diff --git a/.gitignore b/.gitignore index a81c8ee121952cf06bfaf9ff9988edd8cded763c..c70c40e0f527c8c20a6bf994bcb8070b95e13e27 100644 --- a/.gitignore +++ b/.gitignore @@ -136,3 +136,10 @@ dmypy.json # Cython debug symbols cython_debug/ + +# vscode settings and analysis output +.vscode/ +att_advisor*.html +*.xlsx +operator_tuning_file*.cfg +.ipynb_checkpoints/ \ No newline at end of file diff --git a/OWNERS b/OWNERS index 6ad393ee68f77d0647ee159b27e3a2a6893fedb8..7b721dd643e3399d29ff649e2f76182de72421a4 100644 --- a/OWNERS +++ b/OWNERS @@ -41,4 +41,7 @@ reviewers: - zhengweifeng6 - gong-siwei - uniteone -- binghamhuang \ No newline at end of file +- binghamhuang +- wjchuee +- zhou-xianqi +- stby11 \ No newline at end of file diff --git a/README.md b/README.md index 87a1a03725d6778b24ab322b7ee8a3725c303e4a..ef54de8f6b64f1d9e06744d9c6b6cb1e0034fc9b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # ATT -Ascend Training Tools,昇腾训练工具链。针对训练&大模型场景,提供端到端命令行&可视化调试调优工具,帮助用户快速提高模型开发效率。 +Ascend Training Tools,昇腾训练工具链。【Powered by MindStudio】 + +针对训练&大模型场景,提供端到端命令行&可视化调试调优工具,帮助用户快速提高模型开发效率。 ## 模型训练迁移全流程 ![输入图片说明](debug/resources/model_training_migration_process.png) @@ -45,6 +47,10 @@ Ascend Training Tools,昇腾训练工具链。针对训练&大模型场景, 提供多机多卡的集群分析能力(基于通信域的通信分析和迭代耗时分析), 当前需要配合Ascend Insight的集群分析功能使用。 +3. [affinity_cpu_bind (亲和性cpu绑核工具) ](https://gitee.com/ascend/att/tree/master/profiler/affinity_cpu_bind) + + 提供亲和性CPU绑核能力,改善host_bound调度问题。 + ### [Tensorboard](https://gitee.com/ascend/att/tree/master/plugins/tensorboard-plugins/tb_plugin) Tensorboard支持NPU性能数据可视化插件PyTorch Profiler TensorBoard NPU Plugin。 @@ -79,3 +85,7 @@ ATT分支名称格式为:版本号-ATT,而版本号命名规则如下: 2. 新建 xxx 分支 3. 提交代码 4. 新建 Pull Request + +## 版本过渡提示 + +当前版本预检和ptdbg维护到2024/09/30,准备于2024/09/30下线,相关目录att/debug/accuracy_tools/api_accuracy_checker和att/debug/accuracy_tools/ptdbg_ascend将于2024/09/30删除。新版本的预检和ptdbg已经合到att/debug/accuracy_tools/atat目录下。 diff --git a/debug/OWNERS b/debug/OWNERS new file mode 100644 index 0000000000000000000000000000000000000000..09121722c9d7147133c6f111cd10b279979ebdb3 --- /dev/null +++ b/debug/OWNERS @@ -0,0 +1,11 @@ +options: + no_parent_owners: true +approvers: +- wangchao285 +- kun_8 +- binghamhuang +- brightlyking +reviewers: +- lv-kaimeng +- litian_drinksnow +- binghamhuang diff --git a/debug/accuracy_tools/MANIFEST.in b/debug/accuracy_tools/MANIFEST.in index 21075694b9b36d38ea5d5fea29e00bfbd6d9e538..a0aeb46bbc6b35a3ecb0015e137fadf595a65c4d 100644 --- a/debug/accuracy_tools/MANIFEST.in +++ b/debug/accuracy_tools/MANIFEST.in @@ -1,8 +1,5 @@ recursive-include ptdbg_ascend/src/python/ptdbg_ascend/ *.py recursive-include ptdbg_ascend/src/python/ptdbg_ascend/ *.yaml recursive-include ptdbg_ascend/src/python/ptdbg_ascend/ *.template -recursive-include api_accuracy_checker/ *.py -recursive-include api_accuracy_checker/ *.yaml -recursive-include api_accuracy_checker/ *.json recursive-include atat/ * recursive-exclude api_accuracy_checker/test * diff --git a/debug/accuracy_tools/api_accuracy_checker/README.md b/debug/accuracy_tools/api_accuracy_checker/README.md index a3f7ea68ca248cce237b037f163097cd16de5215..e06c262de0e402aed0868a880fe3bd69b18fdf39 100644 --- a/debug/accuracy_tools/api_accuracy_checker/README.md +++ b/debug/accuracy_tools/api_accuracy_checker/README.md @@ -1,5 +1,9 @@ # Ascend模型精度预检工具 +## 版本过渡提示 + +当前版本预检维护到2024/09/30,准备于2024/09/30下线,相关目录att/debug/accuracy_tools/api_accuracy_checker将于2024/09/30删除。新版本的预检已经合到att/debug/accuracy_tools/atat目录下。 + Ascend模型精度预检工具能在昇腾NPU上扫描用户训练模型中所有API,输出精度情况的诊断和分析。工具通过dump模型中所有的API前反向信息;构造相应的API单元测试,将NPU输出与标杆(CPU高精度)比对,从而计算对应的精度指标,该过程称为run_ut;将NPU环境下dump的预检数据拷贝至GPU环境,同样执行run_ut;最后通过新精度标准比对法将NPU和GPU的预检结果进行比对,从而找出NPU中存在精度问题的API。 **新精度标准比对法**:依据新精度标准,对不同的API采取不同的比对算法进行比对(包括绝对阈值法,标杆比对法、二进制一致法、ULP误差比对法和双千指标法),最终给定预检判定结果。 diff --git a/debug/accuracy_tools/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/api_accuracy_checker/common/utils.py index 39905b39fc395b6a10990e6d1061c0f2990e42c4..5af80a1fff26f683adaf88b78c187c290821231f 100644 --- a/debug/accuracy_tools/api_accuracy_checker/common/utils.py +++ b/debug/accuracy_tools/api_accuracy_checker/common/utils.py @@ -25,6 +25,7 @@ import sys import time import csv import logging +import warnings from datetime import datetime, timezone import numpy as np @@ -105,6 +106,10 @@ class Const: "int32_to_int64": ["cross_entropy"] } + VERSION_MESSAGE = """The current version of api_precision_checker will be deprecated on September 30, 2024. + The att/debug/accuracy_tools/api_accuracy_checker directory will be deleted on September 30, 2024. + Please use the api_precision_checker in the att/debug/accuracy_tools/atat directory.""" + class CompareConst: """ @@ -414,7 +419,7 @@ def save_numpy_data(file_path, data): save_numpy_data """ if not os.path.exists(os.path.dirname(file_path)): - os.makedirs(os.path.dirname(file_path)) + create_directory(os.path.dirname(file_path)) np.save(file_path, data) @@ -689,5 +694,11 @@ def _create_logger(level=logging.INFO): return logger_ +class WarningManager: + def warn(self, message=None, enable_warnings=True): + if enable_warnings: + warnings.warn(message) + + log_level = logging.DEBUG if os.environ.get("API_ACCUCARY_CHECK_LOG_LEVEL") == "1" else logging.INFO logger = _create_logger(log_level) diff --git a/debug/accuracy_tools/api_accuracy_checker/compare/api_precision_compare.py b/debug/accuracy_tools/api_accuracy_checker/compare/api_precision_compare.py index 0af8fd1d3f592836d664dbe7701ffa2715c825fa..163668ffe63810647c4c98c6ed41d0c41e6a7383 100644 --- a/debug/accuracy_tools/api_accuracy_checker/compare/api_precision_compare.py +++ b/debug/accuracy_tools/api_accuracy_checker/compare/api_precision_compare.py @@ -6,9 +6,8 @@ import math from collections import namedtuple import torch import pandas as pd - from api_accuracy_checker.common.utils import print_info_log, print_warn_log, print_error_log, write_csv, \ - CompareException, create_directory + CompareException, create_directory, Const, WarningManager from api_accuracy_checker.common.config import msCheckerConfig from api_accuracy_checker.compare.compare_utils import CompareConst, API_PRECISION_COMPARE_RESULT_FILE_NAME, \ API_PRECISION_COMPARE_DETAILS_FILE_NAME, BENCHMARK_COMPARE_SUPPORT_LIST, API_PRECISION_COMPARE_UNSUPPORT_LIST, \ @@ -558,6 +557,8 @@ def _api_precision_compare_parser(parser): if __name__ == '__main__': + wm = WarningManager() + wm.warn(message=Const.VERSION_MESSAGE, enable_warnings=True) _api_precision_compare() print_info_log("Compare task completed.") \ No newline at end of file diff --git a/debug/accuracy_tools/api_accuracy_checker/dump/dump.py b/debug/accuracy_tools/api_accuracy_checker/dump/dump.py index a0a771c9b39d9466c5d62f3f097efaa2187bc0ac..25ff7029e592906d016989c4e97c8164fc68ce2b 100644 --- a/debug/accuracy_tools/api_accuracy_checker/dump/dump.py +++ b/debug/accuracy_tools/api_accuracy_checker/dump/dump.py @@ -16,13 +16,12 @@ """ import os import time - import torch.distributed as dist from api_accuracy_checker.dump.api_info import ForwardAPIInfo, BackwardAPIInfo from api_accuracy_checker.dump.info_dump import write_api_info_json, initialize_output_json from api_accuracy_checker.common.utils import print_error_log, CompareException, print_info_log, \ - get_tensor_rank, logger, Const + get_tensor_rank, logger, Const, WarningManager from api_accuracy_checker.hook_module.register_hook import initialize_hook from api_accuracy_checker.common.config import msCheckerConfig @@ -49,6 +48,8 @@ def check_dataloader_status(): def start(): + wm = WarningManager() + wm.warn(message=Const.VERSION_MESSAGE, enable_warnings=True) check_dataloader_status() if not DumpUtil.get_dump_switch(): DumpUtil.incr_iter_num_maybe_exit() diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/multi_run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/multi_run_ut.py index 760e088eb38a26ba01fd25ac579130240a76a9e8..df6c99a567c98056e2ee725f77b6b0922c034aac 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/multi_run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/multi_run_ut.py @@ -13,7 +13,7 @@ from ptdbg_ascend.src.python.ptdbg_ascend.common.file_check_util import FileChec check_file_suffix, check_link, FileOpen from api_accuracy_checker.compare.compare import Comparator from api_accuracy_checker.run_ut.run_ut import _run_ut_parser, get_validated_result_csv_path, get_validated_details_csv_path, preprocess_forward_content -from api_accuracy_checker.common.utils import print_error_log, print_warn_log, print_info_log, create_directory +from api_accuracy_checker.common.utils import print_error_log, print_warn_log, print_info_log, create_directory, Const, WarningManager from ptdbg_ascend.src.python.ptdbg_ascend.common.utils import check_path_before_create @@ -179,4 +179,6 @@ def main(): run_parallel_ut(config) if __name__ == '__main__': + wm = WarningManager() + wm.warn(message=Const.VERSION_MESSAGE, enable_warnings=True) main() diff --git a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py index 4f5a85e2d64e9e80ea1098d078d0df4304b24cfb..9c1c3073a219f7962b2b97516465281d67b9bafa 100644 --- a/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/run_ut/run_ut.py @@ -19,7 +19,7 @@ from tqdm import tqdm from api_accuracy_checker.run_ut.data_generate import gen_api_params, gen_args from api_accuracy_checker.run_ut.run_ut_utils import Backward_Message, hf_32_standard_api from api_accuracy_checker.common.utils import print_info_log, print_warn_log, get_json_contents, api_info_preprocess, \ - print_error_log, initialize_save_path, Const, create_directory, Const + print_error_log, initialize_save_path, Const, create_directory, Const, WarningManager from api_accuracy_checker.compare.compare import Comparator from api_accuracy_checker.compare.compare_column import CompareColumn from api_accuracy_checker.compare.compare_utils import CompareConst @@ -584,5 +584,7 @@ class UtAPIInfo(APIInfo): if __name__ == '__main__': + wm = WarningManager() + wm.warn(message=Const.VERSION_MESSAGE, enable_warnings=True) _run_ut() print_info_log("UT task completed.") diff --git a/debug/accuracy_tools/api_accuracy_checker/test/run_ut.py b/debug/accuracy_tools/api_accuracy_checker/test/run_ut.py index c73949697941d84782c4983aa484c06b1a7cbcc2..de6358f49ae9612f5c560d7efa120e03c2794a3d 100644 --- a/debug/accuracy_tools/api_accuracy_checker/test/run_ut.py +++ b/debug/accuracy_tools/api_accuracy_checker/test/run_ut.py @@ -13,7 +13,7 @@ def run_ut(): if os.path.exists(report_dir): shutil.rmtree(report_dir) - os.makedirs(report_dir) + os.makedirs(report_dir, mode=0o750) cmd = ["python3", "-m", "pytest", ut_path, "--junitxml=" + report_dir + "/final.xml", "--cov=" + src_dir, "--cov-branch", "--cov-report=xml:" + report_dir + "/coverage.xml"] diff --git a/debug/accuracy_tools/api_checker/api_mapping.json b/debug/accuracy_tools/api_checker/api_mapping.json deleted file mode 100644 index f41b2b0f8d00ca603c55565ee0e8d853663e0485..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/api_mapping.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "Sqrt": "sqrt", - "Square": "square", - "Relu": "relu", - "Mul": "mul", - "Reshape": "reshape", - "ExpandDims": "expanddims", - "IsFinite": "isfinite", - "Cos": "cos", - "Transponse": "permute", - "ReadDiv": "true_divide", - "FloorDiv": "floor_divide", - "Neg": "neg", - "Reciprocal": "reciprocal", - "LogicalNot": "logical_not", - "Ceil": "ceil", - "Exp": "exp", - "Pow": "pow", - "Log": "log", - "Select": "where", - "Tile": "tile", - "Rsqrt": "rsqrt" -} \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/common/json_parser.py b/debug/accuracy_tools/api_checker/common/json_parser.py deleted file mode 100644 index ee163192d882827256a6db5dac872b1691b27533..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/common/json_parser.py +++ /dev/null @@ -1,19 +0,0 @@ -import json -import ast - -def parse_value(value): - try: - parsed_value = ast.literal_eval(value) - except (ValueError, SyntaxError): - parsed_value = value - return parsed_value - -def convert_json(json_obj): - if isinstance(json_obj, dict): - return {k: convert_json(v) for k, v in json_obj.items()} - elif isinstance(json_obj, list): - return [convert_json(elem) for elem in json_obj] - elif isinstance(json_obj, str): - return parse_value(json_obj) - else: - return json_obj diff --git a/debug/accuracy_tools/api_checker/common/logger.py b/debug/accuracy_tools/api_checker/common/logger.py deleted file mode 100644 index 5d5016af72718a32cface7ece7b2c7e5348fc687..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/common/logger.py +++ /dev/null @@ -1,30 +0,0 @@ -import logging -from datetime import datetime - -class SingletonLogger: - _instance = None - - def __new__(cls): - if cls._instance is None: - cls._instance = super(SingletonLogger, cls).__new__(cls) - cls._instance._initialize_logger() - return cls._instance - - def _initialize_logger(self): - self.logger = logging.getLogger("singleton_logger") - handler = logging.StreamHandler() - handler.setFormatter(CustomFormatter()) - self.logger.addHandler(handler) - self.logger.setLevel(logging.DEBUG) - - def get_logger(self): - return self.logger - -class CustomFormatter(logging.Formatter): - def format(self, record): - level = record.levelname - time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - message = record.getMessage() - return f"[{level}] {time} - {message}" - -logger = SingletonLogger().get_logger() \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/common/utils.py b/debug/accuracy_tools/api_checker/common/utils.py deleted file mode 100644 index 4012021a63cd6eb5aa8d43f18166ac1a49f7e719..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/common/utils.py +++ /dev/null @@ -1,202 +0,0 @@ -import os -import json -import csv -import sys -import time -import stat -import inspect - -import numpy as np -import torch - -class Const: - """ - Class for const - """ - DIRECTORY_LENGTH = 4096 - FILE_NAME_LENGTH = 255 - FILE_PATTERN = r'^[a-zA-Z0-9_./-]+$' - MODEL_TYPE = ['.onnx', '.pb', '.om'] - SEMICOLON = ";" - COLON = ":" - EQUAL = "=" - COMMA = "," - DOT = "." - DUMP_RATIO_MAX = 100 - SUMMERY_DATA_NUMS = 256 - ONE_HUNDRED_MB = 100 * 1024 * 1024 - FLOAT_EPSILON = np.finfo(float).eps - SUPPORT_DUMP_MODE = ['api', 'acl'] - ON = 'ON' - OFF = 'OFF' - BACKWARD = 'backward' - FORWARD = 'forward' - FLOAT_TYPE = [np.half, np.single, float, np.double, np.float64, np.longdouble, np.float32, np.float16] - BOOL_TYPE = [bool, np.uint8] - INT_TYPE = [np.int32, np.int64] - - INPUT = "input" - OUTPUT = "output" - - # dump mode - ALL = "all" - LIST = "list" - RANGE = "range" - STACK = "stack" - ACL = "acl" - API_LIST = "api_list" - API_STACK = "api_stack" - DUMP_MODE = [ALL, LIST, RANGE, STACK, ACL, API_LIST, API_STACK] - - WRITE_FLAGS = os.O_WRONLY | os.O_CREAT - WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR - - RAISE_PRECISION = { - torch.float16: torch.float32, - torch.bfloat16: torch.float32, - torch.float32: torch.float64 - } - CONVERT = { - "int32_to_int64": ["torch.int32", "torch.int64"], - } - - CONVERT_API = { - "int32_to_int64": ["cross_entropy"] - } - - -def _print_log(level, msg): - current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) - pid = os.getgid() - print(current_time + "(" + str(pid) + ")-[" + level + "]" + msg) - sys.stdout.flush() - -def print_info_log(info_msg): - """ - Function Description: - print info log. - Parameter: - info_msg: the info message. - """ - _print_log("INFO", info_msg) - - -def print_error_log(error_msg): - """ - Function Description: - print error log. - Parameter: - error_msg: the error message. - """ - _print_log("ERROR", error_msg) - -def print_warn_log(warn_msg): - """ - Function Description: - print warn log. - Parameter: - warn_msg: the warning message. - """ - _print_log("WARNING", warn_msg) - - -def get_json_contents(file_path): - with open(file_path, "r") as f: - ops = f.read() - #ops = get_file_content_bytes(file_path) - try: - json_obj = json.loads(ops) - except ValueError as error: - print_error_log('Failed to load "%s". %s' % (file_path, str(error))) - #raise CompareException(CompareException.INVALID_FILE_ERROR) from error - if not isinstance(json_obj, dict): - print_error_log('Json file %s, content is not a dictionary!' % file_path) - #raise CompareException(CompareException.INVALID_FILE_ERROR) - return json_obj - - -def write_csv(data, filepath): - with open(filepath, 'a', encoding='utf-8-sig') as f: - writer = csv.writer(f) - writer.writerow(data) - - -def check_file_or_directory_path(path, isdir=False): - """ - Function Description: - check whether the path is valid - Parameter: - path: the path to check - isdir: the path is dir or file - Exception Description: - when invalid data throw exception - """ - if isdir: - if not os.path.exists(path): - print_error_log('The path {} is not exist.'.format(path)) - #raise CompareException(CompareException.INVALID_PATH_ERROR) - - if not os.path.isdir(path): - print_error_log('The path {} is not a directory.'.format(path)) - #raise CompareException(CompareException.INVALID_PATH_ERROR) - - if not os.access(path, os.W_OK): - print_error_log( - 'The path {} does not have permission to write. Please check the path permission'.format(path)) - #raise CompareException(CompareException.INVALID_PATH_ERROR) - else: - if not os.path.isfile(path): - print_error_log('{} is an invalid file or non-exist.'.format(path)) - #raise CompareException(CompareException.INVALID_PATH_ERROR) - - if not os.access(path, os.R_OK): - print_error_log( - 'The path {} does not have permission to read. Please check the path permission'.format(path)) - #raise CompareException(CompareException.INVALID_PATH_ERROR) - - -def get_stack(): - stack_str = [] - try: - for (_, path, line, func, code, _) in inspect.stack()[3:]: - if code: - stack_line = [path, str(line), func, code[0].strip() if code else code] - else: - stack_line = [path, str(line), func, code] - stack_str.append(stack_line) - except Exception as e: - print("Dump stack info failed, error: {}".format(e)) - stack_str.append('') - return stack_str - - -dtype_map = { - "Float32": np.float32, - "Float16": np.float16, - "Float64": np.float64, - "Int8": np.int8, - "Int16": np.int16, - "Int32": np.int32, - "Int64": np.int64, - "Bool_": np.bool_, - "Uint8": np.uint8, - "Uint16": np.uint16, - "Uint32": np.uint32, - "Uint64": np.uint64, - "Bool": np.bool_, - "Complex64": np.complex64, - "Complex128": np.complex128 -} - - -np_scalar_type = [ - bool, - np.int8, - np.int16, - np.int32, - np.int64, - np.uint8, - np.uint16, - np.uint32, - np.uint64, -] \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/compare/compare.py b/debug/accuracy_tools/api_checker/compare/compare.py deleted file mode 100644 index 4688bb8d3476201ee0f0f65dd3d456509ee12b5b..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/compare/compare.py +++ /dev/null @@ -1,76 +0,0 @@ -import os -from datetime import datetime, timezone -from compare.algorithm import CompareColumn, compare_float_tensor, compare_bool_tensor -from common.utils import get_json_contents, write_csv, np_scalar_type - - -class Comparator: - # consts for result csv - RESULT_CSV_PATH = "_result.csv" - DETAILS_CSV_PATH = "_detail.csv" - - def __init__(self, outpath, is_continue_run_ut, stack_info_json_path=None): - time = datetime.now(tz=timezone.utc).strftime("%Y%m%d%H%M%S") - self.save_path = os.path.join(outpath, time + self.RESULT_CSV_PATH) - self.detail_save_path = os.path.join(outpath, time + self.DETAILS_CSV_PATH) - if not is_continue_run_ut and not os.path.exists(self.save_path) and not os.path.exists(self.detail_save_path): - self.write_csv_title() - if stack_info_json_path: - self.stack_info = get_json_contents(stack_info_json_path) - else: - self.stack_info = None - - self.test_result_cnt = { - "forward_fail_num": 0, "backward_fail_num": 0, "forward_and_backward_fail_num": 0, "success_num": 0, - "total_num": 0, "forward_or_backward_fail_num": 0 - } - - def compare(self, bench_out, npu_out, api_name): - compareColumn = CompareColumn() - compareColumn.bench_type = bench_out.dtype - compareColumn.npu_type = npu_out.dtype - compareColumn.shape = npu_out.shape - if npu_out.dtype in np_scalar_type: - err_rate, status, message = compare_bool_tensor(bench_out, npu_out) - compareColumn.err_rate = err_rate - else: - status, compareColumn, message = compare_float_tensor(bench_out, npu_out, compareColumn) - result_list = [api_name, status] - write_csv(result_list, self.save_path) - detail_list = [api_name] - detail_temp = compareColumn.to_column_value(status, message) - for detail in detail_temp: - detail_list.append(detail) - write_csv(detail_list, self.detail_save_path) - - def write_result_csv(self, detail_dict): - result_list = [detail_dict["api_name"], detail_dict["status"]] - write_csv(result_list, self.save_path) - - def write_csv_title(self): - result_test_rows = [ - "API name", - "Forward Test Success", - "Backward Test Success", - "Message" - ] - write_csv(result_test_rows, self.save_path) - - detail_test_rows = [ - "API Name", "Bench Dtype", "NPU Dtype", "Shape", - "余弦相似度", - "最大绝对误差", - "双百指标", - "双千指标", - "双万指标", - "错误率", - "误差均衡性", - "均方根误差", - "小值域错误占比", - "相对误差最大值", - "相对误差平均值", - "Status", - "Message" - ] - write_csv(detail_test_rows, self.detail_save_path) - diff --git a/debug/accuracy_tools/api_checker/compare/compare_utils.py b/debug/accuracy_tools/api_checker/compare/compare_utils.py deleted file mode 100644 index 6dea015d7bf3d2e663e455c985df569292566473..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/compare/compare_utils.py +++ /dev/null @@ -1,50 +0,0 @@ -from common.utils import Const, print_warn_log -import numpy as np - - -class CompareConst: - NAN = np.nan - NA = "N/A" - PASS = 'pass' - WARNING = 'warning' - ERROR = 'error' - SKIP = 'SKIP' - TRUE = 'TRUE' - FALSE = 'FALSE' - - -def check_dtype_comparable(x, y): - if x.dtype in Const.FLOAT_TYPE: - if y.dtype in Const.FLOAT_TYPE: - return True - return False - if x.dtype in Const.BOOL_TYPE: - if y.dtype in Const.BOOL_TYPE: - return True - return False - if x.dtype in Const.INT_TYPE: - if y.dtype in Const.INT_TYPE: - return True - return False - print_warn_log(f"Compare: Unexpected dtype {x.dtype}, {y.dtype}") - return False - - -precision_configs = { - 'float16' : { - 'small_value' : [ - 1e-3 - ], - 'small_value_atol' : [ - 1e-5 - ] - }, - 'float32':{ - 'small_value' : [ - 1e-6 - ], - 'small_value_atol' : [ - 1e-9 - ] - } -} diff --git a/debug/accuracy_tools/api_checker/main.py b/debug/accuracy_tools/api_checker/main.py deleted file mode 100644 index 86d2f4a5959c24f7b997494c02225c53ded4b0b7..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/main.py +++ /dev/null @@ -1,50 +0,0 @@ -import argparse -import json -import os -from common.logger import logger - - -def main(): - parser = argparse.ArgumentParser(description='Generate JSON based on user input') - - parser.add_argument('--mode', type=int, default=0, help='Dump mode (0 for all data and 1 for select api)') - parser.add_argument('--out', type=str, default="./data", help='Dump data output dir') - parser.add_argument('--net_name', type=str, default="MyNet", help='Network name (e.g., MyNet)') - parser.add_argument('--iteration', type=str, default="0", help='Iteration range (e.g., "0|5-8|100-120")') - parser.add_argument('--saved_data', type=str, default="tensor", help='Saved data type ("tensor" for dump tensor and "statistic" for statistic data)') - parser.add_argument('--input_output', type=int, default="0", help='Input output flag (0 for all and 1 for input and 2 for output)') - parser.add_argument('--kernels', type=str, nargs='+', default="", help='List of selected kernels only valid when mode is 1(e.g., "Default/Conv-op12")') - parser.add_argument('--support_device', type=int, nargs='+', required=True, help='List of supported devices (e.g., 0 1 2 3 4 5 6 7)') - parser.add_argument('--e2e_enable', type=bool, default=True, help='Enable end-to-end dump (true/false)') - parser.add_argument('--e2e_trans_flag', type=bool, default=True, help='End-to-end trans flag (true/false)') - parser.add_argument('--output_dir', type=str, default="./", help='Output JSON file path') - - args = parser.parse_args() - - json_data = { - "common_dump_settings": { - "dump_mode": args.mode, - "path": os.path.realpath(args.out), - "net_name": args.net_name, - "iteration": args.iteration, - "saved_data": args.saved_data, - "input_output": args.input_output, - "kernels": list(args.kernels), - "support_device": list(args.support_device), - "op_debug_mode": 0, - "file_format": "npy" - }, - "e2e_dump_settings": { - "enable": args.e2e_enable, - "trans_flag": args.e2e_trans_flag, - "save_kernel_args": True - } - } - dump_json_path = os.path.realpath(args.output_dir) - output_json_path = os.path.join(args.output_dir, "dump.json") - with open(output_json_path, 'w') as f: - json.dump(json_data, f, indent=4) - - logger.info(f"JSON data saved to {output_json_path}") -if __name__ == '__main__': - main() diff --git a/debug/accuracy_tools/api_checker/run_ut.py b/debug/accuracy_tools/api_checker/run_ut.py deleted file mode 100644 index 692dda30978c85fb3a414f7248089f6637e95631..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/run_ut.py +++ /dev/null @@ -1,123 +0,0 @@ -import os -import sys -import json -import importlib -import inspect -import argparse -from collections import defaultdict -import numpy as np -import pandas as pd -from compare.compare import Comparator -from common.logger import logger -from common.json_parser import convert_json - - -def _run_ut_parser(parser): - parser.add_argument( - "-i", "--input", dest="input_file", required=True, type=str, - help=" Input josn file containing the API information" - ) - parser.add_argument( - "-o", "--output", dest="output_path", required=False, type=str, - help=" Output path to store the comparison result" - ) - - -def get_ops_ut(module): - for name, obj in inspect.getmembers(module): - if inspect.isclass(obj) and name.endswith("UT"): - return obj - - -def load_json(file_path): - with open(file_path, 'r') as f: - return json.load(f) - - -def load_npy(file_path): - return np.load(file_path) - - -def _run_ut(): - parser = argparse.ArgumentParser() - _run_ut_parser(parser) - args = parser.parse_args(sys.argv[1:]) - out_path = os.path.realpath(args.output_path) if args.output_path else "./" - data_path = os.path.realpath(args.input_file) - comparator = Comparator(out_path, False) - cur_path = os.path.dirname(os.path.realpath(__file__)) - api_mapping_path = os.path.join(cur_path, "api_mapping.json") - - with open(api_mapping_path, 'r') as f: - api_mapping_dict = json.load(f) - - - file_groups = defaultdict(lambda: {'json': None, 'args': [], 'output': []}) - - for dirpath, _, filenames in os.walk(data_path): - for filename in filenames: - file_path = os.path.join(dirpath, filename) - base_name, ext = os.path.splitext(filename) - if ext == ".csv": - continue - real_name = base_name - if real_name.isdigit(): - mapping_csv_path = os.path.join(dirpath, "mapping.csv") - mapping_df = pd.read_csv(mapping_csv_path, header=None, names=['filename', 'realname']) - mapping_col = mapping_df[mapping_df['filename'] == filename] - real_name = mapping_col['realname'].tolist()[0] - - parts = real_name.split('.') - op_type = parts[0] - op_name = parts[1] - file_groups[op_name]['type'] = op_type - if ext == ".json": - file_groups[op_name]['json'] = file_path - elif ext == ".npy": - if 'input' in real_name: - file_groups[op_name]['args'].append(file_path) - elif 'output' in real_name: - file_groups[op_name]['output'].append(file_path) - - for op_name, files in file_groups.items(): - json_path = files['json'] - input_paths = files['args'] - output_paths = files['output'] - op_type = files['type'] - if op_type in api_mapping_dict: - module_name = "common_ut" - api_name = op_type + "_" + api_mapping_dict[op_type] + "_" + op_name - else: - module_name = op_type + "_ut" - api_name = op_name - - if not os.path.exists(f"ut_case/{module_name}.py"): - logger.warning(f"{op_type} not support compare now") - continue - - if json_path: - kwargs = convert_json(load_json(json_path)) - args = [load_npy(npy_path) for npy_path in sorted(input_paths)] - output = [load_npy(npy_path) for npy_path in sorted(output_paths)] - module = importlib.import_module(f"ut_case.{module_name}") - if not module: - logger.warning(f"load {module} failed") - continue - - ops_ut = get_ops_ut(module) - try: - ops_ut( - api_name, - args, - kwargs, - output, - real_data=True, - stack=None, - comparator=comparator - ).compare() - except Exception as e: - logger.warning(f">>>[{op_name}] Compare failed.Reason: {e}") - - -if __name__ == '__main__': - _run_ut() \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/start.sh b/debug/accuracy_tools/api_checker/start.sh deleted file mode 100644 index f2ae9b44450d8eec37b5becb6685a5ec3bf81203..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/start.sh +++ /dev/null @@ -1,2 +0,0 @@ -CUR_DIR=$(dirname "$(readlink -f "$0")") -export MINDSPORE_DUMP_CONFIG=${CUR_DIR}/dump.json \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_base.py b/debug/accuracy_tools/api_checker/ut_base.py deleted file mode 100644 index 305678790ab7902d5ad49684c16462033046e395..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_base.py +++ /dev/null @@ -1,79 +0,0 @@ -import numpy as np -import os -import mindspore as ms -import torch -from common.utils import Const, dtype_map -from collections import deque -from collections import defaultdict - - -class UTBase: - def __init__(self, name, args, kwargs, output=None, real_data=False, stack=None, comparator=None): - self.name = name - self.args = args - self.kwargs = kwargs - self.output = output - self.real_data = real_data - self.stack = stack - self.comparator = comparator - - @staticmethod - def convert_list_to_tuple(data): - for key, value in data.items(): - if not isinstance(value, list): - continue - data[key] = tuple(value) - return data - - def insert_into_dict(self, dic, keys, value): - key = keys.pop(0) - if not keys: - if key in dic: - dic[key].append(value) - else: - dic[key] = [value] - else: - if key not in dic: - dic[key] = defaultdict(list) - self.insert_into_dict(dic[key], keys, value) - - def forward_mindspore_impl(self, *input): - pass - - def forward_pytorch_impl(self, *input): - pass - - def forward_cmp_real(self): - data_input = self.args - input_pt = [] - for data in data_input: - if data.shape: - input_pt.append(torch.from_numpy(data)) - else: - origin_data = data.item() - if data.dtype == np.int64: - origin_data = int(origin_data) - elif data.dtype == np.float64: - origin_data = float(origin_data) - input_pt.append(origin_data) - - output_pt = self.forward_pytorch_impl(*input_pt) - - if isinstance(output_pt, torch.Tensor): - output_ms = self.output[0] - output_pt = output_pt.numpy() - self.comparator.compare(output_pt, output_ms, self.name + "." + Const.OUTPUT) - else: - for index_output, (output_p, output_ms) in enumerate(zip(output_pt, self.output)): - output_p = output_p.numpy() - output_ms = np.load(os.path.join(self.save_path, output_ms)) - self.comparator.compare(output_p, output_ms, self.name + "." + Const.OUTPUT + "." + str(index_output)) - - def forward_cmp_random(self): - pass - - def compare(self): - if self.real_data: - self.forward_cmp_real() - else: - self.forward_cmp_random() \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/Add_ut.py b/debug/accuracy_tools/api_checker/ut_case/Add_ut.py deleted file mode 100644 index 02915b0ebda8d2c095024599b05cf4c6c7a31583..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/Add_ut.py +++ /dev/null @@ -1,37 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase -from common.logger import logger - - -class Add(Cell): - def __init__(self): - super().__init__() - self.add = P.Add() - - def construct(self, input_x, input_y): - return self.add(input_x, input_y) - - -class AddUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = Add() - out = net(x, y) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - input_pt_y = args[1] - if not isinstance(input_pt_x, torch.Tensor): - input_pt_x = torch.tensor(input_pt_x) - output = torch.add(input_pt_x, input_pt_y) - if output.dtype == torch.bfloat16: - return output.float() - return output \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/ArgMaxWithValue_ut.py b/debug/accuracy_tools/api_checker/ut_case/ArgMaxWithValue_ut.py deleted file mode 100644 index 993b5e2cf170f72eced527da7ace93749249446c..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/ArgMaxWithValue_ut.py +++ /dev/null @@ -1,33 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase -from common.logger import logger - - -class ArgMaxWithValue(Cell): - def __init__(self, axis=0, keep_dims=False): - super().__init__() - self.argmaxwithvalue = P.ArgMaxWithValue(axis, keep_dims) - - def construct(self, input_x): - return self.argmaxwithvalue(input_x) - - -class ArgMaxWithValueUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - len_args = len(args) - self.axis = self.kwargs.get("axis") if self.kwargs else 0 - self.keep_dims = self.kwargs.get("keep_dims") if self.kwargs else False - def forward_mindspore_impl(self, *args): - x = args[0] - net = ArgMaxWithValue(self.axis, self.keep_dims) - out = net(x) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - value, index = torch.max(input_pt_x, self.axis, self.keep_dims) - return (index, value) \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/AssignAdd_ut.py b/debug/accuracy_tools/api_checker/ut_case/AssignAdd_ut.py deleted file mode 100644 index a6d80c4febdd053456de084fb225790d413a3c92..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/AssignAdd_ut.py +++ /dev/null @@ -1,35 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase -from common.logger import logger - - -class AssignAdd(Cell): - def __init__(self): - super().__init__() - self.assignadd = P.AssignAdd() - - def construct(self, variable, value): - return self.assignadd(variable, value) - - -class AssignAddUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = AssignAdd() - out = net(x, y) - return out - - def forward_pytorch_impl(self, *args): - variable = args[0] - value = args[1] - if not isinstance(value, torch.Tensor): - value = torch.Tensor(value) - out = torch.add(variable, value) - return out \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/Assign_ut.py b/debug/accuracy_tools/api_checker/ut_case/Assign_ut.py deleted file mode 100644 index f1a49fdc1e16929be2ea07c07810f62bde80abc3..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/Assign_ut.py +++ /dev/null @@ -1,33 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase -from common.logger import logger - - -class Assign(Cell): - def __init__(self): - super().__init__() - self.assign = P.Assign() - - def construct(self, variable, value): - return self.assign(variable, value) - - -class AssignUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - variable = args[0] - value = args[1] - net = Assign() - out = net(variable, value) - return out - - def forward_pytorch_impl(self, *args): - variable = args[0] - value = args[1] - out = variable.copy(value) - return out \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/BatchMatMul_ut.py b/debug/accuracy_tools/api_checker/ut_case/BatchMatMul_ut.py deleted file mode 100644 index 4a3587017782761064cff9e58e4e501db16d36e4..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/BatchMatMul_ut.py +++ /dev/null @@ -1,42 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -from mindspore.common import dtype as mstype -import torch -from ut_base import UTBase - - -class BatchMatMul(Cell): - def __init__(self, transponse_a=False, transponse_b=False): - super().__init__() - self.batchmatmul = P.BatchMatMul(transponse_a, transponse_b) - - def construct(self, input_x, input_y): - return self.batchmatmul(input_x, input_y) - -class BatchMatMulUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - self.transpose_a = self.kwargs.get("transpose_a") if self.kwargs else False - self.transpose_b = self.kwargs.get("transpose_b") if self.kwargs else False - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = BatchMatMul(self.transpose_a, self.transpose_b) - out = net(x, y) - if out.dtype == mstype.bfloat16: - return out.float() - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - input_pt_y = args[1] - if self.transpose_a: - input_pt_x = torch.transpose(input_pt_x, -1, -2) - if self.transpose_b: - input_pt_y = torch.transpose(input_pt_y, -1, -2) - output = torch.matmul(input_pt_x, input_pt_y) - if output.dtype == torch.bfloat16: - return output.float() - return output \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/BroadcastTo_ut.py b/debug/accuracy_tools/api_checker/ut_case/BroadcastTo_ut.py deleted file mode 100644 index d2de14b53060748a2a771ab6a52cdbf075ed690f..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/BroadcastTo_ut.py +++ /dev/null @@ -1,30 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class BroadcastTo(Cell): - def __init__(self, shape): - super().__init__() - self.broadcastto = P.BroadcastTo(shape) - - def construct(self, input_x): - return self.broadcastto(input_x) - -class BroadcastToUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - self.shape = self.kwargs.get("shape") - - def forward_mindspore_impl(self, *args): - x = args[0] - net = BroadcastTo(self.shape) - out = net(x) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - output = torch.broadcast_to(input_pt_x, self.shape) - return output \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/Concat_ut.py b/debug/accuracy_tools/api_checker/ut_case/Concat_ut.py deleted file mode 100644 index ccc83cc375cd7b5eb59c67133413050d6fe9e071..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/Concat_ut.py +++ /dev/null @@ -1,30 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class Concat(Cell): - def __init__(self, axis=0): - super().__init__() - self.concat = P.Concat(axis) - - def construct(self, input_x): - return self.concat(input_x) - -class ConcatUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - self.axis = self.kwargs.get("axis") if self.kwargs else 0 - - def forward_mindspore_impl(self, *args): - x = args[0] - net = Concat(self.axis) - out = net(x) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - output = torch.cat(input_pt_x, self.axis) - return output \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/Conv2D_ut.py b/debug/accuracy_tools/api_checker/ut_case/Conv2D_ut.py deleted file mode 100644 index b84e841d062341a26a2d1a92d07e5e65c7c93f51..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/Conv2D_ut.py +++ /dev/null @@ -1,119 +0,0 @@ -import numpy as np -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -import torch.nn as nn -import torch.nn.functional as F -from ut_base import UTBase - -class Conv2D(Cell): - def __init__(self, out_channel, kernel_size, mode=1, pad_mode="valid", pad=0, stride=1, dilation=1, group=1, data_format="NCHW"): - super(Conv2D, self).__init__() - self.conv2d = P.Conv2D(out_channel, kernel_size, mode, pad_mode, pad, stride, dilation, group, data_format) - - def construct(self, input_x, weight): - return self.conv2d(input_x, weight) - -class Conv2DPytorch(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False): - super(Conv2DPytorch, self).__init__() - self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, - kernel_size=kernel_size, stride=self._ensure_tuple(stride, kernel_size), - padding=0, dilation=self._ensure_tuple(dilation, kernel_size), groups=groups, bias=bias) - self.padding_torch = self._to_padding(padding) - self.flag = int(any(self.padding_torch)) - - def forward(self, x): - if self.flag: - x = F.pad(x, self.padding_torch) - return self.conv(x) - - def _ensure_tuple(self, value, ref_value): - if isinstance(value, int): - return (value,) * len(ref_value) - if isinstance(value, (tuple, list)) and len(value) == len(ref_value): - return tuple(value) - elif isinstance(value, (tuple, list)) and len(value) == 4: - return (value[2], value[3]) - raise ValueError(f"Invalid value for conversion to tuple: {value}") - - def _to_padding(self, padding): - if isinstance(padding, int): - return [padding] * 4 - if isinstance(padding, (tuple, list)) and len(padding) == 2: - return [padding[0], padding[0], padding[1], padding[1]] - if isinstance(padding, (tuple, list)) and len(padding) == 4: - return list(padding) - raise ValueError(f"Invalid padding value: {padding}") - -class Conv2DUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - input_shape = self.args[0].shape - weight_shape = self.args[1].shape - - self.out_channel = self.kwargs.get("out_channel", self.args[0]) - self.kernel_size = self.kwargs.get("kernel_size", self.args[1]) - self.mode = self.kwargs.get("mode", 1) - self.pad_mode = self.kwargs.get("pad_mode", "valid") - self.pad = self.kwargs.get("pad", 0) - self.stride = self.kwargs.get("stride", 1) - self.dilation = self.kwargs.get("dilation", 1) - self.group = self.kwargs.get("group", 1) - self.data_format = self.kwargs.get("format", "NCHW") - - if self.data_format == "NCHW": - self.in_n, self.in_c, self.in_h, self.in_w = input_shape - self.out_c, _, self.kernel_h, self.kernel_w = weight_shape - else: # NHWC - self.in_n, self.in_h, self.in_w, self.in_c = input_shape - self.out_c, self.kernel_h, self.kernel_w, _ = weight_shape - - self.stride = self._ensure_tuple(self.stride, (self.kernel_h, self.kernel_w)) - self.dilation = self._ensure_tuple(self.dilation, (self.kernel_h, self.kernel_w)) - - if self.pad_mode == 'same': - self.padding_torch = self._compute_same_padding(self.in_h, self.in_w, self.kernel_h, self.kernel_w, self.stride, self.dilation) - else: - self.padding_torch = [self.pad] * 4 - - def _ensure_tuple(self, value, ref_value): - if isinstance(value, int): - return (value,) * len(ref_value) - if isinstance(value, (tuple, list)) and len(value) == len(ref_value): - return tuple(value) - raise ValueError(f"Invalid value for conversion to tuple: {value}") - - def _compute_same_padding(self, h, w, kh, kw, stride, dilation): - def compute_pad(dim, kernel, stride, dilation): - pad = max((dim + stride - 1) // stride * stride - dim + (kernel - 1) * dilation, 0) - return pad // 2, pad - pad // 2 - pad_h = compute_pad(h, kh, stride[0], dilation[0]) - pad_w = compute_pad(w, kw, stride[1], dilation[1]) - return [pad_w[0], pad_w[1], pad_h[0], pad_h[1]] - - def forward_mindspore_impl(self, *args): - x = ms.Tensor(args[0]) - weight = ms.Tensor(args[1]) - net = Conv2D(self.out_channel, self.kernel_size, self.mode, self.pad_mode, self.pad, self.stride, self.dilation, self.group, self.data_format) - out = net(x, weight) - return out.asnumpy() - - def forward_pytorch_impl(self, *args): - x, weight = args - if self.data_format == 'NHWC': - x = x.permute(0, 3, 1, 2) - weight = weight.permute(0, 3, 1, 2) - - net = Conv2DPytorch(in_channels=self.in_c, out_channels=self.out_c, - kernel_size=(self.kernel_h, self.kernel_w), - stride=self.stride, padding=self.padding_torch, - dilation=self.dilation, groups=self.group, bias=False) - - net.conv.weight = nn.Parameter(weight) - output = net(x) - if self.data_format == 'NHWC': - output = output.permute(0, 2, 3, 1) - return output.detach() diff --git a/debug/accuracy_tools/api_checker/ut_case/Equal_ut.py b/debug/accuracy_tools/api_checker/ut_case/Equal_ut.py deleted file mode 100644 index 6c651202b51d3f951e1f34ef486ceaf810da66ec..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/Equal_ut.py +++ /dev/null @@ -1,33 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class Equal(Cell): - def __init__(self): - super().__init__() - self.equal = P.Equal() - - def construct(self, input_x, input_y): - return self.equal(input_x, input_y) - -class EqualUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = Equal() - out = net(x, y) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - input_pt_y = args[1] - if not isinstance(input_pt_x, torch.Tensor): - input_pt_x = torch.tensor(input_pt_x) - out = torch.eq(input_pt_x, input_pt_y) - return out \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/GreaterEqual_ut.py b/debug/accuracy_tools/api_checker/ut_case/GreaterEqual_ut.py deleted file mode 100644 index 99e4172239e1b95bd0fccc133f09fb7711b6ace0..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/GreaterEqual_ut.py +++ /dev/null @@ -1,33 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class GreaterEqual(Cell): - def __init__(self): - super().__init__() - self.greaterequal = P.GreaterEqual() - - def construct(self, input_x, input_y): - return self.greaterequal(input_x, input_y) - -class GreaterEqualUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = GreaterEqual() - out = net(x, y) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - input_pt_y = args[1] - if not isinstance(input_pt_x, torch.Tensor): - input_pt_x = torch.tensor(input_pt_x) - output = torch.ge(input_pt_x, input_pt_y) - return output \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/Greater_ut.py b/debug/accuracy_tools/api_checker/ut_case/Greater_ut.py deleted file mode 100644 index 20f699e3dd1bdd7c6c2bfe4bd3563d981aeb704e..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/Greater_ut.py +++ /dev/null @@ -1,33 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class Greater(Cell): - def __init__(self): - super().__init__() - self.greater = P.Greater() - - def construct(self, input_x, input_y): - return self.greater(input_x, input_y) - -class GreaterUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = Greater() - out = net(x, y) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - input_pt_y = args[1] - if not isinstance(input_pt_x, torch.Tensor): - input_pt_x = torch.tensor(input_pt_x) - out = torch.gt(input_pt_x, input_pt_y) - return out \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/LessEqual_ut.py b/debug/accuracy_tools/api_checker/ut_case/LessEqual_ut.py deleted file mode 100644 index 7d0a2222e8114a9286ee56d611b636bdee0a85bf..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/LessEqual_ut.py +++ /dev/null @@ -1,33 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class LessEqual(Cell): - def __init__(self): - super().__init__() - self.lessequal = P.LessEqual() - - def construct(self, input_x, input_y): - return self.lessequal(input_x, input_y) - -class LessEqualUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = LessEqual() - out = net(x, y) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - input_pt_y = args[1] - if not isinstance(input_pt_x, torch.Tensor): - input_pt_x = torch.tensor(input_pt_x) - output = torch.le(input_pt_x, input_pt_y) - return output \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/Less_ut.py b/debug/accuracy_tools/api_checker/ut_case/Less_ut.py deleted file mode 100644 index 6d0b456e7c0999c560522f2b7d587ab04f87fca1..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/Less_ut.py +++ /dev/null @@ -1,33 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class Less(Cell): - def __init__(self): - super().__init__() - self.less = P.Less() - - def construct(self, input_x, input_y): - return self.less(input_x, input_y) - -class LessUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = Less() - out = net(x, y) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - input_pt_y = args[1] - if not isinstance(input_pt_x, torch.Tensor): - input_pt_x = torch.tensor(input_pt_x) - output = torch.lt(input_pt_x, input_pt_y) - return output \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/LogicalAnd_ut.py b/debug/accuracy_tools/api_checker/ut_case/LogicalAnd_ut.py deleted file mode 100644 index 739bf96fa6a52b341d1a1f3f7efea1d2af218d99..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/LogicalAnd_ut.py +++ /dev/null @@ -1,36 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class LogicalAnd(Cell): - def __init__(self): - super().__init__() - self.logicaland = P.LogicalAnd() - - def construct(self, input_x, input_y): - return self.logicaland(input_x, input_y) - -class LogicalAndUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = LogicalAnd() - out = net(x, y) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - input_pt_y = args[1] - if not isinstance(input_pt_x, torch.Tensor): - input_pt_x = torch.tensor(input_pt_x) - if not isinstance(input_pt_y, torch.Tensor): - input_pt_y = torch.tensor(input_pt_y) - output = torch.logical_and(input_pt_x, input_pt_y) - return output - diff --git a/debug/accuracy_tools/api_checker/ut_case/LogicalOr_ut.py b/debug/accuracy_tools/api_checker/ut_case/LogicalOr_ut.py deleted file mode 100644 index 3dade286528bfe90aa6eb617454321d0a2c7cdb9..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/LogicalOr_ut.py +++ /dev/null @@ -1,36 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class LogicalOr(Cell): - def __init__(self): - super().__init__() - self.logicalor = P.LogicalOr() - - def construct(self, input_x, input_y): - return self.logicalor(input_x, input_y) - -class LogicalOrUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = LogicalOr() - out = net(x, y) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - input_pt_y = args[1] - if not isinstance(input_pt_x, torch.Tensor): - input_pt_x = torch.tensor(input_pt_x) - if not isinstance(input_pt_y, torch.Tensor): - input_pt_y = torch.tensor(input_pt_y) - output = torch.logical_or(input_pt_x, input_pt_y) - return output - diff --git a/debug/accuracy_tools/api_checker/ut_case/MatMul_ut.py b/debug/accuracy_tools/api_checker/ut_case/MatMul_ut.py deleted file mode 100644 index 40373334b2dc8b1a31f8a22e978fcd6dd324e312..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/MatMul_ut.py +++ /dev/null @@ -1,44 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -from mindspore.common import dtype as mstype -import torch -from ut_base import UTBase - - -class MatMul(Cell): - def __init__(self, transponse_a=False, transponse_b=False): - super().__init__() - self.matmul = P.MatMul(transponse_a, transponse_b) - - def construct(self, input_x, input_y): - return self.matmul(input_x, input_y) - - -class MatMulUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - len_args = len(args) - self.transpose_a = self.kwargs.get("transpose_a") if self.kwargs else False - self.transpose_b = self.kwargs.get("transpose_b") if self.kwargs else False - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = MatMul(self.transpose_a, self.transpose_a) - out = net(x, y) - if out.dtype == mstype.bfloat16: - return out.float().asnumpy() - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - input_pt_y = args[1] - if self.transpose_a: - input_pt_x = torch.transpose(input_pt_x, 0, 1) - if self.transpose_b: - input_pt_y = torch.transpose(input_pt_y, 0, 1) - output = torch.matmul(input_pt_x, input_pt_y) - if output.dtype == torch.bfloat16: - return output.float().numpy() - return output \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/Maximum_ut.py b/debug/accuracy_tools/api_checker/ut_case/Maximum_ut.py deleted file mode 100644 index 996fa26f9dff39af6a89ab5c9b2cb018fc1ec1eb..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/Maximum_ut.py +++ /dev/null @@ -1,36 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class Maximum(Cell): - def __init__(self): - super().__init__() - self.maximum = P.Maximum() - - def construct(self, input_x, input_y): - return self.maximum(input_x, input_y) - -class MaximumUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = Maximum() - out = net(x, y) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - input_pt_y = args[1] - if not isinstance(input_pt_x, torch.Tensor): - input_pt_x = torch.tensor(input_pt_x) - if not isinstance(input_pt_y, torch.Tensor): - input_pt_y = torch.tensor(input_pt_y) - output = torch.maximum(input_pt_x, input_pt_y) - return output - diff --git a/debug/accuracy_tools/api_checker/ut_case/Minimum_ut.py b/debug/accuracy_tools/api_checker/ut_case/Minimum_ut.py deleted file mode 100644 index a49f21998a75c1342695b24775b58d076269c9ee..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/Minimum_ut.py +++ /dev/null @@ -1,36 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class Minimum(Cell): - def __init__(self): - super().__init__() - self.minimum = P.Minimum() - - def construct(self, input_x, input_y): - return self.minimum(input_x, input_y) - -class MinimumUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = Minimum() - out = net(x, y) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - input_pt_y = args[1] - if not isinstance(input_pt_x, torch.Tensor): - input_pt_x = torch.tensor(input_pt_x) - if not isinstance(input_pt_y, torch.Tensor): - input_pt_y = torch.tensor(input_pt_y) - output = torch.minimum(input_pt_x, input_pt_y) - return output - diff --git a/debug/accuracy_tools/api_checker/ut_case/NotEqual_ut.py b/debug/accuracy_tools/api_checker/ut_case/NotEqual_ut.py deleted file mode 100644 index 5137e39d209d8ffe2dc347fb52802046ceea97d2..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/NotEqual_ut.py +++ /dev/null @@ -1,34 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class NotEqual(Cell): - def __init__(self): - super().__init__() - self.notequal = P.NotEqual() - - def construct(self, input_x, input_y): - return self.notequal(input_x, input_y) - -class NotEqualUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = NotEqual() - out = net(x, y) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - input_pt_y = args[1] - if not isinstance(input_pt_x, torch.Tensor): - input_pt_x = torch.tensor(input_pt_x) - output = torch.ne(input_pt_x, input_pt_y) - return output - diff --git a/debug/accuracy_tools/api_checker/ut_case/OneHot_ut.py b/debug/accuracy_tools/api_checker/ut_case/OneHot_ut.py deleted file mode 100644 index 335eff8a03ed25f39b3d00a015a0eb70d48fc601..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/OneHot_ut.py +++ /dev/null @@ -1,49 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class OneHot(Cell): - def __init__(self, axis=0): - super.__init__() - self.onehot = P.OneHot(axis) - - def construct(self, indices, depth, on_value, off_value): - return self.onehot(indices, depth, on_value, off_value) - -class OneHotUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - self.axis = self.kwargs.get("axis") if self.kwargs else -1 - - def forward_mindspore_impl(self, *args): - indices = args[0] - depth = args[1] - on_value = args[2] - off_value = args[3] - net = OneHot(self.axis) - out = net(indices, depth, on_value, off_value) - return out - - def forward_pytorch_impl(self, *args): - indices = args[0] - num_classes = args[1] - on_value = args[2] - off_value = args[3] - dim = indices.ndim - output = torch.nn.functional.one_hot(indices, num_classes) - mask_for_ones = output == 1 - mask_for_zeros = output == 0 - output[mask_for_ones] = on_value - output[mask_for_zeros] = off_value - - dims = tuple(range(dim)) - axis = self.axis - if axis < 0: - axis = axis + dim + 1 - dims = dims[:axis] + (dim,) + dims[axis:] - output = output.permute(dims) - return output - diff --git a/debug/accuracy_tools/api_checker/ut_case/ReLU_ut.py b/debug/accuracy_tools/api_checker/ut_case/ReLU_ut.py deleted file mode 100644 index 92bd0894f4037790e62c10228523fd5648181801..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/ReLU_ut.py +++ /dev/null @@ -1,33 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -import torch.nn as nn -from ut_base import UTBase -from common.logger import logger - - -class ReLU(Cell): - def __init__(self): - super().__init__() - self.relu = P.ReLU() - - def construct(self, input_x): - return self.relu(input_x) - - -class ReLUUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - x = args[0] - net = ReLU() - out = net(x) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - net = nn.ReLU() - output = net(input_pt_x) - return output \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/ReduceAll_ut.py b/debug/accuracy_tools/api_checker/ut_case/ReduceAll_ut.py deleted file mode 100644 index 391d0af59f2f41ba5e176e8fc8973214feab08b8..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/ReduceAll_ut.py +++ /dev/null @@ -1,35 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase -from common.logger import logger - - -class ReduceAll(Cell): - def __init__(self, axis, keep_dims=False): - super().__init__() - self.reduceall = P.ReduceAll(keep_dims=keep_dims) - self.axis = axis - - def construct(self, x): - return self.reduceall(x, self.axis) - - -class ReduceAllUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - self.keep_dims = self.kwargs.get("keep_dims") if self.kwargs else False - - def forward_mindspore_impl(self, *args): - x = args[0] - axis = args[1] - net = ReduceAll(axis=axis, keep_dims=self.keep_dims) - out = net(x) - return out - - def forward_pytorch_impl(self, *args): - x = args[0] - axis = args[1] - output = torch.all(x, dim=axis, keepdim=self.keep_dims) - return output \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/ReduceMean_ut.py b/debug/accuracy_tools/api_checker/ut_case/ReduceMean_ut.py deleted file mode 100644 index 675efde7b9250200ebdbd228bf054fa47dc25f92..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/ReduceMean_ut.py +++ /dev/null @@ -1,34 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase -from common.logger import logger - - -class ReduceMean(Cell): - def __init__(self, keep_dims=False): - super().__init__() - self.reducemean = P.ReduceMean(keep_dims) - - def construct(self, x, axis): - return self.reducemean(x, axis) - - -class ReduceMeanUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - self.keep_dims = self.kwargs.get("keep_dims") if self.kwargs else False - - def forward_mindspore_impl(self, *args): - x = args[0] - axis = args[1] - net = ReduceMean(self.keep_dims) - out = net(x, axis) - return out - - def forward_pytorch_impl(self, *args): - x = args[0] - axis = args[1] - output = torch.mean(x, axis, self.keep_dims) - return output \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/Squeeze_ut.py b/debug/accuracy_tools/api_checker/ut_case/Squeeze_ut.py deleted file mode 100644 index 5e1bb498d9e5246919307684adfbbdac07efb66d..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/Squeeze_ut.py +++ /dev/null @@ -1,33 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class Squeeze(Cell): - def __init__(self, axis=()): - super().__init__() - self.squeeze = P.Squeeze(axis) - - def construct(self, input_x): - return self.squeeze(input_x) - -class SqueezeUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - self.axis = self.kwargs.get("axis") if self.kwargs else () - - def forward_mindspore_impl(self, *args): - x = args[0] - net = Squeeze(self.axis) - out = net(x) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - if self.axis == (): - output = torch.squeeze(input_pt_x) - else: - output = torch.squeeze(input_pt_x, self.axis) - return output \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/Sub_ut.py b/debug/accuracy_tools/api_checker/ut_case/Sub_ut.py deleted file mode 100644 index 14e156fcd822881b8bbb6f6675b6e404c19f62ac..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/Sub_ut.py +++ /dev/null @@ -1,39 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -from ut_base import UTBase - - -class Sub(Cell): - def __init__(self): - super().__init__() - self.sub = P.Sub() - - def construct(self, input_x, input_y): - return self.sub(input_x, input_y) - -class SubUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - def forward_mindspore_impl(self, *args): - x = args[0] - y = args[1] - net = Sub() - out = net(x, y) - return out - - def forward_pytorch_impl(self, *args): - input_pt_x = args[0] - input_pt_y = args[1] - if isinstance(input_pt_x, bool): - input_pt_x = int(input_pt_x) - elif isinstance(input_pt_x, torch.Tensor) and input_pt_x.dtype == torch.bool: - input_pt_x = input_pt_x.int() - if isinstance(input_pt_y, bool): - input_pt_y = int(input_pt_y) - elif isinstance(input_pt_y, torch.Tensor) and input_pt_y.dtype == torch.bool: - input_pt_y = input_pt_y.int() - out = torch.sub(input_pt_x, input_pt_y) - return out \ No newline at end of file diff --git a/debug/accuracy_tools/api_checker/ut_case/common_ut.py b/debug/accuracy_tools/api_checker/ut_case/common_ut.py deleted file mode 100644 index c5377248c305d6d3ff082274921f2b83221d3c0e..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/api_checker/ut_case/common_ut.py +++ /dev/null @@ -1,39 +0,0 @@ -import mindspore as ms -from mindspore.nn.cell import Cell -from mindspore.ops import operations as P -import torch -import re -import inspect -from ut_base import UTBase -from common.logger import logger - - -class Common(Cell): - def __init__(self): - super().__init__() - - -class CommonUT(UTBase): - def __init__(self, name, args, kwargs, output, real_data=False, stack=None, comparator=None): - super().__init__(name, args, kwargs, output, real_data, stack, comparator) - - pattern = re.compile(r"^(.*?)_(.*?)_(.*)$") - match = pattern.match(self.name) - - if match: - self.name_ms = match.group(1) - logger.info(f"Common UT compare mindspore api: {self.name_ms}") - self.name_py = match.group(2) - self.name = match.group(3) - else: - logger.warning("No match UT found") - - def forward_mindspore_impl(self, *args): - output_ms = getattr(P, self.name_ms)()(*args) - return output_ms - - def forward_pytorch_impl(self, *args): - args_len = len(inspect.getfullargspec(getattr(P, self.name_ms)()).args) - 1 - tensor_list = args[:args_len] - output_py = getattr(torch,self.name_py)(*tensor_list) - return output_py \ No newline at end of file diff --git a/debug/accuracy_tools/atat/README.md b/debug/accuracy_tools/atat/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6a1fbb473c21c3a922d781a52a300818d03a063c --- /dev/null +++ b/debug/accuracy_tools/atat/README.md @@ -0,0 +1,124 @@ +# MindStudio精度调试工具 + +MindStudio精度调试工具(ascend_training_accuracy_tools),简称atat,是ATT工具链下精度调试部分的工具包。主要包括精度预检和精度比对等子工具,当前适配场景包括PyTorch和MindSpore。 + +## 工具安装 + +精度工具合一软件包名称:`ascend_training_accuracy_tools-{version}-py3-none-any.whl` + +1. whl包获取。 + + 请通过下表链接下载工具whl包。 + + | 版本 | 发布日期 | 支持PyTorch版本 | 下载链接 | 校验码 | + | ----- | ---------- | ------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | + | 0.0.3 | 2024-06-11 | 1.11.0/2.0/2.1/2.2 | [ascend_training_accuracy_tools-0.0.3-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/att/0.0/ascend_training_accuracy_tools-0.0.3-py3-none-any.whl) | f46d9714704859e2d67861a65bbb3c76b0a250cf6e238b978b5b959ab1fe125a | + | 0.0.2 | 2024-05-23 | 1.11.0/2.0/2.1/2.2 | [ascend_training_accuracy_tools-0.0.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/att/0.0/ascend_training_accuracy_tools-0.0.2-py3-none-any.whl) | 2e35809bde559e9c4d2f16a02ccde779ed9e436bb65fded0b7ebaf6ac2c88d93 | + | 0.0.1 | 2024-03-15 | 1.11.0/2.0/2.1 | [ascend_training_accuracy_tools-0.0.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/att/0.0/ascend_training_accuracy_tools-0.0.1-py3-none-any.whl) | 5801510d4e827e4859bc9a5aca021e4d30c2ea42d60a4c8ad0c2baab1b7782c9 | + +2. whl包校验。 + + 1. 根据以上下载链接下载whl包到Linux安装环境。 + + 2. 进入whl包所在目录,执行如下命令。 + + ```bash + sha256sum {name}.whl + ``` + + {name}为whl包名称。 + + 若回显呈现对应版本whl包一致的**校验码**,则表示下载了正确的ptdbg_ascend精度工具whl安装包。示例如下: + + ```bash + sha256sum ascend_training_accuracy_tools-0.0.1-py3-none-any.whl + 5801510d4e827e4859bc9a5aca021e4d30c2ea42d60a4c8ad0c2baab1b7782c9 *ascend_training_accuracy_tools-0.0.1-py3-none-any.whl + ``` + +3. 执行如下命令进行安装。 + + ```bash + pip3 install ./ascend_training_accuracy_tools-{version}-py3-none-any.whl + ``` + + 若为覆盖安装,请在命令行末尾增加“--force-reinstall”参数强制安装,例如: + + ```bash + pip3 install ./ascend_training_accuracy_tools-{version}-py3-none-any.whl --force-reinstall + ``` + + 提示如下信息则表示安装成功。 + + ```bash + Successfully installed ascend_training_accuracy_tools-{version} + ``` + + +## 工具使用 + +安装atat工具后,可以按照如下思路选择合适的子工具进行精度调试: + +1. 判断框架场景。 + + 当前支持PyTorch和MindSpore场景。 + +2. 执行数据采集。 + + 工具通过在训练脚本中添加PrecisionDebugger接口的方式对API执行精度数据dump操作。 + + PyTorch场景:详见[PyTorch_精度数据采集](./pytorch/doc/dump.md)。 + + MindSpore场景:详见[MindSpore_精度数据采集](./mindspore/doc/dump.md)。 + +3. 执行精度预检。 + + 在昇腾NPU上扫描用户训练模型中所有API,进行API复现,给出精度情况的诊断和分析。 + + PyTorch场景:详见[PyTorch_精度预检工具](./pytorch/doc/api_accuracy_checker.md)。 + + MindSpore场景:暂不支持。 + +4. 执行精度比对。 + + 进行PyTorch整网API粒度的数据dump、精度比对和溢出检测,从而定位训练场景下的精度问题。 + + PyTorch场景:详见[PyTorch_精度比对工具](./pytorch/doc/ptdbg_ascend_overview.md)。 + + MindSpore场景:暂不支持。 + +5. 执行溢出解析。 + + 溢出解析是在执行精度数据dump时,配置了溢出检测dump,那么对于输入正常但输出存在溢出的API,可以判断是否为正常溢出。 + + PyTorch场景:详见[PyTorch_溢出解析工具](./pytorch/doc/run_overflow_check.md)。(暂不支持) + + MindSpore场景:暂不支持。 + +6. 执行数据解析。 + + 用于比对前后两次NPU ACL层级dump数据的一致性。 + + PyTorch场景:详见[PyTorch_数据解析工具](./pytorch/doc/parse_tool.md)。 + + MindSpore场景:暂不支持。 + +上述流程中的工具均为atat工具的子工具,使用相同的命令行,格式如下: + +```bash +atat [-h] -f parse run_ut multi_run_ut api_precision_compare run_overflow_check +``` + +| 参数 | 说明 | +| ---- | ---------------------------------------- | +| -f | 框架,当前支持配置为pytorch和mindspore。 | +| -h | 帮助信息。 | + +其他参数在上述对应的工具手册中详细介绍。 + +## 贡献 + +push代码前,请务必保证已经完成了基础功能测试和网络测试。 + +## Release Notes + +Release Notes请参见[RELEASE](RELEASE.md)。 \ No newline at end of file diff --git a/debug/accuracy_tools/atat/atat.py b/debug/accuracy_tools/atat/atat.py index 4f69afd2349f211d1c6e17ab9386de3c8fcd6909..799200ae41c76ac41be8e467910c19a772f9db74 100644 --- a/debug/accuracy_tools/atat/atat.py +++ b/debug/accuracy_tools/atat/atat.py @@ -15,11 +15,11 @@ import argparse import sys -from api_accuracy_checker.run_ut.run_ut import _run_ut_parser, run_ut_command +from atat.pytorch.api_accuracy_checker.run_ut.run_ut import _run_ut_parser, run_ut_command from ptdbg_ascend.src.python.ptdbg_ascend.parse_tool.cli import parse as cli_parse -from api_accuracy_checker.run_ut.multi_run_ut import prepare_config, run_parallel_ut -from api_accuracy_checker.compare.api_precision_compare import _api_precision_compare_parser, _api_precision_compare_command -from api_accuracy_checker.run_ut.run_overflow_check import _run_overflow_check_parser, _run_overflow_check_command +from atat.pytorch.api_accuracy_checker.run_ut.multi_run_ut import prepare_config, run_parallel_ut +from atat.pytorch.api_accuracy_checker.compare.api_precision_compare import _api_precision_compare_parser, _api_precision_compare_command +from atat.pytorch.api_accuracy_checker.run_ut.run_overflow_check import _run_overflow_check_parser, _run_overflow_check_command def main(): @@ -30,6 +30,8 @@ def main(): f"For any issue, refer README.md first", ) parser.set_defaults(print_help=parser.print_help) + parser.add_argument('-f', '--framework', required=True, choices=['pytorch'], + help='Deep learning framework.') subparsers = parser.add_subparsers() subparsers.add_parser('parse') run_ut_cmd_parser = subparsers.add_parser('run_ut') @@ -46,16 +48,16 @@ def main(): parser.print_help() sys.exit(0) args = parser.parse_args(sys.argv[1:]) - if sys.argv[1] == "run_ut": + if sys.argv[3] == "run_ut": run_ut_command(args) - elif sys.argv[1] == "parse": + elif sys.argv[3] == "parse": cli_parse() - elif sys.argv[1] == "multi_run_ut": + elif sys.argv[3] == "multi_run_ut": config = prepare_config(args) run_parallel_ut(config) - elif sys.argv[1] == "api_precision_compare": + elif sys.argv[3] == "api_precision_compare": _api_precision_compare_command(args) - elif sys.argv[1] == "run_overflow_check": + elif sys.argv[3] == "run_overflow_check": _run_overflow_check_command(args) diff --git a/debug/accuracy_tools/atat/config/README.md b/debug/accuracy_tools/atat/config/README.md new file mode 100644 index 0000000000000000000000000000000000000000..66429b54fc5e716bec8c70c932232546bae1b55e --- /dev/null +++ b/debug/accuracy_tools/atat/config/README.md @@ -0,0 +1,293 @@ +# 配置文件说明 + +当前配置文件主要为PrecisionDebugger接口执行dump或无标杆比对操作时调用的配置,当PrecisionDebugger接口未指定该配置文件时,使用该文件的默认配置。配置文件详见[config.json](./config.json)。 + +## 参数说明 + +### **通用配置参数** + +| 参数名 | 说明 | 是否必选 | +| ----------------- | ------------------------------------------------------------ | -------- | +| task | dump的任务类型,str类型。可取值"free_benchmark"(无标杆比对,仅PyTorch场景支持)、"statistics"(仅dump API统计信息,默认值)、"tensor"(dump API统计信息和完全复刻整网的API运行情况的真实数据)、"overflow_check"(溢出检测)。配置示例:"task": "tensor"。根据task参数取值的不同,可以配置不同场景参数,详见:“**task配置为free_benchmark**”,“**task配置为statistics**”,“**task配置为tensor**”,“**task配置为overflow_check**”。 | 否 | +| dump_path | 设置dump数据目录路径,str类型。配置示例:"dump_path": "./dump_path"。MindSpore场景仅支持绝对路径。 | 是 | +| rank | 指定对某张卡上的数据进行dump,list[int]类型,默认未配置(表示dump所有卡的数据),应配置为大于等于0的整数,且须配置实际可用的Rank ID。配置示例:"rank": [1]。
对于PyTorch场景,Rank ID从0开始计数,最大取值为所有节点可用卡总数-1,若所配置的值大于实际训练所运行的卡的Rank ID,则dump数据为空,比如当前环境Rank ID为0到7,实际训练运行0到3卡,此时若配置Rank ID为4或不存在的10等其他值,此时dump数据为空。
对于MindSpore场景,所有节点的Rank ID均从0开始计数,最大取值为每个节点可用卡总数-1,config.json配置一次rank参数对所有节点同时生效。 | 否 | +| step | 指定dump某个step的数据,list[int]类型。默认未配置,表示dump所有step数据。dump特定step时,须指定为训练脚本中存在的step。step为list格式,可配置逐个step,例如:"step": [0,1,2]。 | 否 | +| level | dump级别,str类型,根据不同级别dump不同数据。可取值"L0"(dump module模块级精度数据,仅PyTorch场景支持,使用背景详见“**模块级精度数据dump说明**”)、"L1"(dump API级精度数据,默认值)、"L2"(dump kernel级精度数据)、"mix"(dump module模块级和API级精度数据,即"L0"+"L1",仅PyTorch场景支持)。配置示例:"level": "L1"。 | 否 | +| seed | 随机种子数,int类型,默认值为:1234,仅PyTorch场景支持。通过固定随机数保证模型的输入或输出一致,可固定的随机数详见“**固定随机数范围**”。配置示例:"seed": 1234。 | 否 | +| is_deterministic | 确定性计算模式,bool类型,仅PyTorch场景支持。可取值true(开启)或false(关闭),默认关闭。配置示例:"is_deterministic": true。
即使在相同的硬件和输入下,API多次执行的结果也可能不同,开启确定性计算是为了保证在相同的硬件和输入下,API多次执行的结果相同。
确定性计算会导致API执行性能降低,建议在发现模型多次执行结果不同的情况下开启。
rnn类算子、ReduceSum、ReduceMean等算子可能与确定性计算存在冲突,若开启确定性计算后多次执行的结果不相同,则考虑存在这些算子。 | 否 | +| enable_dataloader | 自动控制开关,bool类型,仅PyTorch场景支持。可取值true(开启)或false(关闭),默认为false。配置为True后自动识别step参数指定的迭代,并在该迭代执行完成后退出训练,此时start、stop和step函数可不配置,开启该开关要求训练脚本是通过torch.utils.data.dataloader方式加载数据。仅支持PyTorch单卡训练使用,分布式训练场景下存在数据dump不全问题,**下个版本即将废弃该功能**。 | 否 | + +### task配置为free_benchmark + +仅PyTorch场景支持。 + +task配置为free_benchmark时,开启**无标杆比对**,在NPU环境下通过对当前模型API的输入添加扰动因子,二次执行,将得到的输出与未添加扰动因子前的输出进行比对,从而**得出该模型中可能因迁移等变化导致精度降低的API**。 + +无标杆比对优势在于省去了从GPU环境获取dump数据并执行的步骤,也省去了在NPU环境执行dump的操作,降低了精度比对的操作难度。 + +建议配置白名单(配置scope或list)控制少量API进行无标杆比对,一次对过多API执行无标杆比对可能导致显存溢出或性能膨胀。 + +| 参数名 | 说明 | 是否必选 | +| ------------ | ------------------------------------------------------------ | -------- | +| scope | PyTorch场景dump范围,list[str]类型,默认未配置(list也未配置时表示dump所有API的数据)。需要在[]内配置两个模块名或API名,用于锁定区间,dump该范围内的数据。配置示例:"scope": ["MyModuleOP1", "MyModuleOP2"]。与level参数取值相关,level为L0和mix级别时,可配置模块名;level为L1级别时,可配置API名。与list参数不能同时配置。 | 否 | +| list | 自定义dump范围,list[str]类型,默认未配置(scope也未配置时表示dump所有API的数据)。包含如下配置方法:
PyTorch场景配置具体的API全称,dump该API数据。配置示例:"list": ["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]。
PyTorch场景指定某一类API,dump某一类的API级别输入输出数据。配置示例:"list": ["relu"]。
PyTorch场景配置kernel_api,dump前向和反向API的kernel_api级别数据,其中dump反向API时需要配置**backward_input**参数。前向API配置示例:"list": ["Tensor.permute.1.forward"];反向API配置示例:"list": ["Tensor.permute.1.forward"], "backward.input": "./npu_dump/step0/rank0/Functional.conv2d.1.backward.input.0.pt"]。
与scope参数不能同时配置。 | 否 | +| fuzz_device | 标杆设备,str类型。可取值:
"npu":无标杆,通过添加扰动因子进行比对,默认值。
"cpu":以CPU为标杆,pert_mode须配置为"to_cpu"。
配置示例:"fuzz_device": "cpu"。 | 否 | +| pert_mode | 无标杆扰动因子,str类型。可取值:
"improve_precision":对输入做升精度,默认值。
"add_noise":对输入增加噪声。
"no_change":不加扰动直接二次执行。
"bit_noise":输入的末位比特翻转。
"change_value":输入的张量首尾值调换。
"to_cpu":在CPU等价执行。
配置示例:"pert_mode": "to_cpu"。 | 否 | +| handler_type | 处理类型,可取值:"check"(进行无标杆比对检查,默认值)、"fix"(将扰动后的API输出结果覆盖原始API输出结果,尝试将Loss曲线恢复正常,该模式下不支持预热if_preheat)。配置示例:"handler_type": "fix"。 | 否 | +| fuzz_level | 无标杆数据dump级别,即选择比对结果文件应输出的表头属性,当前仅支持取值为:"L1"。输出结果详见“**无标杆比对数据存盘格式**”。 | 否 | +| fuzz_stage | 前反向,选择对API前向或反向进行无标杆比对,可取值:"forward"(前向,默认值)、"backward"(反向)。配置示例:"fuzz_stage": "backward"。 | 否 | +| if_preheat | 预热功能,开启功能后工具可以根据每次迭代的输出调整精度算法的阈值,从而更准确找出存在精度问题的API,bool类型。可取值true(开启)或false(关闭),默认关闭。配置示例:"if_preheat": "true"。"handler_type": "fix"不支持预热。 | 否 | +| preheat_step | 开启预热的迭代数量,int类型,默认值为15。须配置"if_preheat": "true"。 | 否 | +| max_sample | 每个算子预热的采样次数的最大阈值,int类型,默认值为20。须配置"if_preheat": "true"。 | 否 | + +#### 无标杆比对数据存盘格式 + +无标杆比对在dump_path目录下输出结果文件free_benchmark.csv,如下示例: + +![free_benchmark](./img/free_benchmark.png) + +| 字段 | 说明 | +| ------------ | ------------------------------------------------------------ | +| rank | Rank ID,int类型。 | +| pert_mode | 扰动因子的类型,string类型。 | +| stage | 前向或反向,string类型。 | +| step | 迭代数,int类型。 | +| api_name | API名称,string类型。 | +| max_rel | 输出对比最大相对误差,float类型。 | +| dtype | 输入的dtype,string类型。 | +| shape | 输入的shape,tuple类型。 | +| Output_index | 如果输出为列表或元组,其中一个元素检测不一致,则会有该元素的index,否则为空,int类型。 | + +### task配置为statistics + +| 参数名 | 说明 | 是否必选 | +| ------------ | ------------------------------------------------------------ | -------- | +| scope | PyTorch场景dump范围,list[str]类型,默认未配置(list也未配置时表示dump所有API的数据)。需要在[]内配置两个模块名或API名,用于锁定区间,dump该范围内的数据。配置示例:"scope": ["MyModuleOP1", "MyModuleOP2"]。与level参数取值相关,level为L0和mix级别时,可配置模块名;level为L1级别时,可配置API名。 | 否 | +| list | 自定义dump范围,list[str]类型,默认未配置(scope也未配置时表示dump所有API的数据)。包含如下配置方法:
PyTorch场景配置具体的API全称,dump该API数据。配置示例:"list": ["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]。
PyTorch场景指定某一类API,dump某一类的API级别输入输出数据。配置示例:"list": ["relu"]。
MindSpore场景配置kernel_name,可以是算子的名称列表,也可以指定算子类型("level": "L2"时不支持),还可以配置算子名称的正则表达式(当字符串符合”name-regex(xxx)”格式时,后台则会将其作为正则表达式。例如,”name-regex(Default/.+)”可匹配算子名称以”Default/”开头的所有算子)。 | 否 | +| data_mode | dump数据过滤,str类型。可取值"all"、"forward"、"backward"、"input"和"output",表示仅保存dump的数据中文件名包含"forward"、"backward"、"input"和"output"的前向、反向、输入或输出的dump文件。配置示例"data_mode": ["backward"]或"data_mode": ["forward", "backward"]。默认为["all"],即保存所有dump的数据。除了all参数只能单独配置外,其他参数可以自由组合。
MindSpore场景仅支持"all"、"input"和"output"参数,且各参数只能单独配置,不支持自由组合。 | 否 | +| summary_mode | 控制dump文件输出的模式,str类型,仅PyTorch场景支持,可取值md5(dump输出包含md5值以及API统计信息的dump.json文件,用于验证数据的完整性)、statistics(dump仅输出包含API统计信息的dump.json文件,默认值)。配置示例:"summary_mode": "md5"。 | 否 | + +### task配置为tensor + +| 参数名 | 说明 | 是否必选 | +| -------------- | ------------------------------------------------------------ | -------- | +| scope | PyTorch场景dump范围,list[str]类型,默认未配置(list也未配置时表示dump所有API的数据)。需要在[]内配置两个模块名或API名,用于锁定区间,dump该范围内的数据。配置示例:"scope": ["MyModuleOP1", "MyModuleOP2"]。与level参数取值相关,level为L0和mix级别时,可配置模块名;level为L1级别时,可配置API名。 | 否 | +| list | 自定义dump范围,list[str]类型,默认未配置(scope也未配置时表示dump所有API的数据)。包含如下配置方法:
PyTorch场景配置具体的API全称,dump该API数据。配置示例:"list": ["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]。
PyTorch场景指定某一类API,dump某一类的API级别输入输出数据。配置示例:"list": ["relu"]。
PyTorch场景配置kernel_api,dump前向和反向API的kernel_api级别数据,其中dump反向API时需要配置**backward_input**参数。前向API配置示例:"list": ["Tensor.permute.1.forward"];反API配置示例:"list": ["Tensor.permute.1.forward"], "backward.input": "./npu_dump/step0/rank0/Functional.conv2d.1.backward.input.0.pt"]。
MindSpore场景配置kernel_name,可以是算子的名称列表,也可以指定算子类型("level": "L2"时不支持),还可以配置算子名称的正则表达式(当字符串符合”name-regex(xxx)”格式时,后台则会将其作为正则表达式。例如,”name-regex(Default/.+)”可匹配算子名称以”Default/”开头的所有算子)。 | 否 | +| backward_input | 该输入文件为首次运行训练dump得到反向API输入的dump文件,str类型,仅PyTorch场景支持,默认未配置。例如若需要dump Functional.conv2d.1 API的反向过程的输入输出,则需要在dump目录下查找命名包含Functional.conv2d.1、backward和input字段的dump文件。配置示例:"backward_input": "./npu_dump/step0/rank0/Functional.conv2d.1.backward.input.0.pt"] | 否 | +| data_mode | dump数据过滤,str类型。可取值"all"、"forward"、"backward"、"input"和"output",表示仅保存dump的数据中文件名包含"forward"、"backward"、"input"和"output"的前向、反向、输入或输出的dump文件。配置示例"data_mode": ["backward"]或"data_mode": ["forward", "backward"]。默认为["all"],即保存所有dump的数据。除了all参数只能单独配置外,其他参数可以自由组合。
MindSpore场景仅支持"all"、"input"和"output"参数,且各参数只能单独配置,不支持自由组合。 | 否 | +| file_format | MindSpore场景真实tensor数据的保存格式,str类型,可取值"bin"(dump的tensor文件为二进制格式,"level": "L1"时不支持)、"npy"(dump的tensor文件后缀为.npy,默认值)。 | 否 | + +### task配置为overflow_check + +| 参数名 | 说明 | 是否必选 | +| ------------- | ------------------------------------------------------------ | -------- | +| overflow_nums | 控制溢出次数,int类型,仅PyTorch场景支持,表示第N次溢出时,停止训练,过程中检测到溢出API对应kernel数据均dump。配置示例:"overflow_nums": 3。默认为1,即检测到1次溢出,训练停止,配置为-1时,表示持续检测溢出直到训练结束。 | 否 | +| check_mode | MindSpore场景kernel级别的溢出检测,str类型,可取值"aicore"(开启AI Core的溢出检测)、"atomic"(开启Atomic的溢出检测)、"all"(开启AI Core和Atomic的溢出检测,默认值)。配置示例"check_mode": "aicore"。 | 否 | + +## 配置示例 + +以下示例包含当前支持的所有场景可配置的完整参数。 + +### PyTorch场景task配置为free_benchmark + +```json +{ + "task": "free_benchmark", + "dump_path": "/home/data_dump", + "rank": [], + "step": [], + "level": "L1", + "seed": 1234, + "is_deterministic": false, + "enable_dataloader": false, + + "free_benchmark": { + "scope": [], + "list": ["conv2d"], + "fuzz_device": "npu", + "pert_mode": "improve_precision", + "handler_type": "check", + "fuzz_level": "L1", + "fuzz_stage": "forward", + "if_preheat": false, + "preheat_step": 15, + "max_sample": 20 + } +} +``` + +### PyTorch场景task配置为statistics + +```json +{ + "task": "statistics", + "dump_path": "/home/data_dump", + "rank": [], + "step": [], + "level": "L1", + "seed": 1234, + "is_deterministic": false, + "enable_dataloader": false, + + "statistics": { + "scope": [], + "list": [], + "data_mode": ["all"], + "summary_mode": "statistics" + } +} +``` + +### PyTorch场景task配置为tensor + +```json +{ + "task": "tensor", + "dump_path": "/home/data_dump", + "rank": [], + "step": [], + "level": "L1", + "seed": 1234, + "is_deterministic": false, + "enable_dataloader": false, + + "tensor": { + "scope": [], + "list":[], + "data_mode": ["all"], + "backward_input": "" + } +} +``` + +### PyTorch场景task配置为overflow_check + +```json +{ + "task": "overflow_check", + "dump_path": "/home/data_dump", + "rank": [], + "step": [], + "level": "L1", + "seed": 1234, + "is_deterministic": false, + "enable_dataloader": false, + + "overflow_check": { + "overflow_nums": 1 + } +} +``` + +### MindSpore场景task配置为statistics + +```json +{ + "task": "statistics", + "dump_path": "/home/data_dump", + "rank": [], + "step": [], + "level": "L1", + "seed": 1234, + "is_deterministic": false, + + "statistics": { + "list": [], + "data_mode": ["all"], + "summary_mode": "statistics" + } +} +``` + +### MindSpore场景task配置为tensor + +```json +{ + "task": "tensor", + "dump_path": "/home/data_dump", + "rank": [], + "step": [], + "level": "L1", + "seed": 1234, + "is_deterministic": false, + + "tensor": { + "list":[], + "data_mode": ["all"], + "backward_input": "" + } +} +``` + +### MindSpore场景task配置为overflow_check + +```json +{ + "task": "overflow_check", + "dump_path": "/home/data_dump", + "rank": [], + "step": [], + "level": "L1", + "seed": 1234, + "is_deterministic": false, + + "overflow_check": { + "overflow_nums": 1, + "check_mode": "all" + } +} +``` + +## 附录 + +### 模块级精度数据dump说明 + +仅PyTorch场景支持。 + +大模型场景下,通常不是简单的利用自动迁移能力实现GPU到NPU的训练脚本迁移,而是会对NPU网络进行一系列针对性的适配,因此,常常会造成迁移后的NPU模型存在部分子结构不能与GPU原始模型完全对应。模型结构不一致导致API调用类型及数量不一致,若直接按照API粒度进行精度数据dump和比对,则无法完全比对所有的API。 + +本节介绍的功能是对模型中的大粒度模块进行数据dump,使其比对时,对于无法以API粒度比对的模块可以直接以模块粒度进行比对。 + +模块指的是继承自nn.Module类模块,通常情况下这类模块就是一个小模型,可以被视为一个整体,dump数据时以模块为粒度进行dump。 + +### 固定随机数范围 + +仅PyTorch场景支持。 + +seed_all函数可固定随机数的范围如下表。 + +| API | 固定随机数 | +| ---------------------------------------- | --------------------------- | +| os.environ['PYTHONHASHSEED'] = str(seed) | 禁止Python中的hash随机化 | +| random.seed(seed) | 设置random随机生成器的种子 | +| np.random.seed(seed) | 设置numpy中随机生成器的种子 | +| torch.manual_seed(seed) | 设置当前CPU的随机种子 | +| torch.cuda.manual_seed(seed) | 设置当前GPU的随机种子 | +| torch.cuda.manual_seed_all(seed) | 设置所有GPU的随机种子 | +| torch_npu.npu.manual_seed(seed) | 设置当前NPU的随机种子 | +| torch_npu.npu.manual_seed_all(seed) | 设置所有NPU的随机种子 | +| torch.backends.cudnn.enable=False | 关闭cuDNN | +| torch.backends.cudnn.benchmark=False | cuDNN确定性地选择算法 | +| torch.backends.cudnn.deterministic=True | cuDNN仅使用确定性的卷积算法 | + +需要保证CPU或GPU以及NPU的模型输入完全一致,dump数据的比对才有意义,seed_all并不能保证模型输入完全一致,如下表所示场景需要保证输入的一致性。 + +| 场景 | 固定方法 | +| --------------- | ------------- | +| 数据集的shuffle | 关闭shuffle。 | +| dropout | 关闭dropout。 | + +关闭shuffle示例: + +```Python +train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size = batch_size, + shuffle = False, + num_workers = num_workers +) +``` + +关闭dropout: + +在使用from ptdbg import *后,工具会自动将torch.nn.functional.dropout、torch.nn.functional.dropout2d、torch.nn.functional.dropout3d、torch.nn.Dropout、torch.nn.Dropout2d、torch.nn.Dropout3d的接口参数p置为0。 diff --git a/debug/accuracy_tools/atat/config/config.json b/debug/accuracy_tools/atat/config/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ba13898090c802ed814694da70e5c415222f6c35 --- /dev/null +++ b/debug/accuracy_tools/atat/config/config.json @@ -0,0 +1,28 @@ +{ + "task": "statistics", + "dump_path": "", + "rank": [], + "step": [], + "level": "L1", + "seed": 1234, + "is_deterministic": false, + "enable_dataloader": false, + "acl_config": "", + "tensor": { + "scope": [], + "list":[], + "data_mode": ["all"], + "backward_input": [], + "file_format": "npy" + }, + "statistics": { + "scope": [], + "list":[], + "data_mode": ["all"], + "summary_mode": "statistics" + }, + "overflow_check": { + "overflow_nums": 1, + "check_mode":"all" + } +} \ No newline at end of file diff --git a/debug/accuracy_tools/atat/config/img/free_benchmark.png b/debug/accuracy_tools/atat/config/img/free_benchmark.png new file mode 100644 index 0000000000000000000000000000000000000000..83cea25228919de2adec8a9695d89a6f0f99fc8f Binary files /dev/null and b/debug/accuracy_tools/atat/config/img/free_benchmark.png differ diff --git a/debug/accuracy_tools/atat/core/common_config.py b/debug/accuracy_tools/atat/core/common_config.py new file mode 100644 index 0000000000000000000000000000000000000000..ee045d3c520f9418191daaedec2830e8f9248435 --- /dev/null +++ b/debug/accuracy_tools/atat/core/common_config.py @@ -0,0 +1,54 @@ +from .utils import Const + + +# 公共配置类 +class CommonConfig: + def __init__(self, json_config): + self.task = json_config.get('task') + self.dump_path = json_config.get('dump_path') + self.rank = json_config.get('rank') + self.step = json_config.get('step') + self.level = json_config.get('level') + self.seed = json_config.get('seed') + self.acl_config = json_config.get('acl_config') + self.is_deterministic = json_config.get('is_deterministic', False) + self.enable_dataloader = json_config.get('enable_dataloader', False) + self._check_config() + + def _check_config(self): + if self.task and self.task not in Const.TASK_LIST: + raise Exception("task is invalid") + if self.rank is not None and not isinstance(self.rank, list): + raise Exception("rank is invalid") + if self.step is not None and not isinstance(self.step, list): + raise Exception("step is invalid") + if self.level and self.level not in Const.LEVEL_LIST: + raise Exception("level is invalid") + if self.seed is not None and not isinstance(self.seed, int): + raise Exception("seed is invalid") + if not isinstance(self.is_deterministic, bool): + raise Exception("is_deterministic is invalid") + if not isinstance(self.enable_dataloader, bool): + raise Exception("enable_dataloader is invalid") + + +# 基础配置类 +class BaseConfig: + def __init__(self, json_config): + self.scope = json_config.get('scope') + self.list = json_config.get('list') + self.data_mode = json_config.get('data_mode') + self.backward_input = json_config.get("backward_input") + self.file_format = json_config.get("file_format") + self.summary_mode = json_config.get("summary_mode") + self.overflow_num = json_config.get("overflow_num") + self.check_mode = json_config.get("check_mode") + + def check_config(self): + if self.scope is not None and not isinstance(self.scope, list): + raise Exception("scope is invalid") + if self.list is not None and not isinstance(self.list, list): + raise Exception("list is invalid") + if self.data_mode is not None and not isinstance(self.data_mode, list): + raise Exception("data_mode is invalid") + \ No newline at end of file diff --git a/debug/accuracy_tools/atat/core/file_check_util.py b/debug/accuracy_tools/atat/core/file_check_util.py new file mode 100644 index 0000000000000000000000000000000000000000..b10cdd61049ad9a87e91d910e89b121557a58a7f --- /dev/null +++ b/debug/accuracy_tools/atat/core/file_check_util.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import os +import re + +from .log import print_warn_log, print_error_log + + +class FileCheckConst: + """ + Class for file check const + """ + READ_ABLE = "read" + WRITE_ABLE = "write" + READ_WRITE_ABLE = "read and write" + DIRECTORY_LENGTH = 4096 + FILE_NAME_LENGTH = 255 + FILE_VALID_PATTERN = r"^[a-zA-Z0-9_.:/-]+$" + PKL_SUFFIX = ".pkl" + NUMPY_SUFFIX = ".npy" + JSON_SUFFIX = ".json" + PT_SUFFIX = ".pt" + CSV_SUFFIX = ".csv" + YAML_SUFFIX = ".yaml" + MAX_PKL_SIZE = 1 * 1024 * 1024 * 1024 + MAX_NUMPY_SIZE = 10 * 1024 * 1024 * 1024 + MAX_JSON_SIZE = 1 * 1024 * 1024 * 1024 + MAX_PT_SIZE = 10 * 1024 * 1024 * 1024 + MAX_CSV_SIZE = 1 * 1024 * 1024 * 1024 + MAX_YAML_SIZE = 10 * 1024 * 1024 + DIR = "dir" + FILE = "file" + DATA_DIR_AUTHORITY = 0o750 + DATA_FILE_AUTHORITY = 0o640 + FILE_SIZE_DICT = { + PKL_SUFFIX: MAX_PKL_SIZE, + NUMPY_SUFFIX: MAX_NUMPY_SIZE, + JSON_SUFFIX: MAX_JSON_SIZE, + PT_SUFFIX: MAX_PT_SIZE, + CSV_SUFFIX: MAX_CSV_SIZE, + YAML_SUFFIX: MAX_YAML_SIZE + } + + +class FileCheckException(Exception): + """ + Class for File Check Exception + """ + NONE_ERROR = 0 + INVALID_PATH_ERROR = 1 + INVALID_FILE_TYPE_ERROR = 2 + INVALID_PARAM_ERROR = 3 + INVALID_PERMISSION_ERROR = 3 + + def __init__(self, code, error_info: str = ""): + super(FileCheckException, self).__init__() + self.code = code + self.error_info = error_info + + def __str__(self): + return self.error_info + + +class FileChecker: + """ + The class for check file. + + Attributes: + file_path: The file or dictionary path to be verified. + path_type: file or dictionary + ability(str): FileCheckConst.WRITE_ABLE or FileCheckConst.READ_ABLE to set file has writability or readability + file_type(str): The correct file type for file + """ + def __init__(self, file_path, path_type, ability=None, file_type=None, is_script=True): + self.file_path = file_path + self.path_type = self._check_path_type(path_type) + self.ability = ability + self.file_type = file_type + self.is_script = is_script + + @staticmethod + def _check_path_type(path_type): + if path_type not in [FileCheckConst.DIR, FileCheckConst.FILE]: + print_error_log(f'The path_type must be {FileCheckConst.DIR} or {FileCheckConst.FILE}.') + raise FileCheckException(FileCheckException.INVALID_PARAM_ERROR) + return path_type + + def common_check(self): + """ + 功能:用户校验基本文件权限:软连接、文件长度、是否存在、读写权限、文件属组、文件特殊字符 + 注意:文件后缀的合法性,非通用操作,可使用其他独立接口实现 + """ + check_path_exists(self.file_path) + check_link(self.file_path) + self.file_path = os.path.realpath(self.file_path) + check_path_length(self.file_path) + check_path_type(self.file_path, self.path_type) + self.check_path_ability() + if self.is_script: + check_path_owner_consistent(self.file_path) + check_path_pattern_vaild(self.file_path) + check_common_file_size(self.file_path) + check_file_suffix(self.file_path, self.file_type) + return self.file_path + + def check_path_ability(self): + if self.ability == FileCheckConst.WRITE_ABLE: + check_path_writability(self.file_path) + if self.ability == FileCheckConst.READ_ABLE: + check_path_readability(self.file_path) + if self.ability == FileCheckConst.READ_WRITE_ABLE: + check_path_readability(self.file_path) + check_path_writability(self.file_path) + + +class FileOpen: + """ + The class for open file by a safe way. + + Attributes: + file_path: The file or dictionary path to be opened. + mode(str): The file open mode + """ + SUPPORT_READ_MODE = ["r", "rb"] + SUPPORT_WRITE_MODE = ["w", "wb", "a", "ab"] + SUPPORT_READ_WRITE_MODE = ["r+", "rb+", "w+", "wb+", "a+", "ab+"] + + def __init__(self, file_path, mode, encoding='utf-8'): + self.file_path = file_path + self.mode = mode + self.encoding = encoding + self._handle = None + + def __enter__(self): + self.check_file_path() + binary_mode = "b" + if binary_mode not in self.mode: + self._handle = open(self.file_path, self.mode, encoding=self.encoding) + else: + self._handle = open(self.file_path, self.mode) + return self._handle + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._handle: + self._handle.close() + + def check_file_path(self): + support_mode = self.SUPPORT_READ_MODE + self.SUPPORT_WRITE_MODE + self.SUPPORT_READ_WRITE_MODE + if self.mode not in support_mode: + print_error_log("File open not support %s mode" % self.mode) + raise FileCheckException(FileCheckException.INVALID_PARAM_ERROR) + check_link(self.file_path) + self.file_path = os.path.realpath(self.file_path) + check_path_length(self.file_path) + self.check_ability_and_owner() + check_path_pattern_vaild(self.file_path) + if os.path.exists(self.file_path): + check_common_file_size(self.file_path) + + def check_ability_and_owner(self): + if self.mode in self.SUPPORT_READ_MODE: + check_path_exists(self.file_path) + check_path_readability(self.file_path) + check_path_owner_consistent(self.file_path) + if self.mode in self.SUPPORT_WRITE_MODE and os.path.exists(self.file_path): + check_path_writability(self.file_path) + check_path_owner_consistent(self.file_path) + if self.mode in self.SUPPORT_READ_WRITE_MODE and os.path.exists(self.file_path): + check_path_readability(self.file_path) + check_path_writability(self.file_path) + check_path_owner_consistent(self.file_path) + + +def check_link(path): + abs_path = os.path.abspath(path) + if os.path.islink(abs_path): + print_error_log('The file path {} is a soft link.'.format(path)) + raise FileCheckException(FileCheckException.INVALID_PATH_ERROR) + + +def check_path_length(path, name_length=None): + file_max_name_length = name_length if name_length else FileCheckConst.FILE_NAME_LENGTH + if len(path) > FileCheckConst.DIRECTORY_LENGTH or \ + len(os.path.basename(path)) > file_max_name_length: + print_error_log('The file path length exceeds limit.') + raise FileCheckException(FileCheckException.INVALID_PATH_ERROR) + + +def check_path_exists(path): + if not os.path.exists(path): + print_error_log('The file path %s does not exist.' % path) + raise FileCheckException(FileCheckException.INVALID_PATH_ERROR) + + +def check_path_readability(path): + if not os.access(path, os.R_OK): + print_error_log('The file path %s is not readable.' % path) + raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR) + + +def check_path_writability(path): + if not os.access(path, os.W_OK): + print_error_log('The file path %s is not writable.' % path) + raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR) + + +def check_path_executable(path): + if not os.access(path, os.X_OK): + print_error_log('The file path %s is not executable.' % path) + raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR) + + +def check_other_user_writable(path): + st = os.stat(path) + if st.st_mode & 0o002: + _user_interactive_confirm( + 'The file path %s may be insecure because other users have write permissions. ' + 'Do you want to continue?' % path) + + +def _user_interactive_confirm(message): + while True: + check_message = input(message + " Enter 'c' to continue or enter 'e' to exit: ") + if check_message == "c": + break + elif check_message == "e": + print_warn_log("User canceled.") + raise FileCheckException(FileCheckException.INVALID_PATH_ERROR) + else: + print("Input is error, please enter 'c' or 'e'.") + + +def check_path_owner_consistent(path): + file_owner = os.stat(path).st_uid + if file_owner != os.getuid(): + print_error_log('The file path %s may be insecure because is does not belong to you.' % path) + raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR) + + +def check_path_pattern_vaild(path): + if not re.match(FileCheckConst.FILE_VALID_PATTERN, path): + print_error_log('The file path {} contains special characters.'.format(path)) + raise FileCheckException(FileCheckException.INVALID_PATH_ERROR) + + +def check_file_size(file_path, max_size): + file_size = os.path.getsize(file_path) + if file_size >= max_size: + _user_interactive_confirm(f'The size of file path {file_path} exceeds {max_size} bytes.' + f'Do you want to continue?') + + +def check_common_file_size(file_path): + if os.path.isfile(file_path): + for suffix, max_size in FileCheckConst.FILE_SIZE_DICT.items(): + if file_path.endswith(suffix): + check_file_size(file_path, max_size) + break + + +def check_file_suffix(file_path, file_suffix): + if file_suffix: + if not file_path.endswith(file_suffix): + print_error_log(f"The {file_path} should be a {file_suffix} file!") + raise FileCheckException(FileCheckException.INVALID_FILE_TYPE_ERROR) + + +def check_path_type(file_path, file_type): + if file_type == FileCheckConst.FILE: + if not os.path.isfile(file_path): + print_error_log(f"The {file_path} should be a file!") + raise FileCheckException(FileCheckException.INVALID_FILE_TYPE_ERROR) + if file_type == FileCheckConst.DIR: + if not os.path.isdir(file_path): + print_error_log(f"The {file_path} should be a dictionary!") + raise FileCheckException(FileCheckException.INVALID_FILE_TYPE_ERROR) + + +def create_directory(dir_path): + """ + Function Description: + creating a directory with specified permissions + Parameter: + dir_path: directory path + Exception Description: + when invalid data throw exception + """ + dir_path = os.path.realpath(dir_path) + try: + os.makedirs(dir_path, mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True) + except OSError as ex: + print_error_log( + 'Failed to create {}.Please check the path permission or disk space .{}'.format(dir_path, str(ex))) + raise FileCheckException(FileCheckException.INVALID_PATH_ERROR) from ex + + +def change_mode(path, mode): + if not os.path.exists(path) or os.path.islink(path): + return + try: + os.chmod(path, mode) + except PermissionError as ex: + print_error_log('Failed to change {} authority. {}'.format(path, str(ex))) + raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR) from ex + diff --git a/debug/accuracy_tools/atat/core/log.py b/debug/accuracy_tools/atat/core/log.py new file mode 100644 index 0000000000000000000000000000000000000000..b9ac8f5edfb18286aff317b5440bb99a92dd2486 --- /dev/null +++ b/debug/accuracy_tools/atat/core/log.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import os +import time +import sys + + +def _print_log(level, msg, end='\n'): + current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) + pid = os.getgid() + print(current_time + "(" + str(pid) + ")-[" + level + "]" + msg, end=end) + sys.stdout.flush() + + +def print_info_log(info_msg, end='\n'): + """ + Function Description: + print info log. + Parameter: + info_msg: the info message. + """ + _print_log("INFO", info_msg, end=end) + + +def print_error_log(error_msg): + """ + Function Description: + print error log. + Parameter: + error_msg: the error message. + """ + _print_log("ERROR", error_msg) + + +def print_warn_log(warn_msg): + """ + Function Description: + print warn log. + Parameter: + warn_msg: the warning message. + """ + _print_log("WARNING", warn_msg) \ No newline at end of file diff --git a/debug/accuracy_tools/atat/core/utils.py b/debug/accuracy_tools/atat/core/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fdaa33e3ceda02b4200c0d1b26f756361780987e --- /dev/null +++ b/debug/accuracy_tools/atat/core/utils.py @@ -0,0 +1,723 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import collections +import os +import re +import shutil +import stat +import subprocess +import sys +import time +import json +from json.decoder import JSONDecodeError +from datetime import datetime, timezone +from pathlib import Path +import numpy as np + +from .file_check_util import FileOpen, FileChecker, FileCheckConst + + +device = collections.namedtuple('device', ['type', 'index']) +prefixes = ['api_stack', 'list', 'range', 'acl'] + + +class Const: + """ + Class for const + """ + MODEL_TYPE = ['.onnx', '.pb', '.om'] + DIM_PATTERN = r"^(-?[0-9]+)(,-?[0-9]+)*" + REGEX_PREFIX_MAX_LENGTH = 20 + REGEX_PREFIX_PATTERN = r"^[a-zA-Z0-9_-]+$" + SEMICOLON = ";" + COLON = ":" + EQUAL = "=" + COMMA = "," + DOT = "." + DUMP_RATIO_MAX = 100 + SUMMERY_DATA_NUMS = 256 + FLOAT_EPSILON = np.finfo(float).eps + SUPPORT_DUMP_MODE = ['api', 'acl'] + ON = 'ON' + OFF = 'OFF' + BACKWARD = 'backward' + FORWARD = 'forward' + PRE_FORWARD = "pre_forward" + + # dump mode + ALL = "all" + LIST = "list" + RANGE = "range" + STACK = "stack" + ACL = "acl" + API_LIST = "api_list" + API_STACK = "api_stack" + DUMP_MODE = [ALL, LIST, RANGE, STACK, ACL, API_LIST, API_STACK] + AUTO = "auto" + ONLINE_DUMP_MODE = [ALL, LIST, AUTO, OFF] + SUMMARY = "summary" + MD5 = "md5" + SUMMARY_MODE = [ALL, SUMMARY, MD5] + + WRITE_FLAGS = os.O_WRONLY | os.O_CREAT + WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR + + PKL_SUFFIX = ".pkl" + NUMPY_SUFFIX = ".npy" + ONE_GB = 1 * 1024 * 1024 * 1024 + TEN_GB = 10 * 1024 * 1024 * 1024 + FILE_PATTERN = r'^[a-zA-Z0-9_./-]+$' + FILE_NAME_LENGTH = 255 + DIRECTORY_LENGTH = 4096 + DISTRIBUTED_PREFIX_LENGTH = 60 + SUMMARY_COLUMN_NUM = 6 + STACK_COLUMN_NUM = 2 + # env dump path + ASCEND_WORK_PATH = "ASCEND_WORK_PATH" + DUMP_DIR = "dump_data" + + ENV_ENABLE = "1" + ENV_DISABLE = "0" + + MAX_SEED_VALUE = 2**32 - 1 + + INPLACE_LIST = ["broadcast", "all_reduce", "reduce", "all_gather", "gather", "scatter", "reduce_scatter", + "_reduce_scatter_base", "_all_gather_base"] + + TASK_LIST = ["tensor", "statistics", "overflow_check", "free_benchmark"] + LEVEL_LIST = ["L0", "L1", "L2", "mix"] + STATISTICS = "statistics" + TENSOR = "tensor" + OVERFLOW_CHECK = "overflow_check" + FREE_BENCHMARK = "free_benchmark" + +class CompareConst: + """ + Class for compare module const + """ + # compare result column name + NPU_NAME = "NPU Name" + BENCH_NAME = "Bench Name" + NPU_DTYPE = "NPU Dtype" + BENCH_DTYPE = "Bench Dtype" + NPU_SHAPE = "NPU Tensor Shape" + BENCH_SHAPE = "Bench Tensor Shape" + NPU_MAX = "NPU max" + NPU_MIN = "NPU min" + NPU_MEAN = "NPU mean" + NPU_NORM = "NPU l2norm" + BENCH_MAX = "Bench max" + BENCH_MIN = "Bench min" + BENCH_MEAN = "Bench mean" + BENCH_NORM = "Bench l2norm" + MAX_DIFF = "Max diff" + MIN_DIFF = "Min diff" + MEAN_DIFF = "Mean diff" + NORM_DIFF = "L2norm diff" + COSINE = "Cosine" + MAX_ABS_ERR = "MaxAbsErr" + MAX_RELATIVE_ERR = "MaxRelativeErr" + MIN_RELATIVE_ERR = "MinRelativeErr" + MEAN_RELATIVE_ERR = "MeanRelativeErr" + NORM_RELATIVE_ERR = "NormRelativeErr" + ACCURACY = "Accuracy Reached or Not" + STACK = "NPU_Stack_Info" + DATA_NAME = "Data_name" + ERROR_MESSAGE = "Err_message" + ONE_THOUSANDTH_ERR_RATIO = "One Thousandth Err Ratio" + FIVE_THOUSANDTHS_ERR_RATIO = "Five Thousandths Err Ratio" + NPU_MD5 = "NPU MD5" + BENCH_MD5 = "BENCH MD5" + RESULT = "Result" + + COMPARE_RESULT_HEADER = [ + NPU_NAME, BENCH_NAME, NPU_DTYPE, BENCH_DTYPE, NPU_SHAPE, BENCH_SHAPE, COSINE, MAX_ABS_ERR, MAX_RELATIVE_ERR, + ONE_THOUSANDTH_ERR_RATIO, FIVE_THOUSANDTHS_ERR_RATIO, + NPU_MAX, NPU_MIN, NPU_MEAN, NPU_NORM, BENCH_MAX, BENCH_MIN, BENCH_MEAN, BENCH_NORM, ACCURACY, ERROR_MESSAGE + ] + + SUMMARY_COMPARE_RESULT_HEADER = [ + NPU_NAME, BENCH_NAME, NPU_DTYPE, BENCH_DTYPE, NPU_SHAPE, BENCH_SHAPE, MAX_DIFF, MIN_DIFF, MEAN_DIFF, NORM_DIFF, + MAX_RELATIVE_ERR, MIN_RELATIVE_ERR, MEAN_RELATIVE_ERR, NORM_RELATIVE_ERR, + NPU_MAX, NPU_MIN, NPU_MEAN, NPU_NORM, BENCH_MAX, BENCH_MIN, BENCH_MEAN, BENCH_NORM, RESULT, ERROR_MESSAGE + ] + + MD5_COMPARE_RESULT_HEADER = [ + NPU_NAME, BENCH_NAME, NPU_DTYPE, BENCH_DTYPE, NPU_SHAPE, BENCH_SHAPE, NPU_MD5, BENCH_MD5, RESULT + ] + + # compare result data + NAN = 'Nan' + NONE = 'None' + SHAPE_UNMATCH = 'shape unmatched' + DTYPE_UNMATCH = 'dtype unmatched' + PASS = 'Pass' + WARNING = 'Warning' + DIFF = 'Different' + + # accuracy standards + COS_THRESHOLD = 0.99 + MAX_ABS_ERR_THRESHOLD = 0.001 + COS_MAX_THRESHOLD = 0.9 + MAX_ABS_ERR_MAX_THRESHOLD = 1 + ACCURACY_CHECK_YES = "Yes" + ACCURACY_CHECK_NO = "No" + ACCURACY_CHECK_UNMATCH = "Unmatched" + + # error message + NO_BENCH = "No bench data matched." + + # compare const + FLOAT_TYPE = [np.half, np.single, float, np.double, np.float64, np.longdouble] + + # highlight xlsx color const + RED = "FFFF0000" + YELLOW = "FFFF00" + BLUE = "0000FF" + + # highlight rules const + OVERFLOW_LIST = ['nan\t', 'inf\t', '-inf\t', 'nan', 'inf', '-inf'] + MAX_DIFF_RED = 1e+10 + ORDER_MAGNITUDE_DIFF_YELLOW = 1 + ONE_THOUSAND_ERROR_IN_RED = 0.9 + ONE_THOUSAND_ERROR_OUT_RED = 0.6 + ONE_THOUSAND_ERROR_DIFF_YELLOW = 0.1 + COSINE_DIFF_YELLOW = 0.1 + MAX_RELATIVE_OUT_RED = 0.5 + MAX_RELATIVE_OUT_YELLOW = 0.1 + MAX_RELATIVE_IN_YELLOW = 0.01 + + +class CompareException(Exception): + """ + Class for Accuracy Compare Exception + """ + NONE_ERROR = 0 + INVALID_PATH_ERROR = 1 + OPEN_FILE_ERROR = 2 + CLOSE_FILE_ERROR = 3 + READ_FILE_ERROR = 4 + WRITE_FILE_ERROR = 5 + INVALID_FILE_ERROR = 6 + PERMISSION_ERROR = 7 + INDEX_OUT_OF_BOUNDS_ERROR = 8 + NO_DUMP_FILE_ERROR = 9 + INVALID_DATA_ERROR = 10 + INVALID_PARAM_ERROR = 11 + INVALID_DUMP_RATIO = 12 + INVALID_DUMP_FILE = 13 + UNKNOWN_ERROR = 14 + INVALID_DUMP_MODE = 15 + PARSE_FILE_ERROR = 16 + INVALID_COMPARE_MODE = 17 + OVER_SIZE_FILE_ERROR = 18 + INVALID_SUMMARY_MODE = 19 + INVALID_TASK_ERROR = 20 + + + def __init__(self, code, error_info: str = ""): + super(CompareException, self).__init__() + self.code = code + self.error_info = error_info + + def __str__(self): + return self.error_info + + +class DumpException(CompareException): + pass + + +class OverflowConst: + """ + Class for Overflow + """ + OVERFLOW_DEBUG_MODE_ENABLE = "OVERFLOW_DEBUG_MODE_ENABLE" + OVERFLOW_ORIGINAL_MODE = 0 + OVERFLOW_DEBUG_MODE = 1 + + +def make_dump_path_if_not_exists(dump_path): + if not os.path.exists(dump_path): + try: + Path(dump_path).mkdir(mode=0o750, exist_ok=True, parents=True) + except OSError as ex: + print_error_log( + 'Failed to create {}.Please check the path permission or disk space .{}'.format(dump_path, str(ex))) + raise CompareException(CompareException.INVALID_PATH_ERROR) from ex + else: + if not os.path.isdir(dump_path): + print_error_log('{} already exists and is not a directory.'.format(dump_path)) + + +def _print_log(level, msg, end='\n'): + current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) + pid = os.getgid() + print(current_time + "(" + str(pid) + ")-[" + level + "]" + msg, end=end) + sys.stdout.flush() + + +def print_info_log(info_msg, end='\n'): + """ + Function Description: + print info log. + Parameter: + info_msg: the info message. + """ + _print_log("INFO", info_msg, end=end) + + +def print_error_log(error_msg): + """ + Function Description: + print error log. + Parameter: + error_msg: the error message. + """ + _print_log("ERROR", error_msg) + + +def print_warn_log(warn_msg): + """ + Function Description: + print warn log. + Parameter: + warn_msg: the warning message. + """ + _print_log("WARNING", warn_msg) + + +def check_mode_valid(mode, scope=None, api_list=None): + if scope is None: + scope = [] + if api_list is None: + api_list = [] + if not isinstance(scope, list): + raise ValueError("scope param set invalid, it's must be a list.") + if not isinstance(api_list, list): + raise ValueError("api_list param set invalid, it's must be a list.") + mode_check = { + Const.ALL: lambda: None, + Const.RANGE: lambda: ValueError("set_dump_switch, scope param set invalid, it's must be [start, end].") if len(scope) != 2 else None, + Const.LIST: lambda: ValueError("set_dump_switch, scope param set invalid, it's should not be an empty list.") if len(scope) == 0 else None, + Const.STACK: lambda: ValueError("set_dump_switch, scope param set invalid, it's must be [start, end] or [].") if len(scope) > 2 else None, + Const.ACL: lambda: ValueError("set_dump_switch, scope param set invalid, only one api name is supported in acl mode.") if len(scope) != 1 else None, + Const.API_LIST: lambda: ValueError("Current dump mode is 'api_list', but the content of api_list parameter is empty or valid.") if len(api_list) < 1 else None, + Const.API_STACK: lambda: None, + } + if mode not in Const.DUMP_MODE: + msg = "Current mode '%s' is not supported. Please use the field in %s" % \ + (mode, Const.DUMP_MODE) + raise CompareException(CompareException.INVALID_DUMP_MODE, msg) + + if mode_check.get(mode)() is not None: + raise mode_check.get(mode)() + + +def check_switch_valid(switch): + if switch not in ["ON", "OFF"]: + print_error_log("Please set switch with 'ON' or 'OFF'.") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + + +def check_dump_mode_valid(dump_mode): + if not isinstance(dump_mode, list): + print_warn_log("Please set dump_mode as a list.") + dump_mode = [dump_mode] + if not all(mode in ["all", "forward", "backward", "input", "output"] for mode in dump_mode): + raise ValueError("Please set dump_mode as a list containing one or more of the following: 'all', 'forward', 'backward', 'input', 'output'.") + if 'input' not in dump_mode and 'output' not in dump_mode: + dump_mode.extend(['input', 'output']) + if 'forward' not in dump_mode and 'backward' not in dump_mode: + dump_mode.extend(['forward', 'backward']) + if 'all' in dump_mode or set(["forward", "backward", "input", "output"]).issubset(set(dump_mode)): + return ["forward", "backward", "input", "output"] + return dump_mode + + +def check_summary_mode_valid(summary_mode): + if summary_mode not in Const.SUMMARY_MODE: + msg = "The summary_mode is not valid" + raise CompareException(CompareException.INVALID_SUMMARY_MODE, msg) + + +def check_summary_only_valid(summary_only): + if not isinstance(summary_only, bool): + print_error_log("Params summary_only only support True or False.") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + return summary_only + + +def check_compare_param(input_parma, output_path, stack_mode=False, summary_compare=False, md5_compare=False): + if not (isinstance(input_parma, dict) and isinstance(output_path, str)): + print_error_log("Invalid input parameters") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + check_file_or_directory_path(input_parma.get("npu_json_path"), False) + check_file_or_directory_path(input_parma.get("bench_json_path"), False) + check_file_or_directory_path(input_parma.get("stack_json_path"), False) + if not summary_compare and not md5_compare: + check_file_or_directory_path(input_parma.get("npu_dump_data_dir"), True) + check_file_or_directory_path(input_parma.get("bench_dump_data_dir"), True) + check_file_or_directory_path(output_path, True) + with FileOpen(input_parma.get("npu_json_path"), "r") as npu_json, \ + FileOpen(input_parma.get("bench_json_path"), "r") as bench_json, \ + FileOpen(input_parma.get("stack_json_path"), "r") as stack_json: + check_json_file(input_parma, npu_json, bench_json, stack_json) + + +def check_configuration_param(stack_mode=False, auto_analyze=True, fuzzy_match=False): + if not (isinstance(stack_mode, bool) and isinstance(auto_analyze, bool) and isinstance(fuzzy_match, bool)): + print_error_log("Invalid input parameters which should be only bool type.") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + + +def check_file_or_directory_path(path, isdir=False): + """ + Function Description: + check whether the path is valid + Parameter: + path: the path to check + isdir: the path is dir or file + Exception Description: + when invalid data throw exception + """ + if isdir: + path_checker = FileChecker(path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE) + else: + path_checker = FileChecker(path, FileCheckConst.FILE, FileCheckConst.READ_ABLE) + path_checker.common_check() + + +def is_starts_with(string, prefix_list): + return any(string.startswith(prefix) for prefix in prefix_list) + + +def _check_json(json_file_handle, file_name): + tensor_line = json_file_handle.readline() + if not tensor_line: + print_error_log("dump file {} have empty line!".format(file_name)) + raise CompareException(CompareException.INVALID_DUMP_FILE) + json_file_handle.seek(0, 0) + + +def check_json_file(input_param, npu_json, bench_json, stack_json): + _check_json(npu_json, input_param.get("npu_json_path")) + _check_json(bench_json, input_param.get("bench_json_path")) + _check_json(stack_json, input_param.get("stack_json_path")) + + +def check_file_size(input_file, max_size): + try: + file_size = os.path.getsize(input_file) + except OSError as os_error: + print_error_log('Failed to open "%s". %s' % (input_file, str(os_error))) + raise CompareException(CompareException.INVALID_FILE_ERROR) from os_error + if file_size > max_size: + print_error_log('The size (%d) of %s exceeds (%d) bytes, tools not support.' + % (file_size, input_file, max_size)) + raise CompareException(CompareException.INVALID_FILE_ERROR) + + +def check_file_not_exists(file_path): + if os.path.exists(file_path) or os.path.islink(file_path): + remove_path(file_path) + + +def check_regex_prefix_format_valid(prefix): + """ + validate the format of the regex prefix + + Args: + prefix (str): The prefix string to validate. + + Returns: + no returns + + Raises: + ValueError: if the prefix length exceeds Const.REGEX_PREFIX_MAX_LENGTH characters or the prefix do not match + the given pattern Const.REGEX_PREFIX_PATTERN + """ + if len(prefix) > Const.REGEX_PREFIX_MAX_LENGTH: + raise ValueError(f"Maximum length of prefix is {Const.REGEX_PREFIX_MAX_LENGTH}, while current length " + f"is {len(prefix)}") + if not re.match(Const.REGEX_PREFIX_PATTERN, prefix): + raise ValueError(f"prefix contains invalid characters, prefix pattern {Const.REGEX_PREFIX_PATTERN}") + + +def remove_path(path): + if not os.path.exists(path): + return + try: + if os.path.islink(path) or os.path.isfile(path): + os.remove(path) + else: + shutil.rmtree(path) + except PermissionError as err: + print_error_log("Failed to delete {}. Please check the permission.".format(path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) from err + + +def get_dump_data_path(dump_dir): + """ + Function Description: + traverse directories and obtain the absolute path of dump data + Parameter: + dump_dir: dump data directory + Return Value: + dump data path,file is exist or file is not exist + """ + dump_data_path = None + file_is_exist = False + + check_file_or_directory_path(dump_dir, True) + for dir_path, sub_paths, files in os.walk(dump_dir): + if len(files) != 0: + dump_data_path = dir_path + file_is_exist = True + break + dump_data_path = dir_path + return dump_data_path, file_is_exist + + +def modify_dump_path(dump_path, mode): + if mode == Const.ALL: + return dump_path + file_name = os.path.split(dump_path) + mode_file_name = mode + "_" + file_name[-1] + return os.path.join(file_name[0], mode_file_name) + + +def create_directory(dir_path): + """ + Function Description: + creating a directory with specified permissions + Parameter: + dir_path: directory path + Exception Description: + when invalid data throw exception + """ + if not os.path.exists(dir_path): + try: + os.makedirs(dir_path, mode=0o700) + except OSError as ex: + print_error_log( + 'Failed to create {}.Please check the path permission or disk space .{}'.format(dir_path, str(ex))) + raise CompareException(CompareException.INVALID_PATH_ERROR) from ex + + +def execute_command(cmd): + """ + Function Description: + run the following command + Parameter: + cmd: command + Exception Description: + when invalid command throw exception + """ + print_info_log('Execute command:%s' % cmd) + process = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + while process.poll() is None: + line = process.stdout.readline() + line = line.strip() + if line: + print(line) + if process.returncode != 0: + print_error_log('Failed to execute command:%s' % " ".join(cmd)) + raise CompareException(CompareException.INVALID_DATA_ERROR) + + +def save_numpy_data(file_path, data): + """ + save_numpy_data + """ + if not os.path.exists(os.path.dirname(file_path)): + os.makedirs(os.path.dirname(file_path)) + np.save(file_path, data) + + +def parse_value_by_comma(value): + """ + parse value by comma, like '1,2,4,8' + """ + value_list = [] + value_str_list = value.split(Const.COMMA) + for value_str in value_str_list: + value_str = value_str.strip() + if value_str.isdigit() or value_str == '-1': + value_list.append(int(value_str)) + else: + print_error_log("please check your input shape.") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + return value_list + + +def get_data_len_by_shape(shape): + data_len = 1 + for item in shape: + if item == -1: + print_error_log("please check your input shape, one dim in shape is -1.") + return -1 + data_len = data_len * item + return data_len + + +def add_time_as_suffix(name): + return '{}_{}.csv'.format(name, time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))) + + +def add_time_with_xlsx(name): + return '{}_{}.xlsx'.format(name, time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))) + + +def get_time(): + return datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S") + + +def format_value(value): + return float('{:.12f}'.format(value)) + + +def check_seed_all(seed, mode): + if isinstance(seed, int): + if seed < 0 or seed > Const.MAX_SEED_VALUE: + print_error_log(f"Seed must be between 0 and {Const.MAX_SEED_VALUE}.") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + else: + print_error_log(f"Seed must be integer.") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + if not isinstance(mode, bool): + print_error_log(f"seed_all mode must be bool.") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + + +def get_process_rank(model): + print_info_log("Rank id is not provided. Trying to get the rank id of the model.") + try: + local_device = next(model.parameters()).device + except StopIteration: + print_warn_log('There is no parameter in the model. Fail to get rank id.') + return 0, False + if local_device.type == 'cpu': + print_warn_log("Warning: the debugger is unable to get the rank id. " + "This may cause the dumpped data to be corrupted in the " + "case of distributed training. (You may ignore this if you are using only one card.) " + "Transfer the model to npu or gpu before register_hook() to avoid this warning.") + return 0, False + else: + return local_device.index, True + + +def generate_compare_script(dump_path, pkl_file_path, dump_switch_mode): + template_path = os.path.join(os.path.dirname(__file__), "compare_script.template") + pkl_dir = os.path.dirname(pkl_file_path) + compare_script_path = os.path.join(pkl_dir, "compare_data.py") + is_api_stack = "True" if dump_switch_mode == Const.API_STACK else "False" + + try: + with FileOpen(template_path, 'r') as ftemp, \ + os.fdopen(os.open(compare_script_path, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as fout: + code_temp = ftemp.read() + fout.write(code_temp % (pkl_file_path, dump_path, is_api_stack)) + except OSError: + print_error_log(f"Failed to open file. Please check file {template_path} or path {pkl_dir}.") + + print_info_log(f"Generate compare script successfully which is {compare_script_path}.") + + +def check_file_valid(file_path): + if os.path.islink(file_path): + print_error_log('The file path {} is a soft link.'.format(file_path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + + if len(os.path.realpath(file_path)) > Const.DIRECTORY_LENGTH or len(os.path.basename(file_path)) > \ + Const.FILE_NAME_LENGTH: + print_error_log('The file path length exceeds limit.') + raise CompareException(CompareException.INVALID_PATH_ERROR) + + if not re.match(Const.FILE_PATTERN, os.path.realpath(file_path)): + print_error_log('The file path {} contains special characters.'.format(file_path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + + if os.path.isfile(file_path): + file_size = os.path.getsize(file_path) + if file_path.endswith(Const.PKL_SUFFIX) and file_size > Const.ONE_GB: + print_error_log('The file {} size is greater than 1GB.'.format(file_path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + if file_path.endswith(Const.NUMPY_SUFFIX) and file_size > Const.TEN_GB: + print_error_log('The file {} size is greater than 10GB.'.format(file_path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + + +def check_path_before_create(path): + if len(os.path.realpath(path)) > Const.DIRECTORY_LENGTH or len(os.path.basename(path)) > \ + Const.FILE_NAME_LENGTH: + print_error_log('The file path length exceeds limit.') + raise CompareException(CompareException.INVALID_PATH_ERROR) + + if not re.match(Const.FILE_PATTERN, os.path.realpath(path)): + print_error_log('The file path {} contains special characters.'.format(path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + + +def check_inplace_op(prefix): + if len(prefix) > Const.DISTRIBUTED_PREFIX_LENGTH: + return False + match_op = re.findall(r"Distributed_(.+?)_\d", prefix) + op_name = match_op[0] if match_op else None + return op_name in Const.INPLACE_LIST + + +def md5_find(data): + for key_op in data: + for api_info in data[key_op]: + if isinstance(data[key_op][api_info], list): + for data_detail in data[key_op][api_info]: + if data_detail and 'md5' in data_detail: + return True + elif 'md5' in data[key_op][api_info]: + return True + return False + + +def task_dumppath_get(input_param): + npu_json_path = input_param.get("npu_json_path", None) + bench_json_path = input_param.get("bench_json_path", None) + if not npu_json_path or not bench_json_path: + print_error_log(f"Please check the json path is valid.") + raise CompareException(CompareException.INVALID_PATH_ERROR) + with FileOpen(npu_json_path, 'r') as npu_f: + npu_json_data = json.load(npu_f) + with FileOpen(bench_json_path, 'r') as bench_f: + bench_json_data = json.load(bench_f) + if npu_json_data['task'] != bench_json_data['task']: + print_error_log(f"Please check the dump task is consistent.") + raise CompareException(CompareException.INVALID_TASK_ERROR) + if npu_json_data['task'] == Const.TENSOR: + summary_compare = False + md5_compare = False + elif npu_json_data['task'] == Const.STATISTICS: + md5_compare = md5_find(npu_json_data['data']) + if md5_compare: + summary_compare = False + else: + summary_compare = True + else: + print_error_log(f"Compare is not required for overflow_check or free_benchmark.") + raise CompareException(CompareException.INVALID_TASK_ERROR) + input_param['npu_dump_data_dir'] = npu_json_data['dump_data_dir'] + input_param['bench_dump_data_dir'] = bench_json_data['dump_data_dir'] + return summary_compare, md5_compare diff --git a/debug/accuracy_tools/atat/mindspore/__init__.py b/debug/accuracy_tools/atat/mindspore/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bb3f93567542e93ff913edf3daabcd3aedb91ee3 --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/__init__.py @@ -0,0 +1 @@ +from atat.mindspore.debugger.precision_debugger import PrecisionDebugger diff --git a/debug/accuracy_tools/atat/mindspore/debugger/__init__.py b/debug/accuracy_tools/atat/mindspore/debugger/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/mindspore/debugger/debugger_config.py b/debug/accuracy_tools/atat/mindspore/debugger/debugger_config.py new file mode 100644 index 0000000000000000000000000000000000000000..56a4b9bf758197d77ef04874f2865e2136d6f67c --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/debugger/debugger_config.py @@ -0,0 +1,51 @@ +import os + + +class DebuggerConfig: + convert_map = { + "L0": "cell", + "L1": "api", + "L2": 'kernel' + } + + def __init__(self, common_config, task_config): + self.dump_path = common_config.dump_path + self.task = common_config.task + self.rank = [] if not common_config.rank else common_config.rank + self.step = [] if not common_config.step else common_config.step + if not common_config.level: + common_config.level = "L1" + self.level = DebuggerConfig.convert_map[common_config.level] + self.list = [] if not task_config.list else task_config.list + self.data_mode = [] if not task_config.data_mode else task_config.data_mode + self.file_format = task_config.file_format + self.check_mode = task_config.check_mode + + self.check() + + def check(self): + if not self.dump_path: + raise Exception("Dump path is empty.") + if not os.path.isabs(self.dump_path): + raise Exception("Dump path must be absolute path.") + if not self.task: + self.task = "statistics" + if not self.level: + raise Exception("level must be L0, L1 or L2") + if not self.file_format: + self.file_format = "npy" + if not self.check_mode: + self.check_mode = "all" + self._check_rank() + self._check_step() + return True + + def _check_rank(self): + for rank_id in self.rank: + if not isinstance(rank_id, int) or rank_id < 0: + raise ValueError(f"rank {self.rank} must be a positive integer.") + + def _check_step(self): + for s in self.step: + if not isinstance(s, int): + raise ValueError(f"step element {s} should be int") diff --git a/debug/accuracy_tools/atat/mindspore/debugger/precision_debugger.py b/debug/accuracy_tools/atat/mindspore/debugger/precision_debugger.py new file mode 100644 index 0000000000000000000000000000000000000000..0099074762f0746c1bd8341047f37b3e5fe08855 --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/debugger/precision_debugger.py @@ -0,0 +1,32 @@ +import os +from atat.mindspore.ms_config import parse_json_config +from atat.mindspore.debugger.debugger_config import DebuggerConfig +from atat.mindspore.task_handler_factory import TaskHandlerFactory + + +class PrecisionDebugger: + _instance = None + + def __new__(cls, config_path=None): + if not cls._instance: + cls._instance = super().__new__(cls) + cls._instance.initialized = False + cls._instance.config = None + return cls._instance + + def __init__(self, config_path=None): + if self.initialized: + return + if not config_path: + config_path = os.path.join(os.path.dirname(__file__), "../../config/config.json") + common_config, task_config = parse_json_config(config_path) + self.config = DebuggerConfig(common_config, task_config) + self.initialized = True + + @classmethod + def start(cls, target=None): + instance = cls._instance + if not instance: + raise Exception("No instance of PrecisionDebugger found.") + handler = TaskHandlerFactory.create(instance.config) + handler.handle() diff --git a/debug/accuracy_tools/atat/mindspore/doc/dump.md b/debug/accuracy_tools/atat/mindspore/doc/dump.md new file mode 100644 index 0000000000000000000000000000000000000000..34529f580a7b2cb4961a2c992949cab89c15115e --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/doc/dump.md @@ -0,0 +1,65 @@ +# **精度数据采集** + +atat工具主要通过在训练脚本内添加dump接口并启动训练的方式来采集精度数据。 + +执行dump操作需要安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 + +## dump接口介绍 + +### PrecisionDebugger + +**功能说明** + +通过加载dump配置文件的方式来确定dump操作的详细配置。 + +可以在from atat.mindspore import PrecisionDebugger和模型初始化之间的任意位置添加该接口。 + +**原型** + +```Python +PrecisionDebugger(config_path=None) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ----------- | ------------------------------------------------------------ | -------- | +| config_path | 指定dump配置文件路径,String类型。参数示例:"./config.json"。未配置该路径时,默认使用../../config目录下的config.json文件的默认配置。config.json文件可以配置更多参数,若需要进行更多场景的精度数据dump,建议配置[config.json](../../config/config.json)文件。 | 否 | + +### start函数 + +**功能说明** + +启动函数。 + +**原型** + +```Python +debugger.start() +``` + +该函数为类函数,可以使用debugger.start()也可以使用PrecisionDebugger.start()。 + +## 示例代码 + +```Python +from atat.mindspore import PrecisionDebugger +debugger = PrecisionDebugger(config_path="./config.json") +# 请勿将以上初始化流程插入到循环代码中 +# 下面代码也可以用PrecisionDebugger.start() +debugger.start() +... +``` + +## dump结果文件介绍 + +训练结束后,工具将dump的数据保存在dump_path参数指定的目录下。 + +- level为L1时 + + dump结果目录请参见MindSpore官网中的《[同步Dump数据对象目录](https://www.mindspore.cn/tutorials/experts/zh-CN/r2.3.0rc2/debug/dump.html#%E5%90%8C%E6%AD%A5dump%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95)》。 + +- level为L2时 + + dump结果目录请参见MindSpore官网中的《[异步Dump数据对象目录](https://www.mindspore.cn/tutorials/experts/zh-CN/r2.3.0rc2/debug/dump.html#%E5%BC%82%E6%AD%A5dump%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95)》。 + diff --git a/debug/accuracy_tools/atat/mindspore/dump/__init__.py b/debug/accuracy_tools/atat/mindspore/dump/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/mindspore/dump/api_kbk_dump.py b/debug/accuracy_tools/atat/mindspore/dump/api_kbk_dump.py new file mode 100644 index 0000000000000000000000000000000000000000..b0f80f40e553a8b136144f515015d0f94c635f5d --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/dump/api_kbk_dump.py @@ -0,0 +1,55 @@ +import os +import json +from atat.core.utils import make_dump_path_if_not_exists +from atat.mindspore.debugger.debugger_config import DebuggerConfig +from atat.core.log import print_info_log +from atat.core.file_check_util import FileOpen + + +class ApiKbkDump: + def __init__(self, config: DebuggerConfig): + self.dump_json = dict() + self.dump_json["common_dump_settings"] = dict() + self.dump_json["common_dump_settings"]["dump_mode"] = 0 + self.dump_json["common_dump_settings"]["path"] = "" + self.dump_json["common_dump_settings"]["net_name"] = "Net" + self.dump_json["common_dump_settings"]["iteration"] = "all" + self.dump_json["common_dump_settings"]["saved_data"] = "statistic" + self.dump_json["common_dump_settings"]["input_output"] = 0 + self.dump_json["common_dump_settings"]["kernels"] = [] + self.dump_json["common_dump_settings"]["support_device"] = [0,1,2,3,4,5,6,7] + self.dump_json["e2e_dump_settings"] = dict() + self.dump_json["e2e_dump_settings"]["enable"] = True + self.dump_json["e2e_dump_settings"]["trans_flag"] = True + + + if len(config.list) > 0: + self.dump_json["common_dump_settings"]["dump_mode"] = 1 + self.dump_json["common_dump_settings"]["kernels"] = config.list + self.dump_json["common_dump_settings"]["path"] = config.dump_path + if len(config.step) > 0: + step_str = "" + for s in config.step: + step_str += (str(s) + '|') + self.dump_json["common_dump_settings"]["iteration"] = step_str[:-1] + if len(config.rank) > 0: + self.dump_json["common_dump_settings"]["support_device"] = config.rank + if config.task == "tensor": + self.dump_json["common_dump_settings"]["saved_data"] = "tensor" + if len(config.data_mode) == 1: + if config.data_mode[0] == "input": + self.dump_json["common_dump_settings"]["input_output"] = 1 + if config.data_mode[0] == "output": + self.dump_json["common_dump_settings"]["input_output"] = 2 + + def handle(self): + json_path = self.dump_json["common_dump_settings"]["path"] + make_dump_path_if_not_exists(json_path) + json_path = os.path.join(json_path, "api_kbk_dump.json") + with FileOpen(json_path, 'w') as f: + json.dump(self.dump_json, f) + print_info_log(json_path + " has been created.") + os.environ["GRAPH_OP_RUN"] = "1" + os.environ["MINDSPORE_DUMP_CONFIG"] = json_path + if "MS_ACL_DUMP_CFG_PATH" in os.environ: + del os.environ["MS_ACL_DUMP_CFG_PATH"] diff --git a/debug/accuracy_tools/atat/mindspore/dump/dump_tool_factory.py b/debug/accuracy_tools/atat/mindspore/dump/dump_tool_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..ab534edc243dfd5f44688358fe4ca8edb6a8a12d --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/dump/dump_tool_factory.py @@ -0,0 +1,38 @@ +from atat.mindspore.debugger.debugger_config import DebuggerConfig +from atat.mindspore.dump.api_kbk_dump import ApiKbkDump +from atat.mindspore.dump.kernel_graph_dump import KernelGraphDump + + +class DumpToolFactory: + tools = { + "cell": { + "kbk": None, + "graph": None, + "pynative": None + }, + "api": { + "kbk": ApiKbkDump, + "graph": None, + "pynative": None + }, + "kernel": { + "kbk": None, + "graph": KernelGraphDump, + "pynative": None + } + } + + @staticmethod + def create(config: DebuggerConfig): + tool = DumpToolFactory.tools.get(config.level) + if not tool: + raise Exception("valid level is needed.") + if config.level == "api": + tool = tool.get("kbk") + elif config.level == "kernel": + tool = tool.get("graph") + elif config.level == "cell": + raise Exception("Cell dump in not supported now.") + if not tool: + raise Exception("Data dump in not supported in this mode.") + return tool(config) \ No newline at end of file diff --git a/debug/accuracy_tools/atat/mindspore/dump/kernel_graph_dump.py b/debug/accuracy_tools/atat/mindspore/dump/kernel_graph_dump.py new file mode 100644 index 0000000000000000000000000000000000000000..f8a10ec1b1f690931871895a47014d44594ac80a --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/dump/kernel_graph_dump.py @@ -0,0 +1,60 @@ +import os +import json +from atat.core.utils import make_dump_path_if_not_exists +from atat.mindspore.debugger.debugger_config import DebuggerConfig +from atat.core.log import print_info_log +from atat.core.file_check_util import FileOpen + + +class KernelGraphDump: + def __init__(self, config: DebuggerConfig): + self.dump_json = dict() + self.dump_json["common_dump_settings"] = dict() + self.dump_json["common_dump_settings"]["dump_mode"] = 0 + self.dump_json["common_dump_settings"]["path"] = "" + self.dump_json["common_dump_settings"]["net_name"] = "Net" + self.dump_json["common_dump_settings"]["iteration"] = "all" + self.dump_json["common_dump_settings"]["saved_data"] = "statistic" + self.dump_json["common_dump_settings"]["input_output"] = 0 + self.dump_json["common_dump_settings"]["kernels"] = [] + self.dump_json["common_dump_settings"]["support_device"] = [0, 1, 2, 3, 4, 5, 6, 7] + self.dump_json["common_dump_settings"]["op_debug_mode"] = 0 + self.dump_json["common_dump_settings"]["file_format"] = "npy" + + if len(config.list) > 0: + self.dump_json["common_dump_settings"]["dump_mode"] = 1 + self.dump_json["common_dump_settings"]["kernels"] = config.list + self.dump_json["common_dump_settings"]["path"] = config.dump_path + if len(config.step) > 0: + step_str = "" + for s in config.step: + step_str += (str(s) + '|') + self.dump_json["common_dump_settings"]["iteration"] = step_str[:-1] + if len(config.rank) > 0: + self.dump_json["common_dump_settings"]["support_device"] = config.rank + if config.task == "tensor": + self.dump_json["common_dump_settings"]["saved_data"] = "tensor" + self.dump_json["common_dump_settings"]["file_format"] = config.file_format + if len(config.data_mode) == 1: + if config.data_mode[0] == "input": + self.dump_json["common_dump_settings"]["input_output"] = 1 + if config.data_mode[0] == "output": + self.dump_json["common_dump_settings"]["input_output"] = 2 + + def handle(self): + if os.getenv("GRAPH_OP_RUN") == "1": + raise Exception("Must run in graph mode, not kbk mode") + json_path = self.dump_json["common_dump_settings"]["path"] + make_dump_path_if_not_exists(json_path) + json_path = os.path.join(json_path, "kernel_graph_dump.json") + with FileOpen(json_path, 'w') as f: + json.dump(self.dump_json, f) + print_info_log(json_path + " has been created.") + os.environ["MINDSPORE_DUMP_CONFIG"] = json_path + if self.dump_json["common_dump_settings"]["dump_mode"] == 0: + if self.dump_json["common_dump_settings"]["iteration"] != "all" or \ + len(self.dump_json["common_dump_settings"]["kernels"]) == 0: + os.environ["MS_ACL_DUMP_CFG_PATH"] = json_path + else: + if "MS_ACL_DUMP_CFG_PATH" in os.environ: + del os.environ["MS_ACL_DUMP_CFG_PATH"] diff --git a/debug/accuracy_tools/atat/mindspore/ms_config.py b/debug/accuracy_tools/atat/mindspore/ms_config.py new file mode 100644 index 0000000000000000000000000000000000000000..0d846c4771caca64443e170d580268ffbbdeff8e --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/ms_config.py @@ -0,0 +1,78 @@ +import json +from atat.core.common_config import CommonConfig, BaseConfig +from atat.core.file_check_util import FileOpen + + +class TensorConfig(BaseConfig): + def __init__(self, json_config): + super().__init__(json_config) + self.check_mode = None + self.file_format = json_config.get("file_format") + self.check_config() + self._check_config() + + def _check_config(self): + if self.data_mode is not None and len(self.data_mode) > 0: + if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]: + raise Exception("data_mode must be all, input or output") + if self.file_format and self.file_format not in ["npy", "bin"]: + raise Exception("file_format is invalid") + + +class StatisticsConfig(BaseConfig): + def __init__(self, json_config): + super().__init__(json_config) + self.file_format = None + self.check_mode = None + self.check_config() + self._check_config() + + def _check_config(self): + if self.data_mode is not None and len(self.data_mode) > 0: + if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]: + raise Exception("data_mode must be all, input or output") + + +class OverflowCheck(BaseConfig): + def __init__(self, json_config): + super().__init__(json_config) + self.file_format = None + self.check_mode = json_config.get("check_mode") + self._check_config() + + def _check_config(self): + if self.data_mode is not None and len(self.data_mode) > 0: + if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]: + raise Exception("data_mode must be all, input or output") + if self.check_mode and self.check_mode not in ["all", "aicore", "atomic"]: + raise Exception("check_mode is invalid") + + +def parse_common_config(json_config): + return CommonConfig(json_config) + + +def parse_task_config(task, json_config): + task_map = json_config[task] + if not task_map: + task_map = dict() + if task == "tensor": + return TensorConfig(task_map) + elif task == "statistics": + return StatisticsConfig(task_map) + elif task == "overflow_check": + return OverflowCheck(task_map) + else: + raise Exception("task is invalid.") + + +def parse_json_config(json_file_path): + if not json_file_path: + raise Exception("json file path is None") + with FileOpen(json_file_path, 'r') as file: + json_config = json.load(file) + common_config = parse_common_config(json_config) + if not common_config.task: + common_config.task = "statistics" + task_config = parse_task_config(common_config.task, json_config) + return common_config, task_config diff --git a/debug/accuracy_tools/atat/mindspore/overflow_check/__init__.py b/debug/accuracy_tools/atat/mindspore/overflow_check/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/mindspore/overflow_check/kernel_graph_overflow_check.py b/debug/accuracy_tools/atat/mindspore/overflow_check/kernel_graph_overflow_check.py new file mode 100644 index 0000000000000000000000000000000000000000..5ef005e59e8839e19f9af600c168343251580936 --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/overflow_check/kernel_graph_overflow_check.py @@ -0,0 +1,45 @@ +import os +import json +from atat.core.utils import make_dump_path_if_not_exists +from atat.mindspore.debugger.debugger_config import DebuggerConfig +from atat.core.log import print_warn_log, print_info_log +from atat.core.file_check_util import FileOpen + + +class KernelGraphOverflowCheck: + def __init__(self, config: DebuggerConfig): + self.dump_json = dict() + self.dump_json["common_dump_settings"] = dict() + self.dump_json["common_dump_settings"]["dump_mode"] = 0 + self.dump_json["common_dump_settings"]["path"] = "" + self.dump_json["common_dump_settings"]["net_name"] = "Net" + self.dump_json["common_dump_settings"]["iteration"] = "all" + self.dump_json["common_dump_settings"]["saved_data"] = "full" + self.dump_json["common_dump_settings"]["input_output"] = 0 + self.dump_json["common_dump_settings"]["kernels"] = [] + self.dump_json["common_dump_settings"]["support_device"] = [0,1,2,3,4,5,6,7] + self.dump_json["common_dump_settings"]["op_debug_mode"] = 3 + self.dump_json["common_dump_settings"]["file_format"] = "npy" + + self.dump_json["common_dump_settings"]["path"] = config.dump_path + if len(config.step) > 0: + print_warn_log("Step would change to all in this task.") + if len(config.rank) > 0: + self.dump_json["common_dump_settings"]["support_device"] = config.rank + if config.check_mode == "aicore": + self.dump_json["common_dump_settings"]["op_debug_mode"] = 1 + elif config.check_mode == "atomic": + self.dump_json["common_dump_settings"]["op_debug_mode"] = 2 + + def handle(self): + if os.getenv("GRAPH_OP_RUN") == "1": + raise Exception("Must run in graph mode, not kbk mode") + json_path = self.dump_json["common_dump_settings"]["path"] + make_dump_path_if_not_exists(json_path) + json_path = os.path.join(json_path, "kernel_graph_overflow_check.json") + with FileOpen(json_path, 'w') as f: + json.dump(self.dump_json, f) + print_info_log(json_path + " has been created.") + os.environ["MINDSPORE_DUMP_CONFIG"] = json_path + if "MS_ACL_DUMP_CFG_PATH" in os.environ: + del os.environ["MS_ACL_DUMP_CFG_PATH"] diff --git a/debug/accuracy_tools/atat/mindspore/overflow_check/overflow_check_tool_factory.py b/debug/accuracy_tools/atat/mindspore/overflow_check/overflow_check_tool_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..fe53359be1ba1ecb73fb84138228415f68e1c2ce --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/overflow_check/overflow_check_tool_factory.py @@ -0,0 +1,32 @@ +from atat.mindspore.debugger.debugger_config import DebuggerConfig +from atat.mindspore.overflow_check.kernel_graph_overflow_check import KernelGraphOverflowCheck + + +class OverflowCheckToolFactory: + tools = { + "cell": { + "kbk": None, + "graph": None, + "pynative": None + }, + "api": { + "kbk": None, + "graph": None, + "pynative": None + }, + "kernel": { + "kbk": None, + "graph": KernelGraphOverflowCheck, + "pynative": None + } + } + + @staticmethod + def create(config: DebuggerConfig): + tool = OverflowCheckToolFactory.tools.get(config.level) + if not tool: + raise Exception("valid level is needed.") + tool = tool.get("graph") + if not tool: + raise Exception("Overflow check in not supported in this mode.") + return tool(config) diff --git a/debug/accuracy_tools/atat/mindspore/task_handler_factory.py b/debug/accuracy_tools/atat/mindspore/task_handler_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..4f80e4e89c92156762ea0e4c4ed3302cc5c31f5f --- /dev/null +++ b/debug/accuracy_tools/atat/mindspore/task_handler_factory.py @@ -0,0 +1,21 @@ +from atat.mindspore.debugger.debugger_config import DebuggerConfig +from atat.mindspore.dump.dump_tool_factory import DumpToolFactory +from atat.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory + + +class TaskHandlerFactory: + tasks = { + "tensor": DumpToolFactory, + "statistics": DumpToolFactory, + "overflow_check": OverflowCheckToolFactory + } + + @staticmethod + def create(config: DebuggerConfig): + task = TaskHandlerFactory.tasks.get(config.task) + if not task: + raise Exception("valid task is needed.") + handler = task.create(config) + if not handler: + raise Exception("Can not find task handler") + return handler diff --git a/debug/accuracy_tools/atat/pytorch/__init__.py b/debug/accuracy_tools/atat/pytorch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..482e850f7baa845bd831e0d4728e841661b9345b --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/__init__.py @@ -0,0 +1,4 @@ +from .debugger.precision_debugger import PrecisionDebugger +from .common.utils import seed_all +from .compare.acc_compare import compare +from .compare.distributed_compare import compare_distributed diff --git a/debug/accuracy_tools/atat/pytorch/advisor/advisor.py b/debug/accuracy_tools/atat/pytorch/advisor/advisor.py new file mode 100644 index 0000000000000000000000000000000000000000..5ae692a998d933af01b6d8d0ebf60896fd685886 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/advisor/advisor.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os +import pandas as pd + +from .advisor_result import AdvisorResult +from .advisor_const import AdvisorConst +from ...core.utils import CompareException, CompareConst, Const, print_info_log, print_warn_log, print_error_log +from ...core.file_check_util import FileChecker, FileCheckConst + + +class Advisor: + """ + Class for generate advisor + """ + + def __init__(self, input_data, out_path=""): + self.input_data = input_data + self.out_path = os.path.realpath(out_path) + + def _parse_input_data(self): + data_columns = self.input_data.columns.values + if {CompareConst.ACCURACY, CompareConst.NPU_NAME}.issubset(data_columns): + self.file_type = Const.ALL + elif {CompareConst.RESULT, CompareConst.NPU_MD5}.issubset(data_columns): + self.file_type = Const.MD5 + elif {CompareConst.MAX_DIFF, CompareConst.RESULT}.issubset(data_columns): + self.file_type = Const.SUMMARY + else: + print_error_log('Compare result does not meet the required conditions.') + raise CompareException(CompareException.INVALID_DATA_ERROR) + df = self.input_data.reset_index() + return df + + def _check_path_vaild(self): + out_path_checker = FileChecker(self.out_path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE) + out_path_checker.common_check() + + def gen_advisor_message(self, node_name): + if AdvisorConst.FORWARD in node_name: + if AdvisorConst.INPUT in node_name: + message = AdvisorConst.FORWARD_INPUT_SUGGEST + else: + message = AdvisorConst.FORWARD_OUTPUT_SUGGEST + message = self.deterministic_advisor(message, node_name) + else: + if AdvisorConst.INPUT in node_name: + message = AdvisorConst.BACKWARD_INPUT_SUGGEST + else: + message = AdvisorConst.BACKWARD_OUTPUT_SUGGEST + message = self.deterministic_advisor(message, node_name) + message = self.batch_norm_advisor(message, node_name) + return message + + @staticmethod + def deterministic_advisor(message, node_name): + for api_name in AdvisorConst.NEED_DETERMINISTIC_API: + if api_name in node_name: + return AdvisorConst.DETERMINISTIC_SUGGEST + return message + + @staticmethod + def batch_norm_advisor(message, node_name): + if AdvisorConst.FUNC_BATCH_NORM in node_name and AdvisorConst.FORWARD_INPUT_1 in node_name: + message = AdvisorConst.BATCH_NORM_SUGGEST + return message + + def analyze_unmatched(self, analyze_data): + if self.file_type == Const.ALL: + accuracy_unmatched = analyze_data[analyze_data[CompareConst.ACCURACY] == CompareConst.ACCURACY_CHECK_UNMATCH] + else: + accuracy_unmatched = analyze_data[(analyze_data[CompareConst.NPU_SHAPE] == CompareConst.NAN) | + (analyze_data[CompareConst.BENCH_SHAPE] == CompareConst.NAN)] + num_unmatch = len(accuracy_unmatched) + if num_unmatch != 0: + for i in range(len(accuracy_unmatched)): + item = accuracy_unmatched.iloc[i] + print_warn_log("The tensor name matches but the shape or dtype does not match: {}" + .format(item[CompareConst.NPU_NAME])) + + def gen_advisor_result(self, pd_data): + first_failing_data = pd_data.iloc[0] + node_name = first_failing_data[CompareConst.NPU_NAME] + index = first_failing_data['index'] + message = self.gen_advisor_message(node_name) + print_warn_log("Find %s accuracy not reached, the line is %s" % (node_name, index)) + result = AdvisorResult(node_name, index, message) + return result + + def analysis(self): + self._check_path_vaild() + analyze_data = self._parse_input_data() + print_info_log("Start analyzing the comparison result: %s" % self.file_type) + self.analyze_unmatched(analyze_data) + if self.file_type == Const.ALL: + failing_data = analyze_data[analyze_data[CompareConst.ACCURACY] == CompareConst.ACCURACY_CHECK_NO] + elif self.file_type == Const.MD5: + failing_data = analyze_data[analyze_data[CompareConst.RESULT] == CompareConst.DIFF] + elif self.file_type == Const.SUMMARY: + failing_data = analyze_data[analyze_data[CompareConst.RESULT] == CompareConst.WARNING] + if failing_data.empty: + print_info_log("All data from api input/output accuracy reached") + result = AdvisorResult(AdvisorConst.NO_ERROR_API, AdvisorConst.NO_ERROR_API, AdvisorConst.NO_ERR_SUGGEST) + else: + result = self.gen_advisor_result(failing_data) + message_list = result.print_advisor_log() + result.gen_summary_file(self.out_path, message_list) diff --git a/debug/accuracy_tools/atat/pytorch/advisor/advisor_const.py b/debug/accuracy_tools/atat/pytorch/advisor/advisor_const.py new file mode 100644 index 0000000000000000000000000000000000000000..737c675911216bad767eadf4ec8d95c92f8e9173 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/advisor/advisor_const.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + + +class AdvisorConst: + """ + Class for advisor const + """ + + # text symbol + NEW_LINE = "\n" + COLON = ": " + + # advisor summary key + SUSPECT_NODES = "Suspect Nodes" + LINE = "Line" + ADVISOR_SUGGEST = "Expert Advice" + + NO_ERROR_API = "NA" + + # advisor message + NO_ERR_SUGGEST = "All data in comparison result meets the accuracy requirements." + FORWARD_INPUT_SUGGEST = "1. Analyze the model to view the input source.\n" \ + "2. Check whether an inplace API causes the output result to overwrite the input result. That is, the fault is actually caused by a computation error.\n" \ + "3. The fault may be caused by memory corruption and further analysis is required." + FORWARD_OUTPUT_SUGGEST = "This is a forward API computation error. Check the computation implementation." + BACKWARD_INPUT_SUGGEST = "Check whether the forward computation result is affected." + BACKWARD_OUTPUT_SUGGEST = "This is a backward API computation error. Check the computation implementation." + BATCH_NORM_SUGGEST = "Torch API batch_norm input not fixed, the following suggestions may fix it:\n" \ + "1. If use torch.nn.functional.batch_norm, you can set parameter training=False.\n" \ + "2. If use torch.nn.BatchNormXXX, you can set parameter affine=False.\n" \ + "3. Use seed_all(mode=True) to enable deterministic computing." + DETERMINISTIC_SUGGEST = "This torch api may be uncertainty in the calculation, " \ + "can seed_all(mode=True) to enable deterministic computing." + + FUNC_BATCH_NORM = "Functional_batch_norm" + FORWARD_INPUT_1 = "forward_input.1" + NEED_DETERMINISTIC_API = ["conv2d", "conv3d", "matmul", "nll_loss", "layer_norm", "lstm"] + BATCH_NORM = "batch_norm" + + # name keyword + INPUT = "input" + OUTPUT = "output" + FORWARD = "forward" + BACKWARD = "backward" diff --git a/debug/accuracy_tools/atat/pytorch/advisor/advisor_result.py b/debug/accuracy_tools/atat/pytorch/advisor/advisor_result.py new file mode 100644 index 0000000000000000000000000000000000000000..f8a16d2a7067d7ef2fa0746e32258f9da17624df --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/advisor/advisor_result.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import os +import time + +from .advisor_const import AdvisorConst +from ...core.utils import Const, print_info_log, print_error_log +from ...core.file_check_util import FileCheckConst, change_mode + + +class AdvisorResult: + """ + Class for generate advisor result + """ + + def __init__(self, node, line, message): + self.suspect_node = node + self.line = line + self.advisor_message = message + + @staticmethod + def gen_summary_file(out_path, message_list): + file_name = 'advisor_{}.txt'.format(time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))) + result_file = os.path.join(out_path, file_name) + try: + with os.fdopen(os.open(result_file, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as output_file: + output_file.truncate(0) + message_list = [message + AdvisorConst.NEW_LINE for message in message_list] + output_file.writelines(message_list) + change_mode(result_file, FileCheckConst.DATA_FILE_AUTHORITY) + except IOError as io_error: + print_error_log("Failed to save %s, the reason is %s." % (result_file, io_error)) + else: + print_info_log("The advisor summary is saved in: %s" % result_file) + + def print_advisor_log(self): + print_info_log("The summary of the expert advice is as follows: ") + message_list = [AdvisorConst.LINE + AdvisorConst.COLON + str(self.line), + AdvisorConst.SUSPECT_NODES + AdvisorConst.COLON + self.suspect_node, + AdvisorConst.ADVISOR_SUGGEST + AdvisorConst.COLON + self.advisor_message] + for message in message_list: + print_info_log(message) + return message_list diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/.keep b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/README.md b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7738501db87b1cacbc9eb96687bf09aed3a5ed68 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/README.md @@ -0,0 +1,528 @@ +# Ascend模型精度预检工具 + +Ascend模型精度预检工具能在昇腾NPU上扫描用户训练模型中所有API,输出精度情况的诊断和分析。工具会提取模型中所有的API前反向信息,构造相应的API单元测试,将NPU输出与标杆(CPU高精度)比对,从而检测出精度有问题的API;另外工具还可以通过新精度标准比对法,从而确认NPU和GPU各自运行时的精度哪一方更接近标杆(CPU高精度)。 + +**新精度标准比对法**:依据新精度标准,对不同的API采取不同的比对算法进行比对(包括绝对阈值法,标杆比对法和二进制一致法),最终给定预检判定结果。 + +**真实数据模式**:精度预检工具支持随机生成模式和真实数据模式,即在预检dump时可以选择由工具构造随机数进行输入获得dump数据或选择获取真实输入数据进行预检dump操作;随机生成模式执行效率高,可以快速获得结果,但数据精度低,只能大致判断精度问题;真实数据模式执行效率略低于随机生成模式,但是数据精度高,可以准确判断精度问题。 + +工具支持PyTorch版本:1.11.0/2.0/2.1。 + +## 工具特性 + +1. 落盘数据小。 +2. 不依赖标杆侧GPU训练资源,本地即可完成预检(新精度标准比对法除外)。 +3. 支持随机生成模式和真实数据模式。 +4. 单API测试,排除整网中的累计误差问题。 + +## 预检流程 + +精度预检可以分为:标准模式(直接进行NPU vs CPU高精度的预检比对操作)和新精度标准比对法(将NPU vs CPU高精度的预检比对结果和GPU vs CPU高精度的预检比对结果进行比对汇总),两种模式操作流程如下。 + +### 标准模式 + +1. 在NPU环境下安装预检工具。详见“**工具安装**”。 +2. 在NPU环境下dump预检数据。详见“**dump预检数据**”。 +3. NPU环境下执行run_ut。详见“**run_ut预检操作**”。 +4. 查看“**预检结果**”。 + +### 新精度标准比对法 + +1. 在NPU和GPU环境下分别安装预检工具。详见“**工具安装**”。 +2. 在NPU环境下dump预检数据(使用msCheckerConfig.update_config开启真实数据模式)。详见“**dump预检数据**”。 +3. 将NPU环境下dump的预检数据拷贝至GPU环境。 +4. 在NPU和GPU环境下分别执行run_ut。详见“**run_ut预检操作**”。 +5. 将NPU和GPU执行run_ut生成的`accuracy_checking_details_{timestamp}.csv`结果文件拷贝至同一环境下。 +6. 运行api_precision_compare.py。详见“**预检结果比对**”。 + +## 工具安装 + +1. 将att仓代码下载到本地,并配置环境变量。假设下载后att仓路径为 $ATT_HOME,环境变量应配置为: + + ```bash + export PYTHONPATH=$PYTHONPATH:$ATT_HOME/debug/accuracy_tools/ + ``` + +2. 安装依赖tqdm、rich、pyyaml、pandas + + ```bash + pip3 install tqdm rich pyyaml pandas + ``` + +## 预检操作 + +### dump预检数据 + +#### dump操作 + +在训练脚本(如main.py)中加入以下代码导入工具dump模块,启动训练即可自动抓取网络所有API信息。 + +- 若训练脚本中的代码不是通过torch.utils.data.dataloader来加载数据或在部分流水并行、张量并行场景下,工具的开关无法在每张卡上自动打开,导致多卡训练dump结果只有一组json,那么需要在训练代码中添加打开工具开关的调用。 + + 在训练代码中添加数据dump操作如下: + + ```Python + import api_accuracy_checker.dump as DP + + # 需要先修改enable_dataloader参数值为False + # 关闭torch.utils.data.dataloader加载数据时,下列代码须在训练step代码内添加 + DP.dump.start() # 开启工具dump模块 + + ... + + DP.dump.stop() # 控制dump结束 + DP.dump.step() # 在DP.dump.stop()后加入DP.dump.step()即可指定需要dump的step + ``` + + 上述代码要添加在迭代内,如对于[ModelLink](https://gitee.com/ascend/ModelLink)的LLAMA2-7B可以添加在training.py中train函数的iteration循环内。之后工具会适配这个场景开关的自动打开。 + +- 如果训练脚本是通过torch.utils.data.dataloader方式加载数据。 + + 首先,需要开启torch.utils.data.dataloader加载数据,操作如下: + + ```bash + cd att/debug/accuracy_tools/api_accuracy_checker + vi config.yaml + # 修改enable_dataloader参数值为True + ``` + + 其次,在训练脚本中加入以下代码导入工具dump模块,启动训练即可自动抓取网络所有API信息。 + + ```python + import api_accuracy_checker.dump + ``` + + 工具默认抓取训练的**第二个迭代**并且在第二个迭代后会报错退出训练进程,可通过target_iter参数配置。 + + **报错信息如下,这个报错仅用于停止训练,属于正常现象**: + + ```bash + Exception: Model pretest: exit after iteration 1. + ``` + + 若报错信息不一致,可能是由于服务器的其他错误信息覆盖导致,可以尝试查找报错信息中的Exception。 + +dump信息默认会存盘到“./step1”路径下(相对于启动训练的路径),包括: + +- forward_info_{pid}.json:前向API信息文件。 +- backward_info_{pid}.json:反向API信息文件。 +- stack_info_{pid}.json:调用栈信息文件。 + +forward_info与stack_info中的key值一一对应,用户可根据forward_info中API的key在stack_info中查询到其调用栈及代码行位置。 + +若有需要,用户可以通过msCheckerConfig.update_config来配置dump路径以及开启**真实数据模式**、指定dump某个step或配置**API dump白名单**,详见“**msCheckerConfig.update_config**”。 + +#### 真实数据模式 + +预检工具默认为随机数据模式,如果想要完全复刻整网的API运行情况,可以使用真实数据模式,添加以下代码即可: + +```python +from api_accuracy_checker.dump import msCheckerConfig +msCheckerConfig.update_config(real_data=True) +``` + +#### API dump白名单 + +精度预检工具可以对指定API进行预检操作,可以在dump时的训练脚本中直接添加白名单参数,只dump指定的API数据,示例代码如下: + +```python +from api_accuracy_checker.dump import msCheckerConfig +msCheckerConfig.update_config(white_list=["conv1d", "conv2d"]) +``` + +配置的API名称须存在于[support_wrap_ops.yaml](./hook_module/support_wrap_ops.yaml)文件下。 + +#### 工具支持的API列表 + +预检工具维护固定的API支持列表,若需要删除或增加dump的API,可以在[support_wrap_ops.yaml](./hook_module/support_wrap_ops.yaml)文件内手动修改,如下示例: + +```bash +functional: # functional为算子类别,找到对应的类别,在该类别下按照下列格式删除或添加API + - conv1d + - conv2d + - conv3d +``` + +#### msCheckerConfig.update_config + +**功能说明** + +配置精度预检dump时的属性。 + +可选配置。 + +**函数原型** + +```python +msCheckerConfig.update_config(dump_path="./", real_data=False, target_iter=[1], white_list=[], enable_dataloader=False) +``` + +**参数说明** + +| 参数名称 | 说明 | 是否必选 | +| ----------------- | ------------------------------------------------------------ | -------- | +| dump_path | 设置dump路径,默认为当前目录。若指定目录不存在,则自动创建。 | 否 | +| real_data | 真实数据模式,可取值True或False,默认为False,表示随机数据模式,配置为True后开启真实数据模式,dump信息增加forward_real_data和backward_real_data目录,目录下保存每个API输入的具体数值。 | 否 | +| target_iter | 指定dump某个step的数据,默认为[1],须指定为训练脚本中存在的step。target_iter为list格式,可配置逐个step,例如:target_iter=[0,1,2];也可以配置step范围,例如:target_iter=list(range(0,9)),表示dump第0到第8个step。 | 否 | +| white_list | API dump白名单,指定dump具体API数据,也可以直接配置预检的API白名单,详细请参见“**API预检白名单**”。参数示例:white_list=["conv1d", "conv2d"]。默认未配置白名单,即dump全量API数据。 | 否 | +| enable_dataloader | 自动dump数据开关,可取值True(开启)、False(关闭),默认关闭。 | 否 | + +### run_ut预检操作 + +完成“dump预检数据”后,仅仅获取了API的输入数据,为了得到NPU vs CPU高精度(标杆)的预检比对结果和GPU vs CPU高精度(标杆)的预检比对结果,还需要进行run_ut操作。 + +run_ut预检操作包括如下场景: + +- 使用run_ut.py执行预检:run_ut.py适用于数据量较小的单卡场景。 +- 使用multi_run_ut.py执行多线程预检:multi_run_ut.py适用于数据量较大的大模型场景。 + +#### 使用run_ut.py执行预检 + +1. 将API信息输入给run_ut模块运行精度检测并比对,运行如下命令: + + ```bash + cd $ATT_HOME/debug/accuracy_tools/api_accuracy_checker/run_ut + python run_ut.py -forward ./forward_info_0.json -backward ./backward_info_0.json + ``` + + 某些场景下(如推理),可以不指定backward_info_0.json,不影响预检功能。 + + | 参数名称 | 说明 | 是否必选 | + | -------------------------------- | ------------------------------------------------------------ | ---------------------------------- | + | -forward或--forward_input_file | 指定前向API信息文件forward_info_{pid}.json。 | 是 | + | -backward或--backward_input_file | 指定反向API信息文件backward_info_{pid}.json。 | 否 | + | -save_error_data | 保存精度未达标的API输入输出数据。 | 否 | + | -o或--out_path | 指定run_ut执行结果存盘路径,默认“./”(相对于run_ut的路径)。 | 否 | + | -j或--jit_compile | 开启jit编译。 | 否 | + | -d或--device | 指定Device ID,选择UT代码运行所在的卡,默认值为0。 | 否 | + | -csv_path或--result_csv_path | 指定本次运行中断时生成的`accuracy_checking_result_{timestamp}.csv`文件路径,执行run_ut中断时,若想从中断处继续执行,配置此参数即可。需要指定为上次中断的`accuracy_checking_result_{timestamp}.csv`文件。详见“**断点续检**”。 | run_ut操作中断后继续执行场景下必选 | + | -real_data_path | 指定run_ut操作的真实数据路径。真实数据dump模式通过**msCheckerConfig.update_config**接口的real_data参数开启。指定绝对路径为forward_real_data和backward_real_data目录的父目录。 | dump的数据为真实数据下必选 | + | -f或--filter_api | 过滤模型中除最大值和最小值以外其他参数和结构相同的API。适用于模型较大且重复API较多的场景。 | 否 | + + run_ut执行结果包括`accuracy_checking_result_{timestamp}.csv`和`accuracy_checking_details_{timestamp}.csv`两个文件。`accuracy_checking_result_{timestamp}.csv`是API粒度的,标明每个API是否通过测试。建议用户先查看`accuracy_checking_result_{timestamp}.csv`文件,对于其中没有通过测试的或者特定感兴趣的API,根据其API name字段在`accuracy_checking_details_{timestamp}.csv`中查询其各个输出的达标情况以及比较指标。详细介绍请参见“**预检结果**”。 + +2. (可选)如果需要保存比对不达标的输入和输出数据,可以在run_ut执行命令结尾添加-save_error_data,例如: + + ```bash + python run_ut.py -forward ./forward_info_0.json -backward ./backward_info_0.json -save_error_data + ``` + + 数据默认会存盘到'./ut_error_data{timestamp}'路径下(相对于启动run_ut的路径),有需要的话,用户可以通过修改att/debug/accuracy_tools/api_accuracy_checker目录下,config.yaml文件的error_data_path参数来配置保存路径,详见“config.yaml文件说明”。。 + +3. (可选)如果dump的数据为真实数据,那么需要指定真实数据路径,例如: + + ```bash + python run_ut.py -forward ./forward_info_0.json -backward ./backward_info_0.json -real_data_path /home/xxx/ut/real_data + ``` + +#### 使用multi_run_ut.py执行多线程预检 + +multi_run_ut.py脚本,可以并行执行多个run_ut操作,从而降低预检耗时。 + +命令示例如下: + +```bash +cd $ATT_HOME/debug/accuracy_tools/api_accuracy_checker/run_ut +python multi_run_ut.py -forward ./forward_info_0.json -backward ./backward_info_0.json -n 32 -d 0 1 2 3 +``` + +某些场景下(如推理),可以不指定backward_info_0.json,不影响预检功能。 + +| 参数名称 | 说明 | 是否必选 | +| -------------------------------- | ------------------------------------------------------------ | ---------------------------------- | +| -forward或--forward_input_file | 指定前向API信息文件forward_info_{pid}.json。 | 是 | +| -backward或--backward_input_file | 指定反向API信息文件backward_info_{pid}.json。 | 否 | +| -save_error_data | 保存精度未达标的API输入输出数据。 | 否 | +| -o或--out_path | 指定run_ut执行结果存盘路径,默认“./”(相对于run_ut的路径)。 | 否 | +| -j或--jit_compile | 开启jit编译。 | 否 | +| -n | 同时执行run_ut线程的数量,默认为8,最大支持64,但每个Device最大支持8个线程,当指定多个线程和多个Device时,则线程数在每张卡上均分。 | 否 | +| -d或--device | 指定Device ID,选择UT代码运行所在的卡,默认值为0,支持同时指定0~7,共8个Device。 | 否 | +| -csv_path或--result_csv_path | 指定本次运行中断时生成的`accuracy_checking_result_{timestamp}.csv`文件路径,执行run_ut中断时,若想从中断处继续执行,配置此参数即可。需要指定为上次中断的`accuracy_checking_result_{timestamp}.csv`文件。详见“**断点续检**”。 | run_ut操作中断后继续执行场景下必选 | +| -real_data_path | 指定run_ut操作的真实数据路径。真实数据dump模式通过**msCheckerConfig.update_config**接口的real_data参数开启。指定绝对路径为forward_real_data和backward_real_data目录的父目录。 | dump的数据为真实数据下必选 | +| -f或--filter_api | 过滤模型中除最大值和最小值以外其他参数和结构相同的API。适用于模型较大且重复API较多的场景。 | 否 | + +#### 断点续检 + +精度预检run_ut过程中,若因环境、数据量过大等原因导致预检进程中断,那么当用户解决这些问题后,重新执行run_ut操作,可以通过断点续检操作继续前面未完成的预检,会在-csv_path指定的`accuracy_checking_result_{timestamp}.csv`文件以及对应的`accuracy_checking_details_{timestamp}.csv`文件中继续写入后续的结果,不会重新创建结果文件。 + +须指定为上次预检中断的`accuracy_checking_result_{timestamp}.csv`文件。请勿修改`accuracy_checking_result_{timestamp}.csv`和`accuracy_checking_details_{timestamp}.csv`文件名,包括时间戳,否则断点续检会因无法识别到文件名而失败。 + +断点续检操作通过如下命令执行: + +```bash +python run_ut.py -forward ./forward_info_0.json -backward ./backward_info_0.json -csv_path /home/xxx/ut/accuracy_checking_result_{timestamp}.csv +``` + +#### API预检白名单 + +run_ut过程同样支持API预检白名单,操作方式如下: + +修改att/debug/accuracy_tools/api_accuracy_checker目录下config.yaml文件的white_list参数,配置需要预检的API名称,详见“config.yaml文件说明”。 + +### config.yaml文件说明 + +config.yaml文件可以通过配置参数来控制dump和run_ut操作的真实数据模式以及白名单等功能。 + +文件路径为:att/debug/accuracy_tools/api_accuracy_checker/config.yaml + +| 参数名称 | 说明 | 是否必选 | +| ----------------- | ------------------------------------------------------------ | -------- | +| dump_path | 设置dump路径,默认为当前目录。若指定目录不存在,则自动创建。 | 否 | +| real_data | 真实数据模式,可取值True或False,默认为False,表示随机数据模式,配置为True后开启真实数据模式,dump信息增加forward_real_data和backward_real_data目录,目录下保存每个API输入的具体数值。 | 否 | +| enable_dataloader | 自动dump数据开关,可取值True(开启)、False(关闭),默认关闭。 | 否 | +| target_iter | 指定dump某个step的数据,默认为[1],须指定为训练脚本中存在的step。target_iter为list格式,可配置逐个step,例如:target_iter=[0,1,2];也可以配置step范围,例如:target_iter=list(range(0,9)),表示dump第0到第8个step。 | 否 | +| white_list | API dump白名单,指定dump具体API数据,也可以直接配置预检的API白名单,详细请参见“**API预检白名单**”。参数示例:white_list=["conv1d", "conv2d"]。默认未配置白名单,即dump全量API数据。 | 否 | +| error_data_path | 配置保存精度未达标的API输入输出数据路径。 | 否 | +| jit_compile | 开启jit编译。 | 否 | +| precision | 浮点数表示位数,默认取小数点后14位。 | 否 | + +## 预检结果 + +精度预检生成的`accuracy_checking_result_{timestamp}.csv`和`accuracy_checking_details_{timestamp}.csv`文件示例如下: + +可以通过先查看`accuracy_checking_result_{timestamp}.csv`文件的Forward Test Success和Backward Test Success,判断是否存在未通过测试的API,再查看`accuracy_checking_details_{timestamp}.csv`文件的API详细达标情况,API达标情况介绍请参见“**API预检指标**”。 + +`accuracy_checking_result_{timestamp}.csv` + +![891a3bd8_12631423](img/accuracy_checking_result.png) + +| 字段 | 含义 | +| --------------------- | ------------------------------------------------------------ | +| API name | API名称。 | +| Forward Test Success | 前向API是否通过测试,pass为通过,warning为待观察,error为错误。 | +| Backward Test Success | 反向API是否通过测试,pass为通过,warning为待观察,error为错误,如果是空白的话代表该API没有反向输出。 | +| Message | 提示信息。 | + +Forward Test Success和Backward Test Success是否通过测试是由`accuracy_checking_details_{timestamp}.csv`中的余弦相似度、最大绝对误差、双百双千双万指标判定结果决定的。 + +需要注意的是`accuracy_checking_details_{timestamp}.csv`中可能存在一个API的前向(反向)有多个输出,那么每个输出记录一行,而在`accuracy_checking_result_{timestamp}.csv`中的结果需要该API的所有结果均为pass才能标记为TRUE,否则标记FALSE或WARING。 + +`accuracy_checking_details_{timestamp}.csv` + +![f07237b1_12631423](img/accuracy_checking_details.png) + +| 字段 | 含义 | +| ---------------- | ------------------------------------------------------------ | +| API name | NPU或GPU下的API名称。 | +| Bench Dtype | 标杆数据的API数据类型。 | +| Device Dtype | NPU或GPU数据的API数据类型。 | +| Shape | API的Shape信息。 | +| 余弦相似度 | NPU或GPU数据与标杆数据的余弦相似度。 | +| 最大绝对误差 | NPU或GPU数据与标杆数据的最大绝对误差。 | +| 双百指标 | 双百精度指标。是指NPU或GPU的Tensor中的元素逐个与对应的标杆数据对比,相对误差小于百分之一的个数占总元素个数的比例。测试通过标准为相对误差大于百分之一的个数占总元素个数的比例小于百分之一。 | +| 双千指标 | 双千精度指标。是指NPU或GPU的Tensor中的元素逐个与对应的标杆数据对比,相对误差小于千分之一的个数占总元素个数的比例。测试通过标准为相对误差大于千分之一的个数占总元素个数的比例小于千分之一。 | +| 双万指标 | 双万精度指标。是指NPU或GPU的Tensor中的元素逐个与对应的标杆数据对比,相对误差小于万分之一的个数占总元素个数的比例。测试通过标准为相对误差大于万分之一的个数占总元素个数的比例小于万分之一。 | +| 二进制一致错误率 | NPU或GPU数据中每个Tensor精度不一致的数值的数量与Tensor中数值数量的比值。只有数据是builtin类型(bool、int、float、str)、torch.bool和torch的int类型才会展示。 | +| 误差均衡性 | NPU或GPU数据与标杆数据精度差的上下浮动情况。 | +| 均方根误差 | NPU或GPU数据与标杆数据的均方根误差。 | +| 小值域错误占比 | NPU或GPU Tensor中与标杆的绝对误差大于错误阈值的小值在小值域(小值的总数量)中的占比。判断为小值以及绝对误差的错误阈值见“**小值域阈值**”。 | +| 相对误差最大值 | NPU或GPU数据与标杆数据相对误差的最大值。 | +| 相对误差平均值 | NPU或GPU数据与标杆数据相对误差的平均值。 | +| inf/nan错误率 | NPU与标杆inf/nan计算不一致的元素个数占总元素的个数比例。 | +| 相对误差错误率 | NPU与标杆的正常值计算相对误差,其大于错误阈值的元素个数占正常值元素个数的比例。 | +| 绝对误差错误率 | NPU与标杆的小值计算绝对误差,其大于错误阈值的元素个数占小值元素个数的比例。 | +| Status | API预检通过状态,pass表示通过测试,error表示未通过,warning表示测试未通过双千或双万精度指标,SKIP表示该API的某个参数的反向不要计算梯度,所以没有任何计算过程,其他信息均为空。 | +| message | 提示信息。 | + +### 小值域阈值 + +判定为小值的阈值为: + +- torch.float32:e-6 +- torch.float16:e-3 +- torch.bfloat16:e-3 + +小值域的绝对误差阈值为: + +- torch.float32:e-9 +- torch.float16:e-5 +- torch.bfloat16:e-5 + +### API预检指标 + +API预检指标是通过对`accuracy_checking_details_{timestamp}.csv`中的余弦相似度、最大绝对误差双百、双千、双万精度指标的数值进行判断,得出该API是否符合精度标准的参考指标。 + +API预检通过测试,则在`accuracy_checking_details_{timestamp}.csv`文件中的“Status”列标记“pass”,否则标记“error”或“warning”,详细规则如下: + +1. 余弦相似度 > 0.99:≤ 0.99为不达标,标记“error”,> 0.99达标,进行下一步; +2. 最大绝对误差 < 0.001:< 0.001达标,标记“pass”,≥ 0.001为不达标,进行下一步; +3. 双百、双千、双万精度指标: + - 对于float16和bfloat16数据:双百指标不通过,标记“error”;双百指标通过,双千指标不通过,标记“warning”;双百、双千指标均通过,标记“pass”。 + - 对于float32和float64数据:双千指标不通过,标记“error”;双千指标通过,双万指标不通过,标记“warning”;双千、双万指标均通过,标记“pass”。 + +4. 在`accuracy_checking_result_{timestamp}.csv`中以“Forward Test Success”和“Backward Test Success”字段统计该算子前向反向输出的测试结果,对于标记“pass”的算子,则在`accuracy_checking_result_{timestamp}.csv`中标记“TRUE”表示测试通过,对于标记“error”或“warning”的算子,则在`accuracy_checking_result_{timestamp}.csv`中标记“FALSE”表示测试不通过。由于一个算子可能有多个前向或反向的输入或输出,那么该类算子的输入或输出中必须全为“pass”,才能在`accuracy_checking_result_{timestamp}.csv`中标记“TRUE”,只要有一个输入或输出标记“error”或“warning”,那么在`accuracy_checking_result_{timestamp}.csv`中标记“FALSE”。 + +## 预检结果比对 + +该步骤仅新精度标准比对法需要执行,需要同时获取NPU和GPU环境下run_ut操作的预检结果`accuracy_checking_details_{timestamp}.csv`文件。执行如下命令进行NPU和GPU预检结果的比对: + +```bash +cd $ATT_HOME/debug/accuracy_tools/api_accuracy_checker/compare +python api_precision_compare.py -npu /home/xxx/npu/accuracy_checking_details_{timestamp}.csv -gpu /home/xxx/gpu/accuracy_checking_details_{timestamp}.csv -o /home/xxx/ +``` + +| 参数名称 | 说明 | 是否必选 | +| -------------------- | ------------------------------------------------------------ | -------- | +| -npu或--npu_csv_path | NPU预检结果`accuracy_checking_details_{timestamp}.csv`文件路径。默认从当前目录下识别该文件。 | 否 | +| -gpu或--gpu_csv_path | GPU预检结果`accuracy_checking_details_{timestamp}.csv`文件路径。默认从当前目录下识别该文件。 | 否 | +| -o或--out_path | 指定api_precision_compare.py执行结果存盘路径,默认为当前目录。 | 否 | + +执行完成后输出`api_precision_compare_result_{timestamp}.csv`和`api_precision_compare_details_{timestamp}.csv`文件。文件示例如下: + +可以通过先查看`api_precision_compare_result_{timestamp}.csv`文件的Forward Test Success和Backward Test Success,判断是否存在未通过测试的API,再查看`api_precision_compare_details_{timestamp}.csv`文件的API详细达标情况。 + +`api_precision_compare_result_{timestamp}.csv` + +![api_precision_compare_result](img/api_precision_compare_result.png) + +| 字段 | 含义 | +| --------------------- | ------------------------------------------------------------ | +| API name | API名称。 | +| Forward Test Success | 前向API是否通过测试,pass为通过,warning为待观察,error为错误,skip表示该API的数据类型不支持使用新精度标准进行比对,如float64。 | +| Backward Test Success | 反向API是否通过测试,pass为通过,warning为待观察,error为错误,如果是空白的话代表该API没有反向输出,skip表示该API的数据类型不支持使用新精度标准进行比对,如float64。 | +| Message | 提示信息。 | + +Forward Test Success和Backward Test Success是否通过测试是由`api_precision_compare_details_{timestamp}.csv`中的各个指标判定结果决定的。需要注意的是`api_precision_compare_details_{timestamp}.csv`中可能存在一个API的前向(反向)有多个输出,那么每个输出记录一行,而在`api_precision_compare_result_{timestamp}.csv`中的结果需要该API的所有结果均为pass才能标记为TRUE,否则标记FALSE或WARING。 + +`api_precision_compare_details_{timestamp}.csv` + +![api_precision_compare_details](img/api_precision_compare_details.png) + +| 字段 | 含义 | +| ------------------------ | ------------------------------------------------------------ | +| API name | NPU或GPU下的API名称。 | +| 小值域错误比值 | NPU与CPU的小值域的错误比率/GPU与CPU的小值域的错误比率。 | +| 小值域错误判定结果 | 小值域错误比值小于等于1标记为pass,1~2之间标记为waring,大于2标记为error。 | +| 均方根误差比值 | NPU与CPU的均方根误差/GPU与CPU的均方根误差。 | +| 均方根误差判定结果 | 均方根误差比值小于等于1标记为pass,1~2之间标记为waring,大于2标记为error。 | +| 相对误差最大值比值 | NPU与CPU的相对误差最大值/GPU与CPU的相对误差最大值。 | +| 相对误差最大值判定结果 | 相对误差最大值比值小于等于1标记为pass,1~10之间标记为waring,大于10标记为error。 | +| 相对误差平均值比值 | NPU与CPU的相对误差的平均值/GPU与CPU的相对误差的平均值。 | +| 相对误差平均值判定结果 | 相对误差平均值比值小于等于1标记为pass,1~2之间标记为waring,大于2标记为error。 | +| 误差均衡性比值 | NPU与CPU的误差均衡性/GPU与CPU的误差均衡性。 | +| 误差均衡性判定结果 | 误差均衡性比值小于等于1标记为pass,1~2之间标记为waring,大于2标记为error。该字段暂不参与api_precision_compare_result的结果判定。 | +| inf/nan错误率 | NPU与标杆inf/nan计算不一致的元素个数占总元素的个数比例。 | +| inf/nan判定结果 | inf/nan错误率判定结果,等于0标记为pass,其余情况标记为error。 | +| 相对误差错误率 | NPU与标杆的正常值计算相对误差,其大于错误阈值的元素个数占正常值元素个数的比例。 | +| 相对误差判定结果 | 相对误差错误率判定结果,等于0标记为pass,其余情况标记为error。 | +| 绝对误差错误率 | NPU与标杆的小值计算绝对误差,其大于错误阈值的元素个数占小值元素个数的比例。 | +| 绝对误差判定结果 | 绝对误差错误率判定结果,等于0标记为pass,其余情况标记为error。 | +| 二进制一致错误率 | NPU或GPU数据中每个Tensor精度不一致的数值的数量与Tensor中数值数量的比值。只有数据是builtin类型(bool、int、float、str)、torch.bool和torch的int类型或者在新精度标准中使用二进制一致算法进行比对的API才会展示。 | +| 二进制一致错误率判定结果 | 二进制一致错误率判定结果,等于0标记为pass,其余情况标记为error。 | +| 比对结果 | 综合所有指标的最终结果。如果比对指标中有error,则标记为error;有warning,则标记为warning;否则标记为pass。 | +| 比对算法 | API使用的比对算法,为标杆比对法、二进制一致法和绝对阈值法中的一种。 | +| Message | 提示信息。当前提示该API比对结果为error或warning时对应不符合标准的指标。 | + +# 溢出解析工具 + +针对训练过程中的溢出检测场景(参见[ptdbg_ascend精度工具功能说明](https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/ptdbg_ascend/doc)中的"溢出检测场景"进行溢出检测dump),对于输入正常但输出存在溢出的API,会在训练执行目录下将溢出的API信息按照前向和反向分类,dump并保存为`forward_info_{pid}.json`,前向过程溢出的API可通过该工具对`forward_info_{pid}.json`进行解析,输出溢出API为正常溢出还是非正常溢出,从而帮助用户快速判断。 + +工具支持PyTorch版本:1.8.1/1.11.0/2.0/2.1。 + +若溢出检测场景dump结果生成`forward_info_{pid}.json`文件,则使用本工具进行解析。操作步骤如下: + +1. 安装预检工具 + + 将att仓代码下载到本地,并配置环境变量。假设下载后att仓路径为 $ATT_HOME,环境变量应配置为 + + ```bash + export PYTHONPATH=$PYTHONPATH:$ATT_HOME/debug/accuracy_tools/ + ``` + + 安装依赖tqdm、rich、pyyaml + + ```bash + pip3 install tqdm rich pyyaml + ``` + +2. 执行溢出API解析操作 + + **forward_info_0.json为[ptdbg_ascend精度工具功能说明](https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/ptdbg_ascend/doc)中的"溢出检测场景"执行溢出检测dump时生成,而不是精度预检工具生成。** + + ```bash + cd $ATT_HOME/debug/accuracy_tools/api_accuracy_checker/run_ut + python run_overflow_check.py -forward ./forward_info_0.json + ``` + + | 参数名称 | 说明 | 是否必选 | + | ------------------------------ | -------------------------------------------------- | -------- | + | -forward或--forward_input_file | 指定前向API信息文件forward_info_{pid}.json。 | 是 | + | -j或--jit_compile | 开启jit编译。 | 否 | + | -d或--device | 指定Device ID,选择UT代码运行所在的卡,默认值为0。 | 否 | + + 反向过程溢出的API暂不支持该功能。 + + +具体参数解释请参见“**Ascend模型精度预检工具”**。 + +# FAQ + +1. 预检工具在dump和run_ut的过程中,是否需要同时开启或关闭jit编译(jit_compile)? + + 答:是。 + +2. 预检工具对于type_as这类涉及数据类型转换操作的API,是否具有参考性? + + 由于这类API在CPU侧存在精度先提升后下降的操作,因此这类API的有效性的参考价值有限。 + +3. run ut过程中出现报错:ERROR:Got unsupported ScalarType BFloat16 + + 答:请使用最新版本的工具。 + +4. Dropout算子,CPU和NPU的随机应该不一样,为什么结果比对是一致的? + + 答:这个结果是正常的,工具对该算子有特殊处理,只判定位置为0的位置比例大约和设定p值相当。 + +5. 为什么浮点型数据bench和CPU的dtype不一致? + + 答:对于fp16的数据,CPU会上升一个精度fp32去计算,这是和算子那边对齐的精度结论,CPU用更高精度去计算会更接近真实值。 + +6. 添加预检工具后截取操作报错:`IndexError: too many indices for tensor of dimension x` 或 `TypeError: len() of a 0-d tensor`。 + + 答:注释工具目录api_accuracy_checker/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- __getitem__`,工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。 + +7. 添加预检工具后F.gelu触发ValueError报错:`activation_func must be F.gelu`等。 + + 答:注释工具目录api_accuracy_checker/hook_module/support_wrap_ops.yaml文件中functional:下的的`- gelu`,工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。 + +8. 添加预检工具后触发AsStrided算子相关的报错,或者编译相关的报错,如:`Failed to compile Op [AsStrided]`。 + + 答:注释工具目录api_accuracy_checker/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- t`和`- transpose`。 + +9. Tensor 魔法函数具体对应什么操作? + + 答: + + | Tensor魔法函数 | 具体操作 | + | --------------- | ---------------- | + | `__add__` | + | + | `__and__` | & | + | `__bool__` | 返回Tensor布尔值 | + | `__div__` | / | + | `__eq__` | == | + | `__ge__` | >= | + | `__gt__` | > | + | `__iadd__` | += | + | `__iand__` | &= | + | `__idiv__` | /= | + | `__ifloordiv__` | //= | + | `__ilshift__` | <<= | + | `__imod__` | %= | + | `__imul__` | *= | + | `__ior__` | \|= | + | `__irshift__` | >>= | + | `__isub__` | -= | + | `__ixor__` | ^= | + | `__lshift__` | << | + | `__matmul__` | 矩阵乘法 | + | `__mod__` | % | + | `__mul__` | * | + | `__nonzero__` | 同`__bool__` | + | `__or__` | \| | + | `__radd__` | +(反向) | + | `__rmul__` | *(反向) | + | `__rshift__` | >> | + | `__sub__` | - | + | `__truediv__` | 同`__div__` | + | `__xor__` | ^ | + diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/.keep b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/config.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/config.py new file mode 100644 index 0000000000000000000000000000000000000000..dd6607a81ec00ce635ffae6e41b4b9d18e090827 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/config.py @@ -0,0 +1,76 @@ +import os +import yaml +from ..common.utils import check_file_or_directory_path +from ..hook_module.utils import WrapFunctionalOps, WrapTensorOps, WrapTorchOps +from ...common.file_check import FileOpen + +WrapApi = set(WrapFunctionalOps) | set(WrapTensorOps) | set(WrapTorchOps) + + +class Config: + def __init__(self, yaml_file): + check_file_or_directory_path(yaml_file, False) + with FileOpen(yaml_file, 'r') as file: + config = yaml.safe_load(file) + self.config = {key: self.validate(key, value) for key, value in config.items()} + + def validate(self, key, value): + validators = { + 'dump_path': str, + 'real_data': bool, + 'enable_dataloader': bool, + 'target_iter': list, + 'white_list': list, + 'error_data_path': str, + 'jit_compile': bool, + 'precision': int + } + if key not in validators: + raise ValueError(f"{key} must be one of {validators.keys()}") + if not isinstance(value, validators.get(key)): + raise ValueError(f"{key} must be {validators[key].__name__} type") + if key == 'target_iter': + if not isinstance(value, list): + raise ValueError("target_iter must be a list type") + if any(isinstance(i, bool) for i in value): + raise ValueError("target_iter cannot contain boolean values") + if not all(isinstance(i, int) for i in value): + raise ValueError("All elements in target_iter must be of int type") + if any(i < 0 for i in value): + raise ValueError("All elements in target_iter must be greater than or equal to 0") + if key == 'precision' and value < 0: + raise ValueError("precision must be greater than 0") + if key == 'white_list': + if not isinstance(value, list): + raise ValueError("white_list must be a list type") + if not all(isinstance(i, str) for i in value): + raise ValueError("All elements in white_list must be of str type") + invalid_api = [i for i in value if i not in WrapApi] + if invalid_api: + raise ValueError(f"{', '.join(invalid_api)} is not in support_wrap_ops.yaml, please check the white_list") + return value + + def __getattr__(self, item): + return self.config[item] + + def __str__(self): + return '\n'.join(f"{key}={value}" for key, value in self.config.items()) + + def update_config(self, dump_path=None, real_data=None, target_iter=None, white_list=None, enable_dataloader=None): + args = { + "dump_path": dump_path if dump_path else self.config.get("dump_path", './'), + "real_data": real_data if real_data else self.config.get("real_data", False), + "target_iter": target_iter if target_iter else self.config.get("target_iter", [1]), + "white_list": white_list if white_list else self.config.get("white_list", []), + "enable_dataloader": enable_dataloader if enable_dataloader else self.config.get("enable_dataloader", False) + } + for key, value in args.items(): + if key in self.config: + self.config[key] = self.validate(key, value) + else: + raise ValueError(f"Invalid key '{key}'") + + +cur_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +yaml_path = os.path.join(cur_path, "config.yaml") +msCheckerConfig = Config(yaml_path) \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..91cfc1e06d5bbac70024b50dae4de6e0d9037330 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/utils.py @@ -0,0 +1,654 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import collections +import json +import os +import random +import re +import stat +import subprocess +import sys +import time +import csv +from datetime import datetime, timezone + +import numpy as np +import torch + +try: + import torch_npu +except ImportError: + IS_GPU = True +else: + IS_GPU = False + +from ...common.file_check import FileCheckConst, FileChecker, FileOpen +from ...common import file_check as file_check_util + +torch_without_guard_version_list = ['2.1'] +for version in torch_without_guard_version_list: + if torch.__version__.startswith(version): + torch_without_guard_version = True + break + else: + torch_without_guard_version = False +if not IS_GPU and not torch_without_guard_version: + from torch_npu.utils.device_guard import torch_device_guard as torch_npu_device_guard + + +class Const: + """ + Class for const + """ + SEP = '.' + DIRECTORY_LENGTH = 4096 + FILE_NAME_LENGTH = 255 + FILE_PATTERN = r'^[a-zA-Z0-9_./-]+$' + MODEL_TYPE = ['.onnx', '.pb', '.om'] + SEMICOLON = ";" + COLON = ":" + EQUAL = "=" + COMMA = "," + DOT = "." + DUMP_RATIO_MAX = 100 + SUMMERY_DATA_NUMS = 256 + ONE_HUNDRED_MB = 100 * 1024 * 1024 + FLOAT_EPSILON = np.finfo(float).eps + SUPPORT_DUMP_MODE = ['api', 'acl'] + ON = 'ON' + OFF = 'OFF' + BACKWARD = 'backward' + FORWARD = 'forward' + FLOAT_TYPE = [np.half, np.single, float, np.double, np.float64, np.longdouble, np.float32, np.float16] + BOOL_TYPE = [bool, np.uint8] + INT_TYPE = [np.int32, np.int64] + NPU = 'NPU' + DISTRIBUTED = 'Distributed' + + # dump mode + ALL = "all" + LIST = "list" + RANGE = "range" + STACK = "stack" + ACL = "acl" + API_LIST = "api_list" + API_STACK = "api_stack" + DUMP_MODE = [ALL, LIST, RANGE, STACK, ACL, API_LIST, API_STACK] + + WRITE_FLAGS = os.O_WRONLY | os.O_CREAT + WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR + + RAISE_PRECISION = { + torch.float16: torch.float32, + torch.bfloat16: torch.float32, + torch.float32: torch.float64 + } + CONVERT = { + "int32_to_int64": ["torch.int32", "torch.int64"], + } + + CONVERT_API = { + "int32_to_int64": ["cross_entropy"] + } + + +class CompareConst: + """ + Class for compare module const + """ + # compare result column name + NPU_NAME = "NPU Name" + BENCH_NAME = "Bench Name" + NPU_DTYPE = "NPU Tensor Dtype" + BENCH_DTYPE = "Bench Tensor Dtype" + NPU_SHAPE = "NPU Tensor Shape" + BENCH_SHAPE = "Bench Tensor Shape" + NPU_MAX = "NPU max" + NPU_MIN = "NPU min" + NPU_MEAN = "NPU mean" + BENCH_MAX = "Bench max" + BENCH_MIN = "Bench min" + BENCH_MEAN = "Bench mean" + COSINE = "Cosine" + MAX_ABS_ERR = "MaxAbsErr" + ACCURACY = "Accuracy Reached or Not" + STACK = "NPU_Stack_Info" + ERROR_MESSAGE = "Err_message" + + # compare result data + NAN = 'Nan' + SHAPE_UNMATCH = 'shape unmatched' + DTYPE_UNMATCH = 'dtype unmatched' + + # accuracy standards + COS_THRESHOLD = 0.99 + MAX_ABS_ERR_THRESHOLD = 0.001 + COS_MAX_THRESHOLD = 0.9 + MAX_ABS_ERR_MAX_THRESHOLD = 1 + ACCURACY_CHECK_YES = "Yes" + ACCURACY_CHECK_NO = "No" + ACCURACY_CHECK_UNMATCH = "Unmatched" + + # error message + NO_BENCH = "No bench data matched." + + +class VersionCheck: + """ + Class for TorchVersion + """ + V1_8 = "1.8" + V1_11 = "1.11" + + @staticmethod + def check_torch_version(version): + torch_version = torch.__version__ + if torch_version.startswith(version): + return True + else: + return False + + +class CompareException(Exception): + """ + Class for Accuracy Compare Exception + """ + NONE_ERROR = 0 + INVALID_PATH_ERROR = 1 + OPEN_FILE_ERROR = 2 + CLOSE_FILE_ERROR = 3 + READ_FILE_ERROR = 4 + WRITE_FILE_ERROR = 5 + INVALID_FILE_ERROR = 6 + PERMISSION_ERROR = 7 + INDEX_OUT_OF_BOUNDS_ERROR = 8 + NO_DUMP_FILE_ERROR = 9 + INVALID_DATA_ERROR = 10 + INVALID_PARAM_ERROR = 11 + INVALID_DUMP_RATIO = 12 + INVALID_DUMP_FILE = 13 + UNKNOWN_ERROR = 14 + INVALID_DUMP_MODE = 15 + PARSE_FILE_ERROR = 16 + INVALID_COMPARE_MODE = 17 + + def __init__(self, code, error_info: str = ""): + super(CompareException, self).__init__() + self.code = code + self.error_info = error_info + + def __str__(self): + return self.error_info + + +class DumpException(CompareException): + pass + + +def read_json(file): + with FileOpen(file, 'r') as f: + obj = json.load(f) + return obj + + +def write_csv(data, filepath): + with FileOpen(filepath, 'a', encoding='utf-8-sig') as f: + writer = csv.writer(f) + writer.writerows(data) + + +def _print_log(level, msg, end='\n'): + current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) + pid = os.getgid() + print(current_time + "(" + str(pid) + ")-[" + level + "]" + msg, end=end) + sys.stdout.flush() + + +def print_info_log(info_msg, end='\n'): + """ + Function Description: + print info log. + Parameter: + info_msg: the info message. + """ + _print_log("INFO", info_msg, end=end) + + +def print_error_log(error_msg): + """ + Function Description: + print error log. + Parameter: + error_msg: the error message. + """ + _print_log("ERROR", error_msg) + + +def print_warn_log(warn_msg): + """ + Function Description: + print warn log. + Parameter: + warn_msg: the warning message. + """ + _print_log("WARNING", warn_msg) + + +def check_mode_valid(mode): + if mode not in Const.DUMP_MODE: + msg = "Current mode '%s' is not supported. Please use the field in %s" % \ + (mode, Const.DUMP_MODE) + raise CompareException(CompareException.INVALID_DUMP_MODE, msg) + + +def check_object_type(check_object, allow_type): + """ + Function Description: + Check if the object belongs to a certain data type + Parameter: + check_object: the object to be checked + allow_type: legal data type + Exception Description: + when invalid data throw exception + """ + if not isinstance(check_object, allow_type): + print_error_log(f"{check_object} not of {allow_type} type") + raise CompareException(CompareException.INVALID_DATA_ERROR) + + +def check_file_or_directory_path(path, isdir=False): + """ + Function Description: + check whether the path is valid + Parameter: + path: the path to check + isdir: the path is dir or file + Exception Description: + when invalid data throw exception + """ + if isdir: + if not os.path.exists(path): + print_error_log('The path {} is not exist.'.format(path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + + if not os.path.isdir(path): + print_error_log('The path {} is not a directory.'.format(path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + + if not os.access(path, os.W_OK): + print_error_log( + 'The path {} does not have permission to write. Please check the path permission'.format(path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + else: + if not os.path.isfile(path): + print_error_log('{} is an invalid file or non-exist.'.format(path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + + if not os.access(path, os.R_OK): + print_error_log( + 'The path {} does not have permission to read. Please check the path permission'.format(path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + + +def _check_pkl(pkl_file_handle, file_name): + tensor_line = pkl_file_handle.readline() + if len(tensor_line) == 0: + print_error_log("dump file {} have empty line!".format(file_name)) + raise CompareException(CompareException.INVALID_DUMP_FILE) + pkl_file_handle.seek(0, 0) + + +def check_file_mode(npu_pkl, bench_pkl, stack_mode): + npu_pkl_name = os.path.split(npu_pkl)[-1] + bench_pkl_name = os.path.split(bench_pkl)[-1] + + if not npu_pkl_name.startswith("api_stack") and not bench_pkl_name.startswith("api_stack"): + if stack_mode: + print_error_log("The current file does not contain stack information, please turn off the stack_mode") + raise CompareException(CompareException.INVALID_COMPARE_MODE) + elif npu_pkl_name.startswith("api_stack") and bench_pkl_name.startswith("api_stack"): + if not stack_mode: + print_error_log("The current file contains stack information, please turn on the stack_mode") + raise CompareException(CompareException.INVALID_COMPARE_MODE) + else: + print_error_log("The dump mode of the two files is not same, please check the dump files") + raise CompareException(CompareException.INVALID_COMPARE_MODE) + + +def check_file_size(input_file, max_size): + try: + file_size = os.path.getsize(input_file) + except OSError as os_error: + print_error_log('Failed to open "%s". %s' % (input_file, str(os_error))) + raise CompareException(CompareException.INVALID_FILE_ERROR) from os_error + if file_size > max_size: + print_error_log('The size (%d) of %s exceeds (%d) bytes, tools not support.' + % (file_size, input_file, max_size)) + raise CompareException(CompareException.INVALID_FILE_ERROR) + + +def get_dump_data_path(dump_dir): + """ + Function Description: + traverse directories and obtain the absolute path of dump data + Parameter: + dump_dir: dump data directory + Return Value: + dump data path,file is exist or file is not exist + """ + dump_data_path = None + file_is_exist = False + + check_file_or_directory_path(dump_dir, True) + for dir_path, sub_paths, files in os.walk(dump_dir): + if len(files) != 0: + dump_data_path = dir_path + file_is_exist = True + break + dump_data_path = dir_path + return dump_data_path, file_is_exist + + +def modify_dump_path(dump_path, mode): + if mode == Const.ALL: + return dump_path + file_name = os.path.split(dump_path) + mode_file_name = mode + "_" + file_name[-1] + return os.path.join(file_name[0], mode_file_name) + + +def create_directory(dir_path): + """ + Function Description: + creating a directory with specified permissions in a thread-safe manner + Parameter: + dir_path: directory path + Exception Description: + when invalid data throw exception + """ + try: + os.makedirs(dir_path, mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True) + except OSError as ex: + print_error_log( + 'Failed to create {}. Please check the path permission or disk space. {}'.format(dir_path, str(ex))) + raise CompareException(CompareException.INVALID_PATH_ERROR) from ex + + +def execute_command(cmd): + """ + Function Description: + run the following command + Parameter: + cmd: command + Exception Description: + when invalid command throw exception + """ + print_info_log('Execute command:%s' % cmd) + process = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + while process.poll() is None: + line = process.stdout.readline() + line = line.strip() + if line: + print(line) + if process.returncode != 0: + print_error_log('Failed to execute command:%s' % " ".join(cmd)) + raise CompareException(CompareException.INVALID_DATA_ERROR) + + +def save_numpy_data(file_path, data): + """ + save_numpy_data + """ + if not os.path.exists(os.path.dirname(file_path)): + os.makedirs(os.path.dirname(file_path)) + np.save(file_path, data) + + +def parse_arg_value(values): + """ + parse dynamic arg value of atc cmdline + """ + value_list = [] + for item in values.split(Const.SEMICOLON): + value_list.append(parse_value_by_comma(item)) + return value_list + + +def parse_value_by_comma(value): + """ + parse value by comma, like '1,2,4,8' + """ + value_list = [] + value_str_list = value.split(Const.COMMA) + for value_str in value_str_list: + value_str = value_str.strip() + if value_str.isdigit() or value_str == '-1': + value_list.append(int(value_str)) + else: + print_error_log("please check your input shape.") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + return value_list + + +def get_data_len_by_shape(shape): + data_len = 1 + for item in shape: + if item == -1: + print_error_log("please check your input shape, one dim in shape is -1.") + return -1 + data_len = data_len * item + return data_len + + +def add_time_as_suffix(name): + return '{}_{}.csv'.format(name, time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))) + + +def get_time(): + return datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S") + + +def format_value(value): + return '{:.6f}'.format(value) + + +def torch_device_guard(func): + if IS_GPU or torch_without_guard_version: + return func + # Parse args/kwargs matched torch.device objects + + @torch_npu_device_guard + def wrapper(*args, **kwargs): + return func(*args, **kwargs) + return wrapper + + +def seed_all(seed=1234, mode=False): + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(mode) + if IS_GPU: + torch.cuda.manual_seed_all(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.enable = False + torch.backends.cudnn.benchmark = False + else: + torch_npu.npu.manual_seed_all(seed) + torch_npu.npu.manual_seed(seed) + + +def get_process_rank(model): + print_info_log("Rank id is not provided. Trying to get the rank id of the model.") + try: + device = next(model.parameters()).device + except StopIteration: + print_warn_log('There is no parameter in the model. Fail to get rank id.') + return 0, False + if device.type == 'cpu': + print_warn_log("Warning: the debugger is unable to get the rank id. " + "This may cause the dumpped data to be corrupted in the " + "case of distributed training. (You may ignore this if you are using only one card.) " + "Transfer the model to npu or gpu before register_hook() to avoid this warning.") + return 0, False + else: + return device.index, True + + +def get_json_contents(file_path): + ops = get_file_content_bytes(file_path) + try: + json_obj = json.loads(ops) + except ValueError as error: + print_error_log('Failed to load "%s". %s' % (file_path, str(error))) + raise CompareException(CompareException.INVALID_FILE_ERROR) from error + if not isinstance(json_obj, dict): + print_error_log('Json file %s, content is not a dictionary!' % file_path) + raise CompareException(CompareException.INVALID_FILE_ERROR) + return json_obj + + +def get_file_content_bytes(file): + with FileOpen(file, 'rb') as file_handle: + return file_handle.read() + + +def islink(path): + path = os.path.abspath(path) + return os.path.islink(path) + + +class SoftlinkCheckException(Exception): + pass + + +MAX_JSON_FILE_SIZE = 10 * 1024 ** 2 +LINUX_FILE_NAME_LENGTH_LIMIT = 200 + + +def check_path_length_valid(path): + path = os.path.realpath(path) + return len(os.path.basename(path)) <= LINUX_FILE_NAME_LENGTH_LIMIT + + +def check_path_pattern_valid(path): + pattern = re.compile(r'(\.|/|:|_|-|\s|[~0-9a-zA-Z])+') + if not pattern.fullmatch(path): + raise ValueError('Only the following characters are allowed in the path: A-Z a-z 0-9 - _ . / :') + + +def check_input_file_valid(input_path, max_file_size=MAX_JSON_FILE_SIZE): + if islink(input_path): + raise SoftlinkCheckException("Input path doesn't support soft link.") + + input_path = os.path.realpath(input_path) + if not os.path.exists(input_path): + raise ValueError('Input file %s does not exist!' % input_path) + + if not os.access(input_path, os.R_OK): + raise PermissionError('Input file %s is not readable!' % input_path) + + if not check_path_length_valid(input_path): + raise ValueError("The real path or file_name of input is too long.") + + check_path_pattern_valid(input_path) + + if os.path.getsize(input_path) > max_file_size: + raise ValueError(f'The file is too large, exceeds {max_file_size // 1024 ** 2}MB') + + +def check_need_convert(api_name): + convert_type = None + for key, value in Const.CONVERT_API.items(): + if api_name not in value: + continue + else: + convert_type = key + return convert_type + + +def api_info_preprocess(api_name, api_info_dict): + """ + Function Description: + Preprocesses the API information. + Parameter: + api_name: Name of the API. + api_info_dict: argument of the API. + Return api_info_dict: + convert_type: Type of conversion. + api_info_dict: Processed argument of the API. + """ + convert_type = check_need_convert(api_name) + if api_name == 'cross_entropy': + api_info_dict = cross_entropy_process(api_info_dict) + return convert_type, api_info_dict + + +def cross_entropy_process(api_info_dict): + """ + Function Description: + Preprocesses the cross_entropy API information. + Parameter: + api_info_dict: argument of the API. + Return api_info_dict: + api_info_dict: Processed argument of the API. + """ + if 'args' in api_info_dict and len(api_info_dict['args']) > 1 and 'Min' in api_info_dict['args'][1]: + if api_info_dict['args'][1]['Min'] <= 0: + # The second argument in cross_entropy should be -100 or not less than 0 + api_info_dict['args'][1]['Min'] = 0 + return api_info_dict + + +def initialize_save_path(save_path, dir_name): + data_path = os.path.join(save_path, dir_name) + if os.path.exists(data_path): + print_warn_log(f"{data_path} already exists, it will be overwritten") + else: + os.mkdir(data_path, mode=FileCheckConst.DATA_DIR_AUTHORITY) + data_path_checker = FileChecker(data_path, FileCheckConst.DIR) + data_path_checker.common_check() + + +def write_pt(file_path, tensor): + if os.path.exists(file_path): + raise ValueError(f"File {file_path} already exists") + torch.save(tensor, file_path) + full_path = os.path.realpath(file_path) + file_check_util.change_mode(full_path, FileCheckConst.DATA_FILE_AUTHORITY) + return full_path + + +def get_real_data_path(file_path): + targets = ['forward_real_data', 'backward_real_data', 'ut_error_data\d+'] + pattern = re.compile(r'({})'.format('|'.join(targets))) + match = pattern.search(file_path) + if match: + target_index = match.start() + target_path = file_path[target_index:] + return target_path + else: + raise DumpException(DumpException.INVALID_PATH_ERROR) + + +def get_full_data_path(data_path, real_data_path): + if not data_path: + return data_path + full_data_path = os.path.join(real_data_path, data_path) + return os.path.realpath(full_data_path) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/api_checker/compare/algorithm.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/algorithm.py similarity index 43% rename from debug/accuracy_tools/api_checker/compare/algorithm.py rename to debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/algorithm.py index 2396001befd73210cdba54f9f4b2832f58335a77..7983709f14bcca72a0cb29c453198396561681b1 100644 --- a/debug/accuracy_tools/api_checker/compare/algorithm.py +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/algorithm.py @@ -1,229 +1,190 @@ -import numpy as np -import torch - -from compare.compare_utils import CompareConst, precision_configs -from common.utils import Const - -FLOAT_EPSILON = np.finfo(float).eps -np.seterr(divide='ignore', invalid='ignore') # ignore `invalid value encountered in true_divide` warning -print("FLOAT_EPSILON:", FLOAT_EPSILON) -NAN = 'NaN' - -#cos -def cosine_sim(bench_output, device_output): - msg = "" - n_value = device_output.reshape(-1) - b_value = bench_output.reshape(-1) - cos = CompareConst.NA - np.seterr(divide="ignore", invalid="ignore") - if n_value.shape != b_value.shape: - msg = f"Shape of device and bench outputs don't match. device: {n_value.shape}, bench: {b_value.shape}." - return -1, False, msg - if len(n_value) == 1: - msg = "All the data in device dump data is scalar. Please refer to other compare algorithms." - return cos, True, msg - n_value_max = np.max(np.abs(n_value)) - b_value_max = np.max(np.abs(b_value)) - if n_value_max <= np.finfo(float).eps and b_value_max <= np.finfo(float).eps: - return cos, True, msg - elif n_value_max <= np.finfo(float).eps: - msg = "All the data is zero in device dump data." - return CompareConst.NA, False, msg - elif b_value_max <= np.finfo(float).eps: - msg = "All the data is zero in bench dump data." - return CompareConst.NA, False, msg - else: - n_value = n_value.astype(float) / n_value_max - b_value = b_value.astype(float) / b_value_max - cos = np.dot(n_value, b_value) / (np.linalg.norm(n_value) * np.linalg.norm(b_value)) - if np.isnan(cos): - msg = "Dump data has NaN when comparing with Cosine Similarity." - cos = np.clip(cos, -1, 1) - return cos, cos > 0.99, msg - - -#rmse -def get_rmse(abs_err, inf_nan_mask): - masked_ae = np.where(inf_nan_mask, 0, abs_err) - mse = np.mean(np.square(masked_ae)) - inf_nan_cnt = np.sum(inf_nan_mask) - mse = mse * (abs_err.size / (abs_err.size - inf_nan_cnt + 0.0001) + 0.0001) - rmse = np.sqrt(mse) - return rmse - - -#误差均衡性 -def get_error_balance(bench_data, device_data): - larger_count = np.sum(np.greater(device_data - bench_data.astype(device_data.dtype), 0)) - smaller_count = np.sum(np.less(device_data - bench_data.astype(device_data.dtype), 0)) - total_count = bench_data.size - error_balance = abs(larger_count - smaller_count) / total_count if total_count > 0 else 0 - return error_balance - - -#小值域错误占比 -def get_small_value_err_ratio(small_value_mask, abs_err_greater_mask): - err_mask = np.logical_and(small_value_mask, abs_err_greater_mask) - small_value_err_num = np.sum(err_mask) - small_value_num = np.sum(small_value_mask) - return 0 if small_value_num == 0 else small_value_err_num / small_value_num - - -def get_rel_err(abs_err, abs_bench_with_eps, small_value_mask, inf_nan_mask): - rel_err_tmp = abs_err / abs_bench_with_eps - rel_err_mask = np.logical_or(small_value_mask, inf_nan_mask) - rel_err = np.where(rel_err_mask, -1, rel_err_tmp) - return rel_err - - -def get_abs_err(bench_data, device_data): - abs_err = np.abs(device_data - bench_data) - return abs_err - - -def get_rel_err_origin(abs_err, b_value): - rel_err_origin = np.abs(abs_err / b_value) - return rel_err_origin - - -def get_max_abs_err(abs_err): - max_abs_err = abs_err.max() - bool_result = max_abs_err < 0.001 - return max_abs_err, bool_result - - -#相对误差最大值 -def get_max_rel_err(rel_err): - return np.max(rel_err) - - -#相对误差均值 -def get_mean_rel_err(rel_err): - return np.mean(rel_err) - - -def get_rel_err_ratio(rel_err, thresholding): - if np.size(rel_err) == 0: - ratio = 1 - else: - ratio = np.divide(np.sum(rel_err < thresholding), np.size(rel_err)) - bool_result = ratio > (1 - thresholding) - return ratio, bool_result - - -def get_finite_and_infinite_mask(bench_output, device_output): - device_finite_mask = np.isfinite(device_output) - bench_finite_mask = np.isfinite(bench_output.astype(device_output.dtype)) - both_finite_mask = np.logical_and(device_finite_mask, bench_finite_mask) - inf_nan_mask = np.logical_not(both_finite_mask) - return both_finite_mask, inf_nan_mask - - -def get_small_value_mask(abs_bench, both_finite_mask, small_value_threshold): - small_value_mask = np.less_equal(abs_bench, small_value_threshold) - small_value_mask = np.logical_and(small_value_mask, both_finite_mask) - return small_value_mask - - -def get_msg_and_handle_value(b_value, n_value): - if n_value.dtype in Const.FLOAT_TYPE: - zero_mask = (n_value == 0) - n_value[zero_mask] += np.finfo(n_value.dtype).eps - b_value[zero_mask] += np.finfo(n_value.dtype).eps - else: - b_value, n_value = b_value.astype(float), n_value.astype(float) - zero_mask = (n_value == 0) - n_value[zero_mask] += np.finfo(float).eps - b_value[zero_mask] += np.finfo(float).eps - return b_value, n_value - - -def compare_bool_tensor(bench_output, npu_out): - if npu_out.size == 0: - return CompareConst.NAN, CompareConst.ERROR, "There is not npu calculation result." - error_nums = (bench_output!= npu_out).sum() - error_rate = float(error_nums / bench_output.size) - result = CompareConst.PASS if error_rate == 0 else CompareConst.ERROR - return error_rate, result, "" - - -def compare_float_tensor(cpu_output, npu_output, compare_column): - npu_dtype = npu_output.dtype - - message = "" - eps = np.finfo(cpu_output.dtype).eps - abs_bench = np.abs(cpu_output) - abs_bench_with_eps = abs_bench + eps - abs_err = get_abs_err(cpu_output, npu_output) - if npu_dtype in [np.float16, np.float32]: - dtype_config = precision_configs.get(str(npu_dtype)) - both_finite_mask, inf_nan_mask = get_finite_and_infinite_mask(cpu_output, npu_output) - small_value_mask = get_small_value_mask(abs_bench, both_finite_mask, dtype_config['small_value'][0]) - abs_err_greater_mask = np.greater(abs_err, dtype_config['small_value_atol'][0]) - compare_column.small_value_err_ratio = get_small_value_err_ratio(small_value_mask, abs_err_greater_mask) - rel_err = get_rel_err(abs_err, abs_bench_with_eps, small_value_mask, inf_nan_mask) - compare_column.RMSE = get_rmse(abs_err, np.logical_or(inf_nan_mask, small_value_mask)) - compare_column.EB = get_error_balance(cpu_output, npu_output) - compare_column.Max_rel_error = get_max_rel_err(rel_err) - compare_column.Mean_rel_error = get_mean_rel_err(rel_err) - - - cos_res, cos_status, msg = cosine_sim(cpu_output, npu_output) - compare_column.cosine_sim = cos_res - message += msg + "\n" - if not cos_status: - return CompareConst.ERROR, compare_column, message - - max_abs_res, max_abs_status = get_max_abs_err(abs_err) - compare_column.max_abs_err = max_abs_res - if max_abs_status: - return CompareConst.PASS, compare_column, message - - - rel_err_orign = get_rel_err_origin(abs_err, abs_bench_with_eps) - if npu_dtype in [np.float16]: - hundred_res, hundred_status = get_rel_err_ratio(rel_err_orign, 0.01) - compare_column.rel_err_hundredth = hundred_res - if not hundred_status: - return CompareConst.ERROR, compare_column, message - thousand_res, thousand_status = get_rel_err_ratio(rel_err_orign, 0.001) - compare_column.rel_err_thousandth = thousand_res - if npu_dtype in [np.float16]: - if thousand_status: - return CompareConst.PASS, compare_column, message - return CompareConst.WARNING, compare_column, message - ten_thousand_res, ten_thousand_status = get_rel_err_ratio(rel_err_orign, 0.0001) - compare_column.rel_err_ten_thousandth = ten_thousand_res - if npu_dtype in [np.float32, np.float64]: - if not thousand_status: - return CompareConst.ERROR, compare_column, message - if not ten_thousand_status: - return CompareConst.WARNING, compare_column, message - return CompareConst.PASS, compare_column, message - - -class CompareColumn: - def __init__(self): - self.bench_type = CompareConst.NA - self.npu_type = CompareConst.NA - self.shape = CompareConst.NA - self.cosine_sim = CompareConst.NA - self.max_abs_err = CompareConst.NA - self.rel_err_hundredth = CompareConst.NA - self.rel_err_thousandth = CompareConst.NA - self.rel_err_ten_thousandth = CompareConst.NA - self.error_rate = CompareConst.NA - self.EB = CompareConst.NA - self.RMSE = CompareConst.NA - self.small_value_err_ratio = CompareConst.NA - self.Max_rel_error = CompareConst.NA - self.Mean_rel_error = CompareConst.NA - - def to_column_value(self, is_pass, message): - return [self.bench_type, self.npu_type, self.shape, self.cosine_sim, self.max_abs_err, self.rel_err_hundredth, - self.rel_err_thousandth, self.rel_err_ten_thousandth, self.error_rate, self.EB, self.RMSE, - self.small_value_err_ratio, self.Max_rel_error, self.Mean_rel_error, is_pass, message] - - - - +# 定义比对算法及比对标准 +import torch +import numpy as np +from .compare_utils import CompareConst + + +#cos +def cosine_sim(bench_output, device_output): + msg = "" + n_value = device_output.reshape(-1) + b_value = bench_output.reshape(-1) + cos = CompareConst.SPACE + np.seterr(divide="ignore", invalid="ignore") + if n_value.shape != b_value.shape: + msg = f"Shape of device and bench outputs don't match. device: {n_value.shape}, bench: {b_value.shape}." + return -1, False, msg + if len(n_value) == 1: + msg = "All the data in device dump data is scalar. Please refer to other compare algorithms." + return cos, True, msg + n_value_max = np.max(np.abs(n_value)) + b_value_max = np.max(np.abs(b_value)) + if n_value_max <= np.finfo(float).eps and b_value_max <= np.finfo(float).eps: + msg = "All the data in device and bench outputs are zero." + return cos, True, msg + elif n_value_max <= np.finfo(float).eps: + msg = "All the data is zero in device dump data." + return CompareConst.SPACE, False, msg + elif b_value_max <= np.finfo(float).eps: + msg = "All the data is zero in bench dump data." + return CompareConst.SPACE, False, msg + else: + n_value = n_value.astype(float) / n_value_max + b_value = b_value.astype(float) / b_value_max + cos = np.dot(n_value, b_value) / (np.linalg.norm(n_value) * np.linalg.norm(b_value)) + if np.isnan(cos): + msg = "Dump data has NaN when comparing with Cosine Similarity." + cos = np.clip(cos, -1, 1) + return cos, cos > 0.99, msg + + +#rmse +def get_rmse(abs_err, inf_nan_mask): + masked_ae = np.where(inf_nan_mask, 0, abs_err) + mse = np.mean(np.square(masked_ae)) + inf_nan_cnt = np.sum(inf_nan_mask) + mse = mse * (abs_err.size / (abs_err.size - inf_nan_cnt + 0.0001) + 0.0001) + rmse = np.sqrt(mse) + return rmse + + +#误差均衡性 +def get_error_balance(bench_data, device_data): + larger_count = np.sum(np.greater(device_data - bench_data.astype(device_data.dtype), 0)) + smaller_count = np.sum(np.less(device_data - bench_data.astype(device_data.dtype), 0)) + total_count = bench_data.size + error_balance = abs(larger_count - smaller_count) / total_count if total_count > 0 else 0 + return error_balance + + +#小值域错误占比 +def get_small_value_err_ratio(small_value_mask, abs_err_greater_mask): + err_mask = np.logical_and(small_value_mask, abs_err_greater_mask) + small_value_err_num = np.sum(err_mask) + small_value_num = np.sum(small_value_mask) + return 0 if small_value_num == 0 else small_value_err_num / small_value_num + + +def get_rel_err(abs_err, abs_bench_with_eps, small_value_mask, inf_nan_mask): + rel_err_tmp = abs_err / abs_bench_with_eps + rel_err_mask = np.logical_or(small_value_mask, inf_nan_mask) + rel_err = np.where(rel_err_mask, -1, rel_err_tmp) + return rel_err + + +def get_abs_err(bench_data, device_data): + abs_err = np.abs(device_data - bench_data) + return abs_err + + +def get_rel_err_origin(abs_err, b_value): + rel_err_origin = np.abs(abs_err / b_value) + return rel_err_origin + + +def get_max_abs_err(abs_err): + max_abs_err = abs_err.max() + bool_result = max_abs_err < 0.001 + return max_abs_err, bool_result + + +#相对误差最大值 +def get_max_rel_err(rel_err): + return np.max(rel_err) if np.max(rel_err) >= 0 else 0 + + +#相对误差均值 +def get_mean_rel_err(rel_err): + non_negative_rel_err = rel_err[rel_err >= 0] + return np.mean(non_negative_rel_err) if non_negative_rel_err.size > 0 else 0 + + +def get_rel_err_ratio(rel_err, thresholding): + if np.size(rel_err) == 0: + ratio = 1 + else: + ratio = np.divide(np.sum(rel_err < thresholding), np.size(rel_err)) + bool_result = ratio > (1 - thresholding) + return ratio, bool_result + + +def get_finite_and_infinite_mask(bench_output, device_output): + device_finite_mask = np.isfinite(device_output) + bench_finite_mask = np.isfinite(bench_output.astype(device_output.dtype)) + both_finite_mask = np.logical_and(device_finite_mask, bench_finite_mask) + inf_nan_mask = np.logical_not(both_finite_mask) + return both_finite_mask, inf_nan_mask + + +def get_small_value_mask(abs_bench, both_finite_mask, small_value_threshold): + small_value_mask = np.less_equal(abs_bench, small_value_threshold) + small_value_mask = np.logical_and(small_value_mask, both_finite_mask) + return small_value_mask + + +def get_abs_bench_with_eps(bench, dtype): + abs_bench = np.abs(bench) + eps = np.finfo(bench.dtype).eps if dtype != torch.bfloat16 else CompareConst.BFLOAT16_EPS + abs_bench_with_eps = abs_bench + eps + return abs_bench, abs_bench_with_eps + + +def check_inf_nan_value(inf_nan_mask, bench_output, device_output, dtype, rtol): + ''' + 新精度标准的绝对阈值法中,检查npu和golden输出的inf、nan是否一致 + 输入: + inf_nan_mask:npu输出和golden输出的inf、nan的mask + bench_output:golden输出 + device_output:npu输出 + dtype:npu输出的dtype + 输出: + inf_nan_err_ratio:npu输出和golden输出的inf、nan不一致的比例 + ''' + abs_gpu, abs_gpu_with_eps = get_abs_bench_with_eps(bench_output, dtype) + golden_same_dtype = bench_output.astype(device_output.dtype) + a_min = np.finfo(device_output.dtype).min if dtype != torch.bfloat16 else CompareConst.BFLOAT16_MIN + a_max = np.finfo(device_output.dtype).max if dtype != torch.bfloat16 else CompareConst.BFLOAT16_MAX + golden_clip = np.clip(golden_same_dtype, a_min, a_max) + npu_clip = np.clip(device_output, a_min, a_max) + clipped_abs_ae = np.abs(npu_clip - golden_clip) + clipped_re = clipped_abs_ae / abs_gpu_with_eps + pass_mask = np.less_equal(clipped_re, rtol) + both_nan_mask = np.logical_and(np.isnan(device_output), np.isnan(golden_clip)) + pass_mask = np.logical_or(pass_mask, both_nan_mask) + not_pass_mask = np.logical_not(pass_mask) + not_pass_mask = np.logical_and(not_pass_mask, inf_nan_mask) + + inf_nan_err_cnt = np.sum(not_pass_mask) + return 0 if np.sum(inf_nan_mask) == 0 else inf_nan_err_cnt / np.sum(inf_nan_mask) + + +def check_small_value(abs_err, small_value_mask, small_value_atol): + ''' + 新精度标准的相对阈值法中,检查npu和golden小值域输出的相对误差是否满足阈值 + 输入: + rel_err:npu输出和golden输出的相对误差 + normal_value_mask:npu输出和golden输出的正常值mask + rtol:相对误差的阈值 + 输出: + rel_err_ratio:npu输出和golden输出的相对误差不满足阈值的比例 + ''' + greater_mask = np.greater(abs_err, small_value_atol) + err_mask = np.logical_and(greater_mask, small_value_mask) + err_cnt = np.sum(err_mask) + return 0 if np.sum(small_value_mask) == 0 else err_cnt / np.sum(small_value_mask) + + +def check_norm_value(normal_value_mask, rel_err, rtol): + ''' + 新精度标准的绝对阈值法中,检查npu和golden正常值输出的绝对误差是否满足阈值 + 输入: + abs_err:npu输出和golden输出的绝对误差 + normal_value_mask:npu输出和golden输出的正常值mask + atol:绝对误差的阈值 + 输出: + abs_err_ratio:npu输出和golden输出的绝对误差不满足阈值的比例 + ''' + err_mask = np.greater(rel_err, rtol) + err_mask = np.logical_and(err_mask, normal_value_mask) + err_cnt = np.sum(err_mask) + return 0 if np.sum(normal_value_mask) == 0 else err_cnt / np.sum(normal_value_mask) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_compare.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_compare.py new file mode 100644 index 0000000000000000000000000000000000000000..6a544de21a01321774c21aaa90397e3ea80fe7be --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_compare.py @@ -0,0 +1,387 @@ +import argparse +import os +import sys +import math +from collections import namedtuple +import pandas as pd + +from ..common.utils import print_info_log, print_warn_log, print_error_log, write_csv, \ + CompareException, create_directory +from ..common.config import msCheckerConfig +from ..compare.compare_utils import CompareConst, API_PRECISION_COMPARE_RESULT_FILE_NAME, \ + API_PRECISION_COMPARE_DETAILS_FILE_NAME, BENCHMARK_COMPARE_SUPPORT_LIST, API_PRECISION_COMPARE_UNSUPPORT_LIST, \ + ApiPrecisionCompareColumn, AbsoluteStandardApi, BinaryStandardApi, BINARY_COMPARE_UNSUPPORT_LIST, \ + convert_str_to_float, CompareMessage +from ..compare.compare_column import ApiPrecisionOutputColumn +from ..run_ut.run_ut import get_validated_result_csv_path +from ...common.file_check import FileCheckConst, FileChecker, change_mode, check_path_before_create + +CompareConfig = namedtuple('CompareConfig', ['npu_csv_path', 'gpu_csv_path', 'result_csv_path', 'details_csv_path']) +unsupported_message = 'This data type does not support benchmark compare.' + +benchmark_algorithms_thresholds = { + 'small_value': { + 'error_threshold': 2, + 'warning_threshold': 1 + }, + 'rmse': { + 'error_threshold': 2, + 'warning_threshold': 1 + }, + 'max_rel_err': { + 'error_threshold': 10, + 'warning_threshold': 1 + }, + 'mean_rel_err': { + 'error_threshold': 2, + 'warning_threshold': 1 + }, + 'eb': { + 'error_threshold': 2, + 'warning_threshold': 1 + } +} + +benchmark_message = { + "small_value_err_status": { + CompareConst.ERROR: "ERROR: 小值域错误比值超过阈值\n", + CompareConst.WARNING: "WARNING: 小值域错误比值超过阈值\n" + }, + "rmse_status": { + CompareConst.ERROR: "ERROR: 均方根误差比值超过阈值\n", + CompareConst.WARNING: "WARNING: 均方根误差比值超过阈值\n" + }, + "max_rel_err_status": { + CompareConst.ERROR: "ERROR: 相对误差最大值比值超过阈值\n", + CompareConst.WARNING: "WARNING: 相对误差最大值比值超过阈值\n" + }, + "mean_rel_err_status": { + CompareConst.ERROR: "ERROR: 相对误差平均值比值超过阈值\n", + CompareConst.WARNING: "WARNING: 相对误差平均值比值超过阈值\n" + } +} + + +class BenchmarkStandard: + def __init__(self, api_name, npu_precision, gpu_precision): + self.api_name = api_name + self.npu_precision = npu_precision + self.gpu_precision = gpu_precision + self.small_value_err_ratio = 1 + self.rmse_ratio = 1 + self.max_rel_err_ratio = 1 + self.mean_rel_err_ratio = 1 + self.eb_ratio = 1 + self.small_value_err_status = CompareConst.PASS + self.rmse_status = CompareConst.PASS + self.max_rel_err_status = CompareConst.PASS + self.mean_rel_err_status = CompareConst.PASS + self.eb_status = CompareConst.PASS + self.check_result_list = [] + self.final_result = CompareConst.PASS + + def __str__(self): + return "%s" % (self.api_name) + + def get_result(self): + self._compare_ratio() + self.small_value_err_status = self._get_status(self.small_value_err_ratio, 'small_value') + self.check_result_list.append(self.small_value_err_status) + self.rmse_status = self._get_status(self.rmse_ratio, 'rmse') + self.check_result_list.append(self.rmse_status) + self.max_rel_err_status = self._get_status(self.max_rel_err_ratio, 'max_rel_err') + self.check_result_list.append(self.max_rel_err_status) + self.mean_rel_err_status = self._get_status(self.mean_rel_err_ratio, 'mean_rel_err') + self.check_result_list.append(self.mean_rel_err_status) + self.eb_status = self._get_status(self.eb_ratio, 'eb') + if CompareConst.ERROR in self.check_result_list: + self.final_result = CompareConst.ERROR + elif CompareConst.WARNING in self.check_result_list: + self.final_result = CompareConst.WARNING + + def _compare_ratio(self): + self.small_value_err_ratio = self._calc_ratio( + self.npu_precision.get(ApiPrecisionCompareColumn.SMALL_VALUE_ERROR_RATE), + self.gpu_precision.get(ApiPrecisionCompareColumn.SMALL_VALUE_ERROR_RATE), 10000.0) + self.rmse_ratio = self._calc_ratio(self.npu_precision.get(ApiPrecisionCompareColumn.RMSE), + self.gpu_precision.get(ApiPrecisionCompareColumn.RMSE), 10000.0) + self.max_rel_err_ratio = self._calc_ratio(self.npu_precision.get(ApiPrecisionCompareColumn.MAX_REL_ERR), + self.gpu_precision.get(ApiPrecisionCompareColumn.MAX_REL_ERR), + 10000.0) + self.mean_rel_err_ratio = self._calc_ratio(self.npu_precision.get(ApiPrecisionCompareColumn.MEAN_REL_ERR), + self.gpu_precision.get(ApiPrecisionCompareColumn.MEAN_REL_ERR), + 10000.0) + self.eb_ratio = self._calc_ratio(self.npu_precision.get(ApiPrecisionCompareColumn.EB), + self.gpu_precision.get(ApiPrecisionCompareColumn.EB), 10000.0) + + def to_column_value(self): + return [self.small_value_err_ratio, self.small_value_err_status, self.rmse_ratio, + self.rmse_status, self.max_rel_err_ratio, self.max_rel_err_status, self.mean_rel_err_ratio, + self.mean_rel_err_status, self.eb_ratio, self.eb_status] + + @staticmethod + def _get_status(ratio, algorithm): + error_threshold = benchmark_algorithms_thresholds.get(algorithm).get('error_threshold') + warning_threshold = benchmark_algorithms_thresholds.get(algorithm).get('warning_threshold') + if ratio > error_threshold: + return CompareConst.ERROR + elif ratio > warning_threshold: + return CompareConst.WARNING + return CompareConst.PASS + + @staticmethod + def _calc_ratio(x, y, default_value=1.0): + x, y = convert_str_to_float(x), convert_str_to_float(y) + if math.isclose(y, 0.0): + return 1.0 if math.isclose(x, 0.0) else default_value + else: + return abs(x / y) + + +def write_detail_csv(content, save_path): + rows = [] + content = ["{:.{}f}".format(item, msCheckerConfig.precision) \ + if isinstance(item, float) else item for item in content] + rows.append(content) + write_csv(rows, save_path) + + +def api_precision_compare(config): + print_info_log("Start compare task") + print_info_log(f"Compare task result will be saved in {config.result_csv_path}") + print_info_log(f"Compare task detail will be saved in {config.details_csv_path}") + try: + npu_data = pd.read_csv(config.npu_csv_path) + except Exception as err: + print_error_log(f"Open npu csv Error: %s" % str(err)) + check_csv_columns(npu_data.columns, "npu_csv") + try: + gpu_data = pd.read_csv(config.gpu_csv_path) + except Exception as err: + print_error_log(f"Open gpu csv Error: %s" % str(err)) + check_csv_columns(gpu_data.columns, "gpu_csv") + detail_csv_title = [ApiPrecisionCompareColumn.get_detail_csv_title()] + result_csv_title = [ApiPrecisionCompareColumn.get_result_csv_title()] + write_csv(result_csv_title, config.result_csv_path) + write_csv(detail_csv_title, config.details_csv_path) + try: + analyse_csv(npu_data, gpu_data, config) + except Exception as err: + print_error_log(f"Analyse csv Error: %s" % str(err)) + change_mode(config.result_csv_path, FileCheckConst.DATA_FILE_AUTHORITY) + change_mode(config.details_csv_path, FileCheckConst.DATA_FILE_AUTHORITY) + + +def analyse_csv(npu_data, gpu_data, config): + forward_status, backward_status = [], [] + last_api_name, last_api_dtype = None, None + for _, row_npu in npu_data.iterrows(): + message = '' + compare_column = ApiPrecisionOutputColumn() + full_api_name_with_direction_status = row_npu[ApiPrecisionCompareColumn.API_NAME] + row_gpu = gpu_data[gpu_data[ApiPrecisionCompareColumn.API_NAME] == full_api_name_with_direction_status] + _, api_name, _, direction_status, _, _ = full_api_name_with_direction_status.split(".") + if row_gpu.empty: + print_warn_log(f'This API : {full_api_name_with_direction_status} does not exist in the GPU data.') + continue + if len(row_gpu) > 1: + msg = f'This API : {full_api_name_with_direction_status} has multiple records in the GPU data.' + raise CompareException(CompareException.INVALID_DATA_ERROR, msg) + row_gpu = row_gpu.iloc[0] + # 当前API的输出为空(例如反向过程中requires_grad=False),跳过比对 + if row_npu[ApiPrecisionCompareColumn.DEVICE_DTYPE].isspace(): + continue + new_status = CompareConst.SPACE + compare_column.api_name = full_api_name_with_direction_status + if row_npu[ + ApiPrecisionCompareColumn.DEVICE_DTYPE] not in BINARY_COMPARE_UNSUPPORT_LIST or api_name in BinaryStandardApi: + new_status = record_binary_consistency_result(api_name, compare_column, row_npu) + elif api_name in AbsoluteStandardApi: + new_status = record_absolute_threshold_result(compare_column, row_npu) + elif row_npu[ApiPrecisionCompareColumn.DEVICE_DTYPE] in BENCHMARK_COMPARE_SUPPORT_LIST: + bs = BenchmarkStandard(full_api_name_with_direction_status, row_npu, row_gpu) + new_status = record_benchmark_compare_result(compare_column, bs) + write_detail_csv(compare_column.to_column_value(), config.details_csv_path) + + if last_api_name is not None and api_name != last_api_name: + if last_api_dtype in API_PRECISION_COMPARE_UNSUPPORT_LIST: + message = unsupported_message + write_csv([[last_api_name, "skip", "skip", message]], config.result_csv_path) + forward_status, backward_status = [], [] + message = '' + else: + forward_result = get_api_checker_result(forward_status) + backward_result = get_api_checker_result(backward_status) + message += CompareMessage.get(last_api_name, "") if forward_result == CompareConst.ERROR else "" + write_csv([[last_api_name, forward_result, backward_result, message]], config.result_csv_path) + forward_status, backward_status = [], [] + message = '' + + is_supported = row_npu[ApiPrecisionCompareColumn.DEVICE_DTYPE] not in API_PRECISION_COMPARE_UNSUPPORT_LIST + last_api_name = api_name + + last_api_dtype = row_npu[ApiPrecisionCompareColumn.DEVICE_DTYPE] + if not is_supported: + continue + + if direction_status == 'forward': + forward_status.append(new_status) + elif direction_status == 'backward': + backward_status.append(new_status) + else: + print_error_log(f"Invalid direction status: {direction_status}") + + if last_api_name is not None: + if last_api_dtype in API_PRECISION_COMPARE_UNSUPPORT_LIST: + message = unsupported_message + write_csv([[last_api_name, "skip", "skip", message]], config.result_csv_path) + else: + forward_result = get_api_checker_result(forward_status) + backward_result = get_api_checker_result(backward_status) + message += CompareMessage.get(last_api_name, "") if forward_result == CompareConst.ERROR else "" + write_csv([[last_api_name, forward_result, backward_result, message]], config.result_csv_path) + + +def check_error_rate(npu_error_rate): + return CompareConst.PASS if convert_str_to_float(npu_error_rate) == 0 else CompareConst.ERROR + + +def get_absolute_threshold_result(row_npu): + inf_nan_error_ratio = convert_str_to_float(row_npu[ApiPrecisionCompareColumn.INF_NAN_ERROR_RATIO]) + rel_err_ratio = convert_str_to_float(row_npu[ApiPrecisionCompareColumn.REL_ERR_RATIO]) + abs_err_ratio = convert_str_to_float(row_npu[ApiPrecisionCompareColumn.ABS_ERR_RATIO]) + + inf_nan_result = CompareConst.PASS if inf_nan_error_ratio == 0 else CompareConst.ERROR + rel_err_result = CompareConst.PASS if rel_err_ratio == 0 else CompareConst.ERROR + abs_err_result = CompareConst.PASS if abs_err_ratio == 0 else CompareConst.ERROR + + if CompareConst.ERROR in [inf_nan_result, rel_err_result, abs_err_result]: + absolute_threshold_result = CompareConst.ERROR + else: + absolute_threshold_result = CompareConst.PASS + + return { + "inf_nan_error_ratio": inf_nan_error_ratio, + "inf_nan_result": inf_nan_result, + "rel_err_ratio": rel_err_ratio, + "rel_err_result": rel_err_result, + "abs_err_ratio": abs_err_ratio, + "abs_err_result": abs_err_result, + "absolute_threshold_result": absolute_threshold_result, + } + + +def get_api_checker_result(status): + if not status: + return CompareConst.SPACE + for const in (CompareConst.ERROR, CompareConst.WARNING): + if const in status: + return const + return CompareConst.PASS + + +def check_csv_columns(columns, csv_type): + required_columns = ApiPrecisionCompareColumn.to_required_columns() + missing_columns = [column for column in required_columns if column not in columns] + if missing_columns: + msg = f"The followint columns {','.join(missing_columns)} are missing in{csv_type}" + raise CompareException(CompareException.INVALID_DATA_ERROR, msg) + + +def record_binary_consistency_result(api_name, compare_column, row_npu): + new_status = check_error_rate(row_npu[ApiPrecisionCompareColumn.ERROR_RATE]) + compare_column.error_rate = row_npu[ApiPrecisionCompareColumn.ERROR_RATE] + compare_column.error_rate_status = new_status + compare_column.compare_result = new_status + compare_column.compare_algorithm = "二进制一致法" + message = '' + if compare_column.error_rate_status == CompareConst.ERROR: + message += "ERROR: 二进制一致错误率超过阈值\n" + message += CompareMessage.get(api_name, "") + compare_column.compare_message = message + return new_status + + +def record_absolute_threshold_result(compare_column, row_npu): + absolute_threshold_result = get_absolute_threshold_result(row_npu) + compare_column.inf_nan_error_ratio = absolute_threshold_result.get("inf_nan_error_ratio") + compare_column.inf_nan_error_ratio_status = absolute_threshold_result.get("inf_nan_result") + compare_column.rel_err_ratio = absolute_threshold_result.get("rel_err_ratio") + compare_column.rel_err_ratio_status = absolute_threshold_result.get("rel_err_result") + compare_column.abs_err_ratio = absolute_threshold_result.get("abs_err_ratio") + compare_column.abs_err_ratio_status = absolute_threshold_result.get("abs_err_result") + compare_column.compare_result = absolute_threshold_result.get("absolute_threshold_result") + compare_column.compare_algorithm = "绝对阈值法" + message = '' + if compare_column.inf_nan_error_ratio_status == CompareConst.ERROR: + message += "ERROR: inf/nan错误率超过阈值\n" + if compare_column.rel_err_ratio_status == CompareConst.ERROR: + message += "ERROR: 相对误差错误率超过阈值\n" + if compare_column.abs_err_ratio_status == CompareConst.ERROR: + message += "ERROR: 绝对误差错误率超过阈值\n" + compare_column.compare_message = message + return compare_column.compare_result + + +def record_benchmark_compare_result(compare_column, bs): + bs.get_result() + compare_column.small_value_err_ratio = bs.small_value_err_ratio + compare_column.small_value_err_status = bs.small_value_err_status + compare_column.rmse_ratio = bs.rmse_ratio + compare_column.rmse_status = bs.rmse_status + compare_column.max_rel_err_ratio = bs.max_rel_err_ratio + compare_column.max_rel_err_status = bs.max_rel_err_status + compare_column.mean_rel_err_ratio = bs.mean_rel_err_ratio + compare_column.mean_rel_err_status = bs.mean_rel_err_status + compare_column.eb_ratio = bs.eb_ratio + compare_column.eb_status = bs.eb_status + compare_column.compare_result = bs.final_result + compare_column.compare_algorithm = "标杆比对法" + message = '' + for status_attr, messages in benchmark_message.items(): + status_value = getattr(compare_column, status_attr) + if status_value in messages: + message += messages[status_value] + compare_column.compare_message = message + return compare_column.compare_result + + +def _api_precision_compare(parser=None): + if not parser: + parser = argparse.ArgumentParser() + _api_precision_compare_parser(parser) + args = parser.parse_args(sys.argv[1:]) + _api_precision_compare_command(args) + + +def _api_precision_compare_command(args): + npu_csv_path = get_validated_result_csv_path(args.npu_csv_path, 'detail') + gpu_csv_path = get_validated_result_csv_path(args.gpu_csv_path, 'detail') + out_path = os.path.realpath(args.out_path) if args.out_path else "./" + check_path_before_create(out_path) + create_directory(out_path) + out_path_checker = FileChecker(out_path, FileCheckConst.DIR, ability=FileCheckConst.WRITE_ABLE) + out_path = out_path_checker.common_check() + result_csv_path = os.path.join(out_path, API_PRECISION_COMPARE_RESULT_FILE_NAME) + details_csv_path = os.path.join(out_path, API_PRECISION_COMPARE_DETAILS_FILE_NAME) + compare_config = CompareConfig(npu_csv_path, gpu_csv_path, result_csv_path, details_csv_path) + api_precision_compare(compare_config) + + +def _api_precision_compare_parser(parser): + parser.add_argument("-npu", "--npu_csv_path", dest="npu_csv_path", default="", type=str, + help=" , Accuracy_checking_details.csv generated on the NPU by using the " + "api_accuracy_checker tool.", + required=True) + parser.add_argument("-gpu", "--gpu_csv_path", dest="gpu_csv_path", default="", type=str, + help=" Accuracy_checking_details.csv generated on the GPU by using the " + "api_accuracy_checker tool.", + required=False) + parser.add_argument("-o", "--out_path", dest="out_path", default="", type=str, + help=" The api precision compare task result out path.", + required=False) + + +if __name__ == '__main__': + _api_precision_compare() + print_info_log("Compare task completed.") diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml new file mode 100644 index 0000000000000000000000000000000000000000..efba9c5c02bbcc094b75ce2497d830789744b143 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml @@ -0,0 +1,108 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +AbsoluteThreshStandard: + - mul + - mul_ + - __mul__ + - __imul__ + - __rmul__ + - add + - add_ + - __add__ + - __iadd__ + - __radd__ + - div + - div_ + - __div__ + - __idiv__ + - divide + - divide_ + - leaky_relu + - leaky_relu_ + - prelu + - reciprocal + - reciprocal_ + - rsqrt + - rsqrt_ + - square + - square_ + - sub + - sub_ + - rsub + - __isub__ + - __sub__ + +BinaryCompareStandard: + - abs + - abs_ + - absolute + - absolute_ + - argmin + - bitwise_and + - bitwise_and_ + - broadcast_to + - ceil + - ceil_ + - equal + - fill_ + - flatten + - floor + - floor_ + - gather + - greater + - greater_ + - greater_equal + - greater_equal_ + - isfinite + - isnan + - less + - less_ + - less_equal + - less_equal_ + - logical_and + - logical_and_ + - logical_not + - logical_not_ + - logical_or + - logical_or_ + - masked_fill + - masked_fill_ + - max_pool3d + - maximum + - minimum + - neg + - neg_ + - nonzero + - not_equal + - not_equal_ + - one_hot + - pad + - relu + - reshape + - round + - round_ + - select + - sign + - sign_ + - sort + - tile + - topk + - transpose + - transpose_ + - tril + - tril_ + - triu + - triu_ diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0684bd8e9129653b6b69afcf43ab19207006801f --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml @@ -0,0 +1,390 @@ +mul: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +mul_: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +__mul__: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +__imul__: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +__rmul__: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +add: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +add_: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +__add__: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +__iadd__: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +__radd__: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +div: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +div_: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +__div__: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +__idiv__: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +divide: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +divide_: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +leaky_relu: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +leaky_relu_: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +prelu: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +reciprocal: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +reciprocal_: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +rsqrt: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +rsqrt_: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +square: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +square_: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +sub: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +sub_: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +rsub: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +__isub__: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 +__sub__: + torch.float32: + rtol: 0.000001 + small_value: 0.000001 + small_value_atol: 0.000001 + torch.float16: + rtol: 0.001 + small_value: 0.001 + small_value_atol: 0.001 + torch.bfloat16: + rtol: 0.004 + small_value: 0.001 + small_value_atol: 0.001 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare.py new file mode 100644 index 0000000000000000000000000000000000000000..67aa69e209b7bdbdfb7a5db937bf8e5af8d1b8c8 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare.py @@ -0,0 +1,372 @@ +# 进行比对及结果展示 +import os +import csv +import torch +import numpy as np +from rich.table import Table +from rich.console import Console +from ..common.utils import get_json_contents, write_csv, print_warn_log, Const +from ..compare.compare_utils import CompareConst, check_dtype_comparable, DETAIL_TEST_ROWS, \ + precision_configs, BENCHMARK_COMPARE_SUPPORT_LIST, AbsoluteStandardApi, BinaryStandardApi, apis_threshold +from ..compare.compare_column import CompareColumn +from ..compare.algorithm import get_rmse, get_error_balance, get_max_rel_err, get_mean_rel_err, \ + get_rel_err, get_abs_err, get_max_abs_err, get_rel_err_ratio, cosine_sim, get_rel_err_origin, \ + get_small_value_err_ratio, get_finite_and_infinite_mask, get_small_value_mask, check_inf_nan_value, \ + check_small_value, check_norm_value, get_abs_bench_with_eps +from ..common.config import msCheckerConfig +from ...common.file_check import FileOpen + + +class Comparator: + # consts for result csv + COLUMN_API_NAME = "API name" + COLUMN_FORWARD_SUCCESS = "Forward Test Success" + COLUMN_BACKWARD_SUCCESS = "Backward Test Success" + COLUMN_STACK_INFO = "Traceback callstack info" + + def __init__(self, result_csv_path, details_csv_path, is_continue_run_ut, stack_info_json_path=None): + self.save_path = result_csv_path + self.detail_save_path = details_csv_path + if not is_continue_run_ut and not os.path.exists(self.save_path) and not os.path.exists(self.detail_save_path): + self.write_csv_title() + if stack_info_json_path: + self.stack_info = get_json_contents(stack_info_json_path) + else: + self.stack_info = None + + self.test_result_cnt = { + "forward_fail_num": 0, "backward_fail_num": 0, "forward_and_backward_fail_num": 0, "success_num": 0, + "total_num": 0, "forward_or_backward_fail_num": 0 + } + + def print_pretest_result(self): + self.get_statistics_from_result_csv() + total_tests = self.test_result_cnt.get("total_num", 0) + if total_tests != 0: + passing_rate = "{:.2%}".format(self.test_result_cnt.get("success_num", 0) / total_tests) + else: + passing_rate = "0%" + + print_warn_log("The follwing tables will be deprecated in the future." + "The following results are for reference only.") + console = Console() + table_total = Table( + show_header=True, title="Overall Statistics", show_lines=True, width=75 + ) + table_total.add_column("Result") + table_total.add_column("Statistics") + table_total.add_row("[green]Pass[/green]", str(self.test_result_cnt.get("success_num", 0))) + table_total.add_row("[yellow]Warning[/yellow]", str(self.test_result_cnt.get("warning_num", 0))) + table_total.add_row("[red]Error[/red]", str(self.test_result_cnt.get("error_num", 0))) + table_total.add_row("Passing Rate", passing_rate) + table_total.add_row("Skip Tests", str(self.test_result_cnt.get("total_skip_num", 0))) + + table_detail = Table( + show_header=True, title="Detail Statistics", show_lines=True, width=75 + ) + table_detail.add_column("Result") + table_detail.add_column("Statistics") + table_detail.add_row("Forward Error", str(self.test_result_cnt.get("forward_fail_num", 0))) + table_detail.add_row("Backward Error", str(self.test_result_cnt.get("backward_fail_num", 0))) + table_detail.add_row("Both Forward & Backward Error", str(self.test_result_cnt.get("forward_and_backward_fail_num", 0))) + + console.print(table_total) + console.print(table_detail) + + def get_statistics_from_result_csv(self): + checklist = [CompareConst.PASS, CompareConst.ERROR, CompareConst.WARNING, CompareConst.SPACE, CompareConst.SKIP, "skip"] + self.test_result_cnt = { + "success_num": 0, "warning_num": 0, "error_num": 0, + "forward_fail_num": 0, "backward_fail_num": 0, "forward_and_backward_fail_num": 0, + "total_num": 0, "total_skip_num": 0 + } + with FileOpen(self.save_path, 'r') as file: + reader = csv.reader(file) + result_csv_rows = [row for row in reader] + result_csv_name = os.path.basename(self.save_path) + for item in result_csv_rows[1:]: + if not isinstance(item, list) or len(item) < 3: + raise ValueError("The number of columns in %s is incorrect" % result_csv_name) + if not all(item[i] and item[i] in checklist for i in (1, 2)): + raise ValueError( + "The value in the 2nd or 3rd column of %s is wrong, it must be pass, error, warning, skip, or SPACE" + % result_csv_name) + column1 = item[1] + column2 = item[2] + if column1.upper() == CompareConst.SKIP: + self.test_result_cnt["total_skip_num"] += 1 + continue + self.test_result_cnt["total_num"] += 1 + if column1 == CompareConst.PASS and column2 in [CompareConst.PASS, CompareConst.SPACE]: + self.test_result_cnt['success_num'] += 1 + elif column1 == CompareConst.ERROR and column2 == CompareConst.ERROR: + self.test_result_cnt['forward_and_backward_fail_num'] += 1 + self.test_result_cnt['error_num'] += 1 + elif column1 == CompareConst.ERROR: + self.test_result_cnt['forward_fail_num'] += 1 + self.test_result_cnt['error_num'] += 1 + elif column2 == CompareConst.ERROR: + self.test_result_cnt['backward_fail_num'] += 1 + self.test_result_cnt['error_num'] += 1 + elif column1 == CompareConst.WARNING or column2 == CompareConst.WARNING: + self.test_result_cnt['warning_num'] += 1 + + def write_csv_title(self): + summary_test_rows = [[self.COLUMN_API_NAME, self.COLUMN_FORWARD_SUCCESS, + self.COLUMN_BACKWARD_SUCCESS, "Message"]] + if not os.path.exists(self.save_path): + write_csv(summary_test_rows, self.save_path) + if not os.path.exists(self.detail_save_path): + write_csv(DETAIL_TEST_ROWS, self.detail_save_path) + + def write_summary_csv(self, test_result): + test_rows = [] + if self.stack_info: + test_rows[0].append(self.COLUMN_STACK_INFO) + + name = test_result[0] + df_row = list(test_result[:3]) + if test_result[1] == "SKIP" or test_result[2] == "SKIP": + df_row.append(test_result[3]) + if self.stack_info: + stack_info = "\n".join(self.stack_info[name]) + df_row.append(stack_info) + test_rows.append(df_row) + write_csv(test_rows, self.save_path) + + def write_detail_csv(self, test_result): + test_rows = [] + + subject_prefix = test_result[0] + fwd_result = test_result[3] + bwd_result = test_result[4] + if isinstance(fwd_result, list): + for i, test_subject in enumerate(fwd_result): + subject = subject_prefix + ".forward.output." + str(i) + test_subject = ["{:.{}f}".format(item, msCheckerConfig.precision) + if isinstance(item, float) else item for item in test_subject] + test_rows.append([subject] + list(test_subject)) + if isinstance(bwd_result, list): + for i, test_subject in enumerate(bwd_result): + subject = subject_prefix + ".backward.output." + str(i) + test_subject = ["{:.{}f}".format(item, msCheckerConfig.precision) + if isinstance(item, float) else item for item in test_subject] + test_rows.append([subject] + list(test_subject)) + + write_csv(test_rows, self.detail_save_path) + + def record_results(self, *args): + self.write_summary_csv(args) + self.write_detail_csv(args) + + def compare_output(self, full_api_name, bench_output, device_output, bench_grad=None, npu_grad=None): + _, api_name, _ = full_api_name.split(Const.SEP) + compare_func = self._compare_dropout if "dropout" in full_api_name else self._compare_core_wrapper + fwd_success_status, fwd_compare_alg_results = compare_func(api_name, bench_output, device_output) + if not (bench_grad and npu_grad): + bwd_success_status, bwd_compare_alg_results = (CompareConst.SPACE, []) + else: + if "dropout" in full_api_name: + bwd_success_status, bwd_compare_alg_results = compare_func(api_name, bench_grad[0], npu_grad[0]) + else: + bwd_success_status, bwd_compare_alg_results = compare_func(api_name, bench_grad, npu_grad) + self.record_results(full_api_name, fwd_success_status, bwd_success_status if bwd_compare_alg_results is not None else CompareConst.SPACE, fwd_compare_alg_results, bwd_compare_alg_results) + return fwd_success_status == CompareConst.PASS, bwd_success_status == CompareConst.PASS \ + or bwd_success_status == CompareConst.SPACE + + def _compare_core_wrapper(self, api_name, bench_output, device_output): + detailed_result_total = [] + test_final_success = CompareConst.PASS + if isinstance(bench_output, (list, tuple)): + status, compare_result, message = [], [], [] + if len(bench_output) != len(device_output): + status = [CompareConst.ERROR] + message = ["bench and npu output structure is different."] + else: + for b_out_i, n_out_i in zip(bench_output, device_output): + status_i, compare_result_i, message_i = self._compare_core(api_name, b_out_i, n_out_i) + status.append(status_i) + compare_result.append(compare_result_i) + message.append(message_i) + else: + status, compare_result, message = self._compare_core(api_name, bench_output, device_output) + if not isinstance(status, list): + detailed_result_total.append(compare_result.to_column_value(status, message)) + if status == CompareConst.ERROR: + test_final_success = CompareConst.ERROR + elif status == CompareConst.WARNING: + test_final_success = CompareConst.WARNING + else: + for item, item_status in enumerate(status): + detailed_result_total.append(compare_result[item].to_column_value(item_status, message[item])) + if item_status == CompareConst.ERROR: + test_final_success = CompareConst.ERROR + elif item_status == CompareConst.WARNING: + test_final_success = CompareConst.WARNING + return test_final_success, detailed_result_total + + def _compare_core(self, api_name, bench_output, device_output): + compare_column = CompareColumn() + if not isinstance(bench_output, type(device_output)): + return CompareConst.ERROR, compare_column, "bench and npu output type is different." + elif isinstance(bench_output, dict): + b_keys, n_keys = set(bench_output.keys()), set(device_output.keys()) + if b_keys != n_keys: + return CompareConst.ERROR, compare_column, "bench and npu output dict keys are different." + else: + status, compare_result, message = self._compare_core(api_name, list(bench_output.values()), + list(device_output.values())) + elif isinstance(bench_output, torch.Tensor): + copy_bench_out = bench_output.detach().clone() + copy_device_output = device_output.detach().clone() + compare_column.bench_type = str(copy_bench_out.dtype) + compare_column.npu_type = str(copy_device_output.dtype) + compare_column.shape = tuple(device_output.shape) + status, compare_result, message = self._compare_torch_tensor(api_name, copy_bench_out, copy_device_output, + compare_column) + elif isinstance(bench_output, (bool, int, float, str)): + compare_column.bench_type = str(type(bench_output)) + compare_column.npu_type = str(type(device_output)) + status, compare_result, message = self._compare_builtin_type(bench_output, device_output, compare_column) + elif bench_output is None: + return CompareConst.SKIP, compare_column, "Bench output is None, skip this test." + else: + return CompareConst.PASS, compare_column, + "Unexpected output type in compare_core: {}".format(type(bench_output)) + + return status, compare_result, message + + def _compare_torch_tensor(self, api_name, bench_output, device_output, compare_column): + cpu_shape = bench_output.shape + npu_shape = device_output.shape + npu_dtype = device_output.dtype + if npu_dtype == torch.bfloat16: + bench_output = bench_output.to(torch.float32) + device_output = device_output.to(torch.float32) + bench_output = bench_output.numpy() + device_output = device_output.cpu().numpy() + if cpu_shape != npu_shape: + return CompareConst.ERROR, compare_column, f"The shape of bench{str(cpu_shape)} " \ + f"and npu{str(npu_shape)} not equal." + if not check_dtype_comparable(bench_output, device_output): + return CompareConst.ERROR, compare_column, f"Bench out dtype is {bench_output.dtype} but " \ + f"npu output dtype is {device_output.dtype}, cannot compare." + message = "" + if bench_output.dtype in [bool, np.uint8, np.int8, np.int16, np.uint16, np.uint32, np.int32, + np.int64, np.uint64]: + message += f"Compare algorithm is not supported for {bench_output.dtype} data. " \ + f"Only judged by Error Rate." + err_rate, status, msg = self._compare_bool_tensor(bench_output, device_output) + message += msg + "\n" + compare_column.error_rate = err_rate + return status, compare_column, message + else: + status, compare_column, message = self._compare_float_tensor(api_name, bench_output, device_output, + compare_column, npu_dtype) + return status, compare_column, message + + def _compare_float_tensor(self, api_name, bench_output, device_output, compare_column, dtype): + message = "" + abs_bench, abs_bench_with_eps = get_abs_bench_with_eps(bench_output, dtype) + abs_err = get_abs_err(bench_output, device_output) + if str(dtype) in BENCHMARK_COMPARE_SUPPORT_LIST: + both_finite_mask, inf_nan_mask = get_finite_and_infinite_mask(bench_output, device_output) + if api_name in BinaryStandardApi: + err_rate, _, _ = self._compare_bool_tensor(bench_output, device_output) + compare_column.error_rate = err_rate + elif api_name in AbsoluteStandardApi: + small_value_threshold, small_value_atol, rtol = self._get_absolute_threshold_attribute( + api_name, str(dtype)) + rel_err = abs_err / abs_bench_with_eps + small_value_mask = get_small_value_mask(abs_bench, both_finite_mask, small_value_threshold) + normal_value_mask = np.logical_and(both_finite_mask, np.logical_not(small_value_mask)) + compare_column.inf_nan_error_ratio = check_inf_nan_value(inf_nan_mask, bench_output, device_output, dtype, rtol) + compare_column.rel_err_ratio = check_norm_value(normal_value_mask, rel_err, rtol) + compare_column.abs_err_ratio = check_small_value(abs_err, small_value_mask, small_value_atol) + else: + dtype_config = precision_configs.get(dtype) + small_value_mask = get_small_value_mask(abs_bench, both_finite_mask, dtype_config['small_value'][0]) + abs_err_greater_mask = np.greater(abs_err, dtype_config['small_value_atol'][0]) + compare_column.small_value_err_ratio = get_small_value_err_ratio(small_value_mask, abs_err_greater_mask) + rel_err = get_rel_err(abs_err, abs_bench_with_eps, small_value_mask, inf_nan_mask) + compare_column.RMSE = get_rmse(abs_err, np.logical_or(inf_nan_mask, small_value_mask)) + compare_column.EB = get_error_balance(bench_output, device_output) + compare_column.Max_rel_error = get_max_rel_err(rel_err) + compare_column.Mean_rel_error = get_mean_rel_err(rel_err) + + cos_res, cos_status, msg = cosine_sim(bench_output, device_output) + compare_column.cosine_sim = cos_res + message += msg + "\n" + if not cos_status: + message += "Cosine similarity is less than 0.99, consider as error, skip other check and set to SPACE.\n" + return CompareConst.ERROR, compare_column, message + + max_abs_res, max_abs_status = get_max_abs_err(abs_err) + compare_column.max_abs_err = max_abs_res + if max_abs_status: + message += "Max abs error is less than 0.001, consider as pass, skip other check and set to SPACE.\n" + return CompareConst.PASS, compare_column, message + + rel_err_orign = get_rel_err_origin(abs_err, abs_bench_with_eps) + if dtype in [torch.float16, torch.bfloat16]: + hundred_res, hundred_status = get_rel_err_ratio(rel_err_orign, 0.01) + compare_column.rel_err_hundredth = hundred_res + if not hundred_status: + message += "Relative error is greater than 0.01, consider as error, skip other check and set to SPACE.\n" + return CompareConst.ERROR, compare_column, message + thousand_res, thousand_status = get_rel_err_ratio(rel_err_orign, 0.001) + compare_column.rel_err_thousandth = thousand_res + if dtype in [torch.float16, torch.bfloat16]: + if thousand_status: + message += "Relative error is less than 0.001, consider as pass, skip other check and set to SPACE.\n" + return CompareConst.PASS, compare_column, message + message += "Relative error is greater than 0.001, consider as warning, skip other check and set to SPACE.\n" + return CompareConst.WARNING, compare_column, message + ten_thousand_res, ten_thousand_status = get_rel_err_ratio(rel_err_orign, 0.0001) + compare_column.rel_err_ten_thousandth = ten_thousand_res + if dtype in [torch.float32, torch.float64]: + if not thousand_status: + message += "Relative error is greater than 0.001, consider as error, skip other check and set to SPACE.\n" + return CompareConst.ERROR, compare_column, message + if not ten_thousand_status: + message += "Relative error is greater than 0.0001, consider as warning, skip other check and set to SPACE.\n" + return CompareConst.WARNING, compare_column, message + message += "Relative error is less than 0.0001, consider as pass.\n" + return CompareConst.PASS, compare_column, message + + @staticmethod + def _compare_dropout(api_name, bench_output, device_output): + tensor_num = bench_output.numel() + if tensor_num >= 100: + if abs((bench_output == 0).sum() - (device_output == 0).cpu().sum()) / tensor_num < 0.1: + return CompareConst.PASS, 1 + else: + return CompareConst.ERROR, 0 + else: + return CompareConst.PASS, 1 + + @staticmethod + def _compare_builtin_type(bench_output, device_output, compare_column): + if not isinstance(bench_output, (bool, int, float, str)): + return CompareConst.PASS, compare_column, "" + if bench_output != device_output: + return CompareConst.ERROR, compare_column, "" + compare_column.error_rate = 0 + return CompareConst.PASS, compare_column, "" + + + @staticmethod + def _compare_bool_tensor(bench_output, device_output): + error_nums = (bench_output != device_output).sum() + if bench_output.size == 0: + return CompareConst.NAN, CompareConst.ERROR, "There is not bench calculation result." + error_rate = float(error_nums / bench_output.size) + result = CompareConst.PASS if error_rate == 0 else CompareConst.ERROR + return error_rate, result, "" + + @staticmethod + def _get_absolute_threshold_attribute(api_name, dtype): + small_value_threshold = apis_threshold.get(api_name).get(dtype).get('small_value') + small_value_atol = apis_threshold.get(api_name).get(dtype).get('small_value_atol') + rtol = apis_threshold.get(api_name).get(dtype).get('rtol') + return small_value_threshold, small_value_atol, rtol diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_column.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_column.py new file mode 100644 index 0000000000000000000000000000000000000000..97cf8226bd1ea6c9a668abd91719fd2662b5183b --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_column.py @@ -0,0 +1,63 @@ +from .compare_utils import CompareConst + + +class CompareColumn: + def __init__(self): + self.bench_type = CompareConst.SPACE + self.npu_type = CompareConst.SPACE + self.shape = CompareConst.SPACE + self.cosine_sim = CompareConst.SPACE + self.max_abs_err = CompareConst.SPACE + self.rel_err_hundredth = CompareConst.SPACE + self.rel_err_thousandth = CompareConst.SPACE + self.rel_err_ten_thousandth = CompareConst.SPACE + self.error_rate = CompareConst.SPACE + self.EB = CompareConst.SPACE + self.RMSE = CompareConst.SPACE + self.small_value_err_ratio = CompareConst.SPACE + self.Max_rel_error = CompareConst.SPACE + self.Mean_rel_error = CompareConst.SPACE + self.inf_nan_error_ratio = CompareConst.SPACE + self.rel_err_ratio = CompareConst.SPACE + self.abs_err_ratio = CompareConst.SPACE + + def to_column_value(self, is_pass, message): + return [self.bench_type, self.npu_type, self.shape, self.cosine_sim, self.max_abs_err, self.rel_err_hundredth, + self.rel_err_thousandth, self.rel_err_ten_thousandth, self.error_rate, self.EB, self.RMSE, + self.small_value_err_ratio, self.Max_rel_error, self.Mean_rel_error, self.inf_nan_error_ratio, + self.rel_err_ratio, self.abs_err_ratio, is_pass, message] + + +class ApiPrecisionOutputColumn: + def __init__(self): + self.api_name = CompareConst.SPACE + self.small_value_err_ratio = CompareConst.SPACE + self.small_value_err_status = CompareConst.SPACE + self.rmse_ratio = CompareConst.SPACE + self.rmse_status = CompareConst.SPACE + self.max_rel_err_ratio = CompareConst.SPACE + self.max_rel_err_status = CompareConst.SPACE + self.mean_rel_err_ratio = CompareConst.SPACE + self.mean_rel_err_status = CompareConst.SPACE + self.eb_ratio = CompareConst.SPACE + self.eb_status = CompareConst.SPACE + self.inf_nan_error_ratio = CompareConst.SPACE + self.inf_nan_error_ratio_status = CompareConst.SPACE + self.rel_err_ratio = CompareConst.SPACE + self.rel_err_ratio_status = CompareConst.SPACE + self.abs_err_ratio = CompareConst.SPACE + self.abs_err_ratio_status = CompareConst.SPACE + self.error_rate = CompareConst.SPACE + self.error_rate_status = CompareConst.SPACE + self.compare_result = CompareConst.SPACE + self.compare_algorithm = CompareConst.SPACE + self.compare_message = CompareConst.SPACE + + def to_column_value(self): + return [self.api_name, self.small_value_err_ratio, self.small_value_err_status, self.rmse_ratio, + self.rmse_status, self.max_rel_err_ratio, self.max_rel_err_status, self.mean_rel_err_ratio, + self.mean_rel_err_status, self.eb_ratio, self.eb_status, self.inf_nan_error_ratio, + self.inf_nan_error_ratio_status, self.rel_err_ratio, self.rel_err_ratio_status, self.abs_err_ratio, + self.abs_err_ratio_status, self.error_rate, self.error_rate_status, self.compare_result, + self.compare_algorithm, self.compare_message] + \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_utils.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5511da724446187e2dd886448bf6b26ea7b7b369 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_utils.py @@ -0,0 +1,190 @@ +import time +import os +import numpy as np +import torch +import yaml +from ..common.utils import Const, print_warn_log, CompareException +from ...common.file_check import FileOpen + + +current_time = time.strftime("%Y%m%d%H%M%S") +API_PRECISION_COMPARE_RESULT_FILE_NAME = "api_precision_compare_result_" + current_time + ".csv" +API_PRECISION_COMPARE_DETAILS_FILE_NAME = "api_precision_compare_details_" + current_time + ".csv" +BENCHMARK_COMPARE_SUPPORT_LIST = ['torch.float16', 'torch.bfloat16', 'torch.float32'] +API_PRECISION_COMPARE_UNSUPPORT_LIST = ['torch.float64', 'torch.complex64', 'torch.complex128'] +BINARY_COMPARE_UNSUPPORT_LIST = BENCHMARK_COMPARE_SUPPORT_LIST + API_PRECISION_COMPARE_UNSUPPORT_LIST + + +cur_path = os.path.dirname(os.path.realpath(__file__)) +standard_yaml_path = os.path.join(cur_path, "api_precision_standard.yaml") +with FileOpen(standard_yaml_path, 'r') as f: + Apis = yaml.safe_load(f) + AbsoluteStandardApi = Apis.get('AbsoluteThreshStandard') + BinaryStandardApi = Apis.get('BinaryCompareStandard') + + +threshold_yaml_path = os.path.join(cur_path, "api_precision_threshold.yaml") +with FileOpen(threshold_yaml_path, 'r') as f: + apis_threshold = yaml.safe_load(f) + + +DETAIL_TEST_ROWS = [[ + "API Name", "Bench Dtype", "DEVICE Dtype", "Shape", + "余弦相似度", + "最大绝对误差", + "双百指标", + "双千指标", + "双万指标", + "二进制一致错误率", + "误差均衡性", + "均方根误差", + "小值域错误占比", + "相对误差最大值", + "相对误差平均值", + "inf/nan错误率", + "相对误差错误率", + "绝对误差错误率", + "Status", + "Message" + ]] + + +precision_configs = { + torch.float16 : { + 'small_value' : [ + 1e-3 + ], + 'small_value_atol' : [ + 1e-5 + ] + }, + torch.bfloat16: { + 'small_value' : [ + 1e-3 + ], + 'small_value_atol' : [ + 1e-5 + ] + }, + torch.float32:{ + 'small_value' : [ + 1e-6 + ], + 'small_value_atol' : [ + 1e-9 + ] + } +} + + +class CompareConst: + NAN = np.nan + NA = "N/A" + PASS = 'pass' + WARNING = 'warning' + ERROR = 'error' + SKIP = 'SKIP' + TRUE = 'TRUE' + FALSE = 'FALSE' + BFLOAT16_MIN = -3.3895313892515355e+38 + BFLOAT16_MAX = 3.3895313892515355e+38 + BFLOAT16_EPS = 2 ** -8 + SPACE = " " + + +class ApiPrecisionCompareColumn: + API_NAME = 'API Name' + DEVICE_DTYPE = 'DEVICE Dtype' + SMALL_VALUE_ERROR_RATE = '小值域错误占比' + RMSE = '均方根误差' + MAX_REL_ERR = '相对误差最大值' + MEAN_REL_ERR = '相对误差平均值' + EB = '误差均衡性' + SMALL_VALUE_ERROR_RATIO = '小值域错误比值' + SMALL_VALUE_ERROR_STATUS = '小值域判定结果' + RMSE_RATIO = '均方根误差比值' + RMSE_STATUS = '均方根误差判定结果' + MAX_REL_ERR_RATIO = '相对误差最大值比值' + MAX_REL_ERR_STATUS = '相对误差最大值判定结果' + MEAN_REL_ERR_RATIO = '相对误差平均值比值' + MEAN_REL_ERR_STATUS = '相对误差平均值判定结果' + EB_RATIO = '误差均衡性比值' + EB_STATUS = '误差均衡性判定结果' + ERROR_RATE = '二进制一致错误率' + ERROR_RATE_STATUS = '二进制一致错误率判定结果' + INF_NAN_ERROR_RATIO = 'inf/nan错误率' + INF_NAN_ERROR_RATIO_STATUS = 'inf/nan判定结果' + REL_ERR_RATIO = '相对误差错误率' + REL_ERR_RATIO_STATUS = '相对误差判定结果' + ABS_ERR_RATIO = '绝对误差错误率' + ABS_ERR_RATIO_STATUS = '绝对误差判定结果' + FINAL_RESULT = '比对结果' + ALGORITHM = '比对算法' + FORWWARD_STATUS = 'Forward Test Success' + BACKWARD_STATUS = 'Backward Test Success' + MESSAGE = 'Message' + + @staticmethod + def to_required_columns(): + return [ApiPrecisionCompareColumn.API_NAME, ApiPrecisionCompareColumn.DEVICE_DTYPE, + ApiPrecisionCompareColumn.SMALL_VALUE_ERROR_RATE, ApiPrecisionCompareColumn.RMSE, + ApiPrecisionCompareColumn.MAX_REL_ERR, ApiPrecisionCompareColumn.MEAN_REL_ERR, ApiPrecisionCompareColumn.EB, + ApiPrecisionCompareColumn.ERROR_RATE, ApiPrecisionCompareColumn.INF_NAN_ERROR_RATIO, + ApiPrecisionCompareColumn.REL_ERR_RATIO, ApiPrecisionCompareColumn.ABS_ERR_RATIO] + + @staticmethod + def get_detail_csv_title(): + return [ApiPrecisionCompareColumn.API_NAME, + ApiPrecisionCompareColumn.SMALL_VALUE_ERROR_RATIO, ApiPrecisionCompareColumn.SMALL_VALUE_ERROR_STATUS, + ApiPrecisionCompareColumn.RMSE_RATIO, ApiPrecisionCompareColumn.RMSE_STATUS, + ApiPrecisionCompareColumn.MAX_REL_ERR_RATIO, ApiPrecisionCompareColumn.MAX_REL_ERR_STATUS, + ApiPrecisionCompareColumn.MEAN_REL_ERR_RATIO, ApiPrecisionCompareColumn.MEAN_REL_ERR_STATUS, + ApiPrecisionCompareColumn.EB_RATIO, ApiPrecisionCompareColumn.EB_STATUS, + ApiPrecisionCompareColumn.INF_NAN_ERROR_RATIO, ApiPrecisionCompareColumn.INF_NAN_ERROR_RATIO_STATUS, + ApiPrecisionCompareColumn.REL_ERR_RATIO, ApiPrecisionCompareColumn.REL_ERR_RATIO_STATUS, + ApiPrecisionCompareColumn.ABS_ERR_RATIO, ApiPrecisionCompareColumn.ABS_ERR_RATIO_STATUS, + ApiPrecisionCompareColumn.ERROR_RATE, ApiPrecisionCompareColumn.ERROR_RATE_STATUS, + ApiPrecisionCompareColumn.FINAL_RESULT, ApiPrecisionCompareColumn.ALGORITHM, ApiPrecisionCompareColumn.MESSAGE] + + @staticmethod + def get_result_csv_title(): + return [ApiPrecisionCompareColumn.API_NAME, ApiPrecisionCompareColumn.FORWWARD_STATUS, + ApiPrecisionCompareColumn.BACKWARD_STATUS, ApiPrecisionCompareColumn.MESSAGE] + + +CompareMessage = { + "topk" : "在npu上,topk的入参sorted=False时不生效,会返回有序tensor,而cpu上会返回无序tensor。 如果topk精度不达标,请检查是否是该原因导致的。" +} + + +def check_dtype_comparable(x, y): + if x.dtype in Const.FLOAT_TYPE: + if y.dtype in Const.FLOAT_TYPE: + return True + return False + if x.dtype in Const.BOOL_TYPE: + if y.dtype in Const.BOOL_TYPE: + return True + return False + if x.dtype in Const.INT_TYPE: + if y.dtype in Const.INT_TYPE: + return True + return False + print_warn_log(f"Compare: Unexpected dtype {x.dtype}, {y.dtype}") + return False + + +def convert_str_to_float(input_data): + if isinstance(input_data, str) and input_data.strip() == "": + msg = 'ERROR: Input data is an empty string' + raise CompareException(CompareException.INVALID_DATA_ERROR, msg) + try: + float_data = float(input_data) + if str(float_data) in ('inf', '-inf', 'nan'): + msg = 'ERROR: Input data is either "inf", "-inf", "nan"' + raise CompareException(CompareException.INVALID_DATA_ERROR, msg) + return float_data + except ValueError as e: + msg = 'ERROR: Input data cannot be converted to float' + raise CompareException(CompareException.INVALID_DATA_ERROR, msg) from e + \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/config.yaml b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e2582c4539c9408102d3496242651cedeeefeb22 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/config.yaml @@ -0,0 +1,9 @@ +dump_path: './' +real_data: False +enable_dataloader: False +target_iter: [1] +white_list: [] +error_data_path: './' +jit_compile: True +precision: 14 + \ No newline at end of file diff --git "a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/doc/API Accuracy Checker\351\242\204\346\243\200\345\267\245\345\205\267\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" "b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/doc/API Accuracy Checker\351\242\204\346\243\200\345\267\245\345\205\267\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" new file mode 100644 index 0000000000000000000000000000000000000000..740f72589a034476586c342d9709b05ea44a93d3 --- /dev/null +++ "b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/doc/API Accuracy Checker\351\242\204\346\243\200\345\267\245\345\205\267\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" @@ -0,0 +1,64 @@ +# API Accuracy Checker预检工具标准性能基线报告 + +## 环境信息 + +NPU:Atlas A2 训练系列产品 + +CPU: + +![输入图片说明](https://foruda.gitee.com/images/1707274376423595920/8d725bef_10012209.png) + +Torch:2.1.0 + +CANN:8.0.T2 + +除上述环境信息影响性能外,API的数量、种类以及Shape都会对性能产生影响,因此本次选取指定网络进行测试。 + +## 多进程使用说明 + +1. 因预检工具run ut会在NPU和CPU上分别运行每个API的计算,开启多进程后会将指定总进程数平均分配给指定的NPU处理。经测试多进程数量需控制在每张卡不超过8个进程,8卡总计不超过63个进程。建议大模型场景下使用8卡56个进程。 +2. 进程数过多可能会造成环境的内存占用过高导致环境崩溃或NPU上out of memeory,若发生此类情况请减少总进程数。 +3. 因子进程拉起需要额外耗时,小模型场景下不建议开过多进程,过多进程性能提升可能并不明显。 +4. 若发生上述情况导致运行中断,可以使用断点续训功能减少进程数后重新运行。 + +## 模型信息和性能基线 + +以下场景的性能基线测试数据均为多次测试后取平均值,因此实际运行时性能数据可能会根据环境状态稍有浮动。 + +### YOLOV5 + +API:442个,主要数据类型:FLOAT32 + +单进程run_ut耗时:3m55s + +单卡8进程耗时:2m11s + +当API数量较少时,多进程计算性能提升不明显,因为拉起子进程需要额外耗时,此场景下不建议开过多进程。 + +### GPT-3 + +NUM_LAYER:1,API:170个, 主要数据类型:FLOAT16 + +单进程run_ut耗时:10m22s + +单卡8进程耗时:3m50s + +4卡16进程耗时:1m50s + +### GPT-3 + +NUM_LAYER:8,API:16782个,主要数据类型:FLOAT16 + +单进程run_ut耗时:大于2天(未跑完) + +8卡56个进程耗时:1h33m + +当API数量很多时多进程下性能提升明显,可以将天级的运行时长缩短至小时级。 + +### GLM + +API:6035个,主要数据类型:FLOAT16 + +单进程run_ut耗时:大于2天(未跑完) + +8卡56个进程耗时:2h40m diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/.keep b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c9602292b85f753fd132634b98c74c76460997b0 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/__init__.py @@ -0,0 +1 @@ +__all__ = ['set_dump_switch'] diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/api_info.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/api_info.py new file mode 100644 index 0000000000000000000000000000000000000000..7452cec74e80c812902341ef2af13d3f29c5f10c --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/api_info.py @@ -0,0 +1,237 @@ +# 定义API INFO,保存基本信息,用于后续结构体的落盘,注意考虑random场景及真实数据场景 +import os +import inspect +import torch +import numpy as np +from ..common.config import msCheckerConfig +from ..common.utils import print_error_log, write_pt, create_directory, DumpException, \ + get_real_data_path +from ...common.file_check import check_path_before_create + + +def get_tensor_extremum(data, operator): + if data.dtype is torch.bool: + if data.numel() == 0: + return False, False + if operator == 'max': + return True in data, True in data + elif operator == 'min': + return False not in data, False not in data + data_clone = data.float().clone().detach() + if operator == 'max': + max_result = torch._C._VariableFunctionsClass.max(data_clone).item() + if np.isinf(max_result) or np.isnan(max_result): + return handle_tensor_extremum_nan_inf(data_clone, operator), max_result + else: + return max_result, max_result + else: + min_result = torch._C._VariableFunctionsClass.min(data_clone).item() + if np.isinf(min_result) or np.isnan(min_result): + return handle_tensor_extremum_nan_inf(data_clone, operator), min_result + else: + return min_result, min_result + + +def handle_tensor_extremum_nan_inf(data_clone, operator): + data_nan = torch._C._VariableFunctionsClass.isnan(data_clone) + if int(torch._C._VariableFunctionsClass.sum(data_nan)) == data_clone.numel(): + return float('nan') + finite_mask = torch._C._VariableFunctionsClass.isfinite(data_clone) + if int(torch._C._VariableFunctionsClass.sum(finite_mask)) > 0: + finite_values = data_clone[finite_mask] + return torch._C._VariableFunctionsClass.max(finite_values).item() if operator == 'max' else \ + torch._C._VariableFunctionsClass.min(finite_values).item() + else: + data_no_nan = data_clone[~data_nan] + return torch._C._VariableFunctionsClass.max(data_no_nan).item() if operator == 'max' else \ + torch._C._VariableFunctionsClass.min(data_no_nan).item() + + +def get_type_name(name): + left = name.index("'") + right = name.rindex("'") + return name[left + 1: right] + + +def transfer_types(data, dtype): + if 'int' in dtype or 'bool' in dtype: + return int(data) + else: + return float(data) + + +def is_builtin_class(element): + return element is None or isinstance(element, (bool, int, float, str, slice)) + + +def analyze_device_in_kwargs(element): + single_arg = {} + single_arg.update({'type': 'torch.device'}) + if not isinstance(element, str): + if hasattr(element, "index"): + device_value = element.type + ":" + str(element.index) + else: + device_value = element.type + single_arg.update({'value': device_value}) + else: + single_arg.update({'value': element}) + return single_arg + + +def analyze_dtype_in_kwargs(element): + single_arg = {} + single_arg.update({'type': 'torch.dtype'}) + single_arg.update({'value': str(element)}) + return single_arg + + +class APIInfo: + def __init__(self, api_name, save_path, is_save_data=False): + self.api_name = api_name + self.torch_object_key = {'device': analyze_device_in_kwargs, 'dtype': analyze_dtype_in_kwargs} + self.rank = os.getpid() + self.is_save_data = is_save_data + self.save_path = save_path + self.args_num = 0 + + @staticmethod + def get_full_save_path(save_path, dir_name, contain_step=False): + if contain_step: + from calibrator.pytorch.api_accuracy_checker.dump.dump import DumpUtil + step_dir = "step" + str(DumpUtil.call_num - 1 if msCheckerConfig.enable_dataloader else DumpUtil.call_num) + rank_dir = f"rank{os.getpid()}" + return os.path.join(save_path, step_dir, dir_name, rank_dir) + else: + return os.path.join(save_path, dir_name) + + def analyze_element(self, element): + if isinstance(element, (list, tuple)): + out = [] + for item in element: + out.append(self.analyze_element(item)) + return out + + if isinstance(element, dict): + out_dict = {} + for key, value in element.items(): + if key in self.torch_object_key.keys(): + fun = self.torch_object_key[key] + out_dict[key] = fun(value) + else: + out_dict[key] = self.analyze_element(value) + return out_dict + + converted_numpy, numpy_type = self._convert_numpy_to_builtin(element) + if converted_numpy is not element: + return self._analyze_numpy(converted_numpy, numpy_type) + + if isinstance(element, torch.Tensor): + return self._analyze_tensor(element) + + if is_builtin_class(element): + return self._analyze_builtin(element) + + msg = f"Type {type(element)} is unsupported at analyze_element" + print_error_log(msg) + raise DumpException(DumpException.INVALID_DATA_ERROR) + + def _analyze_tensor(self, arg): + single_arg = {} + if not self.is_save_data: + single_arg.update({'type': 'torch.Tensor'}) + single_arg.update({'dtype': str(arg.dtype)}) + single_arg.update({'shape': arg.shape}) + max_handle, max_origin = get_tensor_extremum(arg, 'max') + single_arg.update({'Max': transfer_types(max_handle, str(arg.dtype))}) + single_arg.update({'Max_origin': transfer_types(max_origin, str(arg.dtype))}) + min_handle, min_origin = get_tensor_extremum(arg, 'min') + single_arg.update({'Min': transfer_types(min_handle, str(arg.dtype))}) + single_arg.update({'Min_origin': transfer_types(min_origin, str(arg.dtype))}) + single_arg.update({'requires_grad': arg.requires_grad}) + else: + api_args = self.api_name + '.' + str(self.args_num) + check_path_before_create(self.save_path) + create_directory(self.save_path) + file_path = os.path.join(self.save_path, f'{api_args}.pt') + pt_path = write_pt(file_path, arg.contiguous().cpu().detach()) + self.args_num += 1 + real_data_path = get_real_data_path(pt_path) + single_arg.update({'type': 'torch.Tensor'}) + single_arg.update({'datapath': real_data_path}) + single_arg.update({'requires_grad': arg.requires_grad}) + return single_arg + + def _analyze_builtin(self, arg): + single_arg = {} + if self.is_save_data: + self.args_num += 1 + if isinstance(arg, slice): + single_arg.update({'type': "slice"}) + single_arg.update({'value': [arg.start, arg.stop, arg.step]}) + else: + single_arg.update({'type': get_type_name(str(type(arg)))}) + single_arg.update({'value': arg}) + return single_arg + + def _analyze_numpy(self, value, numpy_type): + single_arg = {} + if self.is_save_data: + self.args_num += 1 + single_arg.update({'type': numpy_type}) + single_arg.update({'value': value}) + return single_arg + + def _convert_numpy_to_builtin(self, arg): + type_mapping = { + np.integer: int, + np.floating: float, + np.bool_: bool, + np.complexfloating: complex, + np.str_: str, + np.bytes_: bytes, + np.unicode_: str + } + for numpy_type, builtin_type in type_mapping.items(): + if isinstance(arg, numpy_type): + return builtin_type(arg), get_type_name(str(type(arg))) + return arg, '' + + +class ForwardAPIInfo(APIInfo): + def __init__(self, name, args, kwargs): + super().__init__(name, + self.get_full_save_path(msCheckerConfig.dump_path, 'forward_real_data', contain_step=True), + is_save_data=msCheckerConfig.real_data) + self.api_info_struct = {} + self.stack_info_struct = {} + self.analyze_api_input(args, kwargs) + self.analyze_api_call_stack() + + def analyze_api_input(self, args, kwargs): + args_info_list = self.analyze_element(args) + kwargs_info_dict = self.analyze_element(kwargs) + self.api_info_struct = {self.api_name: {"args": args_info_list, "kwargs": kwargs_info_dict}} + + def analyze_api_call_stack(self): + stack_str = [] + for (_, path, line, func, code, _) in inspect.stack()[3:]: + if not code: + continue + stack_line = " ".join([ + "File", ", ".join([path, " ".join(["line", str(line)]), " ".join(["in", func]), + " ".join(["\n", code[0].strip()])])]) + stack_str.append(stack_line) + self.stack_info_struct = {self.api_name: stack_str} + + +class BackwardAPIInfo(APIInfo): + def __init__(self, name, grads): + super().__init__(name, + self.get_full_save_path(msCheckerConfig.dump_path, 'backward_real_data', contain_step=True), + is_save_data=msCheckerConfig.real_data) + self.grad_info_struct = {} + self.analyze_api_input(grads) + + def analyze_api_input(self, grads): + grads_info_list = self.analyze_element(grads) + self.grad_info_struct = {self.api_name: grads_info_list} diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/dump.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/dump.py new file mode 100644 index 0000000000000000000000000000000000000000..b20378fd45d322e1e2e4a61031c8c1fa240ca5a0 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/dump.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from .api_info import ForwardAPIInfo, BackwardAPIInfo +from .info_dump import write_api_info_json, initialize_output_json +from ..common.utils import print_error_log, CompareException, print_info_log +from ..hook_module.register_hook import initialize_hook +from ..common.config import msCheckerConfig + + +def set_dump_switch(switch): + if switch not in ["ON", "OFF"]: + print_error_log("Please set switch with 'ON' or 'OFF'.") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + if switch == "ON": + initialize_hook(pretest_hook) + initialize_output_json() + DumpUtil.set_dump_switch(switch) + + +def check_dataloader_status(): + if msCheckerConfig.enable_dataloader: + error_info = ("If you want to use this function, set enable_dataloader " + "in the accuracy_tools/api_accuracy_check/config.yaml " + "to False first") + raise CompareException(CompareException.INVALID_PARAM_ERROR, error_info) + + +def start(): + check_dataloader_status() + if not DumpUtil.get_dump_switch(): + DumpUtil.incr_iter_num_maybe_exit() + + +def stop(): + check_dataloader_status() + DumpUtil.set_dump_switch("OFF") + + +def step(): + check_dataloader_status() + DumpUtil.call_num += 1 + + +class DumpUtil(object): + dump_switch = None + call_num = 0 + + @staticmethod + def set_dump_switch(switch): + DumpUtil.dump_switch = switch + + @staticmethod + def get_dump_switch(): + return DumpUtil.dump_switch == "ON" + + @staticmethod + def incr_iter_num_maybe_exit(): + if DumpUtil.call_num in msCheckerConfig.target_iter: + set_dump_switch("ON") + elif DumpUtil.call_num > max(msCheckerConfig.target_iter): + raise Exception("Model pretest: exit after iteration {}".format(DumpUtil.call_num - 1)) + else: + set_dump_switch("OFF") + + +class DumpConst: + delimiter = '*' + forward = 'forward' + backward = 'backward' + + +def pretest_info_dump(name, out_feat, module, phase): + if not DumpUtil.get_dump_switch(): + return + if phase == DumpConst.forward: + api_info = ForwardAPIInfo(name, module.input_args, module.input_kwargs) + elif phase == DumpConst.backward: + api_info = BackwardAPIInfo(name, out_feat) + else: + msg = "Unexpected training phase {}.".format(phase) + print_error_log(msg) + raise NotImplementedError(msg) + print_info_log(f"tools is dumping api: {name}" + " " * 10, end='\r') + write_api_info_json(api_info) + + +def pretest_hook(name, phase): + def pretest_info_dump_hook(module, in_feat, out_feat): + pretest_info_dump(name, out_feat, module, phase) + if hasattr(module, "input_args"): + del module.input_args + if hasattr(module, "input_kwargs"): + del module.input_kwargs + return pretest_info_dump_hook diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/dump_scope.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/dump_scope.py new file mode 100644 index 0000000000000000000000000000000000000000..ac78fa8ccae9f5935d919b62ec72ed588b290a9f --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/dump_scope.py @@ -0,0 +1,22 @@ +# dump范围控制 +import torch +from torch.utils.data.dataloader import _BaseDataLoaderIter +from ..dump.dump import DumpUtil +from ..common.config import msCheckerConfig + + +def iter_tracer(original_next): + def func_wrapper(*args, **kwargs): + if msCheckerConfig.enable_dataloader: + DumpUtil.dump_switch = "OFF" + result = original_next(*args, **kwargs) + DumpUtil.incr_iter_num_maybe_exit() + DumpUtil.call_num += 1 + return result + else: + return original_next(*args, **kwargs) + return func_wrapper + +original_next_method = _BaseDataLoaderIter.__next__ + +_BaseDataLoaderIter.__next__ = iter_tracer(original_next_method) \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/info_dump.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/info_dump.py new file mode 100644 index 0000000000000000000000000000000000000000..31165077165c724f0e10ad0e279f5a59593cfd48 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/info_dump.py @@ -0,0 +1,72 @@ +import fcntl +import json +import os +import threading +import multiprocessing + +from ..dump.api_info import ForwardAPIInfo, BackwardAPIInfo +from ..common.utils import check_file_or_directory_path, create_directory +from ...common.file_check import check_path_before_create +from ...common.file_check import FileOpen, FileCheckConst, FileChecker, change_mode +from ..common.config import msCheckerConfig + + +lock = threading.Lock() +proc_lock = multiprocessing.Lock() + + +def write_api_info_json(api_info): + from ..dump.dump import DumpUtil + dump_path = msCheckerConfig.dump_path + dump_path = os.path.join(msCheckerConfig.dump_path, "step" + str((DumpUtil.call_num - 1) if msCheckerConfig.enable_dataloader else DumpUtil.call_num)) + check_path_before_create(dump_path) + create_directory(dump_path) + rank = api_info.rank + if isinstance(api_info, ForwardAPIInfo): + file_path = os.path.join(dump_path, f'forward_info_{rank}.json') + stack_file_path = os.path.join(dump_path, f'stack_info_{rank}.json') + write_json(file_path, api_info.api_info_struct) + write_json(stack_file_path, api_info.stack_info_struct, indent=4) + + elif isinstance(api_info, BackwardAPIInfo): + file_path = os.path.join(dump_path, f'backward_info_{rank}.json') + write_json(file_path, api_info.grad_info_struct) + else: + raise ValueError(f"Invalid api_info type {type(api_info)}") + + +def write_json(file_path, data, indent=None): + check_file_or_directory_path(os.path.dirname(file_path), True) + with proc_lock, lock, FileOpen(file_path, 'a+') as f: + fcntl.flock(f, fcntl.LOCK_EX) + try: + f.seek(0, os.SEEK_END) + current_position = f.tell() + if current_position > 0: + f.seek(current_position - 1, os.SEEK_SET) + f.truncate() + if f.tell() > 3: + f.seek(f.tell() - 1, os.SEEK_SET) + f.truncate() + f.write(',\n') + f.write(json.dumps(data, indent=indent)[1:-1] + '\n}') + else: + change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY) + f.write('{\n' + json.dumps(data, indent=indent)[1:] + '\n') + except Exception as e: + raise ValueError(f"Json save failed:{e}") from e + finally: + fcntl.flock(f, fcntl.LOCK_UN) + + +def initialize_output_json(): + dump_path = msCheckerConfig.dump_path + check_path_before_create(dump_path) + create_directory(dump_path) + dump_path_checker = FileChecker(dump_path, FileCheckConst.DIR) + dump_path = dump_path_checker.common_check() + files = ['forward_info.json', 'backward_info.json', 'stack_info.json'] + for file in files: + file_path = os.path.join(dump_path, file) + if os.path.exists(file_path): + raise ValueError(f"file {file_path} already exists, please remove it first or use a new dump path") diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/.keep b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/hook_module.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/hook_module.py new file mode 100644 index 0000000000000000000000000000000000000000..02d5fa5500e470a158b980ff889ab4d7a7ec25bf --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/hook_module.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + + +import functools + +import torch +import torch.nn as nn +import torch.utils.hooks as full_hooks + +module_count = {} +g_stop_hook = False + + +class HOOKModule(nn.Module): + + def __init__(self, hook) -> None: + super(HOOKModule, self).__init__() + self.has_overflow = False + self.input_args = tuple() + self.input_kwargs = dict() + self._enable_hook = True + prefix = "" + if hasattr(self, "prefix_op_name_"): + prefix = self.prefix_op_name_ + + if prefix not in module_count: + module_count[prefix] = 1 + prefix += '0' + else: + module_count[prefix] += 1 + prefix = prefix + str(module_count[prefix] - 1) + + self.register_forward_hook(hook(prefix, "forward")) + self.register_backward_hook(hook(prefix, "backward")) + + def __call__(self, *inputs, **kwargs): + changed = False + global g_stop_hook + if g_stop_hook: + self._enable_hook = False + else: + g_stop_hook = True + changed = True + result = self._call_func(*inputs, **kwargs) + if changed: + g_stop_hook = False + return result + + def _call_func(self, *inputs, **kwargs): + if self._enable_hook: + full_backward_hooks, non_full_backward_hooks = [], [] + if len(self._backward_hooks) > 0: + full_backward_hooks, non_full_backward_hooks = self._get_backward_hooks() + for hook in self._forward_pre_hooks.values(): + result = hook(self, inputs) + if result is not None: + if not isinstance(result, tuple): + result = (result,) + inputs = result + bw_hook = None + if len(full_backward_hooks) > 0: + bw_hook = full_hooks.BackwardHook(self, full_backward_hooks) + inputs = bw_hook.setup_input_hook(inputs) + self.input_args = inputs + self.input_kwargs = kwargs + if torch._C._get_tracing_state(): + result = self._slow_forward(*inputs, **kwargs) + else: + result = self.forward(*inputs, **kwargs) + for hook in self._forward_hooks.values(): + hook_result = hook(self, inputs, result) + if hook_result is not None: + result = hook_result + if bw_hook: + result = bw_hook.setup_output_hook(result) + if len(non_full_backward_hooks) > 0: + var = result + while not isinstance(var, torch.Tensor): + if isinstance(var, dict): + var = next((v for v in var.values() if isinstance(v, torch.Tensor))) + elif isinstance(var, (list, tuple)): + if var: + var = var[0] + else: + return result + else: + return result + grad_fn = var.grad_fn + if grad_fn is not None: + for hook in non_full_backward_hooks: + wrapper = functools.partial(hook, self) + functools.update_wrapper(wrapper, hook) + grad_fn.register_hook(wrapper) + self._maybe_warn_non_full_backward_hook(inputs, result, grad_fn) + return result + else: + forward_call = (self._slow_forward if torch._C._get_tracing_state() else self.forward) + return forward_call(*inputs, **kwargs) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/register_hook.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/register_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..eee0d6c5d665470fbeaf49938cbbed1693c5f623 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/register_hook.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import torch + +from api_accuracy_checker.hook_module import wrap_torch, wrap_functional, wrap_tensor + + +def initialize_hook(hook): + wrap_tensor.wrap_tensor_ops_and_bind(hook) + for attr_name in dir(wrap_tensor.HOOKTensor): + if attr_name.startswith("wrap_"): + setattr(torch.Tensor, attr_name[5:], getattr(wrap_tensor.HOOKTensor, attr_name)) + + wrap_torch.wrap_torch_ops_and_bind(hook) + for attr_name in dir(wrap_torch.HOOKTorchOP): + if attr_name.startswith("wrap_"): + setattr(torch, attr_name[5:], getattr(wrap_torch.HOOKTorchOP, attr_name)) + + wrap_functional.wrap_functional_ops_and_bind(hook) + for attr_name in dir(wrap_functional.HOOKFunctionalOP): + if attr_name.startswith("wrap_"): + setattr(torch.nn.functional, attr_name[5:], getattr(wrap_functional.HOOKFunctionalOP, attr_name)) + diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/support_wrap_ops.yaml new file mode 100644 index 0000000000000000000000000000000000000000..acd4cc0e6e658dd4278f6a67c4f0e8fc288efde6 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/support_wrap_ops.yaml @@ -0,0 +1,999 @@ +# Copyright (c) 2023 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# List of ops that register hooks + +functional: + - conv1d + - conv2d + - conv3d + - conv_transpose1d + - conv_transpose2d + - conv_transpose3d + - conv_tbc + - avg_pool1d + - avg_pool2d + - avg_pool3d + - fractional_max_pool2d_with_indices + - fractional_max_pool2d + - fractional_max_pool3d_with_indices + - fractional_max_pool3d + - max_pool1d_with_indices + - max_pool1d + - max_pool2d_with_indices + - max_pool2d + - max_pool3d_with_indices + - max_pool3d + - max_unpool1d + - max_unpool2d + - max_unpool3d + - lp_pool2d + - lp_pool1d + - adaptive_max_pool1d_with_indices + - adaptive_max_pool1d + - adaptive_max_pool2d_with_indices + - adaptive_max_pool2d + - adaptive_max_pool3d_with_indices + - adaptive_max_pool3d + - adaptive_avg_pool1d + - adaptive_avg_pool2d + - adaptive_avg_pool3d + - dropout + - alpha_dropout + - dropout2d + - dropout3d + - feature_alpha_dropout + - threshold + - threshold_ + - relu + - relu_ + - glu + - hardtanh + - hardtanh_ + - relu6 + - elu + - elu_ + - selu + - selu_ + - celu + - celu_ + - leaky_relu + - leaky_relu_ + - prelu + - rrelu + - rrelu_ + - logsigmoid + - gelu + - hardshrink + - tanhshrink + - softsign + - softplus + - softmin + - softmax + - gumbel_softmax + - log_softmax + - softshrink + - tanh + - sigmoid + - hardsigmoid + - linear + - bilinear + - silu + - hardswish + - embedding + - embedding_bag + - batch_norm + - instance_norm + - layer_norm + - group_norm + - local_response_norm + - ctc_loss + - nll_loss + - poisson_nll_loss + - gaussian_nll_loss + - kl_div + - cross_entropy + - binary_cross_entropy + - binary_cross_entropy_with_logits + - smooth_l1_loss + - l1_loss + - mse_loss + - margin_ranking_loss + - hinge_embedding_loss + - multilabel_margin_loss + - soft_margin_loss + - multilabel_soft_margin_loss + - cosine_embedding_loss + - multi_margin_loss + - pixel_shuffle + - pixel_unshuffle + - channel_shuffle + - upsample + - interpolate + - upsample_nearest + - upsample_bilinear + - grid_sample + - affine_grid + - pad + - pairwise_distance + - pdist + - cosine_similarity + - one_hot + - triplet_margin_loss + - triplet_margin_with_distance_loss + - normalize + - unfold + - fold + - multi_head_attention_forward + +tensor: + - __add__ + - __and__ + - __bool__ + - __div__ + - __eq__ + - __ge__ + - __gt__ + - __iadd__ + - __iand__ + - __idiv__ + - __ifloordiv__ + - __ilshift__ + - __imod__ + - __imul__ + - __ior__ + - __irshift__ + - __isub__ + - __ixor__ + - __lshift__ + - __matmul__ + - __mod__ + - __mul__ + - __nonzero__ + - __or__ + - __radd__ + - __rmul__ + - __rshift__ + - __sub__ + - __truediv__ + - __xor__ + - abs + - abs_ + - absolute + - absolute_ + - acos + - acos_ + - acosh + - acosh_ + - add + - add_ + - addbmm + - addbmm_ + - addcdiv + - addcdiv_ + - addcmul + - addcmul_ + - addmm + - addmm_ + - addmv + - addmv_ + - addr + - addr_ + - align_as + - align_to + - all + - allclose + - amax + - amin + - angle + - any + - arccos + - arccos_ + - arccosh + - arccosh_ + - arcsin + - arcsin_ + - arcsinh + - arcsinh_ + - arctan + - arctan_ + - arctanh + - arctanh_ + - argmax + - argmin + - argsort + - asin + - asin_ + - asinh + - asinh_ + - atan + - atan2 + - atan2_ + - atan_ + - atanh + - atanh_ + - baddbmm + - baddbmm_ + - bernoulli + - bernoulli_ + - bincount + - bitwise_and + - bitwise_and_ + - bitwise_not + - bitwise_not_ + - bitwise_or + - bitwise_or_ + - bitwise_xor + - bitwise_xor_ + - bmm + - broadcast_to + - cauchy_ + - ceil + - ceil_ + - cholesky + - chunk + - clamp + - cholesky_solve + - cholesky_inverse + - clamp_ + - clamp_max + - clamp_max_ + - clip + - clamp_min + - clamp_min_ + - clip_ + - copysign + - copysign_ + - cos + - cos_ + - cosh + - cosh_ + - count_nonzero + - cummax + - cummin + - cumprod + - cumprod_ + - cumsum + - cumsum_ + - deg2rad + - deg2rad_ + - det + - diag + - diag_embed + - diagflat + - diagonal + - diff + - dist + - digamma + - digamma_ + - div + - div_ + - divide + - divide_ + - dot + - eig + - eq + - eq_ + - erf + - equal + - erf_ + - erfc + - erfc_ + - erfinv + - erfinv_ + - exp + - exp2 + - exp2_ + - expm1 + - exp_ + - expm1_ + - exponential_ + - fill_ + - fix + - fill_diagonal_ + - fix_ + - flip + - fliplr + - flatten + - flipud + - float_power + - float_power_ + - floor + - floor_ + - floor_divide + - floor_divide_ + - fmax + - fmin + - fmod + - fmod_ + - frac + - frac_ + - gather + - gcd + - gcd_ + - ge + - ge_ + - geometric_ + - geqrf + - ger + - greater + - greater_ + - gt + - gt_ + - greater_equal + - greater_equal_ + - hardshrink + - heaviside + - heaviside_ + - histc + - hypot + - hypot_ + - igamma + - igamma_ + - igammac + - igammac_ + - index_add + - index_add_ + - inverse + - index_copy + - index_copy_ + - index_fill + - index_fill_ + - index_put + - index_put_ + - inner + - index_select + - isclose + - isfinite + - isinf + - isnan + - isneginf + - isposinf + - isreal + - kron + - kthvalue + - lcm + - lcm_ + - ldexp + - ldexp_ + - le + - le_ + - lerp + - lerp_ + - where + - less + - less_ + - less_equal + - less_equal_ + - lgamma + - lgamma_ + - log + - log10 + - log10_ + - log1p + - log1p_ + - log2 + - log2_ + - log_ + - log_normal_ + - log_softmax + - logcumsumexp + - logdet + - logaddexp + - logaddexp2 + - logical_and + - logical_and_ + - logical_not + - logit + - logical_not_ + - logical_or + - logical_or_ + - logical_xor + - logical_xor_ + - logit_ + - logsumexp + - lstsq + - lt + - lt_ + - lu_solve + - map2_ + - map_ + - masked_fill + - matmul + - masked_fill_ + - masked_scatter + - masked_scatter_ + - masked_select + - matrix_exp + - max + - maximum + - mean + - matrix_power + - median + - min + - minimum + - mm + - mode + - msort + - mul + - mul_ + - multinomial + - multiply + - multiply_ + - mv + - mvlgamma + - mvlgamma_ + - nansum + - narrow + - narrow_copy + - ne + - ne_ + - neg + - neg_ + - negative + - negative_ + - nonzero + - normal_ + - not_equal + - not_equal_ + - permute + - pinverse + - polygamma + - pow + - pow_ + - polygamma_ + - prelu + - prod + - put_ + - rad2deg + - rad2deg_ + - ravel + - real + - reciprocal + - reciprocal_ + - relu + - relu_ + - remainder + - repeat_interleave + - reshape + - remainder_ + - renorm + - renorm_ + - repeat + - reshape_as + - resize_ + - resize_as_ + - roll + - rot90 + - round + - round_ + - rsqrt + - rsqrt_ + - scatter + - scatter_ + - scatter_add + - scatter_add_ + - select + - sgn + - sgn_ + - sigmoid + - sigmoid_ + - sign + - sign_ + - signbit + - sin + - sin_ + - sinc + - sinc_ + - sinh + - sinh_ + - slogdet + - smm + - softmax + - solve + - sort + - split_with_sizes + - sqrt + - sqrt_ + - square + - square_ + - squeeze + - squeeze_ + - sspaddmm + - std + - sub + - sub_ + - sum + - sum_to_size + - svd + - symeig + - t + - t_ + - take + - tan + - tan_ + - tanh + - tanh_ + - tensor_split + - tile + - topk + - transpose + - transpose_ + - triangular_solve + - tril + - tril_ + - triu + - true_divide + - triu_ + - true_divide_ + - trunc + - trunc_ + - type_as + - unbind + - unflatten + - unfold + - unsafe_chunk + - unsqueeze + - unsafe_split + - unsafe_split_with_sizes + - var + - vdot + - unsqueeze_ + - view_as + - xlogy + - xlogy_ + +torch: + - _adaptive_avg_pool2d + - _add_relu + - _add_relu_ + - _aminmax + - _batch_norm_impl_index + - _convolution + - abs + - abs_ + - absolute + - acos + - acos_ + - acosh + - acosh_ + - adaptive_avg_pool1d + - adaptive_max_pool1d + - add + - addbmm + - addcdiv + - addcmul + - addmm + - addmv + - addmv_ + - addr + - amax + - affine_grid_generator + - align_tensors + - all + - alpha_dropout + - amin + - alpha_dropout_ + - angle + - any + - arange + - arccos + - arccos_ + - arccosh + - arccosh_ + - arcsin + - arcsin_ + - arcsinh + - arcsinh_ + - arctan + - arctan_ + - arctanh + - arctanh_ + - argmax + - argmin + - argsort + - asin + - asin_ + - asinh + - asinh_ + - atan + - atan2 + - atan_ + - atanh + - atanh_ + - atleast_1d + - atleast_2d + - atleast_3d + - avg_pool1d + - baddbmm + - bartlett_window + - batch_norm_backward_elemt + - batch_norm_backward_reduce + - batch_norm_elemt + - batch_norm_gather_stats + - batch_norm_gather_stats_with_counts + - bernoulli + - batch_norm_stats + - batch_norm_update_stats + - bilinear + - bincount + - binomial + - binary_cross_entropy_with_logits + - bitwise_and + - bitwise_not + - bitwise_or + - bitwise_xor + - blackman_window + - block_diag + - bmm + - broadcast_tensors + - broadcast_to + - cartesian_prod + - cat + - cdist + - ceil + - ceil_ + - celu + - celu_ + - chain_matmul + - channel_shuffle + - cholesky + - cholesky_inverse + - cholesky_solve + - choose_qparams_optimized + - chunk + - clamp + - clamp_ + - clamp_max + - clamp_max_ + - clamp_min + - clamp_min_ + - clip + - clip_ + - clone + - column_stack + - combinations + - constant_pad_nd + - conv1d + - conv2d + - conv3d + - conv_tbc + - conv_transpose1d + - conv_transpose2d + - conv_transpose3d + - cos + - convolution + - copysign + - cos_ + - cosh + - cosh_ + - cosine_embedding_loss + - cosine_similarity + - count_nonzero + - cross + - ctc_loss + - cummax + - cummin + - cumprod + - cumsum + - deg2rad + - deg2rad_ + - det + - diag + - diag_embed + - diff + - diagflat + - diagonal + - digamma + - dist + - div + - divide + - dot + - dropout + - dropout_ + - dsmm + - dstack + - eig + - einsum + - embedding + - embedding_bag + - embedding_renorm_ + - eq + - equal + - erf + - erf_ + - erfc + - erfc_ + - erfinv + - exp + - exp2 + - exp2_ + - exp_ + - expm1 + - expm1_ + - eye + - feature_dropout + - feature_alpha_dropout + - feature_alpha_dropout_ + - feature_dropout_ + - fix + - fill_ + - fix_ + - flatten + - flip + - fliplr + - flipud + - float_power + - floor + - floor_ + - floor_divide + - fmax + - fmin + - fmod + - frac + - frac_ + - full + - frobenius_norm + - full_like + - gather + - gcd + - gcd_ + - ge + - geqrf + - ger + - greater + - greater_equal + - grid_sampler + - grid_sampler_2d + - group_norm + - grid_sampler_3d + - gru + - gru_cell + - gt + - hamming_window + - hann_window + - hardshrink + - heaviside + - hinge_embedding_loss + - histc + - hsmm + - hspmm + - hstack + - hypot + - igamma + - igammac + - index_add + - index_copy + - inner + - index_fill + - index_put + - index_put_ + - index_select + - instance_norm + - isclose + - isfinite + - isinf + - isnan + - isneginf + - isposinf + - istft + - kaiser_window + - kl_div + - kron + - kthvalue + - layer_norm + - lcm + - lcm_ + - ldexp + - ldexp_ + - le + - lerp + - less + - less_equal + - lgamma + - linspace + - log + - log10 + - log10_ + - log1p + - log1p_ + - log2 + - log2_ + - log_softmax + - log_ + - logaddexp + - logaddexp2 + - logcumsumexp + - logdet + - logical_and + - logical_not + - logical_or + - logical_xor + - logit + - logit_ + - logspace + - logsumexp + - lstm + - lstm_cell + - lstsq + - lt + - lu_solve + - masked_fill + - margin_ranking_loss + - masked_scatter + - masked_select + - matrix_exp + - matmul + - matrix_power + - matrix_rank + - max + - max_pool1d + - max_pool2d + - max_pool1d_with_indices + - max_pool3d + - maximum + - mean + - median + - min + - minimum + - mm + - mode + - moveaxis + - movedim + - msort + - mul + - multinomial + - multiply + - mv + - mvlgamma + - nan_to_num + - nan_to_num_ + - nanmedian + - nansum + - narrow + - native_batch_norm + - native_group_norm + - narrow_copy + - native_layer_norm + - native_norm + - ne + - neg + - negative + - neg_ + - negative_ + - nextafter + - nonzero + - norm_except_dim + - normal + - not_equal + - nuclear_norm + - pairwise_distance + - pdist + - pinverse + - pixel_shuffle + - pixel_unshuffle + - poisson + - poisson_nll_loss + - polar + - polygamma + - pow + - prelu + - prod + - rad2deg + - promote_types + - rad2deg_ + - range + - ravel + - real + - reciprocal + - relu + - reciprocal_ + - relu_ + - remainder + - renorm + - repeat_interleave + - reshape + - resize_as_ + - roll + - rot90 + - round + - round_ + - rrelu + - rrelu_ + - rsqrt + - row_stack + - rsqrt_ + - rsub + - saddmm + - scalar_tensor + - scatter + - select + - scatter_add + - searchsorted + - selu + - selu_ + - sgn + - sigmoid + - sigmoid_ + - sign + - signbit + - sin + - sin_ + - sinc + - sinc_ + - sinh + - sinh_ + - slogdet + - smm + - softmax + - solve + - sort + - sparse_coo_tensor + - square + - split_with_sizes + - spmm + - sqrt + - sqrt_ + - square_ + - squeeze + - sspaddmm + - stack + - std + - std_mean + - sub + - subtract + - sum + - svd + - swapaxes + - swapdims + - symeig + - t + - take + - tan + - tan_ + - tanh + - tanh_ + - tensordot + - tensor_split + - threshold + - threshold_ + - tile + - topk + - transpose + - trapz + - triangular_solve + - tril + - tril_indices + - triplet_margin_loss + - triu + - triu_indices + - true_divide + - trunc + - trunc_ + - unique_consecutive + - xlogy + - unbind + - unique_dim + - unsafe_chunk + - unsafe_split + - vander + - var + - vdot + - unsafe_split_with_sizes + - unsqueeze + - var_mean + - vstack + - where + - xlogy_ diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/utils.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6641807f929babeed3af30cf14b043d1e4f7913c --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/utils.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os +import yaml + +from ...common.file_check import FileOpen + +cur_path = os.path.dirname(os.path.realpath(__file__)) +yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") +with FileOpen(yaml_path, 'r') as f: + Ops = yaml.safe_load(f) + WrapFunctionalOps = Ops.get('functional') + WrapTensorOps = Ops.get('tensor') + WrapTorchOps = Ops.get('torch') \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_functional.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_functional.py new file mode 100644 index 0000000000000000000000000000000000000000..056c1d047eb592f0006e3632eaa5597eba5630da --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_functional.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import torch + +from .hook_module import HOOKModule +from ..common.utils import torch_device_guard +from ..common.config import msCheckerConfig + +for f in dir(torch.nn.functional): + locals().update({f: getattr(torch.nn.functional, f)}) + + +def get_functional_ops(): + global WrapFunctionalOps + _all_functional_ops = dir(torch.nn.functional) + if msCheckerConfig.white_list: + return set(WrapFunctionalOps) & set(_all_functional_ops) & set(msCheckerConfig.white_list) + else: + return set(WrapFunctionalOps) & set(_all_functional_ops) + + +class HOOKFunctionalOP(object): + pass + + +class FunctionalOPTemplate(HOOKModule): + def __init__(self, op_name, hook, need_hook=True): + self.op_name_ = op_name + self.prefix_op_name_ = "Functional*" + str(op_name) + "*" + if need_hook: + super().__init__(hook) + + @torch_device_guard + def forward(self, *args, **kwargs): + return eval(self.op_name_)(*args, **kwargs) + + +def wrap_functional_op(op_name, hook): + def functional_op_template(*args, **kwargs): + return FunctionalOPTemplate(op_name, hook)(*args, **kwargs) + + return functional_op_template + + +def wrap_functional_ops_and_bind(hook): + _functional_ops = get_functional_ops() + for op_name in _functional_ops: + setattr(HOOKFunctionalOP, "wrap_" + op_name, wrap_functional_op(op_name, hook)) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_tensor.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_tensor.py new file mode 100644 index 0000000000000000000000000000000000000000..f7791cdc9ac8e2084fc63d76e3819e137f4ea9d7 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_tensor.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import torch + +from .hook_module import HOOKModule +from ..common.utils import torch_device_guard +from ..common.config import msCheckerConfig +from ...common.utils import parameter_adapter + + +def get_tensor_ops(): + global WrapTensorOps + _tensor_ops = dir(torch._C._TensorBase) + if msCheckerConfig.white_list: + return set(WrapTensorOps) & set(_tensor_ops) & set(msCheckerConfig.white_list) + else: + return set(WrapTensorOps) & set(_tensor_ops) + + +class HOOKTensor(object): + pass + + +class TensorOPTemplate(HOOKModule): + + def __init__(self, op_name, hook, need_hook=True): + self.op_name_ = op_name + self.prefix_op_name_ = "Tensor*" + str(op_name) + "*" + if need_hook: + super().__init__(hook) + + @torch_device_guard + @parameter_adapter + def forward(self, *args, **kwargs): + return getattr(torch._C._TensorBase, str(self.op_name_))(*args, **kwargs) + + +def wrap_tensor_op(op_name, hook): + + def tensor_op_template(*args, **kwargs): + return TensorOPTemplate(op_name, hook)(*args, **kwargs) + + return tensor_op_template + + +def wrap_tensor_ops_and_bind(hook): + _tensor_ops = get_tensor_ops() + for op_name in _tensor_ops: + setattr(HOOKTensor, "wrap_" + str(op_name), wrap_tensor_op(op_name, hook)) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_torch.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_torch.py new file mode 100644 index 0000000000000000000000000000000000000000..aab245b5d21daff0e0ea44e4073333c6854f95ac --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_torch.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import torch + +from .hook_module import HOOKModule +from ..common.utils import torch_device_guard +from ..common.config import msCheckerConfig + + +def get_torch_ops(): + global WrapTorchOps + _torch_ops = dir(torch._C._VariableFunctionsClass) + if msCheckerConfig.white_list: + return set(WrapTorchOps) & set(_torch_ops) & set(msCheckerConfig.white_list) + else: + return set(WrapTorchOps) & set(_torch_ops) + + +class HOOKTorchOP(object): + pass + + +class TorchOPTemplate(HOOKModule): + + def __init__(self, op_name, hook, need_hook=True): + self.op_name_ = op_name + self.prefix_op_name_ = "Torch*" + str(op_name) + "*" + if need_hook: + super().__init__(hook) + + def input_param_need_adapt(self): + special_op_list = ["broadcast_tensors", "block_diag"] + for item in special_op_list: + if item in self.op_name_: + return True + return False + + def einsum_adapt(self, *args): + if len(args) < 2: + raise ValueError('einsum(): must specify the equation string and at least one operand, ' + 'or at least one operand and its subscripts list') + equation = None + operands = None + if isinstance(args[0], torch.Tensor): + def parse_subscript(n: int) -> str: + if n == Ellipsis: + return '...' + if n >= 0 and n < 26: + return chr(ord('A') + n) + if n >= 26 and n < 52: + return chr(ord('a') + n - 26) + raise ValueError('einsum(): subscript in subscript list is not within the valid range [0, 52]') + equation = ','.join(''.join(parse_subscript(script) for script in arg) for arg in args[1::2]) + + if len(args) % 2 == 1: + equation += '->' + ''.join(parse_subscript(script) for script in args[-1]) + operands = args[:-1:2] + else: + operands = args[::2] + else: + equation = args[0] + operands = args[1:] + + if len(operands) == 1 and isinstance(operands[0], (list, tuple)): + _operands = operands[0] + return self.einsum_adapt(equation, *_operands) + return equation, operands + + @torch_device_guard + def forward(self, *args, **kwargs): + if self.input_param_need_adapt(): + return getattr(torch._C._VariableFunctionsClass, str(self.op_name_))(args, **kwargs) + else: + if self.op_name_ == 'einsum': + args = self.einsum_adapt(*args) + return getattr(torch._C._VariableFunctionsClass, str(self.op_name_))(*args, **kwargs) + + +def wrap_torch_op(op_name, hook): + + def torch_op_template(*args, **kwargs): + return TorchOPTemplate(op_name, hook)(*args, **kwargs) + + return torch_op_template + + +def wrap_torch_ops_and_bind(hook): + _torch_ops = get_torch_ops() + for op_name in _torch_ops: + setattr(HOOKTorchOP, "wrap_" + op_name, wrap_torch_op(op_name, hook)) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/accuracy_checking_details.png b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/accuracy_checking_details.png new file mode 100644 index 0000000000000000000000000000000000000000..ddc4fb348ee55197459c7303b0817853e201ace4 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/accuracy_checking_details.png differ diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/accuracy_checking_result.png b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/accuracy_checking_result.png new file mode 100644 index 0000000000000000000000000000000000000000..aa0b29d8d057ff806d5f5e82a35c5ce085dee1f3 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/accuracy_checking_result.png differ diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/api_precision_compare_details.png b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/api_precision_compare_details.png new file mode 100644 index 0000000000000000000000000000000000000000..c3fd909a8d187fd6a725c7f3cc6798989d3fa0cf Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/api_precision_compare_details.png differ diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/api_precision_compare_result.png b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/api_precision_compare_result.png new file mode 100644 index 0000000000000000000000000000000000000000..2b95897031441408f6a88185e3cda36e4fea8049 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/api_precision_compare_result.png differ diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/.keep b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/data_generate.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/data_generate.py new file mode 100644 index 0000000000000000000000000000000000000000..723fb8ec6680ab65770007c5ab90b5f8428db2ac --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/data_generate.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os +import math +import torch +import numpy + +from ..common.utils import Const, check_file_or_directory_path, check_object_type, print_warn_log, \ + print_error_log, get_full_data_path, CompareException + +TORCH_TYPE = ["torch.device", "torch.dtype"] +TENSOR_DATA_LIST = ["torch.Tensor", "torch.nn.parameter.Parameter"] +FLOAT_TYPE = ['torch.float32', 'torch.float', 'torch.float64', 'torch.double', 'torch.float16', + 'torch.half', 'torch.bfloat16'] +NUMPY_TYPE = ["numpy.int8", "numpy.int16", "numpy.int32", "numpy.int64", "numpy.uint8", "numpy.uint16", "numpy.uint32", + "numpy.uint64", "numpy.float16", "numpy.float32", "numpy.float64", "numpy.float128", "numpy.complex64", + "numpy.complex128", "numpy.complex256", "numpy.bool_", "numpy.string_", "numpy.bytes_", "numpy.unicode_"] + + +def gen_data(info, need_grad, convert_type, real_data_path=None): + """ + Function Description: + Based on arg basic information, generate arg data + Parameter: + info: arg basic information. Dict + need_grad: set Tensor grad for backward + convert_type: convert ori_type to dist_type flag. + """ + check_object_type(info, dict) + data_type = info.get('type') + data_path = info.get('datapath', info.get('data_name')) + data_path = get_full_data_path(data_path, real_data_path) + if data_type in TENSOR_DATA_LIST: + if data_path: + data = gen_real_tensor(data_path, convert_type) + else: + data = gen_random_tensor(info, convert_type) + if info.get('requires_grad') and need_grad: + data.requires_grad_(True) + temp_data = data * 1 + data = temp_data.type_as(data) + data.retain_grad() + elif data_type.startswith("numpy"): + if data_type not in NUMPY_TYPE: + raise Exception("{} is not supported now".format(data_type)) + data = info.get("value") + try: + data = eval(data_type)(data) + except Exception as err: + print_error_log("Failed to convert the type to numpy: %s" % str(err)) + elif data_type == "torch.Size": + data = torch.Size(info.get("value")) + else: + data = info.get('value') + if info.get("type") == "slice": + data = slice(*data) + return data + + +def gen_real_tensor(data_path, convert_type): + """ + Function Description: + Based on API data path, generate input parameters real data + Parameter: + data_path: API data path + convert_type: convert ori_type to dist_type flag. + """ + data_path = os.path.realpath(data_path) + check_file_or_directory_path(data_path) + if not data_path.endswith('.pt') and not data_path.endswith('.npy'): + error_info = f"The file: {data_path} is not a pt or numpy file." + raise CompareException(CompareException.INVALID_FILE_ERROR, error_info) + if data_path.endswith('.pt'): + data = torch.load(data_path).cpu() + else: + data_np = numpy.load(data_path) + data = torch.from_numpy(data_np) + if convert_type: + ori_dtype = Const.CONVERT.get(convert_type)[0] + dist_dtype = Const.CONVERT.get(convert_type)[1] + if str(data.dtype) == ori_dtype: + data = data.type(eval(dist_dtype)) + return data + + +def gen_random_tensor(info, convert_type): + """ + Function Description: + Based on API MAX and MIN, generate input parameters random data + Parameter: + info: API data info + convert_type: convert ori_type to dist_type flag. + """ + check_object_type(info, dict) + low, high = info.get('Min'), info.get('Max') + low_origin, high_origin = info.get('Min_origin'), info.get('Max_origin') + low_info = [low, low_origin] + high_info = [high, high_origin] + data_dtype = info.get('dtype') + shape = tuple(info.get('shape')) + if not isinstance(low, (int, float)) or not isinstance(high, (int, float)): + error_info = f'Data info Min: {low} , Max: {high}, info type must be int or float.' + raise CompareException(CompareException.INVALID_PARAM_ERROR, error_info) + if data_dtype == "torch.bool": + data = gen_bool_tensor(low, high, shape) + else: + data = gen_common_tensor(low_info, high_info, shape, data_dtype, convert_type) + return data + + +def gen_common_tensor(low_info, high_info, shape, data_dtype, convert_type): + """ + Function Description: + Based on API basic information, generate int or float tensor + Parameter: + low_info: [low, low_origin], low is the minimum value in the tensor removed inf and nan, + low_origin is the original minimum value in the tensor + high_info: [high, high_origin], high is the maximum value in the tensor removed inf and nan, + high_origin is the original maximum value in the tensor + shape:The shape of Tensor + data_dtype: The data type of Tensor + convert_type: convert ori_type to dist_type flag. + """ + if convert_type: + ori_dtype = Const.CONVERT.get(convert_type)[0] + if ori_dtype == data_dtype: + data_dtype = Const.CONVERT.get(convert_type)[1] + low, low_origin = low_info[0], low_info[1] + high, high_origin = high_info[0], high_info[1] + if data_dtype in FLOAT_TYPE: + if math.isnan(high): + tensor = torch._C._VariableFunctionsClass.full(shape, float('nan'), dtype=eval(data_dtype)) + return tensor + #high_origin为新版json中的属性,只有当high_origin不为None,且high为inf或-inf时,原tensor全为inf或-inf + if high_origin and high in [float('inf'), float('-inf')]: + tensor = torch._C._VariableFunctionsClass.full(shape, high, dtype=eval(data_dtype)) + tensor[-1] = low + return tensor + low_scale, high_scale = low, high + dtype_finfo = torch.finfo(eval(data_dtype)) + #适配老版json high和low为inf或-inf的情况,取dtype的最大值或最小值进行放缩 + if high == float('inf'): + high_scale = dtype_finfo.max + elif high == float('-inf'): + high_scale = dtype_finfo.min + if low == float('inf'): + low_scale = dtype_finfo.max + elif low == float('-inf'): + low_scale = dtype_finfo.min + + scale = high_scale - low_scale + rand01 = torch.rand(shape, dtype=eval(data_dtype)) + tensor = rand01 * scale + low_scale + elif 'int' in data_dtype or 'long' in data_dtype: + low, high = int(low), int(high) + tensor = torch.randint(low, high + 1, shape, dtype=eval(data_dtype)) + else: + print_error_log('Dtype is not supported: ' + data_dtype) + raise NotImplementedError() + if tensor.nelement() == 0: + return tensor + tmp_tensor = tensor.reshape(-1) + if high_origin and math.isnan(high_origin): + if tmp_tensor.numel() <= 2: + tmp_tensor[0] = float('nan') + tmp_tensor[-1] = high + else: + tmp_tensor[0] = low + tmp_tensor[1] = float('nan') + tmp_tensor[-1] = high + else: + tmp_tensor[0] = low + tmp_tensor[-1] = high + if high_origin in [float('inf'), float('-inf')]: + tmp_tensor[-1] = high_origin + if low_origin in [float('inf'), float('-inf')]: + tmp_tensor[0] = low_origin + data = tmp_tensor.reshape(shape) + return data + + +def gen_bool_tensor(low, high, shape): + """ + Function Description: + Based on API basic information, generate bool tensor + Parameter: + low: The minimum value in Tensor + high: The max value in Tensor + shape:The shape of Tensor + """ + low, high = int(low), int(high) + if low > high: + low, high = high, low + tensor = torch.randint(low, high + 1, shape) + data = torch.gt(tensor, 0) + return data + + +def gen_args(args_info, need_grad=True, convert_type=None, real_data_path=None): + """ + Function Description: + Based on API basic information, generate input parameters: args, for API forward running + Parameter: + api_info: API basic information. List + need_grad: set Tensor grad for backward + convert_type: convert ori_type to dist_type flag. + real_data_path: the root directory for storing real data. + """ + check_object_type(args_info, list) + args_result = [] + for arg in args_info: + if isinstance(arg, (list, tuple)): + data = gen_args(arg, need_grad, convert_type, real_data_path) + elif isinstance(arg, dict): + data = gen_data(arg, need_grad, convert_type, real_data_path) + elif arg is None: + data = None + else: + print_warn_log(f'Warning: {arg} is not supported') + raise NotImplementedError() + args_result.append(data) + return args_result + + +def gen_kwargs(api_info, convert_type=None, real_data_path=None): + """ + Function Description: + Based on API basic information, generate input parameters: kwargs, for API forward running + Parameter: + api_info: API basic information. Dict + convert_type: convert ori_type to dist_type flag. + real_data_path: the root directory for storing real data. + """ + check_object_type(api_info, dict) + kwargs_params = api_info.get("input_kwargs") + for key, value in kwargs_params.items(): + if isinstance(value, (list, tuple)): + kwargs_params[key] = gen_list_kwargs(value, convert_type, real_data_path) + elif value is None: + kwargs_params[key] = None + elif value.get('type') in TENSOR_DATA_LIST or value.get('type').startswith("numpy"): + kwargs_params[key] = gen_data(value, True, convert_type, real_data_path) + elif value.get('type') in TORCH_TYPE: + gen_torch_kwargs(kwargs_params, key, value) + else: + kwargs_params[key] = value.get('value') + return kwargs_params + + +def gen_torch_kwargs(kwargs_params, key, value): + if value.get('type') != "torch.device": + kwargs_params[key] = eval(value.get('value')) + + +def gen_list_kwargs(kwargs_item_value, convert_type, real_data_path=None): + """ + Function Description: + When kwargs value is list, generate the list of kwargs result + Parameter: + kwargs_item_value: kwargs value before to generate. List + convert_type: convert ori_type to dist_type flag. + """ + kwargs_item_result = [] + for item in kwargs_item_value: + if item.get('type') in TENSOR_DATA_LIST: + item_value = gen_data(item, False, convert_type, real_data_path) + elif item.get('type') == "torch.Size": + item_value = torch.Size(item.get('value')) + else: + item_value = item.get('value') + kwargs_item_result.append(item_value) + return kwargs_item_result + + +def gen_api_params(api_info, need_grad=True, convert_type=None, real_data_path=None): + """ + Function Description: + Based on API basic information, generate input parameters: args, kwargs, for API forward running + Parameter: + api_info: API basic information. Dict + need_grad: set grad for backward + convert_type: convert ori_type to dist_type flag. + """ + check_object_type(api_info, dict) + if convert_type and convert_type not in Const.CONVERT: + error_info = f"convert_type params not support {convert_type}." + raise CompareException(CompareException.INVALID_PARAM_ERROR, error_info) + kwargs_params = gen_kwargs(api_info, convert_type, real_data_path) + if api_info.get("input_args"): + args_params = gen_args(api_info.get("input_args"), need_grad, convert_type, real_data_path) + else: + print_warn_log(f'Warning: No args in {api_info} ') + args_params = [] + return args_params, kwargs_params diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py new file mode 100644 index 0000000000000000000000000000000000000000..bbb7c6f5ef2c75832c0b7a6e22e9d1ccf0624d8b --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py @@ -0,0 +1,201 @@ +import subprocess +import json +import os +import sys +import argparse +import time +import signal +import threading +from collections import namedtuple +from itertools import cycle +from tqdm import tqdm +from ...common import parse_json_info_forward_backward +from ...common.file_check import FileCheckConst, FileChecker, check_file_suffix, check_link, FileOpen +from ..compare.compare import Comparator +from .run_ut import _run_ut_parser, get_validated_result_csv_path, get_validated_details_csv_path, preprocess_forward_content +from ..common.utils import print_error_log, print_warn_log, print_info_log, create_directory +from ...common.file_check import check_path_before_create + + +def split_json_file(input_file, num_splits, filter_api): + forward_data, backward_data, real_data_path = parse_json_info_forward_backward(input_file) + if filter_api: + forward_data = preprocess_forward_content(forward_data) + for data_name in list(forward_data.keys()): + forward_data[f"{data_name}.forward"] = forward_data.pop(data_name) + for data_name in list(backward_data.keys()): + backward_data[f"{data_name}.backward"] = backward_data.pop(data_name) + + with FileOpen(input_file, 'r') as file: + input_data = json.load(file) + input_data.pop("data") + + items = list(forward_data.items()) + total_items = len(items) + chunk_size = total_items // num_splits + split_files = [] + + for i in range(num_splits): + start = i * chunk_size + end = (i + 1) * chunk_size if i < num_splits - 1 else total_items + + split_forward_data = dict(items[start:end]) + temp_data = { + **input_data, + "data":{ + **split_forward_data, + **backward_data + } + } + split_filename = f"temp_part{i}.json" + with FileOpen(split_filename, 'w') as split_file: + json.dump(temp_data, split_file) + split_files.append(split_filename) + + return split_files, total_items + + +def signal_handler(signum, frame): + print_warn_log(f'Signal handler called with signal {signum}') + raise KeyboardInterrupt() + + +signal.signal(signal.SIGINT, signal_handler) +signal.signal(signal.SIGTERM, signal_handler) + + +ParallelUTConfig = namedtuple('ParallelUTConfig', ['api_files', 'out_path', 'num_splits', + 'save_error_data_flag', 'jit_compile_flag', 'device_id', + 'result_csv_path', 'total_items', 'real_data_path']) + + +def run_parallel_ut(config): + processes = [] + device_id_cycle = cycle(config.device_id) + if config.save_error_data_flag: + print_info_log("UT task error datas will be saved") + print_info_log(f"Starting parallel UT with {config.num_splits} processes") + progress_bar = tqdm(total=config.total_items, desc="Total items", unit="items") + + def create_cmd(api_info, dev_id): + dirname, filename = os.path.split(os.path.abspath(__file__)) + run_ut_path = os.path.join(dirname, "run_ut.py") + cmd = [ + sys.executable, run_ut_path, + '-api_info', api_info, + *(['-o', config.out_path] if config.out_path else []), + '-d', str(dev_id), + *(['-j'] if config.jit_compile_flag else []), + *(['-save_error_data'] if config.save_error_data_flag else []), + '-csv_path', config.result_csv_path, + *(['-real_data_path', config.real_data_path] if config.real_data_path else []) + ] + return cmd + + def read_process_output(process): + try: + while True: + if process.poll() is not None: + break + output = process.stdout.readline() + if output == '': + break + if '[ERROR]' in output: + print(output, end='') + sys.stdout.flush() + except ValueError as e: + print_warn_log(f"An error occurred while reading subprocess output: {e}") + + def update_progress_bar(progress_bar, result_csv_path): + while any(process.poll() is None for process in processes): + try: + with open(result_csv_path, 'r') as result_file: + completed_items = len(result_file.readlines()) - 1 + progress_bar.update(completed_items - progress_bar.n) + except FileNotFoundError: + print_warn_log(f"Result CSV file not found: {result_csv_path}.") + except Exception as e: + print_error_log(f"An unexpected error occurred while reading result CSV: {e}") + time.sleep(1) + + for api_info in config.api_files: + cmd = create_cmd(api_info, next(device_id_cycle)) + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, bufsize=1) + processes.append(process) + threading.Thread(target=read_process_output, args=(process,), daemon=True).start() + + progress_bar_thread = threading.Thread(target=update_progress_bar, args=(progress_bar, config.result_csv_path)) + progress_bar_thread.start() + + def clean_up(): + progress_bar.close() + for process in processes: + try: + process.terminate() + process.wait(timeout=1) + except subprocess.TimeoutExpired: + process.kill() + for file in config.api_files: + check_link(file) + try: + os.remove(file) + except FileNotFoundError: + print_warn_log(f"File not found and could not be deleted: {file}") + + try: + for process in processes: + process.communicate(timeout=None) + except KeyboardInterrupt: + print_warn_log("Interrupted by user, terminating processes and cleaning up...") + except Exception as e: + print_error_log(f"An unexpected error occurred: {e}") + finally: + if progress_bar.n < config.total_items: + print_warn_log("The UT task has not been completed. The parameter '-csv_path' along with the path to the result CSV file will be utilized to resume the UT task.") + clean_up() + progress_bar_thread.join() + try: + comparator = Comparator(config.result_csv_path, config.result_csv_path, False) + comparator.print_pretest_result() + except FileNotFoundError as e: + print_error_log(f"Error: {e}") + except Exception as e: + print_error_log(f"An unexpected error occurred: {e}") + + +def prepare_config(args): + check_link(args.api_info_file) + api_info = os.path.realpath(args.api_info_file) + check_file_suffix(api_info, FileCheckConst.JSON_SUFFIX) + out_path = os.path.realpath(args.out_path) if args.out_path else "./" + check_path_before_create(out_path) + create_directory(out_path) + out_path_checker = FileChecker(out_path, FileCheckConst.DIR, ability=FileCheckConst.WRITE_ABLE) + out_path = out_path_checker.common_check() + split_files, total_items = split_json_file(api_info, args.num_splits, args.filter_api) + + result_csv_path = args.result_csv_path or os.path.join(out_path, f"accuracy_checking_result_{time.strftime('%Y%m%d%H%M%S')}.csv") + if not args.result_csv_path: + details_csv_path = os.path.join(out_path, f"accuracy_checking_details_{time.strftime('%Y%m%d%H%M%S')}.csv") + comparator = Comparator(result_csv_path, details_csv_path, False) + else: + result_csv_path = get_validated_result_csv_path(args.result_csv_path, 'result') + details_csv_path = get_validated_details_csv_path(result_csv_path) + print_info_log(f"UT task result will be saved in {result_csv_path}") + print_info_log(f"UT task details will be saved in {details_csv_path}") + return ParallelUTConfig(split_files, out_path, args.num_splits, args.save_error_data, + args.jit_compile, args.device_id, result_csv_path, + total_items, args.real_data_path) + + +def main(): + parser = argparse.ArgumentParser(description='Run UT in parallel') + _run_ut_parser(parser) + parser.add_argument('-n', '--num_splits', type=int, choices=range(1, 65), default=8, help='Number of splits for parallel processing. Range: 1-64') + args = parser.parse_args() + config = prepare_config(args) + run_parallel_ut(config) + + +if __name__ == '__main__': + main() diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py new file mode 100644 index 0000000000000000000000000000000000000000..bea882f75076655998227baa6e4d3b4708074f08 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py @@ -0,0 +1,121 @@ +import argparse +import os +import sys +import torch_npu +import torch +from tqdm import tqdm +from ..run_ut.run_ut import exec_api, generate_device_params, get_api_info +from ..common.utils import print_info_log, print_warn_log, get_json_contents, print_error_log +from ...common.file_check import check_link + + +def check_tensor_overflow(x): + if isinstance(x, torch.Tensor) and x.numel() != 0 and x.dtype != torch.bool: + if len(x.shape) == 0: + tensor_max = x.cpu().detach().float().numpy().tolist() + tensor_min = tensor_max + else: + tensor_max = torch._C._VariableFunctionsClass.max(x).cpu().detach().float().numpy().tolist() + tensor_min = torch._C._VariableFunctionsClass.min(x).cpu().detach().float().numpy().tolist() + # inf + if tensor_max == float('inf') or tensor_min == float('-inf'): + return True + # nan + elif tensor_max != tensor_max or tensor_min != tensor_min: + return True + else: + return False + elif isinstance(x, bool) or isinstance(x, int) or isinstance(x, float): + if x == float('inf') or x == float('-inf') or x != x: + return True + else: + return False + else: + return False + + +def check_data_overflow(x): + if isinstance(x, (tuple, list)) and x: + for i, item in enumerate(x): + if check_data_overflow(item): + return True + return False + else: + return check_tensor_overflow(x) + + +def run_overflow_check(forward_file): + print_info_log("start UT test") + forward_content = get_json_contents(forward_file) + for api_full_name, api_info_dict in tqdm(forward_content.items()): + try: + run_torch_api(api_full_name, api_info_dict) + except Exception as err: + api_name = api_full_name.split("_", 1)[1].rsplit("_", 2)[0] + if "not implemented for 'Half'" in str(err): + print_warn_log(f"API {api_name} not support half tensor in CPU, please add {api_name} to CONVERT_API " + f"'fp16_to_fp32' list in accuracy_tools/api_accuracy_check/common/utils.py file.") + elif "expected scalar type Long" in str(err): + print_warn_log(f"API {api_name} not support int32 tensor in CPU, please add {api_name} to CONVERT_API " + f"'int32_to_int64' list in accuracy_tools/api_accuracy_check/common/utils.py file.") + else: + print_error_log(f"Run {api_full_name} UT Error: %s" % str(err)) + + +def run_torch_api(api_full_name, api_info_dict): + torch.npu.clear_npu_overflow_flag() + api_type = api_full_name.split(".")[0] + api_name = api_full_name.split(".", 1)[1].rsplit(".", 2)[0] + args, kwargs, need_grad = get_api_info(api_info_dict, api_name, real_data_path='') + if not need_grad: + print_warn_log("%s function with out=... arguments don't support automatic differentiation, skip backward." + % api_full_name) + npu_args, npu_kwargs = generate_device_params(args, kwargs, False, api_name) + if kwargs.get("device"): + del kwargs["device"] + out = exec_api(api_type, api_name, args, kwargs) + npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs) + cpu_overflow = check_data_overflow(out) + npu_overflow = torch_npu.npu.utils.npu_check_overflow(npu_out) + if cpu_overflow == npu_overflow: + print_warn_log("The %s overflow is a normal overflow." % api_full_name) + else: + print_warn_log("The %s overflow is an abnormal overflow." % api_full_name) + return + + +def _run_overflow_check_parser(parser): + parser.add_argument("-api_info", "--api_info_file", dest="api_info_file", default="", + help=" The api param tool result file: generate from api param tool, " + "a json file.", + required=True) + parser.add_argument("-j", "--jit_compile", dest="jit_compile", help=" whether to turn on jit compile", + default=False, required=False) + parser.add_argument("-d", "--device", dest="device_id", type=int, help=" set NPU device id to run ut", + default=0, required=False) + + +def _run_overflow_check(parser=None): + if not parser: + parser = argparse.ArgumentParser() + _run_overflow_check_parser(parser) + args = parser.parse_args(sys.argv[1:]) + _run_overflow_check_command(args) + + +def _run_overflow_check_command(args): + torch.npu.set_compile_mode(jit_compile=args.jit_compile) + npu_device = "npu:" + str(args.device_id) + check_link(args.api_info_file) + api_info = os.path.realpath(args.api_info_file) + try: + torch.npu.set_device(npu_device) + except Exception as error: + print_error_log(f"Set NPU device id failed. device id is: {args.device_id}") + raise NotImplementedError from error + run_overflow_check(api_info) + + +if __name__ == '__main__': + _run_overflow_check() + print_info_log("UT task completed.") diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_ut.py new file mode 100644 index 0000000000000000000000000000000000000000..3186913e9482de5427f8b41685255e0e4cc1f140 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_ut.py @@ -0,0 +1,459 @@ +import argparse +import os +import csv +import re +import sys +import time +import gc +from collections import namedtuple +try: + import torch_npu +except ImportError: + is_gpu = True + current_device = "cuda" +else: + is_gpu = False + current_device = "npu" +import torch +from tqdm import tqdm + +from atat.pytorch.api_accuracy_checker.run_ut.data_generate import gen_api_params, gen_args +from atat.pytorch.api_accuracy_checker.common.utils import print_info_log, print_warn_log, get_json_contents, \ + api_info_preprocess, print_error_log, initialize_save_path, Const, create_directory +from atat.pytorch.api_accuracy_checker.compare.compare import Comparator +from atat.pytorch.api_accuracy_checker.hook_module.wrap_tensor import TensorOPTemplate +from atat.pytorch.api_accuracy_checker.hook_module.wrap_functional import FunctionalOPTemplate +from atat.pytorch.api_accuracy_checker.hook_module.wrap_torch import TorchOPTemplate +from atat.pytorch.api_accuracy_checker.common.config import msCheckerConfig +from atat.pytorch.api_accuracy_checker.dump.api_info import APIInfo +from atat.pytorch.common.parse_json import parse_json_info_forward_backward +from atat.pytorch.common.file_check import check_path_before_create +from atat.pytorch.common.file_check import FileOpen, FileCheckConst, FileChecker, \ + change_mode, check_file_suffix, check_link + +current_time = time.strftime("%Y%m%d%H%M%S") +UT_ERROR_DATA_DIR = 'ut_error_data' + current_time +RESULT_FILE_NAME = "accuracy_checking_result_" + current_time + ".csv" +DETAILS_FILE_NAME = "accuracy_checking_details_" + current_time + ".csv" +RunUTConfig = namedtuple('RunUTConfig', ['forward_content', 'backward_content', 'result_csv_path', 'details_csv_path', + 'save_error_data', 'is_continue_run_ut', 'real_data_path']) +not_backward_list = ['repeat_interleave'] +not_detach_set = {'resize_', 'resize_as_', 'set_', 'transpose_', 't_', 'squeeze_', 'unsqueeze_'} + +tqdm_params = { + 'smoothing': 0, # 平滑进度条的预计剩余时间,取值范围0到1 + 'desc': 'Processing', # 进度条前的描述文字 + 'leave': True, # 迭代完成后保留进度条的显示 + 'ncols': 75, # 进度条的固定宽度 + 'mininterval': 0.1, # 更新进度条的最小间隔秒数 + 'maxinterval': 1.0, # 更新进度条的最大间隔秒数 + 'miniters': 1, # 更新进度条之间的最小迭代次数 + 'ascii': None, # 根据环境自动使用ASCII或Unicode字符 + 'unit': 'it', # 迭代单位 + 'unit_scale': True, # 自动根据单位缩放 + 'dynamic_ncols': True, # 动态调整进度条宽度以适应控制台 + 'bar_format': '{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]' # 自定义进度条输出格式 +} + + +def exec_api(api_type, api_name, args, kwargs): + if api_type == "Functional": + functional_api = FunctionalOPTemplate(api_name, str, False) + out = functional_api.forward(*args, **kwargs) + if api_type == "Tensor": + tensor_api = TensorOPTemplate(api_name, str, False) + out = tensor_api.forward(*args, **kwargs) + if api_type == "Torch": + torch_api = TorchOPTemplate(api_name, str, False) + out = torch_api.forward(*args, **kwargs) + return out + + +def deal_detach(arg, to_detach=True): + return arg.detach() if to_detach else arg + + +def deal_dtype(arg, raise_dtype=None): + if raise_dtype is None or arg.dtype not in Const.RAISE_PRECISION or raise_dtype == arg.dtype: + return arg + return arg.type(raise_dtype) + + +def generate_device_params(input_args, input_kwargs, need_backward, api_name): + def recursive_arg_to_device(arg_in, to_detach): + if isinstance(arg_in, (list, tuple)): + return type(arg_in)(recursive_arg_to_device(arg, to_detach) for arg in arg_in) + elif isinstance(arg_in, torch.Tensor): + if need_backward and arg_in.requires_grad: + arg_in = deal_detach(arg_in.clone(), to_detach).to(current_device).requires_grad_() + temp_arg_in = arg_in * 1 + arg_in = temp_arg_in.type_as(arg_in) + arg_in.retain_grad() + return arg_in + else: + return deal_detach(arg_in.clone(), to_detach).to(current_device) + else: + return arg_in + + is_detach = api_name not in not_detach_set + device_args = recursive_arg_to_device(input_args, is_detach) + device_kwargs = \ + {key: recursive_arg_to_device(value, key != "out" and is_detach) for key, value in input_kwargs.items()} + return device_args, device_kwargs + + +def generate_cpu_params(input_args, input_kwargs, need_backward, api_name): + def recursive_arg_to_cpu(arg_in, to_detach, raise_dtype=None): + if isinstance(arg_in, (list, tuple)): + return type(arg_in)(recursive_arg_to_cpu(arg, to_detach, raise_dtype=raise_dtype) for arg in arg_in) + elif isinstance(arg_in, torch.Tensor): + if need_backward and arg_in.requires_grad: + arg_in = deal_detach(deal_dtype(arg_in.clone(), raise_dtype), to_detach).requires_grad_() + temp_arg_in = arg_in * 1 + arg_in = temp_arg_in.type_as(arg_in) + arg_in.retain_grad() + return arg_in + else: + return deal_detach(deal_dtype(arg_in.clone(), raise_dtype=raise_dtype), to_detach) + else: + return arg_in + + def is_tensor_with_raise_precision(arg_in, check_kwargs=False): + if arg_in.dtype in Const.RAISE_PRECISION: + return True + if check_kwargs and arg_in.dtype in [torch.half, torch.bfloat16]: + return True + return False + + def recursive_find_dtypes(arg_in, kwargs=None, check_kwargs=False): + if isinstance(arg_in, (list, tuple)): + return set().union(*tuple(recursive_find_dtypes(arg, kwargs, check_kwargs=check_kwargs) for arg in arg_in)) + elif isinstance(arg_in, torch.Tensor) and is_tensor_with_raise_precision(arg_in, check_kwargs): + return set([arg_in.dtype]) + elif isinstance(arg_in, dict) and check_kwargs: + return set().union(*tuple(recursive_find_dtypes(v, kwargs, check_kwargs=True) for v in arg_in.values())) + return set() + + raise_dtype = None + need_raise_dtypes = recursive_find_dtypes(input_args) + need_raise_dtypes.update(recursive_find_dtypes(input_kwargs, check_kwargs=True)) + if len(need_raise_dtypes) == 1: + raise_dtype = Const.RAISE_PRECISION.get(need_raise_dtypes.pop(), torch.float32) + elif len(need_raise_dtypes) >= 2: + raise_dtype = torch.float32 + + is_detach = api_name not in not_detach_set + cpu_args = recursive_arg_to_cpu(input_args, is_detach, raise_dtype=raise_dtype) + cpu_kwargs = {key: recursive_arg_to_cpu(value, key != "out" and is_detach, raise_dtype=raise_dtype) for key, value in input_kwargs.items()} + return cpu_args, cpu_kwargs + + +def run_ut(config): + print_info_log("start UT test") + print_info_log(f"UT task result will be saved in {config.result_csv_path}") + print_info_log(f"UT task details will be saved in {config.details_csv_path}") + if config.save_error_data: + error_data_path = os.path.abspath(os.path.join(msCheckerConfig.error_data_path, UT_ERROR_DATA_DIR)) + print_info_log(f"UT task error_datas will be saved in {error_data_path}") + compare = Comparator(config.result_csv_path, config.details_csv_path, config.is_continue_run_ut) + with FileOpen(config.result_csv_path, 'r') as file: + csv_reader = csv.reader(file) + next(csv_reader) + api_name_set = {row[0] for row in csv_reader} + for i, (api_full_name, api_info_dict) in enumerate(tqdm(config.forward_content.items(), **tqdm_params)): + if api_full_name in api_name_set: + continue + if is_unsupported_api(api_full_name): # TODO run_ut does not support to the npu fusion api and distributed api + continue + try: + if msCheckerConfig.white_list: + [_, api_name, _] = api_full_name.split(Const.SEP) + if api_name not in set(msCheckerConfig.white_list): + continue + data_info = run_torch_api(api_full_name, config.real_data_path, config.backward_content, api_info_dict) + is_fwd_success, is_bwd_success = compare.compare_output(api_full_name, + data_info.bench_out, + data_info.device_out, + data_info.bench_grad_out, + data_info.device_grad_out) + if config.save_error_data: + do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success) + except Exception as err: + [_, api_name, _] = api_full_name.split(Const.SEP) + if "expected scalar type Long" in str(err): + print_warn_log(f"API {api_name} not support int32 tensor in CPU, please add {api_name} to CONVERT_API " + f"'int32_to_int64' list in accuracy_tools/api_accuracy_check/common/utils.py file.") + else: + print_error_log(f"Run {api_full_name} UT Error: %s" % str(err)) + compare.write_summary_csv((api_full_name, "SKIP", "SKIP", str(err))) + finally: + if is_gpu: + torch.cuda.empty_cache() + else: + torch.npu.empty_cache() + gc.collect() + change_mode(compare.save_path, FileCheckConst.DATA_FILE_AUTHORITY) + change_mode(compare.detail_save_path, FileCheckConst.DATA_FILE_AUTHORITY) + compare.print_pretest_result() + + +def is_unsupported_api(api_name): + split_name = api_name.split(Const.SEP)[0] + flag = split_name in [Const.NPU, Const.DISTRIBUTED] + if flag: + print_info_log(f"{split_name} api is not supported for run ut. SKIP.") + return flag + + +def do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success): + if not is_fwd_success or not is_bwd_success: + for element in data_info.in_fwd_data_list: + UtAPIInfo(api_full_name + '.forward.input', element) + UtAPIInfo(api_full_name + '.forward.output.bench', data_info.bench_out) + UtAPIInfo(api_full_name + '.forward.output.device', data_info.device_out) + UtAPIInfo(api_full_name + '.backward.input', data_info.grad_in) + UtAPIInfo(api_full_name + '.backward.output.bench', data_info.bench_grad_out) + UtAPIInfo(api_full_name + '.backward.output.device', data_info.device_grad_out) + + +def run_torch_api(api_full_name, real_data_path, backward_content, api_info_dict): + in_fwd_data_list = [] + [api_type, api_name, _] = api_full_name.split(Const.SEP) + args, kwargs, need_grad = get_api_info(api_info_dict, api_name, real_data_path) + in_fwd_data_list.append(args) + in_fwd_data_list.append(kwargs) + need_backward = api_full_name in backward_content + if not need_grad: + print_warn_log("%s function with out=... arguments don't support automatic differentiation, skip backward." + % api_full_name) + if api_name in not_backward_list: + need_grad = False + print_warn_log( + "%s function backward result is None, skip backward." % api_full_name) + need_backward = need_backward and need_grad + if kwargs.get("device"): + del kwargs["device"] + cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, need_backward, api_name) + device_args, device_kwargs = generate_device_params(args, kwargs, need_backward, api_name) + bench_grad_out, device_grad_out = None, None + out = exec_api(api_type, api_name, cpu_args, cpu_kwargs) + device_out = exec_api(api_type, api_name, device_args, device_kwargs) + current_path = os.path.dirname(os.path.realpath(__file__)) + ut_setting_path = os.path.join(current_path, "torch_ut_setting.json") + api_setting_dict = get_json_contents(ut_setting_path) + grad_input_index = api_setting_dict.get(api_name) + grad_index = None + grad, bench_grad = None, None + if grad_input_index is not None: + grad_index = grad_input_index.get('grad_index') + + if need_backward: + backward_args = backward_content[api_full_name].get("grad_output") + grad = gen_args(backward_args, real_data_path=real_data_path)[0] + bench_grad, _ = generate_cpu_params(grad, {}, False, api_name) + bench_grad_out = run_backward(cpu_args, bench_grad, grad_index, out) + device_grad = grad.clone().detach().to(current_device) + device_grad_out = run_backward(device_args, device_grad, grad_index, device_out) + + if grad_index is not None: + return UtDataInfo(bench_grad_out, device_grad_out, device_out[grad_index], out[grad_index], bench_grad, + in_fwd_data_list) + return UtDataInfo(bench_grad_out, device_grad_out, device_out, out, bench_grad, in_fwd_data_list) + + +def get_api_info(api_info_dict, api_name, real_data_path): + convert_type, api_info_dict = api_info_preprocess(api_name, api_info_dict) + need_grad = True + if api_info_dict.get("input_kwargs") and "out" in api_info_dict.get("input_kwargs"): + need_grad = False + args, kwargs = gen_api_params(api_info_dict, need_grad, convert_type, real_data_path) + return args, kwargs, need_grad + + +def run_backward(args, grad, grad_index, out): + if grad_index is not None: + out[grad_index].backward(grad) + elif isinstance(out, (list, tuple)): + raise NotImplementedError("Multiple backward is not supported.") + else: + out.backward(grad) + args_grad = [] + for arg in args: + if isinstance(arg, torch.Tensor): + args_grad.append(arg.grad) + grad_out = args_grad + + return grad_out + + +def initialize_save_error_data(): + error_data_path = msCheckerConfig.error_data_path + check_path_before_create(error_data_path) + create_directory(error_data_path) + error_data_path_checker = FileChecker(msCheckerConfig.error_data_path, FileCheckConst.DIR, + ability=FileCheckConst.WRITE_ABLE) + error_data_path = error_data_path_checker.common_check() + initialize_save_path(error_data_path, UT_ERROR_DATA_DIR) + + +def get_validated_result_csv_path(result_csv_path, mode): + if mode not in ['result', 'detail']: + raise ValueError("The csv mode must be result or detail") + result_csv_path_checker = FileChecker(result_csv_path, FileCheckConst.FILE, ability=FileCheckConst.READ_WRITE_ABLE, + file_type=FileCheckConst.CSV_SUFFIX) + validated_result_csv_path = result_csv_path_checker.common_check() + if mode == 'result': + result_csv_name = os.path.basename(validated_result_csv_path) + pattern = r"^accuracy_checking_result_\d{14}\.csv$" + if not re.match(pattern, result_csv_name): + raise ValueError("When continue run ut, please do not modify the result csv name.") + return validated_result_csv_path + + +def get_validated_details_csv_path(validated_result_csv_path): + result_csv_name = os.path.basename(validated_result_csv_path) + details_csv_name = result_csv_name.replace('result', 'details') + details_csv_path = os.path.join(os.path.dirname(validated_result_csv_path), details_csv_name) + details_csv_path_checker = FileChecker(details_csv_path, FileCheckConst.FILE, + ability=FileCheckConst.READ_WRITE_ABLE, file_type=FileCheckConst.CSV_SUFFIX) + validated_details_csv_path = details_csv_path_checker.common_check() + return validated_details_csv_path + + +def _run_ut_parser(parser): + parser.add_argument("-api_info", "--api_info_file", dest="api_info_file", default="", type=str, + help=" The api param tool result file: generate from api param tool, " + "a json file.", + required=True) + parser.add_argument("-o", "--out_path", dest="out_path", default="", type=str, + help=" The ut task result out path.", + required=False) + parser.add_argument('-save_error_data', dest="save_error_data", action="store_true", + help=" Save compare failed api output.", required=False) + parser.add_argument("-j", "--jit_compile", dest="jit_compile", action="store_true", + help=" whether to turn on jit compile", required=False) + + class UniqueDeviceAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + unique_values = set(values) + if len(values) != len(unique_values): + parser.error("device id must be unique") + for device_id in values: + if not 0 <= device_id: + parser.error("device id must be greater than or equal to 0") + setattr(namespace, self.dest, values) + + parser.add_argument("-d", "--device", dest="device_id", nargs='+', type=int, + help=" set device id to run ut, must be unique and in range 0-7", + default=[0], required=False, action=UniqueDeviceAction) + parser.add_argument("-csv_path", "--result_csv_path", dest="result_csv_path", default="", type=str, + help=" The path of accuracy_checking_result_{timestamp}.csv, " + "when run ut is interrupted, enter the file path to continue run ut.", + required=False) + parser.add_argument("-real_data_path", dest="real_data_path", nargs="?", const="", default="", type=str, + help=" In real data mode, the root directory for storing real data " + "must be configured.", + required=False) + parser.add_argument("-f", "--filter_api", dest="filter_api", action="store_true", + help=" Whether to filter the api in the api_info_file.", required=False) + + +def preprocess_forward_content(forward_content): + processed_content = {} + base_keys_variants = {} + for key, value in forward_content.items(): + base_key = key.rsplit(Const.SEP, 1)[0] + new_args = value['args'] + new_kwargs = value['kwargs'] + filtered_new_args = [{k: v for k, v in arg.items() if k not in ['Max', 'Min']} for arg in new_args if isinstance(arg, dict)] + if base_key in base_keys_variants: + is_duplicate = False + for variant in base_keys_variants.get(base_key, []): + try: + existing_args = processed_content[variant].get('args', []) + existing_kwargs = processed_content[variant].get('kwargs', {}) + filtered_existing_args = [{k: v for k, v in arg.items() if k not in ['Max', 'Min']} for arg in existing_args if isinstance(arg, dict)] + except KeyError as e: + print_error_log(f"KeyError: {e} when processing {key}") + if filtered_existing_args == filtered_new_args and existing_kwargs == new_kwargs: + is_duplicate = True + break + if not is_duplicate: + processed_content[key] = value + base_keys_variants[base_key].append(key) + else: + processed_content[key] = value + base_keys_variants[base_key] = [key] + return processed_content + + +def _run_ut(parser=None): + if not parser: + parser = argparse.ArgumentParser() + _run_ut_parser(parser) + args = parser.parse_args(sys.argv[1:]) + run_ut_command(args) + + +def run_ut_command(args): + if not is_gpu: + torch.npu.set_compile_mode(jit_compile=args.jit_compile) + used_device = current_device + ":" + str(args.device_id[0]) + try: + if is_gpu: + torch.cuda.set_device(used_device) + else: + torch.npu.set_device(used_device) + except Exception as error: + print_error_log(f"Set device id failed. device id is: {args.device_id}") + raise NotImplementedError from error + check_link(args.api_info_file) + api_info = os.path.realpath(args.api_info_file) + check_file_suffix(api_info, FileCheckConst.JSON_SUFFIX) + out_path = os.path.realpath(args.out_path) if args.out_path else "./" + check_path_before_create(out_path) + create_directory(out_path) + out_path_checker = FileChecker(out_path, FileCheckConst.DIR, ability=FileCheckConst.WRITE_ABLE) + out_path = out_path_checker.common_check() + save_error_data = args.save_error_data + forward_content, backward_content, real_data_path = parse_json_info_forward_backward(api_info) + if args.filter_api: + forward_content = preprocess_forward_content(forward_content) + + result_csv_path = os.path.join(out_path, RESULT_FILE_NAME) + details_csv_path = os.path.join(out_path, DETAILS_FILE_NAME) + if args.result_csv_path: + result_csv_path = get_validated_result_csv_path(args.result_csv_path, 'result') + details_csv_path = get_validated_details_csv_path(result_csv_path) + if save_error_data: + if args.result_csv_path: + time_info = result_csv_path.split('.')[0].split('_')[-1] + global UT_ERROR_DATA_DIR + UT_ERROR_DATA_DIR = 'ut_error_data' + time_info + initialize_save_error_data() + run_ut_config = RunUTConfig(forward_content, backward_content, result_csv_path, details_csv_path, save_error_data, + args.result_csv_path, real_data_path) + run_ut(run_ut_config) + + +class UtDataInfo: + def __init__(self, bench_grad_out, device_grad_out, device_out, bench_out, grad_in, in_fwd_data_list): + self.bench_grad_out = bench_grad_out + self.device_grad_out = device_grad_out + self.device_out = device_out + self.bench_out = bench_out + self.grad_in = grad_in + self.in_fwd_data_list = in_fwd_data_list + + +class UtAPIInfo(APIInfo): + def __init__(self, api_name, element): + super().__init__(api_name, + save_path=self.get_full_save_path(msCheckerConfig.error_data_path, UT_ERROR_DATA_DIR), + is_save_data=True) + self.analyze_element(element) + + +if __name__ == '__main__': + _run_ut() + print_info_log("UT task completed.") diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json new file mode 100644 index 0000000000000000000000000000000000000000..d8df6098b1bab44d197b8e1a2b3e652456224e3f --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json @@ -0,0 +1,5 @@ +{ + "topk": { + "grad_index": 0 + } +} \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/resources/forward.json b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/resources/forward.json new file mode 100644 index 0000000000000000000000000000000000000000..f938f352460a87222bdb5346873904cb420996cc --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/resources/forward.json @@ -0,0 +1,3 @@ +{ + "Functional*silu*0": {"args": [{"type": "torch.Tensor", "dtype": "torch.float32", "shape": [2, 2560, 24, 24], "Max": 5.7421875, "Max_origin": 5.7421875, "Min": -5.125, "Min_origin": -5.125, "requires_grad": true}], "kwargs" :{"inplace": {"type": "bool", "value": false}}} +} \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/run_test.sh b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/run_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..fdd00c6021c9827a68e005616b1b4d916e63e995 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/run_test.sh @@ -0,0 +1,31 @@ +#!/bin/bash +CUR_DIR=$(dirname $(readlink -f $0)) +TOP_DIR=${CUR_DIR}/.. +TEST_DIR=${TOP_DIR}/"test" +SRC_DIR=${TOP_DIR}/../ + +clean() { + cd ${TEST_DIR} + + if [ -e ${TEST_DIR}/"report" ]; then + rm -r ${TEST_DIR}/"report" + echo "remove last ut_report successfully." + fi + +} + +run_ut() { + export PYTHONPATH=${SRC_DIR}:${PYTHONPATH} + python3 run_ut.py +} + +main() { + clean + if [ "$1"x == "clean"x ]; then + return 0 + fi + + cd ${TEST_DIR} && run_ut +} + +main $@ diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/run_ut.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/run_ut.py new file mode 100644 index 0000000000000000000000000000000000000000..c73949697941d84782c4983aa484c06b1a7cbcc2 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/run_ut.py @@ -0,0 +1,41 @@ +import os +import shutil +import subprocess +import sys + +def run_ut(): + cur_dir = os.path.realpath(os.path.dirname(__file__)) + top_dir = os.path.realpath(os.path.dirname(cur_dir)) + ut_path = os.path.join(cur_dir, "ut/") + src_dir = top_dir + report_dir = os.path.join(cur_dir, "report") + + if os.path.exists(report_dir): + shutil.rmtree(report_dir) + + os.makedirs(report_dir) + + cmd = ["python3", "-m", "pytest", ut_path, "--junitxml=" + report_dir + "/final.xml", + "--cov=" + src_dir, "--cov-branch", "--cov-report=xml:" + report_dir + "/coverage.xml"] + + result_ut = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + while result_ut.poll() is None: + line = result_ut.stdout.readline().strip() + if line: + print(line) + + ut_flag = False + if result_ut.returncode == 0: + ut_flag = True + print("run ut successfully.") + else: + print("run ut failed.") + + return ut_flag + +if __name__=="__main__": + if run_ut(): + sys.exit(0) + else: + sys.exit(1) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/test_common_utils.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/test_common_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5f25e81c09783eeb8c682fd33d3178b99352f6e0 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/test_common_utils.py @@ -0,0 +1,124 @@ +import unittest +import os +import numpy as np +import torch +from api_accuracy_checker.common.utils import * + +class TestUtils(unittest.TestCase): + + def test_read_json(self): + test_dict = {"key": "value"} + with open('test.json', 'w') as f: + json.dump(test_dict, f) + self.assertEqual(read_json('test.json'), test_dict) + os.remove('test.json') + + def test_write_csv(self): + test_data = [["name", "age"], ["Alice", "20"], ["Bob", "30"]] + write_csv(test_data, 'test.csv') + with open('test.csv', 'r', encoding='utf-8-sig') as f: + reader = csv.reader(f) + for i, row in enumerate(reader): + self.assertEqual(row, test_data[i]) + os.remove('test.csv') + + def test_print_info_log(self): + try: + print_info_log("Test message") + except Exception as e: + self.fail(f"print_info_log raised exception {e}") + + def test_check_mode_valid(self): + try: + check_mode_valid(Const.ALL) + except Exception as e: + self.fail(f"check_mode_valid raised exception {e}") + + def test_check_object_type(self): + try: + check_object_type(123, int) + except Exception as e: + self.fail(f"check_object_type raised exception {e}") + + def test_check_file_or_directory_path(self): + try: + check_file_or_directory_path(__file__) + except Exception as e: + self.fail(f"check_file_or_directory_path raised exception {e}") + + def test_get_dump_data_path(self): + path, exist = get_dump_data_path(os.path.dirname(__file__)) + self.assertTrue(exist) + + def test_create_directory(self): + create_directory('test_dir') + self.assertTrue(os.path.exists('test_dir')) + os.rmdir('test_dir') + + def test_execute_command(self): + execute_command(['echo', 'Hello, World!']) + + def test_parse_arg_value(self): + values = "1,2,3;4,5,6" + expected_result = [[1, 2, 3], [4, 5, 6]] + self.assertEqual(parse_arg_value(values), expected_result) + + def test_parse_value_by_comma(self): + value = "1,2,3" + expected_result = [1, 2, 3] + self.assertEqual(parse_value_by_comma(value), expected_result) + + def test_get_data_len_by_shape(self): + shape = [2, 3, 4] + expected_result = 24 + self.assertEqual(get_data_len_by_shape(shape), expected_result) + + def test_add_time_as_suffix(self): + name = "test" + result = add_time_as_suffix(name) + self.assertTrue(result.startswith(name)) + + def test_get_time(self): + result = get_time() + self.assertTrue(isinstance(result, str)) + + def test_format_value(self): + value = 123.456789 + expected_result = '123.456789' + self.assertEqual(format_value(value), expected_result) + + def test_seed_all(self): + seed_all(1234) + + def test_get_process_rank(self): + model = torch.nn.Linear(10, 10) + rank, _ = get_process_rank(model) + self.assertEqual(rank, 0) + + def test_get_json_contents(self): + test_dict = {"key": "value"} + with open('test.json', 'w') as f: + json.dump(test_dict, f) + self.assertEqual(get_json_contents('test.json'), test_dict) + os.remove('test.json') + + def test_get_file_content_bytes(self): + with open('test.txt', 'w') as f: + f.write("Hello, World!") + self.assertEqual(get_file_content_bytes('test.txt'), b"Hello, World!") + os.remove('test.txt') + + def test_islink(self): + self.assertFalse(islink(__file__)) + + def test_check_path_length_valid(self): + self.assertTrue(check_path_length_valid(__file__)) + + def test_check_path_pattern_valid(self): + self.assertIsNone(check_path_pattern_valid(__file__)) + + def test_check_input_file_valid(self): + self.assertIsNone(check_input_file_valid(__file__)) + + def test_check_need_convert(self): + self.assertIsNone(check_need_convert("unknown_api")) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/test_config.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/test_config.py new file mode 100644 index 0000000000000000000000000000000000000000..a68057dfb41ca38ba79e1daa992a8f51ce4d64e4 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/test_config.py @@ -0,0 +1,21 @@ +import unittest +import os +from api_accuracy_checker.common.config import Config + +class TestConfig(unittest.TestCase): + def setUp(self): + cur_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) + yaml_path = os.path.join(cur_path, "config.yaml") + self.yaml_file = yaml_path + self.config = Config(self.yaml_file) + + def test_validate(self): + self.assertEqual(self.config.validate('dump_path', '/path/to/dump'), '/path/to/dump') + + with self.assertRaises(ValueError): + self.config.validate('dump_path', 123) + + + def test_update_config(self): + self.config.update_config(dump_path='/new/path/to/dump') + self.assertEqual(self.config.dump_path, '/new/path/to/dump') diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_algorithm.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_algorithm.py new file mode 100644 index 0000000000000000000000000000000000000000..90e18d166f56f98b8c1e1f80f2ae28dab7db67d3 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_algorithm.py @@ -0,0 +1,32 @@ +import unittest +import numpy as np +import torch +from api_accuracy_checker.compare import compare as cmp +from api_accuracy_checker.compare import algorithm as alg + +class TestAlgorithmMethods(unittest.TestCase): + + def test_get_max_abs_err(self): + b_value = np.array([1.0, 2.0, 3.0]) + n_value = np.array([1.0, 2.0, 3.0]) + abs_err = np.abs(b_value - n_value) + self.assertEqual(alg.get_max_abs_err(abs_err), (0.0, True)) + + def test_get_rel_err_ratio_thousandth(self): + b_value = np.array([1.0, 2.0, 3.0]) + n_value = np.array([1.0, 2.0, 3.0]) + abs_err = np.abs(b_value - n_value) + rel_err = alg.get_rel_err_origin(abs_err, b_value) + self.assertEqual(alg.get_rel_err_ratio(rel_err, 0.001), (1.0, True)) + + def test_get_rel_err_ratio_ten_thousandth(self): + b_value = np.array([1.0, 2.0, 3.0]) + n_value = np.array([1.0, 2.0, 3.0]) + abs_err = np.abs(b_value - n_value) + rel_err = alg.get_rel_err_origin(abs_err, b_value) + self.assertEqual(alg.get_rel_err_ratio(rel_err, 0.0001), (1.0, True)) + + def test_cosine_sim(self): + cpu_output = np.array([1.0, 2.0, 3.0]) + npu_output = np.array([1.0, 2.0, 3.0]) + self.assertEqual(alg.cosine_sim(cpu_output, npu_output), (1.0, True, '')) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_compare.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_compare.py new file mode 100644 index 0000000000000000000000000000000000000000..4ce73ce550dfc5d5cd21246dbc2756a6024f6fea --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_compare.py @@ -0,0 +1,111 @@ +import csv +import os +import shutil +import time +import unittest + +import numpy as np +import torch.nn.functional + +from api_accuracy_checker.compare.compare import Comparator +from api_accuracy_checker.compare.compare_column import CompareColumn + +current_time = time.strftime("%Y%m%d%H%M%S") +RESULT_FILE_NAME = "accuracy_checking_result_" + current_time + ".csv" +DETAILS_FILE_NAME = "accuracy_checking_details_" + current_time + '.csv' +base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + +class TestCompare(unittest.TestCase): + def setUp(self): + self.output_path = os.path.join(base_dir, "../compare_result") + os.mkdir(self.output_path, mode=0o750) + self.result_csv_path = os.path.join(self.output_path, RESULT_FILE_NAME) + self.details_csv_path = os.path.join(self.output_path, DETAILS_FILE_NAME) + self.is_continue_run_ut = False + self.compare = Comparator(self.result_csv_path, self.details_csv_path, self.is_continue_run_ut) + + def tearDown(self) -> None: + if os.path.exists(self.output_path): + shutil.rmtree(self.output_path) + + def test_compare_dropout(self): + dummmy_input = torch.randn(100, 100) + bench_out = torch.nn.functional.dropout2d(dummmy_input, 0.3) + npu_out = torch.nn.functional.dropout2d(dummmy_input, 0.3) + self.assertTrue(self.compare._compare_dropout("api", bench_out, npu_out)) + + def test_compare_core_wrapper(self): + dummy_input = torch.randn(100, 100) + bench_out, npu_out = dummy_input, dummy_input + test_final_success, detailed_result_total = self.compare._compare_core_wrapper("api", bench_out, npu_out) + actual_cosine_similarity = detailed_result_total[0][3] + # 设置一个小的公差值 + tolerance = 1e-4 + # 判断实际的余弦相似度值是否在预期值的公差范围内 + self.assertTrue(np.isclose(actual_cosine_similarity, 1.0, atol=tolerance)) + # 对其他值进行比较,确保它们符合预期 + detailed_result_total[0][3] = 1.0 + self.assertEqual(detailed_result_total, [['torch.float32', 'torch.float32', (100, 100), 1.0, 0.0, ' ', ' ', ' ', + ' ', 0.0, 0.0, 0, 0.0, 0.0, ' ', ' ', ' ', 'pass', + '\nMax abs error is less than 0.001, consider as pass, skip other check and set to SPACE.\n']]) + self.assertTrue(test_final_success) + + bench_out, npu_out = [dummy_input, dummy_input], [dummy_input, dummy_input] + test_final_success, detailed_result_total = self.compare._compare_core_wrapper("api", bench_out, npu_out) + actual_cosine_similarity = detailed_result_total[0][3] + self.assertTrue(np.isclose(actual_cosine_similarity, 1.0, atol=tolerance)) + actual_cosine_similarity = detailed_result_total[1][3] + self.assertTrue(np.isclose(actual_cosine_similarity, 1.0, atol=tolerance)) + detailed_result_total[0][3] = 1.0 + detailed_result_total[1][3] = 1.0 + self.assertTrue(test_final_success) + self.assertEqual(detailed_result_total, [['torch.float32', 'torch.float32', (100, 100), 1.0, 0.0, ' ', ' ', ' ', + ' ', 0.0, 0.0, 0, 0.0, 0.0, ' ', ' ', ' ', 'pass', + '\nMax abs error is less than 0.001, consider as pass, skip other check and set to SPACE.\n'], + ['torch.float32', 'torch.float32', (100, 100), 1.0, 0.0, ' ', ' ', ' ', ' ', 0.0, 0.0, 0, 0.0, 0.0, ' ', ' ', + ' ', 'pass', '\nMax abs error is less than 0.001, consider as pass, skip other check and set to SPACE.\n']]) + + def test_compare_output(self): + bench_out, npu_out = torch.randn(100, 100), torch.randn(100, 100) + bench_grad, npu_grad = [torch.randn(100, 100)], [torch.randn(100, 100)] + api_name = 'Functional*conv2d*0' + is_fwd_success, is_bwd_success = self.compare.compare_output(api_name, bench_out, npu_out, bench_grad, npu_grad) + self.assertFalse(is_fwd_success) + self.assertFalse(is_bwd_success) + + dummy_input = torch.randn(100, 100) + bench_out, npu_out = dummy_input, dummy_input + is_fwd_success, is_bwd_success = self.compare.compare_output(api_name, bench_out, npu_out) + self.assertTrue(is_fwd_success) + self.assertTrue(is_bwd_success) + + def test_record_results(self): + args = ('Functional*conv2d*0', False, 'N/A', [['torch.float64', 'torch.float32', (32, 64, 112, 112), 1.0, + 0.012798667686, 'N/A', 0.81631212311, 0.159979121213, 'N/A', + 'error', '\n']], None) + self.compare.record_results(*args) + with open(self.details_csv_path, 'r') as file: + csv_reader = csv.reader(file) + next(csv_reader) + api_name_list = [row[0] for row in csv_reader] + self.assertEqual(api_name_list[0], 'Functional*conv2d*0.forward.output.0') + + def test_compare_torch_tensor(self): + cpu_output = torch.Tensor([1.0, 2.0, 3.0]) + npu_output = torch.Tensor([1.0, 2.0, 3.0]) + compare_column = CompareColumn() + status, compare_column, message = self.compare._compare_torch_tensor("api", cpu_output, npu_output, compare_column) + self.assertEqual(status, "pass") + + def test_compare_bool_tensor(self): + cpu_output = np.array([True, False, True]) + npu_output = np.array([True, False, True]) + self.assertEqual(self.compare._compare_bool_tensor(cpu_output, npu_output), (0.0, 'pass', '')) + + def test_compare_builtin_type(self): + compare_column = CompareColumn() + bench_out = 1 + npu_out = 1 + status, compare_result, message = self.compare._compare_builtin_type(bench_out, npu_out, compare_column) + self.assertEqual((status, compare_result.error_rate, message), ('pass', 0, '')) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_compare_utils.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_compare_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4e83c0643ef452c28d11c02bbbc2fee359a1ea2e --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_compare_utils.py @@ -0,0 +1,25 @@ +import unittest +import numpy as np +from api_accuracy_checker.compare.compare_utils import CompareConst, check_dtype_comparable + +class TestCompareUtils(unittest.TestCase): + def test_check_dtype_comparable(self): + x = np.array([1, 2, 3], dtype=np.int32) + y = np.array([4, 5, 6], dtype=np.int32) + self.assertTrue(check_dtype_comparable(x, y)) + + x = np.array([1.0, 2.0, 3.0], dtype=np.float32) + y = np.array([4.0, 5.0, 6.0], dtype=np.float32) + self.assertTrue(check_dtype_comparable(x, y)) + + x = np.array([True, False, True], dtype=np.bool_) + y = np.array([False, True, False], dtype=np.bool_) + self.assertTrue(check_dtype_comparable(x, y)) + + x = np.array([1, 2, 3], dtype=np.int32) + y = np.array([4.0, 5.0, 6.0], dtype=np.float32) + self.assertFalse(check_dtype_comparable(x, y)) + + x = np.array([1, 2, 3], dtype=np.int32) + y = np.array([True, False, True], dtype=np.bool_) + self.assertFalse(check_dtype_comparable(x, y)) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_api_info.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_api_info.py new file mode 100644 index 0000000000000000000000000000000000000000..2c03d56e722decc424052367dfe9700ba3df94ce --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_api_info.py @@ -0,0 +1,131 @@ +import os +import shutil +import unittest +import torch +import numpy as np +from api_accuracy_checker.dump.api_info import APIInfo, ForwardAPIInfo, BackwardAPIInfo, transfer_types, \ + get_tensor_extremum, get_type_name, is_builtin_class, analyze_device_in_kwargs, analyze_dtype_in_kwargs +from api_accuracy_checker.common.config import msCheckerConfig + + +class TestAPIInfo(unittest.TestCase): + def setUp(self): + if os.path.exists('./step-1'): + shutil.rmtree('./step-1') + self.api = APIInfo("test_api", APIInfo.get_full_save_path("./", "forward_real_data", True), True) + + def test_analyze_element(self): + element = [1, 2, 3] + result = self.api.analyze_element(element) + self.assertEqual(result, + [{'type': 'int', 'value': 1}, {'type': 'int', 'value': 2}, {'type': 'int', 'value': 3}]) + + def test_analyze_tensor(self): + tensor = torch.tensor([1, 2, 3], dtype=torch.float32, requires_grad=True) + result = self.api._analyze_tensor(tensor) + self.assertEqual(result.get('type'), 'torch.Tensor') + self.assertTrue(result.get('requires_grad')) + datapath = result.get('datapath') + self.assertTrue(datapath.startswith('forward_real_data') or datapath.startswith('backward_real_data')) + + def test_analyze_builtin(self): + arg = slice(1, 10, 2) + result = self.api._analyze_builtin(arg) + self.assertEqual(result, {'type': 'slice', 'value': [1, 10, 2]}) + + def test_transfer_types(self): + data = 10 + dtype = 'int' + result = transfer_types(data, dtype) + self.assertEqual(result, 10) + + def test_is_builtin_class(self): + element = 10 + result = is_builtin_class(element) + self.assertTrue(result) + + def test_analyze_device_in_kwargs(self): + element = torch.device('cuda:0') + result = analyze_device_in_kwargs(element) + self.assertEqual(result, {'type': 'torch.device', 'value': 'cuda:0'}) + + def test_analyze_dtype_in_kwargs(self): + element = torch.float32 + result = analyze_dtype_in_kwargs(element) + self.assertEqual(result, {'type': 'torch.dtype', 'value': 'torch.float32'}) + + def test_get_tensor_extremum(self): + data = torch.tensor([1, 2, 3]) + result_max, result_max_origin = get_tensor_extremum(data, 'max') + result_min, result_min_origin = get_tensor_extremum(data, 'min') + self.assertEqual(result_max, 3) + self.assertEqual(result_min, 1) + self.assertEqual(result_max_origin, 3) + self.assertEqual(result_min_origin, 1) + + data = torch.tensor([1, float("inf"), 2, 3]) + result_max, result_max_origin = get_tensor_extremum(data, 'max') + result_min, result_min_origin = get_tensor_extremum(data, 'min') + self.assertEqual(result_max, 3) + self.assertEqual(result_min, 1) + self.assertEqual(result_max_origin, float("inf")) + self.assertEqual(result_min_origin, 1) + + data = torch.tensor([1, float("-inf"), 2, 3]) + result_max, result_max_origin = get_tensor_extremum(data, 'max') + result_min, result_min_origin = get_tensor_extremum(data, 'min') + self.assertEqual(result_max, 3) + self.assertEqual(result_min, 1) + self.assertEqual(result_max_origin, 3) + self.assertEqual(result_min_origin, float("-inf")) + + data = torch.tensor([1, float("inf"), float("nan"), 3]) + result_max, result_max_origin = get_tensor_extremum(data, 'max') + result_min, result_min_origin = get_tensor_extremum(data, 'min') + self.assertEqual(result_max, 3) + self.assertEqual(result_min, 1) + self.assertTrue(np.isnan(result_max_origin)) + self.assertTrue(np.isnan(result_min_origin)) + + data = torch.tensor([float("inf"), float("nan")]) + result_max, result_max_origin = get_tensor_extremum(data, 'max') + result_min, result_min_origin = get_tensor_extremum(data, 'min') + self.assertEqual(result_max, float("inf")) + self.assertEqual(result_min, float("inf")) + self.assertTrue(np.isnan(result_max_origin)) + self.assertTrue(np.isnan(result_min_origin)) + + data = torch.tensor([float("nan"), float("nan")]) + result_max, result_max_origin = get_tensor_extremum(data, 'max') + result_min, result_min_origin = get_tensor_extremum(data, 'min') + self.assertTrue(np.isnan(result_max)) + self.assertTrue(np.isnan(result_min)) + self.assertTrue(np.isnan(result_max_origin)) + self.assertTrue(np.isnan(result_min_origin)) + + def test_get_type_name(self): + name = "" + result = get_type_name(name) + self.assertEqual(result, 'int') + + def test_ForwardAPIInfo(self): + forward_api_info = ForwardAPIInfo("test_forward_api", [1, 2, 3], {"a": 1, "b": 2}) + self.assertEqual(forward_api_info.api_name, "test_forward_api") + self.assertEqual(forward_api_info.save_path, + APIInfo.get_full_save_path(msCheckerConfig.dump_path, 'forward_real_data', True)) + self.assertEqual(forward_api_info.api_info_struct, {"test_forward_api": { + "args": [{'type': 'int', 'value': 1}, {'type': 'int', 'value': 2}, {'type': 'int', 'value': 3}, ], + "kwargs": {'a': {'type': 'int', 'value': 1}, 'b': {'type': 'int', 'value': 2}}}}) + + def test_BackwardAPIInfo(self): + backward_api_info = BackwardAPIInfo("test_backward_api", [1, 2, 3]) + self.assertEqual(backward_api_info.api_name, "test_backward_api") + self.assertEqual(backward_api_info.save_path, + APIInfo.get_full_save_path(msCheckerConfig.dump_path, 'backward_real_data', True)) + self.assertEqual(backward_api_info.grad_info_struct, { + "test_backward_api": [{'type': 'int', 'value': 1}, {'type': 'int', 'value': 2}, + {'type': 'int', 'value': 3}]}) + + def tearDown(self): + if os.path.exists('./step-1'): + shutil.rmtree('./step-1') diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_dump.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_dump.py new file mode 100644 index 0000000000000000000000000000000000000000..655e624e809a5cceb406b9fce9df4e4f89efb4ee --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_dump.py @@ -0,0 +1,32 @@ +import unittest +from api_accuracy_checker.dump.dump import * + +class TestDumpUtil(unittest.TestCase): + def test_set_dump_switch(self): + set_dump_switch("ON") + self.assertEqual(DumpUtil.dump_switch, "ON") + set_dump_switch("OFF") + self.assertEqual(DumpUtil.dump_switch, "OFF") + + def test_get_dump_switch(self): + DumpUtil.dump_switch = "ON" + self.assertTrue(DumpUtil.get_dump_switch()) + DumpUtil.dump_switch = "OFF" + self.assertFalse(DumpUtil.get_dump_switch()) + + def test_incr_iter_num_maybe_exit(self): + msCheckerConfig.target_iter = [5] + msCheckerConfig.enable_dataloader = True + + DumpUtil.call_num = 6 + with self.assertRaises(Exception): + DumpUtil.incr_iter_num_maybe_exit() + + DumpUtil.call_num = 4 + DumpUtil.incr_iter_num_maybe_exit() + self.assertEqual(DumpUtil.dump_switch, "OFF") + + msCheckerConfig.enable_dataloader = False + DumpUtil.call_num = 5 + DumpUtil.incr_iter_num_maybe_exit() + self.assertEqual(DumpUtil.dump_switch, "ON") diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_dump_scope.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_dump_scope.py new file mode 100644 index 0000000000000000000000000000000000000000..7712552abe49d757a07bcbbd746038ed22d4027b --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_dump_scope.py @@ -0,0 +1,23 @@ +import unittest +from api_accuracy_checker.dump.dump_scope import iter_tracer +from api_accuracy_checker.dump.dump import DumpUtil + + +class TestDumpScope(unittest.TestCase): + def test_iter_tracer(self): + DumpUtil.call_num = 0 + + def dummy_func(): + return "Hello, World!" + + wrapped_func = iter_tracer(dummy_func) + result = wrapped_func() + self.assertEqual(DumpUtil.dump_switch, "OFF") + self.assertEqual(result, "Hello, World!") + + def another_dummy_func(): + return 123 + wrapped_func = iter_tracer(another_dummy_func) + result = wrapped_func() + self.assertEqual(DumpUtil.dump_switch, "OFF") + self.assertEqual(result, 123) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_info_dump.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_info_dump.py new file mode 100644 index 0000000000000000000000000000000000000000..45e57f2c389292e9226039f56b83966941c603ca --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_info_dump.py @@ -0,0 +1,28 @@ +import unittest +import os +from unittest.mock import patch +from api_accuracy_checker.dump.api_info import APIInfo, BackwardAPIInfo +from api_accuracy_checker.dump.info_dump import write_api_info_json + + +class TestInfoDump(unittest.TestCase): + + def test_write_api_info_json_backward(self): + api_info = BackwardAPIInfo("test_backward_api", [1, 2, 3]) + with patch('api_accuracy_checker.dump.info_dump.write_json') as mock_write_json: + write_api_info_json(api_info) + rank = os.getpid() + mock_write_json.assert_called_with(f'./step0/backward_info_{rank}.json', api_info.grad_info_struct) + + def test_write_api_info_json_invalid_type(self): + api_info = APIInfo("test_api", APIInfo.get_full_save_path("save_path", "forward_real_data", contain_step=True), + is_save_data=True) + with self.assertRaises(ValueError): + write_api_info_json(api_info) + + def tearDown(self): + rank = os.getpid() + files = [f'./step0/backward_info_{rank}.json'] + for file in files: + if os.path.exists(file): + os.remove(file) \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_functional.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_functional.py new file mode 100644 index 0000000000000000000000000000000000000000..37058e77fd87e697b7dd7fde5e94b78d01a2cb89 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_functional.py @@ -0,0 +1,15 @@ +# coding=utf-8 +import unittest +import torch +from api_accuracy_checker.hook_module import wrap_functional as wf + +class TestWrapFunctional(unittest.TestCase): + + def test_get_functional_ops(self): + expected_ops = {'relu', 'sigmoid', 'softmax'} + actual_ops = wf.get_functional_ops() + self.assertTrue(expected_ops.issubset(actual_ops)) + + def test_wrap_functional_ops_and_bind(self): + wf.wrap_functional_ops_and_bind(None) + self.assertTrue(hasattr(wf.HOOKFunctionalOP, 'wrap_relu')) \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_tensor.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_tensor.py new file mode 100644 index 0000000000000000000000000000000000000000..bfae3c72771510b141abf9204723bfe48bfa8de3 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_tensor.py @@ -0,0 +1,29 @@ +# coding=utf-8 +import unittest +import torch +import yaml +from api_accuracy_checker.hook_module.wrap_tensor import get_tensor_ops, HOOKTensor, TensorOPTemplate, wrap_tensor_op, wrap_tensor_ops_and_bind + +class TestWrapTensor(unittest.TestCase): + def hook(self, a, b): + return + + def test_get_tensor_ops(self): + result = get_tensor_ops() + self.assertIsInstance(result, set) + + def test_HOOKTensor(self): + hook_tensor = HOOKTensor() + self.assertIsInstance(hook_tensor, HOOKTensor) + + def test_TensorOPTemplate(self): + tensor_op_template = TensorOPTemplate('add', self.hook) + self.assertEqual(tensor_op_template.op_name_, 'add') + + def test_wrap_tensor_op(self): + wrapped_op = wrap_tensor_op('add', self.hook) + self.assertTrue(callable(wrapped_op)) + + def test_wrap_tensor_ops_and_bind(self): + wrap_tensor_ops_and_bind(self.hook) + self.assertTrue(hasattr(HOOKTensor, 'wrap_add')) \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_torch.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_torch.py new file mode 100644 index 0000000000000000000000000000000000000000..40cef939adfd06158eb543c07b3d682e29d6cdab --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_torch.py @@ -0,0 +1,37 @@ +# coding=utf-8 +import unittest +import torch +import yaml +from api_accuracy_checker.hook_module.wrap_torch import * + +class TestWrapTorch(unittest.TestCase): + + def setUp(self): + self.op_name = 'add' + self.torch_op = wrap_torch_op(self.op_name, self.hook) + + def hook(self, a, b): + return + + def test_get_torch_ops(self): + ops = get_torch_ops() + self.assertIsInstance(ops, set) + self.assertIn(self.op_name, ops) + + def test_TorchOPTemplate(self): + template = TorchOPTemplate(self.op_name, self.hook) + self.assertEqual(template.op_name_, self.op_name) + self.assertEqual(template.prefix_op_name_, "Torch*" + str(self.op_name) + "*") + + def test_input_param_need_adapt(self): + template = TorchOPTemplate(self.op_name, self.hook) + self.assertFalse(template.input_param_need_adapt()) + + def test_forward(self): + template = TorchOPTemplate(self.op_name, self.hook) + result = template.forward(torch.tensor([1, 2, 3]), torch.tensor([4, 5, 6])) + torch.testing.assert_allclose(result, torch.tensor([5, 7, 9])) + + def test_wrap_torch_ops_and_bind(self): + wrap_torch_ops_and_bind(self.hook) + self.assertTrue(hasattr(HOOKTorchOP, "wrap_" + self.op_name)) \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_data_generate.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_data_generate.py new file mode 100644 index 0000000000000000000000000000000000000000..b98f84d516404665b5c3284f1e03f14eedddac55 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_data_generate.py @@ -0,0 +1,97 @@ +# coding=utf-8 +import unittest +import numpy as np +import os +import copy +from api_accuracy_checker.run_ut.data_generate import * +from api_accuracy_checker.common.utils import get_json_contents + +base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +forward_file = os.path.join(base_dir, "../resources/forward.json") +forward_content = get_json_contents(forward_file) +for api_full_name, api_info_dict in forward_content.items(): + api_full_name = api_full_name + api_info_dict = api_info_dict + +max_value = 5.7421875 +min_value = -5.125 + +class TestDataGenerateMethods(unittest.TestCase): + def test_gen_api_params(self): + api_info = copy.deepcopy(api_info_dict) + args_params, kwargs_params = gen_api_params(api_info, True, None, None) + max_diff = abs(args_params[0].max() - max_value) + min_diff = abs(args_params[0].min() - min_value) + self.assertEqual(len(args_params), 1) + self.assertEqual(args_params[0].dtype, torch.float32) + self.assertLessEqual(max_diff, 0.001) + self.assertLessEqual(min_diff, 0.001) + self.assertEqual(args_params[0].shape, torch.Size([2, 2560, 24, 24])) + self.assertEqual(kwargs_params, {'inplace': False}) + + def test_gen_args(self): + args_result = gen_args(api_info_dict.get('args'), real_data_path=None) + max_diff = abs(args_result[0].max() - max_value) + min_diff = abs(args_result[0].min() - min_value) + self.assertEqual(len(args_result), 1) + self.assertEqual(args_result[0].dtype, torch.float32) + self.assertLessEqual(max_diff, 0.001) + self.assertLessEqual(min_diff, 0.001) + self.assertEqual(args_result[0].shape, torch.Size([2, 2560, 24, 24])) + + def test_gen_data(self): + data = gen_data(api_info_dict.get('args')[0], True, None, None) + max_diff = abs(data.max() - max_value) + min_diff = abs(data.min() - min_value) + self.assertEqual(data.dtype, torch.float32) + self.assertEqual(data.requires_grad, True) + self.assertLessEqual(max_diff, 0.001) + self.assertLessEqual(min_diff, 0.001) + self.assertEqual(data.shape, torch.Size([2, 2560, 24, 24])) + + def test_gen_kwargs(self): + api_info = copy.deepcopy(api_info_dict) + kwargs_params = gen_kwargs(api_info, None) + self.assertEqual(kwargs_params, {'inplace': False}) + + def test_gen_kwargs_2(self): + k_dict = {"inplace": {"type": "bool", "value": "False"}} + for key, value in k_dict.items(): + gen_torch_kwargs(k_dict, key, value) + self.assertEqual(k_dict, {'inplace': False}) + + def test_gen_random_tensor(self): + data = gen_random_tensor(api_info_dict.get('args')[0], None) + max_diff = abs(data.max() - max_value) + min_diff = abs(data.min() - min_value) + self.assertEqual(data.dtype, torch.float32) + self.assertEqual(data.requires_grad, False) + self.assertLessEqual(max_diff, 0.001) + self.assertLessEqual(min_diff, 0.001) + self.assertEqual(data.shape, torch.Size([2, 2560, 24, 24])) + + def test_gen_common_tensor(self): + info = api_info_dict.get('args')[0] + low, high = info.get('Min'), info.get('Max') + low_origin, high_origin = info.get('Min_origin'), info.get('Max_origin') + low_info = [low, low_origin] + high_info = [high, high_origin] + data_dtype = info.get('dtype') + shape = tuple(info.get('shape')) + data = gen_common_tensor(low_info, high_info, shape, data_dtype, None) + max_diff = abs(data.max() - max_value) + min_diff = abs(data.min() - min_value) + self.assertEqual(data.dtype, torch.float32) + self.assertEqual(data.requires_grad, False) + self.assertLessEqual(max_diff, 0.001) + self.assertLessEqual(min_diff, 0.001) + self.assertEqual(data.shape, torch.Size([2, 2560, 24, 24])) + + def test_gen_bool_tensor(self): + info = {"type": "torch.Tensor", "dtype": "torch.bool", "shape": [1, 1, 160, 256], \ + "Max": 1, "Min": 0, "requires_grad": False} + low, high = info.get("Min"), info.get("Max") + shape = tuple(info.get("shape")) + data = gen_bool_tensor(low, high, shape) + self.assertEqual(data.shape, torch.Size([1, 1, 160, 256])) + self.assertEqual(data.dtype, torch.bool) diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_multi_run_ut.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_multi_run_ut.py new file mode 100644 index 0000000000000000000000000000000000000000..315b16127972103dffdfe89c941d330c6962305d --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_multi_run_ut.py @@ -0,0 +1,103 @@ +import unittest +from unittest.mock import patch, mock_open, MagicMock +import json +import signal +from api_accuracy_checker.run_ut.multi_run_ut import split_json_file, signal_handler, run_parallel_ut, prepare_config, main, ParallelUTConfig + + +class TestMultiRunUT(unittest.TestCase): + + def setUp(self): + self.test_json_file = 'test_file.json' + self.test_data = {'key1': 'TRUE', 'key2': 'TRUE', 'key3': 'TRUE'} + self.test_json_content = json.dumps(self.test_data) + self.forward_split_files_content = [ + {'key1': 'TRUE', 'key2': 'TRUE'}, + {'key3': 'TRUE', 'key4': 'TRUE'} + ] + + @patch('api_accuracy_checker.run_ut.multi_run_ut.FileOpen') + def test_split_json_file(self, mock_FileOpen): + mock_FileOpen.return_value.__enter__.return_value = mock_open(read_data=self.test_json_content).return_value + num_splits = 2 + split_files, total_items = split_json_file(self.test_json_file, num_splits, False) + self.assertEqual(len(split_files), num_splits) + self.assertEqual(total_items, len(self.test_data)) + + @patch('api_accuracy_checker.run_ut.multi_run_ut.print_warn_log') + def test_signal_handler(self, mock_print_warn_log): + with self.assertRaises(KeyboardInterrupt): + signal_handler(signal.SIGINT, None) + mock_print_warn_log.assert_called() + + @patch('subprocess.Popen') + @patch('os.path.exists', return_value=True) + @patch('builtins.open', new_callable=mock_open) + @patch('json.load', side_effect=lambda f: {'key1': 'TRUE', 'key2': 'TRUE'}) + def test_run_parallel_ut(self, mock_json_load, mock_file, mock_exists, mock_popen): + mock_process = MagicMock() + mock_process.poll.side_effect = [None, None, 1] + mock_process.stdout.readline.side_effect = ['[ERROR] Test Error Message\n', ''] + mock_popen.return_value = mock_process + + config = ParallelUTConfig( + forward_files=['forward_split1.json', 'forward_split2.json'], + backward_files=[None, None], + out_path='./', + num_splits=2, + save_error_data_flag=True, + jit_compile_flag=False, + device_id=[0, 1], + result_csv_path='result.csv', + total_items=2, + real_data_path=None + ) + + mock_file.side_effect = [ + mock_open(read_data=json.dumps(self.forward_split_files_content[0])).return_value, + mock_open(read_data=json.dumps(self.forward_split_files_content[1])).return_value + ] + + run_parallel_ut(config) + + mock_popen.assert_called() + mock_exists.assert_called() + + @patch('os.remove') + @patch('os.path.realpath', side_effect=lambda x: x) + @patch('api_accuracy_checker.run_ut.multi_run_ut.check_link') + @patch('api_accuracy_checker.run_ut.multi_run_ut.check_file_suffix') + @patch('api_accuracy_checker.run_ut.multi_run_ut.FileChecker') + @patch('api_accuracy_checker.run_ut.multi_run_ut.split_json_file', return_value=(['forward_split1.json', 'forward_split2.json'], 2)) + def test_prepare_config(self, mock_split_json_file, mock_FileChecker, mock_check_file_suffix, mock_check_link, mock_realpath, mock_remove): + mock_FileChecker_instance = MagicMock() + mock_FileChecker_instance.common_check.return_value = './' + mock_FileChecker.return_value = mock_FileChecker_instance + args = MagicMock() + args.forward_input_file = 'forward.json' + args.backward_input_file = None + args.out_path = './' + args.num_splits = 2 + args.save_error_data = True + args.jit_compile = False + args.device_id = [0, 1] + args.result_csv_path = None + args.real_data_path = None + + config = prepare_config(args) + + self.assertEqual(config.num_splits, 2) + self.assertTrue(config.save_error_data_flag) + self.assertFalse(config.jit_compile_flag) + self.assertEqual(config.device_id, [0, 1]) + self.assertEqual(len(config.forward_files), 2) + self.assertEqual(config.total_items, 2) + + @patch('argparse.ArgumentParser.parse_args') + @patch('api_accuracy_checker.run_ut.multi_run_ut.prepare_config') + @patch('api_accuracy_checker.run_ut.multi_run_ut.run_parallel_ut') + def test_main(self, mock_run_parallel_ut, mock_prepare_config, mock_parse_args): + main() + mock_parse_args.assert_called() + mock_prepare_config.assert_called() + mock_run_parallel_ut.assert_called() \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_run_ut.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_run_ut.py new file mode 100644 index 0000000000000000000000000000000000000000..fdcc1cfddeb38d4fca0d2a67a09147b571b35def --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_run_ut.py @@ -0,0 +1,70 @@ +# coding=utf-8 +import os +import copy +import unittest +from unittest.mock import patch, DEFAULT +import torch +from api_accuracy_checker.run_ut.run_ut import * +from api_accuracy_checker.common.utils import get_json_contents + +base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +forward_file = os.path.join(base_dir, "../resources/forward.json") +forward_content = get_json_contents(forward_file) +for api_full_name, api_info_dict in forward_content.items(): + api_full_name = api_full_name + api_info_dict = api_info_dict + +class TestRunUtMethods(unittest.TestCase): + def test_exec_api(self): + api_info = copy.deepcopy(api_info_dict) + [api_type, api_name, _] = api_full_name.split("*") + args, kwargs, need_grad = get_api_info(api_info, api_name, None) + cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, True, '') + out = exec_api(api_type, api_name, cpu_args, cpu_kwargs) + self.assertEqual(out.dtype, torch.float64) + self.assertTrue(out.requires_grad) + self.assertEqual(out.shape, torch.Size([2, 2560, 24, 24])) + + def test_generate_device_params(self): + mock_tensor = torch.rand([2, 2560, 24, 24], dtype=torch.float32, requires_grad=True) + + with patch.multiple('torch.Tensor', + to=DEFAULT, + clone=DEFAULT, + detach=DEFAULT, + requires_grad_=DEFAULT, + type_as=DEFAULT, + retain_grad=DEFAULT) as mocks: + mocks['clone'].return_value = mock_tensor + mocks['detach'].return_value = mock_tensor + mocks['requires_grad_'].return_value = mock_tensor + mocks['type_as'].return_value = mock_tensor + mocks['retain_grad'].return_value = None + mocks['to'].return_value = mock_tensor + + device_args, device_kwargs = generate_device_params([mock_tensor], {'inplace': False}, True, '') + self.assertEqual(len(device_args), 1) + self.assertEqual(device_args[0].dtype, torch.float32) + self.assertTrue(device_args[0].requires_grad) + self.assertEqual(device_args[0].shape, torch.Size([2, 2560, 24, 24])) + self.assertEqual(device_kwargs, {'inplace': False}) + + def test_generate_cpu_params(self): + api_info = copy.deepcopy(api_info_dict) + [api_type, api_name, _] = api_full_name.split("*") + args, kwargs, need_grad = get_api_info(api_info, api_name, None) + cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, True, '') + self.assertEqual(len(cpu_args), 1) + self.assertEqual(cpu_args[0].dtype, torch.float64) + self.assertTrue(cpu_args[0].requires_grad) + self.assertEqual(cpu_args[0].shape, torch.Size([2, 2560, 24, 24])) + self.assertEqual(cpu_kwargs, {'inplace': False}) + + def test_UtDataInfo(self): + data_info = UtDataInfo(None, None, None, None, None, None) + self.assertIsNone(data_info.bench_grad_out) + self.assertIsNone(data_info.device_grad_out) + self.assertIsNone(data_info.device_out) + self.assertIsNone(data_info.bench_out) + self.assertIsNone(data_info.grad_in) + self.assertIsNone(data_info.in_fwd_data_list) diff --git a/debug/accuracy_tools/atat/pytorch/common/__init__.py b/debug/accuracy_tools/atat/pytorch/common/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b391e103115498a2c2cf8b78f48168822517be73 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/common/__init__.py @@ -0,0 +1,4 @@ +from .recursive import recursive_apply_transform +from .log import print_error_log_rank_0, print_info_log_rank_0, print_warn_log_rank_0 +from .parse_json import parse_json_info_forward_backward +from .utils import seed_all diff --git a/debug/accuracy_tools/atat/pytorch/common/compare_script.template b/debug/accuracy_tools/atat/pytorch/common/compare_script.template new file mode 100644 index 0000000000000000000000000000000000000000..91565b3c87fa504ca96e7ebfd03f140f648a64c7 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/common/compare_script.template @@ -0,0 +1,14 @@ +from ptdbg_ascend import compare + +pkl_path = "%s" +dump_data_dir = "%s" + +dump_path_param = { + "npu_pkl_path": , + "bench_pkl_path": , + "npu_dump_data_dir": , + "bench_dump_data_dir": , + "is_print_compare_log": True +} + +compare(dump_path_param, output_path="", stack_mode=%s) diff --git a/debug/accuracy_tools/atat/pytorch/common/exceptions.py b/debug/accuracy_tools/atat/pytorch/common/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..17733b5bfd5f4b8ffcb3cb3602e3f5f54fdef97d --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/common/exceptions.py @@ -0,0 +1,75 @@ + +class CodedException(Exception): + def __init__(self, code, error_info=''): + self.error_info = self.err_strs.get(code) + error_info + + def __str__(self): + return self.error_info + + +class MsaccException(CodedException): + INVALID_PARAM_ERROR = 0 + OVERFLOW_NUMS_ERROR = 1 + + err_strs = { + INVALID_PARAM_ERROR: "[msacc] 无效参数: ", + OVERFLOW_NUMS_ERROR: "[msacc] 超过预设溢出次数 当前溢出次数:" + } + + +class FileCheckException(CodedException): + INVALID_FILE_ERROR = 0 + FILE_PERMISSION_ERROR = 1 + SOFT_LINK_ERROR = 2 + ILLEGAL_PATH_ERROR = 3 + ILLEGAL_PARAM_ERROR = 4 + FILE_TOO_LARGE_ERROR = 5 + + err_strs = { + SOFT_LINK_ERROR: "[msacc] 检测到软链接: ", + FILE_PERMISSION_ERROR: "[msacc] 文件权限错误: ", + INVALID_FILE_ERROR: "[msacc] 无效文件: ", + ILLEGAL_PATH_ERROR: "[msacc] 非法文件路径: ", + ILLEGAL_PARAM_ERROR: "[msacc] 非法打开方式: ", + FILE_TOO_LARGE_ERROR: "[msacc] 文件过大: " + } + + +class ParseJsonException(CodedException): + UnexpectedNameStruct = 0 + InvalidDumpJson = 1 + err_strs = { + UnexpectedNameStruct: "[msacc] Unexpected name in json: ", + InvalidDumpJson: "[msacc] json格式不正确: ", + } + + +class ScopeException(CodedException): + InvalidApiStr = 0 + InvalidScope = 1 + ArgConflict = 2 + err_strs = { + InvalidApiStr: "[msacc] Invalid api_list: ", + InvalidScope: "[msacc] Invalid scope: ", + ArgConflict: "[msacc] Scope and api_list conflict: ", + } + + +class RepairException(CodedException): + InvalidRepairType = 0 + err_strs = { + InvalidRepairType: "[msacc] Invalid repair_type: " + } + + +class StepException(CodedException): + InvalidPostProcess = 0 + err_strs = { + InvalidPostProcess: "[msacc] 错误的step后处理配置: ", + } + +class FreeBenchmarkException(CodedException): + UnsupportedType = 0 + err_strs = { + UnsupportedType: "[msacc] Free benchmark get unsupported type: " + } \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/common/file_check.py b/debug/accuracy_tools/atat/pytorch/common/file_check.py new file mode 100644 index 0000000000000000000000000000000000000000..3204652583b9bce5ac874b5a178fb83926856660 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/common/file_check.py @@ -0,0 +1,301 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import os +import re + +from .log import print_error_log, print_warn_log +from .exceptions import FileCheckException +from .utils import Const + + +class FileCheckConst: + """ + Class for file check const + """ + READ_ABLE = "read" + WRITE_ABLE = "write" + READ_WRITE_ABLE = "read and write" + DIRECTORY_LENGTH = 4096 + FILE_NAME_LENGTH = 255 + FILE_VALID_PATTERN = r"^[a-zA-Z0-9_.:/-]+$" + PKL_SUFFIX = ".pkl" + NUMPY_SUFFIX = ".npy" + JSON_SUFFIX = ".json" + PT_SUFFIX = ".pt" + CSV_SUFFIX = ".csv" + YAML_SUFFIX = ".yaml" + MAX_PKL_SIZE = 1 * 1024 * 1024 * 1024 + MAX_NUMPY_SIZE = 10 * 1024 * 1024 * 1024 + MAX_JSON_SIZE = 1 * 1024 * 1024 * 1024 + MAX_PT_SIZE = 10 * 1024 * 1024 * 1024 + MAX_CSV_SIZE = 1 * 1024 * 1024 * 1024 + MAX_YAML_SIZE = 10 * 1024 * 1024 + DIR = "dir" + FILE = "file" + DATA_DIR_AUTHORITY = 0o750 + DATA_FILE_AUTHORITY = 0o640 + FILE_SIZE_DICT = { + PKL_SUFFIX: MAX_PKL_SIZE, + NUMPY_SUFFIX: MAX_NUMPY_SIZE, + JSON_SUFFIX: MAX_JSON_SIZE, + PT_SUFFIX: MAX_PT_SIZE, + CSV_SUFFIX: MAX_CSV_SIZE, + YAML_SUFFIX: MAX_YAML_SIZE + } + + +class FileChecker: + """ + The class for check file. + + Attributes: + file_path: The file or dictionary path to be verified. + path_type: file or dictionary + ability(str): FileCheckConst.WRITE_ABLE or FileCheckConst.READ_ABLE to set file has writability or readability + file_type(str): The correct file type for file + """ + def __init__(self, file_path, path_type, ability=None, file_type=None, is_script=True): + self.file_path = file_path + self.path_type = self._check_path_type(path_type) + self.ability = ability + self.file_type = file_type + self.is_script = is_script + + @staticmethod + def _check_path_type(path_type): + if path_type not in [FileCheckConst.DIR, FileCheckConst.FILE]: + print_error_log(f'The path_type must be {FileCheckConst.DIR} or {FileCheckConst.FILE}.') + raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR) + return path_type + + def common_check(self): + """ + 功能:用户校验基本文件权限:软连接、文件长度、是否存在、读写权限、文件属组、文件特殊字符 + 注意:文件后缀的合法性,非通用操作,可使用其他独立接口实现 + """ + check_path_exists(self.file_path) + check_link(self.file_path) + self.file_path = os.path.realpath(self.file_path) + check_path_length(self.file_path) + check_path_type(self.file_path, self.path_type) + self.check_path_ability() + if self.is_script: + check_path_owner_consistent(self.file_path) + check_path_pattern_vaild(self.file_path) + check_common_file_size(self.file_path) + check_file_suffix(self.file_path, self.file_type) + return self.file_path + + def check_path_ability(self): + if self.ability == FileCheckConst.WRITE_ABLE: + check_path_writability(self.file_path) + if self.ability == FileCheckConst.READ_ABLE: + check_path_readability(self.file_path) + if self.ability == FileCheckConst.READ_WRITE_ABLE: + check_path_readability(self.file_path) + check_path_writability(self.file_path) + + +class FileOpen: + """ + The class for open file by a safe way. + + Attributes: + file_path: The file or dictionary path to be opened. + mode(str): The file open mode + """ + SUPPORT_READ_MODE = ["r", "rb"] + SUPPORT_WRITE_MODE = ["w", "wb", "a", "ab"] + SUPPORT_READ_WRITE_MODE = ["r+", "rb+", "w+", "wb+", "a+", "ab+"] + + def __init__(self, file_path, mode, encoding='utf-8'): + self.file_path = file_path + self.mode = mode + self.encoding = encoding + self._handle = None + + def __enter__(self): + self.check_file_path() + binary_mode = "b" + if binary_mode not in self.mode: + self._handle = open(self.file_path, self.mode, encoding=self.encoding) + else: + self._handle = open(self.file_path, self.mode) + return self._handle + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._handle: + self._handle.close() + + def check_file_path(self): + support_mode = self.SUPPORT_READ_MODE + self.SUPPORT_WRITE_MODE + self.SUPPORT_READ_WRITE_MODE + if self.mode not in support_mode: + print_error_log("File open not support %s mode" % self.mode) + raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR) + check_link(self.file_path) + self.file_path = os.path.realpath(self.file_path) + check_path_length(self.file_path) + self.check_ability_and_owner() + check_path_pattern_vaild(self.file_path) + if os.path.exists(self.file_path): + check_common_file_size(self.file_path) + + def check_ability_and_owner(self): + if self.mode in self.SUPPORT_READ_MODE: + check_path_exists(self.file_path) + check_path_readability(self.file_path) + check_path_owner_consistent(self.file_path) + if self.mode in self.SUPPORT_WRITE_MODE and os.path.exists(self.file_path): + check_path_writability(self.file_path) + check_path_owner_consistent(self.file_path) + if self.mode in self.SUPPORT_READ_WRITE_MODE and os.path.exists(self.file_path): + check_path_readability(self.file_path) + check_path_writability(self.file_path) + check_path_owner_consistent(self.file_path) + + +def check_link(path): + abs_path = os.path.abspath(path) + if os.path.islink(abs_path): + print_error_log('The file path {} is a soft link.'.format(path)) + raise FileCheckException(FileCheckException.SOFT_LINK_ERROR) + + +def check_path_length(path, name_length=None): + file_max_name_length = name_length if name_length else FileCheckConst.FILE_NAME_LENGTH + if len(path) > FileCheckConst.DIRECTORY_LENGTH or \ + len(os.path.basename(path)) > file_max_name_length: + print_error_log('The file path length exceeds limit.') + raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR) + + +def check_path_exists(path): + if not os.path.exists(path): + print_error_log('The file path %s does not exist.' % path) + raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR) + + +def check_path_readability(path): + if not os.access(path, os.R_OK): + print_error_log('The file path %s is not readable.' % path) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_path_writability(path): + if not os.access(path, os.W_OK): + print_error_log('The file path %s is not writable.' % path) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_path_executable(path): + if not os.access(path, os.X_OK): + print_error_log('The file path %s is not executable.' % path) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_other_user_writable(path): + st = os.stat(path) + if st.st_mode & 0o002: + print_error_log('The file path %s may be insecure because other users have write permissions. ' % path) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_path_owner_consistent(path): + file_owner = os.stat(path).st_uid + if file_owner != os.getuid(): + print_error_log('The file path %s may be insecure because is does not belong to you.' % path) + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR) + + +def check_path_pattern_vaild(path): + if not re.match(FileCheckConst.FILE_VALID_PATTERN, path): + print_error_log('The file path {} contains special characters.'.format(path)) + raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR) + + +def check_file_size(file_path, max_size): + file_size = os.path.getsize(file_path) + if file_size >= max_size: + print_error_log(f'The size of file path {file_path} exceeds {max_size} bytes.') + raise FileCheckException(FileCheckException.FILE_TOO_LARGE_ERROR) + + +def check_common_file_size(file_path): + if os.path.isfile(file_path): + for suffix, max_size in FileCheckConst.FILE_SIZE_DICT.items(): + if file_path.endswith(suffix): + check_file_size(file_path, max_size) + break + + +def check_file_suffix(file_path, file_suffix): + if file_suffix: + if not file_path.endswith(file_suffix): + print_error_log(f"The {file_path} should be a {file_suffix} file!") + raise FileCheckException(FileCheckException.INVALID_FILE_ERROR) + + +def check_path_type(file_path, file_type): + if file_type == FileCheckConst.FILE: + if not os.path.isfile(file_path): + print_error_log(f"The {file_path} should be a file!") + raise FileCheckException(FileCheckException.INVALID_FILE_ERROR) + if file_type == FileCheckConst.DIR: + if not os.path.isdir(file_path): + print_error_log(f"The {file_path} should be a dictionary!") + raise FileCheckException(FileCheckException.INVALID_FILE_ERROR) + + +def create_directory(dir_path): + """ + Function Description: + creating a directory with specified permissions + Parameter: + dir_path: directory path + Exception Description: + when invalid data throw exception + """ + dir_path = os.path.realpath(dir_path) + try: + os.makedirs(dir_path, mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True) + except OSError as ex: + raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR, + 'Failed to create {}. Please check the path permission or disk space .{}'.format(dir_path, str(ex))) from ex + + +def check_path_before_create(path): + if path_len_exceeds_limit(path): + raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR, 'The file path length exceeds limit.') + + if not re.match(Const.FILE_PATTERN, os.path.realpath(path)): + raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR, + 'The file path {} contains special characters.'.format(path)) + + +def change_mode(path, mode): + if not os.path.exists(path) or os.path.islink(path): + return + try: + os.chmod(path, mode) + except PermissionError as ex: + raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR, + 'Failed to change {} authority. {}'.format(path, str(ex))) from ex + + +def path_len_exceeds_limit(file_path): + return len(os.path.realpath(file_path)) > FileCheckConst.DIRECTORY_LENGTH or \ + len(os.path.basename(file_path)) > FileCheckConst.FILE_NAME_LENGTH diff --git a/debug/accuracy_tools/atat/pytorch/common/log.py b/debug/accuracy_tools/atat/pytorch/common/log.py new file mode 100644 index 0000000000000000000000000000000000000000..fab5aca45c08af7253dedf8ee13db10b271683da --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/common/log.py @@ -0,0 +1,59 @@ +import os +import time +import sys +from .utils import get_rank_if_initialized + + +def on_rank_0(func): + def func_rank_0(*args, **kwargs): + current_rank = get_rank_if_initialized() + if current_rank is None or current_rank == 0: + return func(*args, **kwargs) + + return func_rank_0 + + +def _print_log(level, msg, end='\n'): + current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) + pid = os.getpid() + full_msg = current_time + "(" + str(pid) + ")-[" + level + "]" + msg + current_rank = get_rank_if_initialized() + if current_rank is not None: + full_msg = f"[rank {current_rank}]-" + full_msg + print(full_msg, end=end) + sys.stdout.flush() + + +def print_info_log(info_msg, end='\n'): + """ + Function Description: + print info log. + Parameter: + info_msg: the info message. + """ + _print_log("INFO", info_msg, end=end) + + +def print_error_log(error_msg): + """ + Function Description: + print error log. + Parameter: + error_msg: the error message. + """ + _print_log("ERROR", error_msg) + + +def print_warn_log(warn_msg): + """ + Function Description: + print warn log. + Parameter: + warn_msg: the warning message. + """ + _print_log("WARNING", warn_msg) + + +print_info_log_rank_0 = on_rank_0(print_info_log) +print_warn_log_rank_0 = on_rank_0(print_warn_log) +print_error_log_rank_0 = on_rank_0(print_error_log) diff --git a/debug/accuracy_tools/atat/pytorch/common/parse_json.py b/debug/accuracy_tools/atat/pytorch/common/parse_json.py new file mode 100644 index 0000000000000000000000000000000000000000..dc594c4cf818ed0bedd8d3997e2848d9fe123a17 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/common/parse_json.py @@ -0,0 +1,37 @@ +import json +from .exceptions import ParseJsonException + + +def parse_json_info_forward_backward(json_path): + def parse_data_name_with_pattern(data_name, pattern): + name_struct = data_name.split('.') + if not name_struct[-1] == pattern: + raise ParseJsonException(ParseJsonException.UnexpectedNameStruct, + f"{data_name} in file {json_path}") + api_name = '.'.join(name_struct[:-1]) + return api_name + + with open(json_path, 'r') as f: + dump_json = json.load(f) + + real_data_path = dump_json.get("dump_data_dir") + dump_data = dump_json.get("data") + if not dump_data: + raise ParseJsonException(ParseJsonException.InvalidDumpJson, "dump数据中没有data字段") + + forward_data = {} + backward_data = {} + for data_name, data_item in dump_data.items(): + if "Module" in data_name: + continue + if "forward" in data_name: + api_name = parse_data_name_with_pattern(data_name, "forward") + forward_data.update({api_name: data_item}) + elif "backward" in data_name: + api_name = parse_data_name_with_pattern(data_name, "backward") + backward_data.update({api_name: data_item}) + else: + raise ParseJsonException(ParseJsonException.UnexpectedNameStruct, + f"{data_name} in file {json_path}.") + + return forward_data, backward_data, real_data_path diff --git a/debug/accuracy_tools/atat/pytorch/common/recursive.py b/debug/accuracy_tools/atat/pytorch/common/recursive.py new file mode 100644 index 0000000000000000000000000000000000000000..c8a19a63117d332b138fec4d38d7efa20f7ddebe --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/common/recursive.py @@ -0,0 +1,28 @@ +import torch +import numpy as np +from .log import print_warn_log + +_recursive_key_stack = [] +special_type = (torch.device, torch.dtype, torch.Size, torch.Tensor, np.integer, np.floating, np.bool_, np.complexfloating, \ + np.str_, np.byte, np.unicode_, bool, int, float, str, slice) +def recursive_apply_transform(args, transform): + global _recursive_key_stack + if isinstance(args, special_type): + arg_transform = transform(args, _recursive_key_stack) + return arg_transform + elif isinstance(args, (list, tuple)): + transform_result = [] + for i, arg in enumerate(args): + _recursive_key_stack.append(str(i)) + transform_result.append(recursive_apply_transform(arg, transform)) + _recursive_key_stack.pop() + return type(args)(transform_result) + elif isinstance(args, dict): + transform_result = {} + for k, arg in args.items(): + _recursive_key_stack.append(str(k)) + transform_result[k] = recursive_apply_transform(arg, transform) + _recursive_key_stack.pop() + return transform_result + elif args is not None: + print_warn_log(f"Data type {type(args)} is not supported.") diff --git a/debug/accuracy_tools/atat/pytorch/common/utils.py b/debug/accuracy_tools/atat/pytorch/common/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e88d506b2c340f9b6141c2e0bb775a693d61a16c --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/common/utils.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import os +import re +import random +import stat +import torch +import numpy as np +from functools import wraps +try: + import torch_npu +except ImportError: + is_gpu = True +else: + is_gpu = False + + +torch_without_guard_version_list = ['2.1'] +for version in torch_without_guard_version_list: + if torch.__version__.startswith(version): + torch_without_guard_version = True + break + else: + torch_without_guard_version = False + +if not is_gpu and not torch_without_guard_version: + from torch_npu.utils.device_guard import torch_device_guard as torch_npu_device_guard + +npu_distributed_api = ['isend', 'irecv'] + + +def parameter_adapter(func): + + def handle_masked_select(input_tensor, indices): + masked_select_func = getattr(torch._C._VariableFunctionsClass, "masked_select") + if input_tensor.dtype == torch.bfloat16: + # masked_select在NPU上输入数据dtype类型为bfloat16会报错,提示不支持此类型 + return masked_select_func(input_tensor.to(torch.float32), indices).to(torch.bfloat16) + else: + return masked_select_func(input_tensor, indices) + + @wraps(func) + def inner(self, *args, **kwargs): + if self.op_name_ == "__getitem__" and len(args) > 1 and isinstance(args[1], torch.Tensor): + input_tensor = args[0] + indices = args[1] + if indices.dtype == torch.uint8: + indices = indices.bool() + if indices.dtype == torch.bool: + if indices.shape == input_tensor.shape: + return handle_masked_select(input_tensor, indices) + else: + indices = getattr(torch._C._VariableFunctionsClass, "nonzero")(indices, as_tuple=True) + return getattr(torch._C._TensorBase, "__getitem__")(input_tensor, indices) + elif indices.dtype != torch.bool: + if not indices.shape or len(indices.shape) == 1: + return func(self, input_tensor, indices.tolist()) + elif len(indices.shape) == 2: + result = [func(self, input_tensor, index) for index in indices.tolist()] + return getattr(torch._C._VariableFunctionsClass, "stack")(result, 0) + else: + res = [input_tensor[tensor_index] for tensor_index in indices] + return getattr(torch._C._VariableFunctionsClass, "stack")(res, 0) + if self.op_name_ == "__eq__" and args[1] is None: + return False + return func(self, *args, **kwargs) + return inner + + +def torch_device_guard(func): + if is_gpu or torch_without_guard_version: + return func + # Parse args/kwargs matched torch.device objects + + @torch_npu_device_guard + def wrapper(*args, **kwargs): + return func(*args, **kwargs) + return wrapper + + +def get_rank_if_initialized(): + if torch.distributed.is_initialized(): + return torch.distributed.get_rank() + return None + + +def seed_all(seed=1234, mode=False): + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(mode) + if is_gpu: + torch.cuda.manual_seed_all(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.enable = False + torch.backends.cudnn.benchmark = False + else: + torch_npu.npu.manual_seed_all(seed) + torch_npu.npu.manual_seed(seed) + + +class Const: + """ + Class for const + """ + SEP = "." + MODEL_TYPE = ['.onnx', '.pb', '.om'] + DIM_PATTERN = r"^(-?[0-9]+)(,-?[0-9]+)*" + SEMICOLON = ";" + COLON = ":" + EQUAL = "=" + COMMA = "," + DOT = "." + DUMP_RATIO_MAX = 100 + SUMMERY_DATA_NUMS = 256 + FLOAT_EPSILON = np.finfo(float).eps + SUPPORT_DUMP_MODE = ['api', 'acl'] + ON = 'ON' + OFF = 'OFF' + KWARGS = 'kwargs' + INPUT = 'input' + OUTPUT = 'output' + BACKWARD = 'backward' + FORWARD = 'forward' + PRE_FORWARD = "pre_forward" + INPUT_ARGS = 'input_args' + INPUT_KWARGS = 'input_kwargs' + GRAD_INPUT = 'grad_input' + GRAD_OUTPUT = 'grad_output' + START = "start" + STOP = "stop" + + # dump mode + ALL = "all" + LIST = "list" + RANGE = "range" + STACK = "stack" + ACL = "acl" + API_LIST = "api_list" + API_STACK = "api_stack" + DUMP_MODE = [ALL, LIST, RANGE, STACK, ACL, API_LIST, API_STACK] + AUTO = "auto" + ONLINE_DUMP_MODE = [ALL, LIST, AUTO, OFF] + SUMMARY = "summary" + MD5 = "md5" + SUMMARY_MODE = [ALL, SUMMARY, MD5] + + WRITE_FLAGS = os.O_WRONLY | os.O_CREAT + OVERWRITE_FLAGS = os.O_WRONLY | os.O_CREAT | os.O_TRUNC + WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR + + PKL_SUFFIX = ".pkl" + NUMPY_SUFFIX = ".npy" + ONE_GB = 1 * 1024 * 1024 * 1024 + TEN_GB = 10 * 1024 * 1024 * 1024 + FILE_PATTERN = r'^[a-zA-Z0-9_./-]+$' + FILE_NAME_LENGTH = 255 + DIRECTORY_LENGTH = 4096 + DISTRIBUTED_PREFIX_LENGTH = 60 + SUMMARY_COLUMN_NUM = 6 + STACK_COLUMN_NUM = 2 + # env dump path + ASCEND_WORK_PATH = "ASCEND_WORK_PATH" + DUMP_DIR = "dump_data" + + ENV_ENABLE = "1" + ENV_DISABLE = "0" + + MAX_SEED_VALUE = 2**32 - 1 + + INPLACE_LIST = ["broadcast", "all_reduce", "reduce", "all_gather", "gather", "scatter", "reduce_scatter", + "_reduce_scatter_base", "_all_gather_base", "all_to_all_single"] + + TASK_LIST = ["tensor", "statistics", "overflow_check", "free_benchmark"] + LEVEL_LIST = ["L0", "L1", "L2", "mix"] + STATISTICS = "statistics" + TENSOR = "tensor" + OVERFLOW_CHECK = "overflow_check" + FREE_BENCHMARK = "free_benchmark" \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py b/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py new file mode 100644 index 0000000000000000000000000000000000000000..b9869fb58ab6ffb7e3ae94367787aee251ff487d --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py @@ -0,0 +1,1163 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2019-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import json +import multiprocessing +import os.path +import stat +import sys +import math +import torch + +import numpy as np +import pandas as pd +import openpyxl +from openpyxl.styles import PatternFill +from collections import namedtuple + +from .match import graph_mapping +from ..advisor.advisor import Advisor +from ...core.utils import check_compare_param, add_time_with_xlsx, CompareException, CompareConst, \ + format_value, check_file_not_exists, check_configuration_param, task_dumppath_get, print_info_log, \ + print_warn_log, print_error_log, Const +from ...core.file_check_util import FileChecker, FileCheckConst, change_mode, FileOpen, create_directory + + +def correct_data(result): + if result == CompareConst.NAN: + return result + if float(result) > 0.99999: + return 1.0 + return result + + +def cosine_similarity(n_value, b_value): + np.seterr(divide='ignore', invalid='ignore') + if len(n_value) == 1: + return "unsupported", "This tensor is scalar." + num = n_value.dot(b_value) + a_norm = np.linalg.norm(n_value) + b_norm = np.linalg.norm(b_value) + message = '' + if a_norm <= Const.FLOAT_EPSILON and b_norm <= Const.FLOAT_EPSILON: + result = 1.0 + elif a_norm <= Const.FLOAT_EPSILON: + message = 'Cannot compare by Cosine Similarity, All the data is Zero in npu dump data.' + result = CompareConst.NAN + elif b_norm <= Const.FLOAT_EPSILON: + message = 'Cannot compare by Cosine Similarity, All the data is Zero in Bench dump data.' + result = CompareConst.NAN + else: + cos = num / (a_norm * b_norm) + if np.isnan(cos): + message = 'Cannot compare by Cosine Similarity, the dump data has NaN.' + result = CompareConst.NAN + else: + result = format_value(cos) + result = correct_data(result) + return result, message + + +def get_rmse(n_value, b_value): + if len(n_value) == 0 and len(b_value) == 0: + rmse = '0' + elif len(n_value) == 0: + rmse = CompareConst.NAN + elif len(b_value) == 0: + rmse = CompareConst.NAN + else: + rmse = np.linalg.norm(n_value - b_value) / np.sqrt(len(n_value)) + if np.isnan(rmse): + rmse = CompareConst.NAN + return rmse, "" + + +def get_mape(n_value, b_value): + if len(n_value) == 0 and len(b_value) == 0: + mape = '0' + elif len(n_value) == 0: + mape = CompareConst.NAN + elif len(b_value) == 0: + mape = CompareConst.NAN + elif not np.all(n_value) and not np.all(b_value): + mape = '0' + elif not np.all(b_value): + mape = CompareConst.NAN + else: + mape_val = np.sum(np.abs((n_value - b_value) / b_value)) / len(b_value) * 100 + mape = CompareConst.NAN if np.isnan(mape_val) else str(round(mape_val, 4)) + '%' + return mape, "" + + +def get_max_abs_err(n_value, b_value): + temp_res = n_value - b_value + max_value = np.max(np.abs(temp_res)) + return format_value(max_value), "" + + +def get_relative_err(n_value, b_value): + np.seterr(divide='ignore', invalid='ignore') + if b_value.dtype in CompareConst.FLOAT_TYPE: + zero_mask = (b_value == 0) + b_value[zero_mask] += np.finfo(b_value.dtype).eps + n_value[zero_mask] += np.finfo(b_value.dtype).eps + else: + n_value, b_value = n_value.astype(float), b_value.astype(float) + zero_mask = (b_value == 0) + b_value[zero_mask] += np.finfo(float).eps + n_value[zero_mask] += np.finfo(float).eps + relative_err = np.divide((n_value - b_value), b_value) + return np.abs(relative_err) + + +def get_max_relative_err(n_value, b_value, input_relative_err=None): + if input_relative_err is None: + relative_err = get_relative_err(n_value, b_value) + else: + relative_err = input_relative_err + max_relative_err = np.max(np.abs(relative_err)) + if np.isnan(max_relative_err): + message = 'Cannot compare by MaxRelativeError, the data contains nan in dump data.' + return CompareConst.NAN, message + return format_value(max_relative_err), "" + + +def rel_err_ratio(relative_err, threshold): + return format_value(np.sum(relative_err < threshold) / np.size(relative_err)) + + +def check_graph_mode(a_op_name, b_op_name): + if "Aten" in a_op_name and "Aten" not in b_op_name: + return True + if "Aten" not in a_op_name and "Aten" in b_op_name: + return True + return False + + +def check_op(npu_dict, bench_dict, fuzzy_match): + a_op_name = npu_dict["op_name"] + b_op_name = bench_dict["op_name"] + graph_mode = check_graph_mode(a_op_name[0], b_op_name[0]) + if graph_mode: + return graph_mapping.match(a_op_name[0], b_op_name[0]) + struct_match = check_struct_match(npu_dict, bench_dict) + if not fuzzy_match: + return a_op_name == b_op_name and struct_match + is_match = True + try: + is_match = fuzzy_check_op(a_op_name, b_op_name) + except Exception as err: + print_warn_log("%s and %s can not fuzzy match." % (a_op_name, b_op_name)) + is_match = False + return is_match and struct_match + + +def check_struct_match(npu_dict, bench_dict): + npu_struct_in = npu_dict.get("input_struct") + bench_struct_in = bench_dict.get("input_struct") + npu_struct_out = npu_dict.get("output_struct") + bench_struct_out = bench_dict.get("output_struct") + is_match = npu_struct_in == bench_struct_in and npu_struct_out == bench_struct_out + if not is_match: + if len(npu_struct_in) == 0 or len(bench_struct_in) == 0 or len(npu_struct_in) != len(bench_struct_in): + return False + struct_in_is_match = check_type_shape_match(npu_struct_in, bench_struct_in) + struct_out_is_match = check_type_shape_match(npu_struct_out, bench_struct_out) + is_match = struct_in_is_match and struct_out_is_match + return is_match + + +def check_type_shape_match(npu_struct, bench_struct): + shape_type_match = False + for npu_type_shape, bench_type_shape in zip(npu_struct, bench_struct): + npu_type = npu_type_shape[0] + npu_shape = npu_type_shape[1] + bench_type = bench_type_shape[0] + bench_shape = bench_type_shape[1] + shape_match = npu_shape == bench_shape + type_match = npu_type == bench_type + if not type_match: + if [npu_type, bench_type] in [["torch.float16", "torch.float32"], ["torch.float32", "torch.float16"], + ["torch.float16", "torch.bfloat16"], ["torch.bfloat16", "torch.float16"]]: + type_match = True + else: + type_match = False + shape_type_match = shape_match and type_match + if not shape_type_match: + return False + return shape_type_match + + +def fuzzy_check_op(npu_name_list, bench_name_list): + if len(npu_name_list) == 0 or len(bench_name_list) == 0 or len(npu_name_list) != len(bench_name_list): + return False + is_match = True + for npu_name, bench_name in zip(npu_name_list, bench_name_list): + is_match = fuzzy_check_name(npu_name, bench_name) + if not is_match: + break + return is_match + + +def fuzzy_check_name(npu_name, bench_name): + if "forward" in npu_name and "forward" in bench_name: + is_match = rename_api(npu_name, "forward") == rename_api(bench_name, "forward") + elif "backward" in npu_name and "backward" in bench_name: + is_match = rename_api(npu_name, "backward") == rename_api(bench_name, "backward") + else: + is_match = npu_name == bench_name + return is_match + + +def rename_api(npu_name, process): + npu_split = npu_name.split(process) + torch_func_index, in_out = npu_split[0], npu_split[1] + torch_func_split = torch_func_index.rsplit("_", 2) + torch_func = str(torch_func_split[0]) + str(in_out) + return torch_func + + +def merge_tensor(tensor_list, summary_compare, md5_compare): + op_dict = {} + op_dict["op_name"] = [] + op_dict["input_struct"] = [] + op_dict["kwargs_struct"] = [] + op_dict["output_struct"] = [] + op_dict["summary"] = [] + op_dict["stack_info"] = [] + + all_mode_bool = summary_compare == False and md5_compare == False + if all_mode_bool: + op_dict["data_name"] = [] + + for tensor in tensor_list: + if len(tensor) == 2: + op_dict['stack_info'].append(tensor['full_info']) + break + op_dict["op_name"].append(tensor['full_op_name']) + if not md5_compare: + if tensor['full_op_name'].find("input") != -1: + op_dict["input_struct"].append((tensor['dtype'], tensor['shape'])) + elif tensor['full_op_name'].find("kwarg") != -1: + op_dict["kwargs_struct"].append((tensor['dtype'], tensor['shape'])) + elif tensor['full_op_name'].find("output") != -1: + op_dict["output_struct"].append((tensor['dtype'], tensor['shape'])) + else: + if tensor['full_op_name'].find("input") != -1: + op_dict["input_struct"].append((tensor['dtype'], tensor['shape'], tensor['md5'])) + elif tensor['full_op_name'].find("kwarg") != -1: + op_dict["kwargs_struct"].append((tensor['dtype'], tensor['shape'], tensor['md5'])) + elif tensor['full_op_name'].find("output") != -1: + op_dict["output_struct"].append((tensor['dtype'], tensor['shape'], tensor['md5'])) + + op_dict["summary"].append([tensor['Max'], tensor['Min'], tensor['Mean'], tensor['Norm']]) + + if all_mode_bool: + op_dict["data_name"].append(tensor['data_name']) + + if not op_dict["kwargs_struct"]: + del op_dict["kwargs_struct"] + return op_dict if op_dict["op_name"] else {} + + +def match_op(npu_queue, bench_queue, fuzzy_match): + for b_index, b_op in enumerate(bench_queue[0: -1]): + if check_op(npu_queue[-1], b_op, fuzzy_match): + return len(npu_queue) - 1, b_index + if check_op(npu_queue[-1], bench_queue[-1], fuzzy_match): + return len(npu_queue) - 1, len(bench_queue) - 1 + for n_index, n_op in enumerate(npu_queue[0: -1]): + if check_op(n_op, bench_queue[-1], fuzzy_match): + return n_index, len(bench_queue) - 1 + return -1, -1 + + +def get_accuracy(result, n_dict, b_dict, summary_compare=False, md5_compare=False): + def get_accuracy_core(n_start, n_len, b_start, b_len, key): + min_len = min(n_len, b_len) + npu_stack_info = n_dict.get("stack_info", None) + bench_stack_info = b_dict.get("stack_info", None) + has_stack = npu_stack_info and bench_stack_info + + all_mode_bool = summary_compare == False and md5_compare == False + if all_mode_bool: + npu_data_name = n_dict.get("data_name", None) + bench_data_name = b_dict.get("data_name", None) + + for index in range(min_len): + + n_name = n_dict['op_name'][n_start + index] + b_name = b_dict['op_name'][b_start + index] + n_struct = n_dict[key][index] + b_struct = b_dict[key][index] + err_msg = "" + if md5_compare: + result_item = [n_name, b_name, n_struct[0], b_struct[0], n_struct[1], b_struct[1], + n_struct[2], b_struct[2], CompareConst.PASS if n_struct[2] == b_struct[2] else CompareConst.DIFF] + if has_stack and index == 0 and key == "input_struct": + result_item.extend(npu_stack_info) + else: + result_item.append(CompareConst.NONE) + result.append(result_item) + continue + + if summary_compare: + result_item = [n_name, b_name, n_struct[0], b_struct[0], n_struct[1], b_struct[1], + " ", " ", " ", " ", " ", " ", " ", " "] + else: + result_item = [n_name, b_name, n_struct[0], b_struct[0], n_struct[1], b_struct[1], + " ", " ", " ", " ", " "] + + npu_summary_data = n_dict.get("summary")[n_start + index] + result_item.extend(npu_summary_data) + bench_summary_data = b_dict.get("summary")[b_start + index] + result_item.extend(bench_summary_data) + + if summary_compare: + start_idx = CompareConst.SUMMARY_COMPARE_RESULT_HEADER.index(CompareConst.MAX_DIFF) + warning_flag = False + for i, (npu_val, bench_val) in enumerate(zip(npu_summary_data, bench_summary_data)): + if isinstance(npu_val, (float, int)) and isinstance(bench_val, (float, int)): + diff = npu_val - bench_val + if bench_val != 0: + relative = str(abs((diff/bench_val) * 100)) + '%' + else: + relative = "N/A" + result_item[start_idx + i] = diff + result_item[start_idx + i + 4] = relative + magnitude_diff = abs(diff) / (max(abs(npu_val), abs(bench_val)) + 1e-10) + if magnitude_diff > 0.5: + warning_flag = True + else: + result_item[start_idx + i] = CompareConst.NONE + accuracy_check = CompareConst.WARNING if warning_flag else "" + err_msg += "Need double check api accuracy." if warning_flag else "" + result_item[start_idx:] = [f'{str(x)}\t' if str(x) in ('inf', '-inf', 'nan') else x for x in result_item[start_idx:]] + + result_item.append(accuracy_check if summary_compare else CompareConst.ACCURACY_CHECK_YES) + result_item.append(err_msg) + if has_stack and index == 0 and key == "input_struct": + result_item.extend(npu_stack_info) + else: + result_item.append(CompareConst.NONE) + if all_mode_bool: + result_item.append(npu_data_name[n_start + index]) + + result.append(result_item) + + if n_len > b_len: + for index in range(b_len, n_len): + n_name = n_dict['op_name'][n_start + index] + n_struct = n_dict[key][index] + if md5_compare: + result_item = [n_name, CompareConst.NAN, n_struct[0], CompareConst.NAN, + n_struct[1], CompareConst.NAN, n_struct[2], CompareConst.NAN, CompareConst.NAN] + result.append(result_item) + continue + result_item = [n_name, CompareConst.NAN, n_struct[0], CompareConst.NAN, + n_struct[1], CompareConst.NAN, " ", " ", " ", " ", " "] + summary_data = n_dict.get("summary")[n_start + index] + result_item.extend(summary_data) + summary_data = [CompareConst.NAN for _ in range(len(n_dict.get("summary")[0]))] + result_item.extend(summary_data) + + err_msg = "" + result_item.append(CompareConst.ACCURACY_CHECK_YES) + result_item.append(err_msg) + + if has_stack and index == 0 and key == "input_struct": + result_item.extend(npu_stack_info) + else: + result_item.append(CompareConst.NONE) + if all_mode_bool: + result_item.append(npu_data_name[n_start + index]) + + result.append(result_item) + + n_num = len(n_dict['op_name']) + b_num = len(b_dict['op_name']) + n_num_input = len([name for name in n_dict['op_name'] if 'input' in name]) + b_num_input = len([name for name in b_dict['op_name'] if 'input' in name]) + n_num_kwarg = len([name for name in n_dict['op_name'] if 'kwarg' in name]) + b_num_kwarg = len([name for name in b_dict['op_name'] if 'kwarg' in name]) + n_num_output = n_num - n_num_input - n_num_kwarg + b_num_output = b_num - b_num_input - b_num_kwarg + get_accuracy_core(0, n_num_input, 0, b_num_input, 'input_struct') + get_accuracy_core(n_num_input, n_num_kwarg, b_num_input, b_num_kwarg, "kwargs_struct") + get_accuracy_core(n_num_input + n_num_kwarg, n_num_output, b_num_input + b_num_kwarg, b_num_output, 'output_struct') + + +def _do_multi_process(input_parma, result_df): + try: + result_df = _handle_multi_process(compare_ops, input_parma, result_df, multiprocessing.Manager().RLock()) + return result_df + except ValueError as e: + print_error_log('result dataframe is not found.') + raise CompareException(CompareException.INVALID_DATA_ERROR) from e + + +def read_dump_data(result_df): + try: + npu_dump_name_list = result_df.iloc[0:, 0].tolist() + npu_dump_tensor_list = result_df.iloc[0:, -1].tolist() + # bench_dump_name_list = csv_pd.iloc[0:, 1].tolist() + op_name_mapping_dict = {} + for index, _ in enumerate(npu_dump_name_list): + npu_dump_name = npu_dump_name_list[index] + npu_dump_tensor = npu_dump_tensor_list[index] + # bench_dump_name = bench_dump_name_list[index] + op_name_mapping_dict[npu_dump_name] = [npu_dump_tensor, npu_dump_tensor] + return op_name_mapping_dict + except ValueError as e: + print_error_log('result dataframe is not found.') + raise CompareException(CompareException.INVALID_DATA_ERROR) from e + except IndexError as e: + print_error_log('result dataframe elements can not be access.') + raise CompareException(CompareException.INDEX_OUT_OF_BOUNDS_ERROR) from e + + +def _handle_multi_process(func, input_parma, result_df, lock): + process_num = int((multiprocessing.cpu_count() + 1) / 2) + op_name_mapping_dict = read_dump_data(result_df) + + df_chunk_size = len(result_df) // process_num + if df_chunk_size > 0: + df_chunks = [result_df.iloc[i:i + df_chunk_size] for i in range(0, len(result_df), df_chunk_size)] + else: + df_chunks = [result_df] + + results = [] + pool = multiprocessing.Pool(process_num) + + def err_call(args): + print_error_log('multiprocess compare failed! Reason: {}'.format(args)) + try: + pool.terminate() + except OSError as e: + print_error_log("pool terminate failed") + + for process_idx, df_chunk in enumerate(df_chunks): + idx = df_chunk_size * process_idx + result = pool.apply_async(func, + args=(idx, op_name_mapping_dict, df_chunk, lock, input_parma), + error_callback=err_call) + results.append(result) + final_results = [r.get() for r in results] + pool.close() + pool.join() + return pd.concat(final_results, ignore_index=True) + + +def compare_ops(idx, dump_path_dict, result_df, lock, input_parma): + cos_result = [] + max_err_result = [] + max_relative_err_result = [] + err_mess = [] + one_thousand_err_ratio_result = [] + five_thousand_err_ratio_result = [] + is_print_compare_log = input_parma.get("is_print_compare_log") + for i in range(len(result_df)): + op_name = result_df.iloc[i, 0] + if is_print_compare_log: + print("start compare: {}".format(op_name)) + cos_sim, max_abs_err, max_relative_err, err_msg, one_thousand_err_ratio, five_thousand_err_ratio = compare_by_op(op_name, dump_path_dict, input_parma) + if is_print_compare_log: + print("[{}] Compare result: cosine {}, max_abs_err {}, max_relative_err {}, {}, one_thousand_err_ratio {}, five_thousand_err_ratio {}".format(op_name, cos_sim, max_abs_err, max_relative_err, err_msg, one_thousand_err_ratio, five_thousand_err_ratio)) + cos_result.append(cos_sim) + max_err_result.append(max_abs_err) + max_relative_err_result.append(max_relative_err) + err_mess.append(err_msg) + one_thousand_err_ratio_result.append(one_thousand_err_ratio) + five_thousand_err_ratio_result.append(five_thousand_err_ratio) + result_df = _save_cmp_result(idx, cos_result, max_err_result, max_relative_err_result, err_mess, one_thousand_err_ratio_result, + five_thousand_err_ratio_result, result_df, lock) + return result_df + + +def _save_cmp_result(idx, cos_result, max_err_result, max_relative_err_result, err_msg, one_thousand_err_ratio_result, five_thousand_err_ratio_result, result_df, lock): + lock.acquire() + try: + for i, _ in enumerate(cos_result): + process_index = i + idx + result_df.loc[process_index, CompareConst.COSINE] = cos_result[i] + result_df.loc[process_index, CompareConst.MAX_ABS_ERR] = max_err_result[i] + result_df.loc[process_index, CompareConst.MAX_RELATIVE_ERR] = max_relative_err_result[i] + result_df.loc[process_index, CompareConst.ERROR_MESSAGE] = err_msg[i] + result_df.loc[process_index, CompareConst.ACCURACY] = check_accuracy(cos_result[i], max_err_result[i]) + result_df.loc[process_index, CompareConst.ONE_THOUSANDTH_ERR_RATIO] = one_thousand_err_ratio_result[i] + result_df.loc[process_index, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] = five_thousand_err_ratio_result[i] + return result_df + except ValueError as e: + print_error_log('result dataframe is not found.') + raise CompareException(CompareException.INVALID_DATA_ERROR) from e + except IndexError as e: + print_error_log('result dataframe elements can not be access.') + raise CompareException(CompareException.INDEX_OUT_OF_BOUNDS_ERROR) from e + finally: + lock.release() + + +def check_accuracy(cos, max_abs_err): + if cos == CompareConst.SHAPE_UNMATCH: + return CompareConst.ACCURACY_CHECK_UNMATCH + if cos == CompareConst.NONE or max_abs_err == CompareConst.NONE: + return CompareConst.NONE + if cos == "N/A" or max_abs_err == "N/A": + return CompareConst.ACCURACY_CHECK_NO + try: + cos, max_abs_err = float(cos), float(max_abs_err) + except ValueError: + print_warn_log("Cosine or MaxAbsErr can not get float value.") + return CompareConst.NONE + if cos < CompareConst.COS_THRESHOLD and max_abs_err > CompareConst.MAX_ABS_ERR_THRESHOLD: + return CompareConst.ACCURACY_CHECK_NO + if cos < CompareConst.COS_MAX_THRESHOLD or max_abs_err > CompareConst.MAX_ABS_ERR_MAX_THRESHOLD: + return CompareConst.ACCURACY_CHECK_NO + return CompareConst.ACCURACY_CHECK_YES + + +def compare_by_op(op_name, op_name_mapping_dict, input_parma): + npu_bench_name_list = op_name_mapping_dict[op_name] + data_name = npu_bench_name_list[1] + if data_name == '-1' or data_name == -1: + return CompareConst.NONE, CompareConst.NONE, CompareConst.NONE, CompareConst.NO_BENCH, CompareConst.NONE, CompareConst.NONE + try: + n_path = os.path.join(input_parma.get("npu_dump_data_dir"), npu_bench_name_list[0]) + b_path = os.path.join(input_parma.get("bench_dump_data_dir"), npu_bench_name_list[1]) + n_path_checker = FileChecker(n_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.PT_SUFFIX, False) + b_path_checker = FileChecker(b_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.PT_SUFFIX, False) + n_path = n_path_checker.common_check() + b_path = b_path_checker.common_check() + n_value = torch.load(n_path, map_location=torch.device('cpu')).detach().numpy() + b_value = torch.load(b_path, map_location=torch.device('cpu')).detach().numpy() + except IOError as error: + return CompareConst.NAN, CompareConst.NAN, CompareConst.NAN, "Dump file: {} not found.".format(error.filename), CompareConst.NAN, CompareConst.NAN + relative_err = get_relative_err(n_value, b_value) + if len(n_value.shape) == 0: + if n_value.dtype == bool: + n_value = n_value.astype(float) + b_value = b_value.astype(float) + max_abs_err, _ = get_max_abs_err(n_value, b_value) + max_relative_err, _ = get_max_relative_err(n_value, b_value, input_relative_err=relative_err) + return "unsupported", max_abs_err, max_relative_err, "This is type of scalar data, can not compare.", CompareConst.NAN, CompareConst.NAN + if n_value.size == 0: + return "unsupported", 0, 0, "This is empty data, can not compare.", 0, 0 + if n_value.shape != b_value.shape: + return CompareConst.SHAPE_UNMATCH, CompareConst.SHAPE_UNMATCH, CompareConst.SHAPE_UNMATCH, "Shape of NPU and bench Tensor do not match. Skipped.", CompareConst.SHAPE_UNMATCH, CompareConst.SHAPE_UNMATCH + if n_value.dtype != b_value.dtype: + print_warn_log("Dtype of NPU and bench Tensor do not match: {}".format(op_name)) + err_msg = " Dtype of NPU and bench Tensor do not match." + else: + err_msg = "" + + n_value, b_value = handle_inf_nan(n_value, b_value) + if n_value is CompareConst.NAN or b_value is CompareConst.NAN: + return "N/A", "N/A", "N/A", "The position of inf or nan in NPU and bench Tensor do not match.", "N/A", "N/A" + + n_value = n_value.reshape(-1).astype(float) + b_value = b_value.reshape(-1).astype(float) + err_msg = "" + cos_sim, message = cosine_similarity(n_value, b_value) + + abs_err = np.abs(n_value - b_value) + max_abs_err = format_value(np.max(abs_err)) + max_relative_err, message = get_max_relative_err(n_value, b_value, input_relative_err=relative_err) + one_thousand_err_ratio = rel_err_ratio(relative_err, 0.001) + five_thousand_err_ratio = rel_err_ratio(relative_err, 0.005) + + if not err_msg: + err_msg += message + else: + err_msg = err_msg + ' ' + message + + if npu_bench_name_list[0] != npu_bench_name_list[1]: + err_msg += " Fuzzy matching data, the comparison accuracy may be affected." + return cos_sim, max_abs_err, max_relative_err, err_msg, one_thousand_err_ratio, five_thousand_err_ratio + + +def handle_inf_nan(n_value, b_value): + n_inf = np.isinf(n_value) + b_inf = np.isinf(b_value) + n_nan = np.isnan(n_value) + b_nan = np.isnan(b_value) + if np.any(n_inf) or np.any(b_inf) or np.any(n_nan) or np.any(b_nan): + if np.array_equal(n_inf, b_inf) and np.array_equal(n_nan, b_nan): + n_value[n_inf] = 0 + b_value[b_inf] = 0 + n_value[n_nan] = 0 + b_value[b_nan] = 0 + else: + return CompareConst.NAN, CompareConst.NAN + return n_value, b_value + + +def check_order_magnitude(info, color_columns, summary_compare=True): + """Check if order magnitude diff of max_diff larger than 1""" + api_in, api_out, num = info + max_diff_index = get_header_index('Max diff' if summary_compare else 'MaxAbsErr', summary_compare) + if api_in[max_diff_index] > api_out[max_diff_index]: + return + in_order = 0 if api_in[max_diff_index] == 0 else math.log10(abs(api_in[max_diff_index])) + out_order = 0 if api_out[max_diff_index] == 0 else math.log10(abs(api_out[max_diff_index])) + if abs(in_order - out_order) >= CompareConst.ORDER_MAGNITUDE_DIFF_YELLOW: + color_columns.yellow.append(num) + + +def check_one_thousand_error_ratio(info, color_columns, summary_compare=True): + """Compare output's one thousand error ratio with input's """ + api_in, api_out, num = info + one_thousand_index = get_header_index('One Thousandth Err Ratio', summary_compare) + if not isinstance(api_in[one_thousand_index], (float, int)) or not isinstance(api_out[one_thousand_index], (float, int)): + return + if api_in[one_thousand_index] > CompareConst.ONE_THOUSAND_ERROR_IN_RED and api_out[one_thousand_index] < CompareConst.ONE_THOUSAND_ERROR_OUT_RED: + color_columns.red.append(num) + elif api_in[one_thousand_index] - api_out[one_thousand_index] > CompareConst.ONE_THOUSAND_ERROR_DIFF_YELLOW: + color_columns.yellow.append(num) + + +def check_cosine_similarity(info, color_columns, summary_compare=True): + """Check if output's cosine similarity more than 0.1 smaller than input's""" + api_in, api_out, num = info + cosine_index = get_header_index('Cosine', summary_compare) + if not isinstance(api_in[cosine_index], (float, int)) or not isinstance(api_out[cosine_index], (float, int)): + return + if api_in[cosine_index] - api_out[cosine_index] > CompareConst.COSINE_DIFF_YELLOW: + color_columns.yellow.append(num) + + +def check_max_relative_diff(info, color_columns, summary_compare=True): + """Compare the output value of max_diff / bench_max with input""" + api_in, api_out, num = info + max_diff_index = get_header_index('Max diff', summary_compare) + bench_max_index = get_header_index('Bench max', summary_compare) + input_max_relative_diff = np.abs(np.divide(api_in[max_diff_index], max(0.01, api_in[bench_max_index]))) + output_max_relative_diff = np.abs(np.divide(api_out[max_diff_index], max(0.01, api_out[bench_max_index]))) + if not isinstance(input_max_relative_diff, (float, int)) or not isinstance(output_max_relative_diff, (float, int)): + return + if output_max_relative_diff > CompareConst.MAX_RELATIVE_OUT_RED: + color_columns.red.append(num) + elif output_max_relative_diff > CompareConst.MAX_RELATIVE_OUT_YELLOW and input_max_relative_diff < CompareConst.MAX_RELATIVE_IN_YELLOW: + color_columns.yellow.append(num) + + +def check_overflow(info, color_columns, summary_compare=False): + """Check if Inf or Nan exists in NPU max/min, or large number in Max diff""" + line, num = info + npu_max_index = get_header_index('NPU max', summary_compare) + npu_min_index = get_header_index('NPU min', summary_compare) + max_diff_index = get_header_index('Max diff' if summary_compare else 'MaxAbsErr', summary_compare) + if str(line[npu_max_index]) in CompareConst.OVERFLOW_LIST or str(line[npu_min_index]) in CompareConst.OVERFLOW_LIST: + color_columns.red.append(num) + return + # check if Max_Diff > 1e+10 + if isinstance(line[max_diff_index], (float, int)) and line[max_diff_index] > CompareConst.MAX_DIFF_RED: + color_columns.red.append(num) + + +def get_header_index(header_name, summary_compare=False): + if summary_compare: + header = CompareConst.SUMMARY_COMPARE_RESULT_HEADER[:] + else: + header = CompareConst.COMPARE_RESULT_HEADER[:] + if header_name not in header: + print_error_log(f"{header_name} not in data name") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + return header.index(header_name) + + +class HighlightRules: + """Highlight rules to check whether API errors""" + # rules for every line: pass in every line of api to check if error exists + basic_rules = { + "check_overflow": check_overflow + } + + # rules compare output with input: pass in input, output to check if output errors compare to input + compare_rules = { + "check_order_magnitude": check_order_magnitude, + "check_one_thousand_error": check_one_thousand_error_ratio, + "check_cosine_similarity": check_cosine_similarity + } + summary_compare_rules = { + "check_order_magnitude": check_order_magnitude, + "check_max_relative_diff": check_max_relative_diff, + } + + +def find_error_rows(result, last_len, n_num_input, highlight_dict, summary_compare=False): + """Find error api and return dict with highlight information red or yellow""" + npu_max_index = get_header_index('NPU max', summary_compare) + bench_max_index = get_header_index('Bench max', summary_compare) + max_diff_index = get_header_index('Max diff' if summary_compare else 'MaxAbsErr', summary_compare) + + red_lines, yellow_lines = [], [] #lines to highlight red or yellow + LineInfo = namedtuple('LineInfo', ['line_data', 'num_pointer']) + ApiInfo = namedtuple('ApiInfo', ['api_input', 'api_output', 'num_pointer']) + ColorColumns = namedtuple('ColorColumns', ['red', 'yellow']) + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + + for i, line in enumerate(result): + num = last_len + i + line_info = LineInfo(line_data=line, num_pointer=num) + for rule in HighlightRules.basic_rules.values(): + rule(line_info, color_columns, summary_compare) + + for n, api_out in enumerate(result[n_num_input:len(result)]): + num = last_len + n_num_input + n + if num in red_lines: + continue + if not isinstance(api_out[npu_max_index], (float, int)) \ + or not isinstance(api_out[bench_max_index], (float, int)) \ + or not isinstance(api_out[max_diff_index],(float, int)): + continue + for m, api_in in enumerate(result[0:n_num_input]): + if not isinstance(api_in[npu_max_index], (float, int)) \ + or not isinstance(api_in[bench_max_index], (float, int)) \ + or not isinstance(api_in[max_diff_index], (float, int)): + continue + + api_info = ApiInfo(api_input=api_in, api_output=api_out, num_pointer=num) + if summary_compare: + for rule in HighlightRules.summary_compare_rules.values(): + rule(api_info, color_columns, summary_compare) + else: + for rule in HighlightRules.compare_rules.values(): + rule(api_info, color_columns, summary_compare) + + highlight_dict.get('red_rows', []).extend(list(set(red_lines))) + highlight_dict.get('yellow_rows', []).extend(list(set(yellow_lines) - set(red_lines))) + + +def get_name_and_state(name): + """Get api/module name and state""" + if "input" in name: + api_name = name.split("input")[0] + state = "input" + else: + api_name = name.split("output")[0] + state = "output" + return api_name, state + + +def find_compare_result_error_rows(result_df, highlight_dict, summary_compare): + """Group the API with its input and output, then find error API with func find_error_rows""" + result = result_df.values + start, input_num, output_num, end = 0, 0, 0, len(result_df) + last_api_name, last_state = None, None + num, last_len = 0, 0 + for res_i in result: + api_name, state = get_name_and_state(res_i[0]) + if last_api_name: + if api_name == last_api_name: + if state == last_state: + num += 1 + else: + input_num = num + num, last_state = 1, state + else: + output_num = num + find_error_rows(result[start:start + input_num + output_num], start, input_num, highlight_dict, summary_compare) + num, last_api_name, last_state = 1, api_name, state + start += input_num + output_num + input_num, output_num = 1, 0 + else: + num, last_api_name, last_state = 1, api_name, state + if state: + if state == "input": + input_num = num + else: + output_num = num + find_error_rows(result[start:start + input_num + output_num], start, input_num, highlight_dict, summary_compare) + + +def highlight_rows_xlsx(result_df, highlight_dict, file_path): + """Write and highlight results in Excel""" + print_info_log('Compare result is %s' % file_path) + + wb = openpyxl.Workbook() + ws = wb.active + + # write header + for j, col_name in enumerate(result_df.columns, start=1): + ws.cell(row=1, column=j, value=col_name) + + for i, row in enumerate(result_df.iterrows(), start=2): + for j, value in enumerate(row[1], start=1): + if not isinstance(value, (float, int)): + value = f'{str(value)}\t' if str(value) in ('inf', '-inf', 'nan') else str(value) + ws.cell(row=i, column=j, value=f'{str(value)}\t' if str(value) in ('inf', '-inf', 'nan') else value) + + if (i - 2) in highlight_dict['red_rows']: + ws.cell(row=i, column=j).fill = PatternFill(start_color=CompareConst.RED, + end_color=CompareConst.RED, fill_type="solid") + elif (i - 2) in highlight_dict['yellow_rows']: + ws.cell(row=i, column=j).fill = PatternFill(start_color=CompareConst.YELLOW, + end_color=CompareConst.YELLOW, fill_type="solid") + wb.save(file_path) + change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY) + + +def compare(input_parma, output_path, stack_mode=False, auto_analyze=True, + fuzzy_match=False): + try: + summary_compare, md5_compare = task_dumppath_get(input_parma) + check_configuration_param(stack_mode, auto_analyze, fuzzy_match) + create_directory(output_path) + check_compare_param(input_parma, output_path, stack_mode, summary_compare, md5_compare) + except CompareException as error: + print_error_log('Compare failed. Please check the arguments and do it again!') + sys.exit(error.code) + compare_core(input_parma, output_path, stack_mode=stack_mode, + auto_analyze=auto_analyze, fuzzy_match=fuzzy_match, summary_compare=summary_compare, + md5_compare=md5_compare) + + +def compare_core(input_parma, output_path, stack_mode=False, auto_analyze=True, + suffix='', fuzzy_match=False, summary_compare=False, md5_compare=False): + print_info_log("Please check whether the input data belongs to you. If not, there may be security risks.") + file_name = add_time_with_xlsx("compare_result" + suffix) + file_path = os.path.join(os.path.realpath(output_path), file_name) + check_file_not_exists(file_path) + highlight_dict = {'red_rows': [], 'yellow_rows': []} + + with FileOpen(input_parma.get("npu_json_path"), "r") as npu_json, \ + FileOpen(input_parma.get("bench_json_path"), "r") as bench_json, \ + FileOpen(input_parma.get("stack_json_path"), "r") as stack_json: + result_df = compare_process([npu_json, bench_json, stack_json], stack_mode, fuzzy_match, highlight_dict, + summary_compare, md5_compare) + + if not md5_compare and not summary_compare: + result_df = _do_multi_process(input_parma, result_df) + find_compare_result_error_rows(result_df, highlight_dict, summary_compare) + highlight_rows_xlsx(result_df, highlight_dict, file_path) + if auto_analyze: + advisor = Advisor(result_df, output_path) + advisor.analysis() + + +def parse(pkl_file, module_name_prefix): + if not isinstance(module_name_prefix, str): + print_error_log("The parameter:module_name_prefix is not a string.") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + with FileOpen(pkl_file, "r") as f: + done = False + title_printed = False + while not done: + pkl_line = f.readline() + if pkl_line == '\n': + continue + if len(pkl_line) == 0: + done = True + break + + msg = json.loads(pkl_line) + info_prefix = msg[0] + if not info_prefix.startswith(module_name_prefix): + continue + + if info_prefix.find("stack_info") != -1: + print("\nTrace back({}):".format(msg[0])) + for item in reversed(msg[1]): + print(" File \"{}\", line {}, in {}".format(item[0], item[1], item[2])) + print(" {}".format(item[3])) + continue + if len(msg) > 5: + summary_info = " [{}][dtype: {}][shape: {}][max: {}][min: {}][mean: {}]" \ + .format(msg[0], msg[3], msg[4], msg[5][0], msg[5][1], msg[5][2]) + if not title_printed: + print("\nStatistic Info:") + title_printed = True + print(summary_info) + + +def op_item_parse(item, op_name, index, item_list=[], top_bool=True): + if item == None or (isinstance(item, dict) and len(item) == 0): + if not top_bool: + tmp = {'full_op_name': op_name + '.' + str(index), 'Max': None, 'Min': None, 'Mean': None, 'Norm': None, 'dtype': None, 'shape': None, 'md5': None, 'data_name': '-1'} + else: + tmp = {'full_op_name': op_name + '.0', 'Max': None, 'Min': None, 'Mean': None, 'Norm': None, 'dtype': None, 'shape': None, 'md5': None, 'data_name': '-1'} + item_list.append(tmp) + return item_list + if index == None: + if isinstance(item, dict): + full_op_name = op_name + '.0' + else: + full_op_name = op_name + else: + full_op_name = op_name + '.' + str(index) + if isinstance(item, dict): + if 'dtype' in item: + parsed_item = item + parsed_item['full_op_name'] = full_op_name + item_list.append(parsed_item) + elif 'type' in item: + parsed_item = {} + if item['type'] == 'torch.Size': + parsed_item['full_op_name'] = full_op_name + parsed_item['dtype'] = 'torch.Size' + parsed_item['shape'] = str(item['value']) + parsed_item['md5'] = None + parsed_item['Max'] = None + parsed_item['Min'] = None + parsed_item['Mean'] = None + parsed_item['Norm'] = None + parsed_item['data_name'] = '-1' + item_list.append(parsed_item) + elif item['type'] == 'slice': + parsed_item['full_op_name'] = full_op_name + parsed_item['dtype'] = 'slice' + parsed_item['shape'] = str(np.shape(np.array(item['value']))) + parsed_item['md5'] = None + parsed_item['Max'] = None + parsed_item['Min'] = None + parsed_item['Mean'] = None + parsed_item['Norm'] = None + parsed_item['data_name'] = '-1' + item_list.append(parsed_item) + else: + parsed_item['full_op_name'] = full_op_name + parsed_item['dtype'] = str(type(item['value'])) + parsed_item['shape'] = '[]' + parsed_item['md5'] = None + parsed_item['Max'] = item['value'] + parsed_item['Min'] = item['value'] + parsed_item['Mean'] = item['value'] + parsed_item['Norm'] = item['value'] + parsed_item['data_name'] = '-1' + item_list.append(parsed_item) + else: + resolve_api_special_parameters(item, full_op_name, item_list) + else: + for j in range(len(item)): + op_item_parse(item[j], full_op_name, j, top_bool=False) + return item_list + + +def resolve_api_special_parameters(data_dict, full_op_name, item_list): + """ + Function Description: + 解析下面格式的数据, 是api参数的一种特殊格式 + { + "last_hidden_state": { + "type": "torch.Tensor", + "dtype": "torch.bfloat16", + ... + }, + "loss": { + "type": "torch.Tensor", + "dtype": "torch.float32", + ... + } + } + Parameter: + data_dict: 字典格式的数据 + full_op_name: 参数的全名字符串 + item_list: 参数信息集合 + """ + for key, value in data_dict.items(): + if isinstance(value, dict): + parsed_item = value + parts = full_op_name.split(".") + parts.insert(-1, key) + full_op_name_new = ".".join(parts) + parsed_item['full_op_name'] = full_op_name_new + item_list.append(parsed_item) + + +def read_op(op_data, op_name): + op_parsed_list = [] + if 'forward' in op_name: + if 'input_args' in op_data: + input_item = op_data['input_args'] + input_parsed_list = op_item_parse(input_item, op_name + '_input', None) + op_parsed_list = input_parsed_list.copy() + input_parsed_list.clear() + if 'input_kwargs' in op_data: + kwargs_item = op_data['input_kwargs'] + if isinstance(kwargs_item, dict) and "type" in kwargs_item or isinstance(kwargs_item, list): + kwarg_parsed_list = op_item_parse(kwargs_item, op_name + '_input', None) + op_parsed_list += kwarg_parsed_list + kwarg_parsed_list.clear() + elif kwargs_item: + for kwarg in kwargs_item: + kwarg_parsed_list = op_item_parse(kwargs_item[kwarg], op_name + '_input.' + kwarg, None) + op_parsed_list += kwarg_parsed_list + kwarg_parsed_list.clear() + if 'output' in op_data: + output_item = op_data['output'] + output_parsed_list = op_item_parse(output_item, op_name + '_output', None) + op_parsed_list += output_parsed_list + output_parsed_list.clear() + if 'backward' in op_name: + if 'grad_input' in op_data: + input_item = op_data['grad_input'] + input_parsed_list = op_item_parse(input_item, op_name + '_input', None) + op_parsed_list = input_parsed_list.copy() + input_parsed_list.clear() + if 'grad_output' in op_data: + output_item = op_data['grad_output'] + output_parsed_list = op_item_parse(output_item, op_name + '_output', None) + op_parsed_list += output_parsed_list + output_parsed_list.clear() + return op_parsed_list + + +def compare_process(file_handles, stack_mode, fuzzy_match, highlight_dict, summary_compare=False, md5_compare=False): + npu_json_handle, bench_json_handle, stack_json_handle = file_handles + npu_json_data = json.load(npu_json_handle) + bench_json_data = json.load(bench_json_handle) + stack_json_data = json.load(stack_json_handle) + + if fuzzy_match: + print_warn_log("This task uses fuzzy matching, which may affect the accuracy of the comparison.") + + npu_ops_queue = [] + bench_ops_queue = [] + result = [] + + ops_npu_iter = iter(npu_json_data['data']) + ops_bench_iter = iter(bench_json_data['data']) + read_err_npu = True + read_err_bench = True + last_npu_ops_len = 0 + last_bench_ops_len = 0 + + while True: + if not read_err_npu and not read_err_bench: + break + try: + last_npu_ops_len = len(npu_ops_queue) + op_name_npu = next(ops_npu_iter) + read_err_npu = True + + npu_op_data = npu_json_data['data'][op_name_npu] + npu_op_parsed_list = read_op(npu_op_data, op_name_npu) + if op_name_npu in stack_json_data: + npu_op_parsed_list.append({'full_op_name': op_name_npu, 'full_info': stack_json_data[op_name_npu]}) + else: + npu_op_parsed_list.append({'full_op_name': op_name_npu, 'full_info': None}) + + npu_merge_list = merge_tensor(npu_op_parsed_list, summary_compare, md5_compare) + if npu_merge_list: + npu_ops_queue.append(npu_merge_list) + except StopIteration: + read_err_npu = False + try: + last_bench_ops_len = len(bench_ops_queue) + op_name_bench = next(ops_bench_iter) + + bench_op_data = bench_json_data['data'][op_name_bench] + bench_op_parsed_list = read_op(bench_op_data, op_name_bench) + if op_name_bench in stack_json_data: + bench_op_parsed_list.append( + {'full_op_name': op_name_bench, 'full_info': stack_json_data[op_name_bench]}) + else: + bench_op_parsed_list.append({'full_op_name': op_name_bench, 'full_info': None}) + + bench_merge_list = merge_tensor(bench_op_parsed_list, summary_compare, md5_compare) + if bench_merge_list: + bench_ops_queue.append(bench_merge_list) + except StopIteration: + read_err_bench = False + + if len(npu_ops_queue) == 0 or len(bench_ops_queue) == 0 or ( + len(npu_ops_queue) == last_npu_ops_len and len(bench_ops_queue) == last_bench_ops_len): + continue + + n_match_point, b_match_point = match_op(npu_ops_queue, bench_ops_queue, fuzzy_match) + if n_match_point == -1 and b_match_point == -1: + continue + n_match_data = npu_ops_queue[n_match_point] + b_match_data = bench_ops_queue[b_match_point] + un_match_data = npu_ops_queue[0: n_match_point] + for npu_data in un_match_data: + get_un_match_accuracy(result, npu_data, md5_compare, summary_compare) + get_accuracy(result, n_match_data, b_match_data, summary_compare, md5_compare) + del npu_ops_queue[0: n_match_point + 1] + del bench_ops_queue[0: b_match_point + 1] + if npu_ops_queue: + for npu_data in npu_ops_queue: + get_un_match_accuracy(result, npu_data, md5_compare, summary_compare) + + header = [] + if md5_compare: + header = CompareConst.MD5_COMPARE_RESULT_HEADER[:] + elif summary_compare: + header = CompareConst.SUMMARY_COMPARE_RESULT_HEADER[:] + else: + header = CompareConst.COMPARE_RESULT_HEADER[:] + + all_mode_bool = summary_compare == False and md5_compare == False + if stack_mode: + if all_mode_bool: + header.append(CompareConst.STACK) + header.append(CompareConst.DATA_NAME) + else: + header.append(CompareConst.STACK) + else: + if all_mode_bool: + for row in result: + del row[-2] + header.append(CompareConst.DATA_NAME) + else: + for row in result: + del row[-1] + + result_df = pd.DataFrame(result, columns=header) + return result_df + + +def get_un_match_accuracy(result, n_dict, md5_compare, summary_compare): + index_out = 0 + npu_stack_info = n_dict.get("stack_info", None) + bench_name, bench_type, bench_shape = CompareConst.NAN, CompareConst.NAN, CompareConst.NAN + err_msg = CompareConst.NO_BENCH + accuracy_check_res = CompareConst.NAN + for index, n_name in enumerate(n_dict["op_name"]): + if n_name.find("input") != -1: + n_struct = n_dict["input_struct"][index] + else: + n_struct = n_dict["output_struct"][index_out] + index_out += 1 + + result_item = [n_name, bench_name, n_struct[0], bench_type, n_struct[1], bench_shape] + if md5_compare: + result_item.extend([CompareConst.NAN] * 3) + if npu_stack_info and index == 0: + result_item.extend(npu_stack_info) + result.append(result_item) + continue + if summary_compare: + result_item.extend([CompareConst.NAN] * 8) + else: + result_item.extend([CompareConst.NAN] * 5) + summary_data = n_dict.get("summary")[index] + result_item.extend(summary_data) + summary_data = [CompareConst.NAN] * 4 + result_item.extend(summary_data) + result_item.append(accuracy_check_res) + result_item.append(err_msg) + if npu_stack_info and index == 0: + result_item.extend(npu_stack_info) + if not md5_compare and not summary_compare and result_item[1] == CompareConst.NAN: + if index == 0: + result_item.extend(["-1"]) + else: + result_item.extend([CompareConst.NONE, "-1"]) + result.append(result_item) diff --git a/debug/accuracy_tools/atat/pytorch/compare/distributed_compare.py b/debug/accuracy_tools/atat/pytorch/compare/distributed_compare.py new file mode 100644 index 0000000000000000000000000000000000000000..09d40b214d5bc2ae67480d78c9255d9e0326567a --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/compare/distributed_compare.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2019-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import os +import sys +import re +from ...core.utils import print_error_log, CompareException, check_compare_param, \ + check_configuration_param, task_dumppath_get, check_file_or_directory_path, check_regex_prefix_format_valid +from .acc_compare import compare_core +from ...core.file_check_util import create_directory + + +def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs): + def check_and_return_dir_contents(dump_dir, prefix): + """ + check the given dump dir and validate files in dump dir by using the given prefix patterns to build a + pattern: ^{prefix}(?:0|[0-9][1-9]*)?$ + + Args: + dump_dir (str): dump dir + prefix (str): prefix for the patterns, prefix should be less than 20 characters and alphanumeric/-/_ only + + Returns: + content [list]: dir contents + Raises: + CompareException: invalid path + ValueError: prefix not match the patterns + + """ + check_regex_prefix_format_valid(prefix) + check_file_or_directory_path(dump_dir, True) + contents = os.listdir(dump_dir) + pattern = re.compile(rf'^{prefix}(?:0|[0-9][1-9]*)?$') + for name in contents: + if not pattern.match(name): + print_error_log( + f"dump_dir contains '{name}'. Expected '{prefix}'. This name is not in the format of dump " + f"output. Please check and delete irrelevant files in {dump_dir} and try again." + ) + raise CompareException(CompareException.INVALID_PATH_ERROR) + return contents + + def extract_json(dirname, stack_json=False): + json_path = '' + for fname in os.listdir(dirname): + full_path = os.path.join(dirname, fname) + if full_path.endswith('.json'): + json_path = full_path + if not stack_json and 'stack' not in json_path: + break + if stack_json and 'stack' in json_path: + break + + # Provide robustness on invalid directory inputs + if not json_path: + print_error_log(f'No file is found in dump dir {dirname}. ') + raise CompareException(CompareException.NO_DUMP_FILE_ERROR) + return json_path + + if kwargs.get('suffix'): + print_error_log("Argument 'suffix' is not supported for compare_distributed.") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + stack_mode = kwargs.get('stack_mode', False) + auto_analyze = kwargs.get('auto_analyze', True) + fuzzy_match = kwargs.get('fuzzy_match', False) + # get the ranks and match by order + npu_ranks = sorted(check_and_return_dir_contents(npu_dump_dir, 'rank')) + bench_ranks = sorted(check_and_return_dir_contents(bench_dump_dir, 'rank')) + if len(npu_ranks) != len(bench_ranks): + print_error_log('The number of ranks in the two runs are different. ' + 'Unable to match the ranks. Please use another folder to compare ' + 'or use compare() api and manually match the ranks.') + raise CompareException(CompareException.INVALID_PATH_ERROR) + for nr, br in zip(npu_ranks, bench_ranks): + n_dir = os.path.join(npu_dump_dir, nr) + b_dir = os.path.join(bench_dump_dir, br) + s_dir = b_dir + npu_json_path = extract_json(n_dir, stack_json=False) + bench_json_path = extract_json(b_dir, stack_json=False) + stack_json_path = extract_json(s_dir, stack_json=True) + + dump_result_param = { + 'npu_json_path': npu_json_path, + 'bench_json_path': bench_json_path, + 'stack_json_path': stack_json_path, + 'is_print_compare_log': True + } + try: + summary_compare, md5_compare = task_dumppath_get(dump_result_param) + check_configuration_param(stack_mode, auto_analyze, fuzzy_match) + create_directory(output_path) + check_compare_param(dump_result_param, output_path, stack_mode=stack_mode, summary_compare=summary_compare) + except CompareException as error: + print_error_log('Compare failed. Please check the arguments and do it again!') + sys.exit(error.code) + compare_core(dump_result_param, output_path, suffix=f'_{nr}-{br}', summary_compare=summary_compare, + md5_compare=md5_compare, **kwargs) diff --git a/debug/accuracy_tools/atat/pytorch/compare/mapping.yaml b/debug/accuracy_tools/atat/pytorch/compare/mapping.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eaffbe7a18be7bce56a8d4714b66f38d50f1e110 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/compare/mapping.yaml @@ -0,0 +1,607 @@ +__and__: __and__ +__iand__: __iand__ +__ilshift__: __ilshift__ +__ior__: __ior__ +__irshift__: __irshift__ +__ixor__: __ixor__ +__lshift__: __lshift__ +__or__: __or__ +__rshift__: __rshift__ +__xor__: __xor__ +_adaptive_avg_pool2d: adaptive_avg_pool2d +_adaptive_avg_pool3d: adaptive_avg_pool3d +_cdist_forward: cdist +_cudnn_rnn: rnn +_embedding_bag: embedding_bag +_fft_c2c: fft +_fft_c2r: rfft +_foreach_add_: _foreach_add_ +_foreach_addcdiv: _foreach_addcdiv +_foreach_copy_: _foreach_copy_ +_foreach_lerp_: _foreach_lerp_ +_foreach_maximum: _foreach_maximum +_foreach_mul: _foreach_mul +_foreach_neg_: _foreach_neg_ +_foreach_pow: _foreach_pow +_foreach_reciprocal_: _foreach_reciprocal_ +_foreach_sign: _foreach_sign +_foreach_sqrt: _foreach_sqrt +_foreach_sqrt_: _foreach_sqrt_ +_foreach_sub: _foreach_sub +_fused_adam: FusedAdam +_linalg_det: det +_linalg_eigh: eigh +_linalg_slogdet: slogdet +_linalg_svd: svd +_list_to_tensor: as_tensor +_log_softmax: log_softmax +_native_batch_norm_legit: batch_norm +_nested_tensor_from_tensor_list: _nested_tensor_from_tensor_list +_pdist_forward: pdist +_pin_memory: pin_memory +_reshape_alias: reshape +_resize_output_: resize_ +_softmax: softmax +_to_copy: to +abs: abs +abs_: abs_ +absolute: abs +absolute_: abs_ +acos: acos +acos_: acos_ +acosh: acosh +acosh_: acosh_ +adaptive_max_pool2d: adaptive_max_pool2d +adaptive_max_pool3d: adaptive_max_pool3d +add: add +add_: add_ +addbmm: addbmm +addbmm_: addbmm_ +addcdiv: addcdiv +addcdiv_: addcdiv_ +addcmul: addcmul +addcmul_: addcmul_ +addmm: addmm +addmm_: addmm_ +addmv: addmv +addmv_: addmv_ +addr: addr +affine_grid_generator: affine_grid +alias: alias +all: all +alpha_dropout: AlphaDropout +amax: amax +amin: amin +aminmax: aminmax +angle: angle +any: any +arange: arange +arccos: acos +arccos_: arccos_ +arccosh: arccosh +arccosh_: arccosh_ +arcsin: asin +arcsin_: arcsin_ +arcsinh: asinh +arcsinh_: arcsinh_ +arctan: atan +arctan2: atan2 +arctan2_: arctan2_ +arctan_: arctan_ +arctanh: arctanh +arctanh_: arctanh_ +argmax: argmax +argmin: argmin +argsort: argsort +as_strided: as_strided +asin: asin +asin_: asin_ +asinh: asinh +asinh_: asinh_ +atan: atan +atan2: atan2 +atan2_: atan2_ +atan_: atan_ +atanh: atanh +atanh_: atanh_ +avg_pool2d: avg_pool2d +avg_pool3d: avg_pool3d +baddbmm: baddbmm +baddbmm_: baddbmm_ +bernoulli: bernoulli +bernoulli_: bernoulli_ +binary_cross_entropy: BCELoss +binary_cross_entropy_with_logits: binary_cross_entropy_with_logits +bitwise_and: bitwise_and +bitwise_and_: bitwise_and_ +bitwise_left_shift: __lshift__ +bitwise_left_shift_: bitwise_left_shift_ +bitwise_not: bitwise_not +bitwise_not_: bitwise_not_ +bitwise_or: bitwise_or +bitwise_or_: bitwise_or_ +bitwise_right_shift: __rshift__ +bitwise_right_shift_: bitwise_right_shift_ +bitwise_xor: bitwise_xor +bitwise_xor_: bitwise_xor_ +bmm: bmm +broadcast_tensors: broadcast_tensors +bucketize: bucketize +cat: cat +cauchy: Cauchy +cauchy_: cauchy_ +ceil: ceil +ceil_: ceil_ +celu: celu +celu_: celu_ +cholesky: cholesky +cholesky_inverse: cholesky_inverse +cholesky_solve: cholesky_solve +clamp: clamp +clamp_: clamp_ +clamp_max: clamp_max +clamp_max_: clamp_max_ +clamp_min: clamp_min +clamp_min_: clamp_min_ +clip: clip +clip_: clip_ +clone: clone +col2im: col2im +complex: complex +conj_physical: conj +conj_physical_: conj_ +constant_pad_nd: pad +convolution: Conv2d +copy: copy_ +copy_: copy_ +copysign: copysign +copysign_: copysign_ +cos: cos +cos_: cos_ +cosh: cosh +cosh_: cosh_ +count_nonzero: count_nonzero +cudnn_batch_norm: BatchNorm2d +cummax: cummax +cummin: cummin +cumprod: cumprod +cumprod_: cumprod_ +cumsum: cumsum +cumsum_: cumsum_ +deg2rad: deg2rad +deg2rad_: deg2rad_ +detach: detach +diag: diag +diag_embed: diag_embed +diagonal: diagonal +diagonal_copy: diagonal +diagonal_scatter: diagonal +digamma: digamma +digamma_: digamma_ +dist: dist +div: div +div_: div_ +divide: div +divide_: divide_ +dot: dot +dropout: dropout +elu: ELU +elu_: elu_ +embedding: embedding +empty_like: empty_like +empty_strided: empty_strided +eq: eq +eq_: eq_ +erf: erf +erf_: erf_ +erfc: erfc +erfc_: erfc_ +erfinv: erfinv +erfinv_: erfinv_ +exp: exp +exp2: exp2 +exp2_: exp2_ +exp_: exp_ +expand: expand +expm1: expm1 +expm1_: expm1_ +exponential: Exponential +exponential_: exponential_ +eye: eye +fft_fft: fft +fft_fft2: fft2 +fft_fftn: fftn +fft_fftshift: fftshift +fft_hfft: hfft +fft_hfft2: hfft2 +fft_hfftn: hfftn +fft_ifft: ifft +fft_ifft2: ifft2 +fft_ifftn: ifftn +fft_ifftshift: ifftshift +fft_ihfft: ihfft +fft_ihfft2: ihfft2 +fft_ihfftn: ifftn +fft_irfft: irfft +fft_irfft2: irfft2 +fft_irfftn: irfftn +fft_rfft: rfft +fft_rfft2: rfft2 +fft_rfftn: rfftn +fill: fill_ +fill_: fill_ +fix: fix +fix_: fix_ +flip: flip +float_power_: float_power_ +floor: floor +floor_: floor_ +floor_divide: floor_divide +floor_divide_: floor_divide_ +fmax: fmax +fmin: fmin +fmod: fmod +fmod_: fmod_ +frac: frac +frac_: frac_ +full: full +full_like: full_like +gather: gather +gcd: gcd +gcd_: gcd_ +ge: ge +ge_: ge_ +gelu: GELU +gelu_: gelu_ +geometric: Geometric +geometric_: geometric_ +glu: glu +greater: gt +greater_: ge_ +greater_equal: ge +greater_equal_: ge_ +grid_sampler_2d: grid_sample +grid_sampler_3d: grid_sample +gru: GRU +gt: gt +gt_: gt_ +hardshrink: Hardshrink +hardsigmoid: hardsigmoid +hardsigmoid_: hardsigmoid_ +hardswish: hardswish +hardswish_: hardswish_ +hardtanh: hardtanh +hardtanh_: hardtanh_ +heaviside: heaviside +heaviside_: heaviside_ +hinge_embedding_loss: HingeEmbeddingLoss +huber_loss: huber_loss +hypot: hypot +hypot_: hypot_ +i0: i0 +i0_: i0_ +igamma: igamma +igamma_: igamma_ +igammac: igammac +igammac_: igammac_ +index: __getitem__ +index_add: index_add +index_add_: index_add_ +index_copy: index_copy_ +index_copy_: index_copy_ +index_fill: index_fill_ +index_fill_: index_fill_ +index_put: index_put_ +index_put_: index_put_ +index_reduce: index_select +index_select: index_select +is_pinned: is_pinned +is_same_size: is_same_size +isinf: isinf +isnan: isnan +isneginf: isneginf +isposinf: isposinf +istft: istft +item: item +lcm: lcm +lcm_: lcm_ +le: le +le_: le_ +leaky_relu: LeakyReLU +leaky_relu_: leaky_relu_ +lerp: lerp +lerp_: lerp_ +less: less +less_: less_ +less_equal: le +less_equal_: less_equal_ +lgamma: lgamma +lgamma_: lgamma_ +linalg_cholesky_ex: cholesky +linalg_cross: cross +linalg_householder_product: householder_product +linalg_inv_ex: inv +linalg_ldl_factor_ex: ldl +linalg_ldl_solve: ldl_solve +linalg_lu: lu +linalg_lu_factor_ex: lu_factor +linalg_lu_solve: lu_solve +linalg_matrix_exp: matrix_exp +linalg_qr: qr +linalg_solve_triangular: solve +linalg_vector_norm: norm +linspace: linspace +log: log +log10: log10 +log10_: log10_ +log1p: log1p +log1p_: log1p_ +log2: log2 +log2_: log2_ +log_: log_ +log_normal: LogNormal +log_sigmoid_forward: log_sigmoid +logaddexp: logaddexp +logaddexp2: logaddexp2 +_native_batch_norm_legit_functional: batch_norm +logcumsumexp: logcumsumexp +logical_and: logical_and +logical_and_: logical_and_ +logical_not: logical_not +logical_not_: logical_not_ +logical_or: logical_or +logical_or_: logical_or_ +logical_xor: logical_xor +logical_xor_: logical_xor_ +logit: logit +logit_: logit_ +logspace: logspace +logsumexp: logsumexp +lstm: LSTM +lt: lt +lt_: lt_ +lu_unpack: lu_unpack +margin_ranking_loss: margin_ranking_loss +masked_fill: masked_fill +masked_fill_: masked_fill_ +matmul: matmul +max: max +max_pool2d_with_indices: MaxPool2d +max_pool3d_with_indices: MaxPool3d +max_unpool2d: MaxUnpool2d +max_unpool3d: max_unpool3d +maximum: maximum +mean: mean +median: median +meshgrid: meshgrid +min: min +minimum: minimum +mish: Mish +mish_: mish_ +mm: mm +mode: mode +mse_loss: mse_loss +mul: mul +mul_: mul_ +multi_margin_loss: MultiMarginLoss +multilabel_margin_loss_forward: multilabel_margin_loss +multinomial: multinomial +multiply: multiply +multiply_: mul_ +mv: mv +mvlgamma: mvlgamma +mvlgamma_: mvlgamma_ +name: name +nan_to_num: nan_to_num +nan_to_num_: nan_to_num_ +nanmedian: nanmedian +nansum: nansum +narrow_copy: narrow +native_batch_norm: BatchNorm2d +native_dropout: dropout +native_group_norm: group_norm +native_layer_norm: LayerNorm +ne: ne +ne_: ne_ +neg: neg +neg_: neg_ +negative: neg +negative_: neg_ +new_empty: new_empty +new_empty_strided: new_empty_strided +new_full: new_full +new_ones: new_ones +new_zeros: new_zeros +nextafter: nextafter +nextafter_: nextafter_ +nll_loss: nll_loss +nll_loss2d_forward: NLLLoss2d +nll_loss_forward: NLLLoss +nonzero_static: nonzero +norm: norm +normal: normal +normal_: normal_ +not_equal: ne +not_equal_: ne_ +ones: ones +ones_like: ones_like +ormqr: ormqr +pairwise_distance: pairwise_distance +pdist: pdist +permute: permute +pin_memory: pin_memory +pixel_shuffle: PixelShuffle +polar: polar +polygamma: polygamma +positive: positive +pow: pow +pow_: pow_ +prelu: prelu +prod: prod +quantized_gru: GRU +quantized_lstm: LSTM +rad2deg: rad2deg +rad2deg_: rad2deg_ +rand: rand +rand_like: rand_like +randint: randint +randint_like: randint_like +randn: randn +randn_like: randn_like +randperm: randperm +reciprocal: reciprocal +reciprocal_: reciprocal_ +reflection_pad1d: reflection_pad1d +reflection_pad2d: reflection_pad2d +reflection_pad3d: ReflectionPad3d +relu: relu +relu6: relu6 +relu_: relu_ +remainder: remainder +remainder_: remainder_ +renorm: renorm +renorm_: renorm_ +repeat: repeat +repeat_interleave: repeat_interleave +replication_pad1d: ReplicationPad1d +replication_pad2d: replication_pad2d +replication_pad3d: replication_pad3d +resize_as_: resize_as_ +rnn_relu: RNN +rnn_tanh: RNN +roll: roll +rot90: rot90 +round: round +round_: round_ +rrelu_with_noise: RReLU +rrelu_with_noise_: rrelu_with_noise +rsqrt: rsqrt +rsqrt_: rsqrt_ +rsub: rsub +scalar_tensor: scalar_tensor +scatter: scatter_ +scatter_: scatter_ +scatter_add: scatter_add +scatter_add_: scatter_add_ +searchsorted: searchsorted +select: select +selu: selu +selu_: selu_ +sgn: sgn +sgn_: sgn_ +sigmoid: sigmoid +sigmoid_: sigmoid_ +sign: sign +sign_: sign_ +signbit: signbit +silu: silu +silu_: silu_ +sin: sin +sin_: sin_ +sinc: sinc +sinc_: sinc_ +sinh: sinh +sinh_: sinh_ +slice: slice +smooth_l1_loss: smooth_l1_loss +soft_margin_loss: soft_margin_loss +softplus: softplus +softshrink: softshrink +sort: sort +special_airy_ai: airy_ai +special_bessel_j0: j0 +special_bessel_j1: j1 +special_bessel_y0: y0 +special_bessel_y1: y1 +special_chebyshev_polynomial_t: chebyshev_t +special_chebyshev_polynomial_u: chebyshev_u +special_entr: entr +special_erfcx: erfcx +special_hermite_polynomial_h: hermite +special_hermite_polynomial_he: he +special_i0: i0 +special_i0e: i0e +special_i1: i1 +special_i1e: i1e +special_laguerre_polynomial_l: laguerre_l +special_log_ndtr: log_ndtr +special_modified_bessel_i0: i0 +special_modified_bessel_i1: i1 +special_modified_bessel_k0: k0 +special_modified_bessel_k1: i1 +special_ndtr: ndtr +special_ndtri: ndtri +special_scaled_modified_bessel_k0: i0e +special_scaled_modified_bessel_k1: scaled_modified_bessel_k1 +special_spherical_bessel_j0: spherical_jn +special_xlog1py: xlog1py +special_zeta: zeta +split: split +split_with_sizes: split +sqrt: sqrt +sqrt_: sqrt_ +square: square +square_: square_ +squeeze: squeeze +stack: stack +std: std +std_mean: std_mean +stft: stft +sub: sub +sub_: sub_ +subtract: sub +subtract_: subtract_ +sum: sum +t: t +t_: t_ +take: take +tan: tan +tan_: tan_ +tanh: tanh +tanh_: tanh_ +threshold: threshold +threshold_: threshold_ +to: to +topk: topk +trace: trace +transpose: transpose +transpose_: transpose_ +triangular_solve: triangular_solve +tril: tril +tril_: tril_ +tril_indices: tril_indices +triu: triu +triu_: triu_ +triu_indices: triu_indices +true_divide: true_divide +true_divide_: true_divide_ +trunc: trunc +trunc_: trunc_ +unbind: unbind +unfold: unfold +uniform: Uniform +uniform_: uniform_ +unsafe_chunk: unsafe_chunk +unsafe_split: split +unsafe_split_with_sizes: split_with_sizes +unsqueeze: unsqueeze +unsqueeze_: unsqueeze_ +upsample_bicubic2d: interpolate +upsample_bilinear2d: upsample_bilinear +upsample_nearest1d: interpolate +upsample_nearest2d: interpolate +upsample_nearest3d: interpolate +var: var +var_mean: var_mean +vdot: vdot +view: view +where: where +xlogy: xlogy +xlogy_: xlogy_ +zero: zeros +zero_: zero_ +zeros: zeros +zeros_like: zeros_like + + + diff --git a/debug/accuracy_tools/atat/pytorch/compare/match.py b/debug/accuracy_tools/atat/pytorch/compare/match.py new file mode 100644 index 0000000000000000000000000000000000000000..51fb2fb6666756d39db9003b87ef7c3a71b4080b --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/compare/match.py @@ -0,0 +1,36 @@ +import os +import yaml +from ...core.file_check_util import FileOpen +from ...core.utils import CompareException + + +class AtenIrMapping(): + def __init__(self): + cur_path = os.path.dirname(os.path.realpath(__file__)) + yaml_path = os.path.join(cur_path, "mapping.yaml") + with FileOpen(yaml_path, 'r') as f: + self.aten_mapping = yaml.safe_load(f) + + def match(self, op1, op2): + if "Aten" in op1 and "Aten" not in op2: + return self.match_op(op1, op2) + else: + return self.match_op(op2, op1) + + def match_op(self, aten_op, torch_op): + try: + aten_op_raw_name_overload = '_'.join(aten_op.split("_")[1:-3]) + aten_op_raw_name = aten_op_raw_name_overload.split('.')[0] + torch_op_raw_name = '_'.join(torch_op.split("_")[1:-3]).lower() + except IndexError as e: + err_msg = f"Dump op name format error: {aten_op}, {torch_op}. Your dump data may be corrupted." + raise CompareException.INVALID_DATA_ERROR(err_msg) from e + matching_op = self.aten_mapping.get(aten_op_raw_name) + if matching_op is None: + return False + if matching_op.lower() == torch_op_raw_name: + return True + return False + + +graph_mapping = AtenIrMapping() diff --git a/debug/accuracy_tools/atat/pytorch/debugger/__init__.py b/debug/accuracy_tools/atat/pytorch/debugger/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py b/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py new file mode 100644 index 0000000000000000000000000000000000000000..9fc97332f64df89a056cda1b0bfdc730c2cdfa29 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py @@ -0,0 +1,88 @@ +from ..common import print_warn_log_rank_0, seed_all +from ...core.utils import Const + +class DebuggerConfig: + def __init__(self, common_config, task_config, task, dump_path, level): + self.dump_path = dump_path if dump_path else common_config.dump_path + self.task = task or common_config.task or Const.STATISTICS + self.rank = common_config.rank if common_config.rank else [] + self.step = common_config.step if common_config.step else [] + self.level = level or common_config.level or "L1" + self.seed = common_config.seed if common_config.seed else 1234 + self.is_deterministic = common_config.is_deterministic + self.enable_dataloader = common_config.enable_dataloader + self.scope = task_config.scope if task_config.scope else [] + self.list = task_config.list if task_config.list else [] + self.data_mode = task_config.data_mode if task_config.data_mode else ["all"] + self.backward_input_list = task_config.backward_input if task_config.backward_input else [] + self.backward_input = {} + self.acl_config = common_config.acl_config if common_config.acl_config else "" + self.is_forward_acl_dump = True + self.summary_mode = task_config.summary_mode if task_config.summary_mode else Const.STATISTICS + self.overflow_num = task_config.overflow_num if task_config.overflow_num else 1 + self.repair_scope = None + self.repair_api_str = None + self.on_step_end = None + self.repair_type = None + + if self.task == "free_benchmark": + self.fuzz_device = task_config.fuzz_device if task_config.fuzz_device else 'npu' + self.handler_type = task_config.handler_type if task_config.handler_type else 'check' + self.pert_mode = task_config.pert_mode if task_config.pert_mode else 'improve_precision' + self.fuzz_level = task_config.fuzz_level if task_config.fuzz_level else 'L1' + self.fuzz_stage = task_config.fuzz_stage if task_config.fuzz_stage else 'forward' + self.preheat_config = { + "if_preheat": task_config.if_preheat if task_config.if_preheat is not None else True, + "preheat_step": task_config.preheat_step if task_config.preheat_step else 15, + "max_sample": task_config.max_sample if task_config.max_sample else 20, + } + + self.check() + if self.step: + self.step.sort() + if self.level == "L2": + if not self.scope or not isinstance(self.scope, list) or len(self.scope) != 1: + raise ValueError("scope must be configured as a list with one api name") + if isinstance(self.scope[0], str) and Const.BACKWARD in self.scope[0] and not self.backward_input_list: + raise ValueError("backward_input must be configured when scope contains 'backward'") + if Const.BACKWARD in self.scope[0]: + self.is_forward_acl_dump = False + for index in range(len(self.scope)): + # Do this replace operation to let the acl backward dump can be done in forward hook. + self.scope[index] = self.scope[index].replace(Const.BACKWARD, Const.FORWARD) + self.backward_input[self.scope[index]] = self.backward_input_list[index] + seed_all(self.seed, self.is_deterministic) + + def check_kwargs(self): + if self.task and self.task not in Const.TASK_LIST: + raise Exception("task is invalid") + if self.level and self.level not in Const.LEVEL_LIST: + raise Exception("level is invalid") + if not self.dump_path: + raise Exception("Invalid dump path, please check your config") + + def check(self): + self.check_kwargs() + self._check_rank() + self._check_step() + return True + + def _check_rank(self): + if self.rank: + for rank_id in self.rank: + if not isinstance(rank_id, int) or rank_id < 0: + raise ValueError(f"rank {self.rank} must be an integer and greater than or equal to 0.") + else: + print_warn_log_rank_0(f"Rank argument is provided. Only rank {self.rank} data will be dumpped.") + + def _check_step(self): + if self.step: + for s in self.step: + if not isinstance(s, int) or s < 0: + raise ValueError(f"step element {s} must be an integer and greater than or equal to 0.") + + def check_model(self, model): + if self.level in ["L0", "mix"] and not model: + raise Exception( + f"For level {self.level}, PrecisionDebugger must receive a model argument.", + ) \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/debugger/precision_debugger.py b/debug/accuracy_tools/atat/pytorch/debugger/precision_debugger.py new file mode 100644 index 0000000000000000000000000000000000000000..e0ffa4e4d6ebec80209097fe8d3a5716439bd939 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/debugger/precision_debugger.py @@ -0,0 +1,91 @@ +import torch +from torch.utils.data import dataloader +from .debugger_config import DebuggerConfig +from ..service import Service +from ..common import print_warn_log_rank_0 +from ..pt_config import parse_json_config +from ..common.exceptions import MsaccException + + +class PrecisionDebugger: + _instance = None + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super(PrecisionDebugger, cls).__new__(cls) + cls._instance.config = None + cls._instance.enable_dataloader = False + return cls._instance + + def __init__( + self, + config_path=None, + task=None, + dump_path=None, + level=None, + model=None, + step=None, + ): + if not hasattr(self, "initialized"): + self.initialized = True + self.model = self.check_model_valid(model) + common_config, task_config = parse_json_config(config_path, task) + if step: + common_config.step = step + self.config = DebuggerConfig( + common_config, task_config, task, dump_path, level + ) + self.config.check_model(self.model) + self.service = Service(self.config) + self.enable_dataloader = self.config.enable_dataloader + if self.enable_dataloader: + print_warn_log_rank_0("The enable_dataloader feature will be deprecated in the future.") + dataloader._BaseDataLoaderIter.__next__ = iter_tracer(dataloader._BaseDataLoaderIter.__next__) + + @classmethod + def start(cls): + instance = cls._instance + if not instance: + raise Exception("No instance of PrecisionDebugger found.") + if instance.enable_dataloader: + print_warn_log_rank_0("DataLoader is enabled, start() skipped.") + else: + instance.service.start(instance.model) + + @classmethod + def stop(cls): + instance = cls._instance + if not instance: + raise Exception("PrecisionDebugger instance is not created.") + if instance.enable_dataloader: + print_warn_log_rank_0("DataLoader is enabled, stop() skipped.") + else: + instance.service.stop() + + @classmethod + def step(cls): + if not cls._instance: + raise Exception("PrecisionDebugger instance is not created.") + cls._instance.service.step() + + @staticmethod + def check_model_valid(model): + if not model or isinstance(model, torch.nn.Module): + return model + raise MsaccException( + MsaccException.INVALID_PARAM_ERROR, "model 参数必须是torch.nn.Module类型。" + ) + + +def iter_tracer(func): + def func_wrapper(*args, **kwargs): + debugger_instance = PrecisionDebugger._instance + debugger_instance.enable_dataloader = False + if not debugger_instance.service.first_start: + debugger_instance.stop() + debugger_instance.step() + result = func(*args, **kwargs) + debugger_instance.start() + debugger_instance.enable_dataloader = True + return result + return func_wrapper diff --git a/debug/accuracy_tools/atat/pytorch/doc/FAQ.md b/debug/accuracy_tools/atat/pytorch/doc/FAQ.md new file mode 100644 index 0000000000000000000000000000000000000000..daaa79abd956f7a585b6d76a45812c4e7b4fc6ae --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/doc/FAQ.md @@ -0,0 +1,202 @@ +# 精度预检工具 + +1. 预检工具在dump和run_ut的过程中,是否需要同时开启或关闭jit编译(jit_compile)? + + 答:是。 + +2. 预检工具对于type_as这类涉及数据类型转换操作的API,是否具有参考性? + + 由于这类API在CPU侧存在精度先提升后下降的操作,因此这类API的有效性的参考价值有限。 + +3. run ut过程中出现报错:ERROR:Got unsupported ScalarType BFloat16 + + 答:请使用最新版本的工具。 + +4. Dropout算子,CPU和NPU的随机应该不一样,为什么结果比对是一致的? + + 答:这个结果是正常的,工具对该算子有特殊处理,只判定位置为0的位置比例大约和设定p值相当。 + +5. 为什么浮点型数据bench和CPU的dtype不一致? + + 答:对于fp16的数据,CPU会上升一个精度fp32去计算,这是和算子那边对齐的精度结论,CPU用更高精度去计算会更接近真实值。 + +6. 添加预检工具后截取操作报错:`IndexError: too many indices for tensor of dimension x` 或 `TypeError: len() of a 0-d tensor`。 + + 答:注释工具目录api_accuracy_checker/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- __getitem__`,工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。 + +7. 添加预检工具后F.gelu触发ValueError报错:`activation_func must be F.gelu`等。 + + 答:注释工具目录api_accuracy_checker/hook_module/support_wrap_ops.yaml文件中functional:下的的`- gelu`,工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。 + +8. 添加预检工具后触发AsStrided算子相关的报错,或者编译相关的报错,如:`Failed to compile Op [AsStrided]`。 + + 答:注释工具目录api_accuracy_checker/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- t`和`- transpose`。 + +9. Tensor 魔法函数具体对应什么操作? + + 答: + + | Tensor魔法函数 | 具体操作 | + | --------------- | ---------------- | + | `__add__` | + | + | `__and__` | & | + | `__bool__` | 返回Tensor布尔值 | + | `__div__` | / | + | `__eq__` | == | + | `__ge__` | >= | + | `__gt__` | > | + | `__iadd__` | += | + | `__iand__` | &= | + | `__idiv__` | /= | + | `__ifloordiv__` | //= | + | `__ilshift__` | <<= | + | `__imod__` | %= | + | `__imul__` | *= | + | `__ior__` | \|= | + | `__irshift__` | >>= | + | `__isub__` | -= | + | `__ixor__` | ^= | + | `__lshift__` | << | + | `__matmul__` | 矩阵乘法 | + | `__mod__` | % | + | `__mul__` | * | + | `__nonzero__` | 同`__bool__` | + | `__or__` | \| | + | `__radd__` | +(反向) | + | `__rmul__` | *(反向) | + | `__rshift__` | >> | + | `__sub__` | - | + | `__truediv__` | 同`__div__` | + | `__xor__` | ^ | + +# 精度比对工具 + +## 工具使用 + +### dump指定融合算子 + +dump指定操作当前支持dump指定融合算子的输入输出,需要在att/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml中添加,比如以下代码段调用的softmax融合算子 + +``` +def npu_forward_fused_softmax(self, input_, mask): + resl = torch_npu.npu_scaled_masked_softmax(input_, mask, self.scale, False) + return resl +``` + +如果需要dump其中调用的npu_scaled_masked_softmax算子的输入输出信息,需要在support_wrap_ops.yaml中的torch_npu: 中自行添加该融合算子即可: + +``` +- npu_scaled_masked_softmax +``` + +(npu_scaled_masked_softmax融合算子工具已支持dump,本例仅供参考) + +## 常见问题 + +### 1. 在同一个目录多次执行dump会冲突吗? + +会,同一个目录多次dump,会覆盖上一次结果,可以使用dump_tag参数修改dump目录名称。 + +### 2. 如何dump算子级的数据? + +需要配置level为L2模式。 + +### 3. 工具比对发现NPU和标杆数据的API无法完全对齐? + +torch版本和硬件差异属于正常情况。 + +## 异常情况 + +### 2. HCCL 报错: error code: EI0006 + +**故障现象** + +使用atat工具时,报错: error code: EI0006。 + +**故障原因** + +CANN软件版本较低导致不兼容。 + +**故障处理** + +升级新版CANN软件版本。 + +### 3. torch_npu._C._clear_overflow_npu() RuntimeError NPU error,error code is 107002 + +如果运行溢出检测功能遇到这个报错,采取以下解决方法: +如果是单卡运行,添加如下代码,0是卡号,选择自己空闲的卡号。 + +``` +torch.npu.set_device('npu:0') +``` + +如果多卡运行,请在代码中修改对应卡号,比如进程使用卡号为{rank}时可以添加如下代码: + +``` +torch.npu.set_device(f'npu:{rank}') +``` + +如果运行精度比对功能遇到这个报错,尝试安装最新版本的atat。 + +### 4. 运行compare.py时报错:json.decoder.JSONDecodeError: Extra data: line 1 column 37(char 36) + +遇到这种情况,先更新工具版本为最新版本,再重新运行训练代码dump数据,再用新的dump数据进行精度比对,如果最新版本未能解决问题,请联系atat工具开发人员。 + +### 5. AssertionError: assert set(WrapTensorOps) <= set(_tensor_ops) + +遇到这种情况,先检查安装的torch版本,建议先更新工具版本为2.2以上,版本2.2的工具支持torch1.8、1.11和2.0 + +### 6. dump得到的VF_lstm_99_forward_input.1.0.npy、VF_lstm_99_forward_input.1.1.npy类似的数据是否正常? + +带1.0/1.1/1.2后缀的npy是正常现象,例如当输入数据为[[tensor1, tensor2, tensor3]]会生成这样的后缀。 + +### 8. 进行compare报错:The current file contains stack information, please turn on the stack_mode + +在比对脚本中,设置stack_mode=True,例如: + +``` +from ptdbg_ascend import * +dump_result_param={ +"npu_pkl_path": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl", +"bench_pkl_path": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl", +"npu_dump_data_dir": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump", +"bench_dump_data_dir": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump", +"is_print_compare_log": True +} +compare(dump_result_param, "./output", stack_mode=True) +``` + +### 9. dump指定反向API的kernel级别的数据报错:NameError:name 'torch_npu' is not defined + +- 如果是npu环境,请安装torch_npu; +- 如果是gpu环境,暂不支持dump指定API的ACL级别的数据 + +### 10. 配置dump_path后,使用工具报错:[ERROR]The file path /home/xxx/dump contains special characters + +- 请检查你设置的dump绝对路径是否包含特殊字符,确保路径名只包含大小写字母、数字、下划线、斜杠、点和短横线 +- 注意,如果你执行脚本的路径为/home/abc++/,你设置的dump_path="./dump",工具实际校验的路径为绝对路径/home/abc++/dump,++为特殊字符,会引发本条报错 + +### 11. 无法dump matmul权重的反向梯度数据 + +- matmul期望的输入是二维,当输入不是二维时,会将输入通过view操作展成二维,再进行matmul运算,因此在反向求导时,backward_hook能拿到的是UnsafeViewBackward这步操作里面数据的梯度信息,取不到MmBackward这步操作里面数据的梯度信息,即权重的反向梯度数据。 +- 典型的例子有,当linear的输入不是二维,且无bias时,会调用output = input.matmul(weight.t()),因此拿不到linear层的weight的反向梯度数据。 + +### 12. dump.json文件中的某些api的dtype类型为float16,但是读取此api的npy文件显示的dtype类型为float32 + +- atat工具在dump数据时需要将原始数据从npu to cpu上再转换为numpy类型,npu to cpu的逻辑和gpu to cpu是保持一致的,都存在dtype可能从float16变为float32类型的情况,如果出现dtype不一致的问题,最终dump数据的dtype以pkl文件为准。 + +### 13. 使用dataloader后raise异常Exception: ptdbg: exit after iteration [x, x, x] + +- 正常现象,dataloader通过raise结束程序,堆栈信息可忽略。 + +### 14. 添加atat工具后截取操作报错:`IndexError: too many indices for tensor of dimension x` 或 `TypeError: len() of a 0-d tensor`。 + +- 注释工具目录atat/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- __getitem__`,工具会跳过dump该API。如果是需要dump的关键位置api也可以考虑根据报错堆栈信息注释引发报错的类型检查。 + +### 15. 添加atat工具后F.gelu触发ValueError报错:`activation_func must be F.gelu`等。 + +- 注释工具目录atat/hook_module/support_wrap_ops.yaml文件中functional:下的的`- gelu`,工具会跳过dump该API。如果是需要dump的关键位置api也可以考虑根据报错堆栈信息注释引发报错的类型检查。 + +### 16. 添加atat工具后触发AsStrided算子相关的报错,或者编译相关的报错,如:`Failed to compile Op [AsStrided]`。 + +- 注释工具目录atat/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- t`和`- transpose`。 diff --git a/debug/accuracy_tools/atat/pytorch/doc/api_accuracy_checker.md b/debug/accuracy_tools/atat/pytorch/doc/api_accuracy_checker.md new file mode 100644 index 0000000000000000000000000000000000000000..7004c25e6daddb0c17e5caf82f9fa0318fd5425c --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/doc/api_accuracy_checker.md @@ -0,0 +1,278 @@ +# **精度预检工具** + +## 简介 + +精度预检工具通过扫描昇腾NPU上用户训练模型中所有API,输出精度情况的诊断和分析。工具通过dump模型中所有的API前反向信息;构造相应的API单元测试,将NPU输出与标杆(CPU高精度)比对,从而计算对应的精度指标,该过程称为run_ut;将NPU环境下dump的预检数据拷贝至GPU环境,同样执行run_ut;最后通过**新精度标准比对法**将NPU和GPU的预检结果进行比对,从而找出NPU中存在精度问题的API。 + +**新精度标准比对法**:依据新精度标准,对不同的API采取不同的比对算法进行比对(包括绝对阈值法,标杆比对法、二进制一致法、ULP误差比对法和双千指标法),最终给定预检判定结果。 + +**真实数据模式**:精度预检工具支持随机生成模式和真实数据模式,即在预检dump时可以选择由工具构造随机数进行输入获得dump数据或选择获取真实输入数据进行预检dump操作;随机生成模式执行效率高,可以快速获得结果,但数据精度低,只能大致判断精度问题;真实数据模式执行效率略低于随机生成模式,但是数据精度高,可以准确判断精度问题。 + +**工具支持PyTorch版本**:1.11.0/2.0/2.1/2.2。 + +**工具特性** + +- 落盘数据小。 +- 支持随机生成模式和真实数据模式。 +- 单API测试,排除整网中的累计误差问题。 + +## 预检流程 + +精度预检操作流程如下: + +1. 在NPU和GPU环境下分别安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 +2. 在NPU训练脚本内添加atat工具dump接口PrecisionDebugger采集待预检数据。详见《[精度数据采集](./dump.md)》。 +3. 将NPU环境下dump的预检数据拷贝至GPU环境。 +4. 在NPU和GPU环境下分别执行run_ut,生成结果用于最终api_precision_compare操作的输入。详见“**run_ut预检操作**”。 +5. 将NPU和GPU执行run_ut生成的`accuracy_checking_details_{timestamp}.csv`结果文件拷贝至同一环境下。 +6. 运行api_precision_compare.py,输出结果为预检操作的最终结果。详见“**预检结果比对**”。 + +## 预检操作 + +### run_ut预检操作 + +完成待预检数据采集后,仅仅获取了API的输入数据,为了得到NPU vs CPU高精度(标杆)的预检比对结果和GPU vs CPU高精度(标杆)的预检比对结果,还需要进行run_ut操作。 + +run_ut预检操作包括如下场景: + +- 使用run_ut.py执行预检:run_ut.py适用于数据量较小的单卡场景。 +- 使用multi_run_ut.py执行多线程预检:multi_run_ut.py适用于数据量较大的大模型场景。 + +#### 使用run_ut.py执行预检 + +1. 将API信息输入给run_ut模块运行精度检测并比对,运行如下命令: + + ```bash + atat -f pytorch run_ut -api_info ./dump.json + ``` + + 某些场景下(如推理),可以不指定backward_info_0.json,不影响预检功能。 + + | 参数名称 | 说明 | 是否必选 | + | ---------------------------- | ------------------------------------------------------------ | ---------------------------------- | + | -api_info或--api_info_file | 指定API信息文件dump.json。 | 是 | + | -save_error_data | 保存精度未达标的API输入输出数据。 | 否 | + | -o或--out_path | 指定run_ut执行结果存盘路径,默认“./”(相对于run_ut的路径)。 | 否 | + | -j或--jit_compile | 开启jit编译。 | 否 | + | -d或--device | 指定Device ID,选择UT代码运行所在的卡,默认值为0。 | 否 | + | -csv_path或--result_csv_path | 指定本次运行中断时生成的`accuracy_checking_result_{timestamp}.csv`文件路径,执行run_ut中断时,若想从中断处继续执行,配置此参数即可。需要指定为上次中断的`accuracy_checking_result_{timestamp}.csv`文件。详见“**断点续检**”。 | run_ut操作中断后继续执行场景下必选 | + | -f或--filter_api | 过滤模型中除最大值和最小值以外其他参数和结构相同的API。适用于模型较大且重复API较多的场景。 | 否 | + + run_ut执行结果包括`accuracy_checking_result_{timestamp}.csv`和`accuracy_checking_details_{timestamp}.csv`两个文件。`accuracy_checking_result_{timestamp}.csv`是API粒度的,标明每个API是否通过测试。建议用户先查看`accuracy_checking_result_{timestamp}.csv`文件,对于其中没有通过测试的或者特定感兴趣的API,根据其API name字段在`accuracy_checking_details_{timestamp}.csv`中查询其各个输出的达标情况以及比较指标。详细介绍请参见“**预检结果**”。 + +2. (可选)如果需要保存比对不达标的输入和输出数据,可以在run_ut执行命令结尾添加-save_error_data,例如: + + ```bash + atat -f pytorch run_ut -api_info ./dump.json -save_error_data + ``` + + 数据默认会存盘到'./ut_error_data{timestamp}'路径下(相对于启动run_ut的路径),有需要的话,用户可以通过修改att/debug/accuracy_tools/api_accuracy_checker目录下,config.yaml文件的error_data_path参数来配置保存路径,详见“config.yaml文件说明”。 + +#### 使用multi_run_ut.py执行多线程预检 + +multi_run_ut.py脚本,可以并行执行多个run_ut操作,从而降低预检耗时。 + +命令示例如下: + +```bash +atat -f pytorch multi_run_ut -api_info ./dump.json -n 32 -d 0 1 2 3 +``` + +某些场景下(如推理),可以不指定backward_info_0.json,不影响预检功能。 + +| 参数名称 | 说明 | 是否必选 | +| ---------------------------- | ------------------------------------------------------------ | ---------------------------------- | +| -api_info或--api_info_file | 指定API信息文件dump.json。 | 是 | +| -save_error_data | 保存精度未达标的API输入输出数据。 | 否 | +| -o或--out_path | 指定run_ut执行结果存盘路径,默认“./”(相对于run_ut的路径)。 | 否 | +| -j或--jit_compile | 开启jit编译。 | 否 | +| -n | 同时执行run_ut线程的数量,默认为8,最大支持64,但每个Device最大支持8个线程,当指定多个线程和多个Device时,则线程数在每张卡上均分。 | 否 | +| -d或--device | 指定Device ID,选择UT代码运行所在的卡,默认值为0,支持同时指定0~7,共8个Device。 | 否 | +| -csv_path或--result_csv_path | 指定本次运行中断时生成的`accuracy_checking_result_{timestamp}.csv`文件路径,执行run_ut中断时,若想从中断处继续执行,配置此参数即可。需要指定为上次中断的`accuracy_checking_result_{timestamp}.csv`文件。详见“**断点续检**”。 | run_ut操作中断后继续执行场景下必选 | +| -f或--filter_api | 过滤模型中除最大值和最小值以外其他参数和结构相同的API。适用于模型较大且重复API较多的场景。 | 否 | + +#### 断点续检 + +精度预检run_ut过程中,若因环境、数据量过大等原因导致预检进程中断,那么当用户解决这些问题后,重新执行run_ut操作,可以通过断点续检操作继续前面未完成的预检,会在-csv_path指定的`accuracy_checking_result_{timestamp}.csv`文件以及对应的`accuracy_checking_details_{timestamp}.csv`文件中继续写入后续的结果,不会重新创建结果文件。 + +须指定为上次预检中断的`accuracy_checking_result_{timestamp}.csv`文件。请勿修改`accuracy_checking_result_{timestamp}.csv`和`accuracy_checking_details_{timestamp}.csv`文件名,包括时间戳,否则断点续检会因无法识别到文件名而失败。 + +断点续检操作通过如下命令执行: + +```bash +atat -f pytorch run_ut -api_info ./dump.json -csv_path /home/xxx/ut/accuracy_checking_result_{timestamp}.csv +``` + +#### API预检白名单 + +run_ut过程支持API预检白名单,操作方式如下: + +修改att/debug/accuracy_tools/api_accuracy_checker目录下config.yaml文件的white_list参数,配置需要预检的API名称,详见“config.yaml文件说明”。 + +### config.yaml文件说明 + +config.yaml文件可以通过配置参数来控制dump和run_ut操作的真实数据模式以及白名单等功能。 + +文件路径为:att/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/config.yaml + +| 参数名称 | 说明 | 是否必选 | +| ----------------- | ------------------------------------------------------------ | -------- | +| dump_path | 设置dump路径,默认为当前目录。若指定目录不存在,则自动创建。 | 否 | +| real_data | 真实数据模式,可取值True或False,默认为False,表示随机数据模式,配置为True后开启真实数据模式,dump信息增加forward_real_data和backward_real_data目录,目录下保存每个API输入的具体数值。 | 否 | +| enable_dataloader | 自动dump数据开关,可取值True(开启)、False(关闭),默认关闭。 | 否 | +| target_iter | 指定dump某个step的数据,默认为[1],须指定为训练脚本中存在的step。target_iter为list格式,可配置逐个step,例如:target_iter=[0,1,2];也可以配置step范围,例如:target_iter=list(range(0,9)),表示dump第0到第8个step。 | 否 | +| white_list | API dump白名单,指定dump具体API数据,也可以直接配置预检的API白名单,详细请参见“**API预检白名单**”。参数示例:white_list=["conv1d", "conv2d"]。默认未配置白名单,即dump全量API数据。 | 否 | +| error_data_path | 配置保存精度未达标的API输入输出数据路径。 | 否 | +| jit_compile | 开启jit编译。 | 否 | +| precision | 浮点数表示位数,默认取小数点后14位。 | 否 | + +## 预检结果 + +精度预检生成的`accuracy_checking_result_{timestamp}.csv`和`accuracy_checking_details_{timestamp}.csv`文件示例如下: + +可以通过先查看`accuracy_checking_result_{timestamp}.csv`文件的Forward Test Success和Backward Test Success,判断是否存在未通过测试的API,再查看`accuracy_checking_details_{timestamp}.csv`文件的API详细达标情况,API达标情况介绍请参见“**API预检指标**”。 + +`accuracy_checking_result_{timestamp}.csv` + +![accuracy_checking_result](img/accuracy_checking_result.png) + +| 字段 | 含义 | +| --------------------- | ------------------------------------------------------------ | +| API name | API名称。 | +| Forward Test Success | 前向API是否通过测试,pass为通过,warning为待观察,error为错误。 | +| Backward Test Success | 反向API是否通过测试,pass为通过,warning为待观察,error为错误,如果是空白的话代表该API没有反向输出。 | +| Message | 提示信息。 | + +该结果为中间结果,仅作为参考,建议完成“**预检结果比对**”后查看比对结果。该结果后续将会删除。 + +Forward Test Success和Backward Test Success是否通过测试是由`accuracy_checking_details_{timestamp}.csv`中的余弦相似度、最大绝对误差、双百双千双万指标判定结果决定的。 + +需要注意的是`accuracy_checking_details_{timestamp}.csv`中可能存在一个API的前向(反向)有多个输出,那么每个输出记录一行,而在`accuracy_checking_result_{timestamp}.csv`中的结果需要该API的所有结果均为pass才能标记为pass,只要存在一个error则标记error,仅存在waring和pass且不存在error标记waring。 + +`accuracy_checking_details_{timestamp}.csv` + +![accuracy_checking_details](img/accuracy_checking_details.png) + +| 字段 | 含义 | +| ------------------- | ------------------------------------------------------------ | +| API name | NPU或GPU下的API名称。 | +| Bench Dtype | 标杆数据的API数据类型。 | +| DEVICE Dtype | NPU或GPU数据的API数据类型。 | +| Shape | API的Shape信息。 | +| 余弦相似度 | NPU或GPU数据与标杆数据的余弦相似度。 | +| 最大绝对误差 | NPU或GPU数据与标杆数据的最大绝对误差。 | +| 双百指标 | 双百精度指标。是指NPU或GPU的Tensor中的元素逐个与对应的标杆数据对比,相对误差小于百分之一的个数占总元素个数的比例。测试通过标准为相对误差大于百分之一的个数占总元素个数的比例小于百分之一。 | +| 双千指标 | 双千精度指标。是指NPU或GPU的Tensor中的元素逐个与对应的标杆数据对比,相对误差小于千分之一的个数占总元素个数的比例。测试通过标准为相对误差大于千分之一的个数占总元素个数的比例小于千分之一。 | +| 双万指标 | 双万精度指标。是指NPU或GPU的Tensor中的元素逐个与对应的标杆数据对比,相对误差小于万分之一的个数占总元素个数的比例。测试通过标准为相对误差大于万分之一的个数占总元素个数的比例小于万分之一。 | +| 二进制一致错误率 | NPU或GPU数据中每个Tensor精度不一致的数值的数量与Tensor中数值数量的比值。只有数据是builtin类型(bool、int、float、str)、torch.bool和torch的int类型或者在新精度标准中使用二进制一致算法进行比对的API才会展示。 | +| 误差均衡性 | NPU或GPU数据与标杆数据精度差的上下浮动情况。 | +| 均方根误差 | NPU或GPU数据与标杆数据的均方根误差。 | +| 小值域错误占比 | NPU或GPU Tensor中与标杆的绝对误差大于错误阈值的小值在小值域(小值的总数量)中的占比。判断为小值以及绝对误差的错误阈值见“**小值域阈值**”。 | +| 相对误差最大值 | NPU或GPU数据与标杆数据相对误差的最大值。 | +| 相对误差平均值 | NPU或GPU数据与标杆数据相对误差的平均值。 | +| inf/nan错误率 | NPU与标杆inf/nan计算不一致的元素个数占总元素的个数比例。 | +| 相对误差错误率 | NPU与标杆的正常值计算相对误差,其大于错误阈值的元素个数占正常值元素个数的比例。 | +| 绝对误差错误率 | NPU与标杆的小值计算绝对误差,其大于错误阈值的元素个数占小值元素个数的比例。 | +| ULP误差最大值 | NPU或GPU数据与标杆数据ULP误差的最大值(取绝对值后)。 | +| ULP误差平均值 | NPU或GPU数据与标杆数据ULP误差的平均值(取绝对值后)。 | +| ULP误差大于阈值占比 | NPU或GPU数据与标杆数据的ULP误差(取绝对值后)大于阈值(当NPU或GPU数据类型为float16或bfloat16时,阈值为1;当NPU或GPU数据类型为float32时,阈值为32)的元素个数占总元素的个数比例。 | +| Status | API预检通过状态,pass表示通过测试,error表示未通过,warning表示测试未通过双千或双万精度指标,SKIP表示该API的某个参数的反向不要计算梯度,所以没有任何计算过程,其他信息均为空。 | +| message | 提示信息。 | + +### 小值域阈值 + +判定为小值的阈值为: + +- torch.float32:e-6 +- torch.float16:e-3 +- torch.bfloat16:e-3 + +小值域的绝对误差阈值为: + +- torch.float32:e-9 +- torch.float16:e-5 +- torch.bfloat16:e-5 + +### API预检指标 + +API预检指标是通过对`accuracy_checking_details_{timestamp}.csv`中的余弦相似度、最大绝对误差双百、双千、双万精度指标的数值进行判断,得出该API是否符合精度标准的参考指标。 + +API预检通过测试,则在`accuracy_checking_details_{timestamp}.csv`文件中的“Status”列标记“pass”,否则标记“error”或“warning”,详细规则如下: + +1. 余弦相似度 > 0.99:≤ 0.99为不达标,标记“error”,> 0.99达标,进行下一步; +2. 最大绝对误差 < 0.001:< 0.001达标,标记“pass”,≥ 0.001为不达标,进行下一步; +3. 双百、双千、双万精度指标: + - 对于float16和bfloat16数据:双百指标不通过,标记“error”;双百指标通过,双千指标不通过,标记“warning”;双百、双千指标均通过,标记“pass”。 + - 对于float32和float64数据:双千指标不通过,标记“error”;双千指标通过,双万指标不通过,标记“warning”;双千、双万指标均通过,标记“pass”。 + +4. 在`accuracy_checking_result_{timestamp}.csv`中以“Forward Test Success”和“Backward Test Success”字段统计该算子前向反向输出的测试结果,对于标记“pass”的算子,则在`accuracy_checking_result_{timestamp}.csv`中标记“TRUE”表示测试通过,对于标记“error”或“warning”的算子,则在`accuracy_checking_result_{timestamp}.csv`中标记“FALSE”表示测试不通过。由于一个算子可能有多个前向或反向的输入或输出,那么该类算子的输入或输出中必须全为“pass”,才能在`accuracy_checking_result_{timestamp}.csv`中标记“TRUE”,只要有一个输入或输出标记“error”或“warning”,那么在`accuracy_checking_result_{timestamp}.csv`中标记“FALSE”。 + +## 预检结果比对 + +需要同时获取NPU和GPU环境下run_ut操作的预检结果`accuracy_checking_details_{timestamp}.csv`文件。执行如下命令进行NPU和GPU预检结果的比对: + +```bash +atat -f pytorch api_precision_compare -npu /home/xxx/npu/accuracy_checking_details_{timestamp}.csv -gpu /home/xxx/gpu/accuracy_checking_details_{timestamp}.csv -o /home/xxx/ +``` + +| 参数名称 | 说明 | 是否必选 | +| -------------------- | ------------------------------------------------------------ | -------- | +| -npu或--npu_csv_path | NPU预检结果`accuracy_checking_details_{timestamp}.csv`文件路径。默认从当前目录下识别该文件。 | 否 | +| -gpu或--gpu_csv_path | GPU预检结果`accuracy_checking_details_{timestamp}.csv`文件路径。默认从当前目录下识别该文件。 | 否 | +| -o或--out_path | 指定api_precision_compare.py执行结果存盘路径,默认为当前目录。 | 否 | + +执行完成后输出`api_precision_compare_result_{timestamp}.csv`和`api_precision_compare_details_{timestamp}.csv`文件。文件示例如下: + +可以通过先查看`api_precision_compare_result_{timestamp}.csv`文件的Forward Test Success和Backward Test Success,判断是否存在未通过测试的API,再查看`api_precision_compare_details_{timestamp}.csv`文件的API详细达标情况。 + +`api_precision_compare_result_{timestamp}.csv` + +![api_precision_compare_result](img/api_precision_compare_result.png) + +| 字段 | 含义 | +| --------------------- | ------------------------------------------------------------ | +| API name | API名称。 | +| Forward Test Success | 前向API是否通过测试,pass为通过,warning为待观察,error为错误,skip表示该API的数据类型不支持使用新精度标准进行比对,如float64。 | +| Backward Test Success | 反向API是否通过测试,pass为通过,warning为待观察,error为错误,如果是空白的话代表该API没有反向输出,skip表示该API的数据类型不支持使用新精度标准进行比对,如float64。 | +| Message | 提示信息。 | + +Forward Test Success和Backward Test Success是否通过测试是由`api_precision_compare_details_{timestamp}.csv`中的各个指标判定结果决定的。需要注意的是`api_precision_compare_details_{timestamp}.csv`中可能存在一个API的前向(反向)有多个输出,那么每个输出记录一行,而在`api_precision_compare_result_{timestamp}.csv`中的结果需要该API的所有结果均为pass才能标记为pass,只要存在一个error则标记error,仅存在warning和pass且不存在error标记warning。 + +`api_precision_compare_details_{timestamp}.csv` + +![api_precision_compare_details](img/api_precision_compare_details.png) + +| 字段 | 含义 | +| ------------------------ | ------------------------------------------------------------ | +| API name | NPU或GPU下的API名称。 | +| 小值域错误比值 | NPU与CPU的小值域的错误比率/GPU与CPU的小值域的错误比率。标杆比对法指标。 | +| 小值域错误判定结果 | 小值域错误比值小于等于1标记为pass,1~2之间标记为waring,大于2标记为error。 | +| 均方根误差比值 | NPU与CPU的均方根误差/GPU与CPU的均方根误差。标杆比对法指标。 | +| 均方根误差判定结果 | 均方根误差比值小于等于1标记为pass,1~2之间标记为waring,大于2标记为error。 | +| 相对误差最大值比值 | NPU与CPU的相对误差最大值/GPU与CPU的相对误差最大值。标杆比对法指标。 | +| 相对误差最大值判定结果 | 相对误差最大值比值小于等于1标记为pass,1~10之间标记为waring,大于10标记为error。 | +| 相对误差平均值比值 | NPU与CPU的相对误差的平均值/GPU与CPU的相对误差的平均值。标杆比对法指标。 | +| 相对误差平均值判定结果 | 相对误差平均值比值小于等于1标记为pass,1~2之间标记为waring,大于2标记为error。 | +| 误差均衡性比值 | NPU与CPU的误差均衡性/GPU与CPU的误差均衡性。标杆比对法指标。 | +| 误差均衡性判定结果 | 误差均衡性比值小于等于1标记为pass,1~2之间标记为waring,大于2标记为error。该字段暂不参与api_precision_compare_result的结果判定。 | +| inf/nan错误率 | NPU与标杆inf/nan计算不一致的元素个数占总元素的个数比例。绝对阈值法指标。 | +| inf/nan判定结果 | inf/nan错误率判定结果,等于0标记为pass,其余情况标记为error。 | +| 相对误差错误率 | NPU与标杆的正常值计算相对误差,其大于错误阈值的元素个数占正常值元素个数的比例。绝对阈值法指标。 | +| 相对误差判定结果 | 相对误差错误率判定结果,等于0标记为pass,其余情况标记为error。 | +| 绝对误差错误率 | NPU与标杆的小值计算绝对误差,其大于错误阈值的元素个数占小值元素个数的比例。绝对阈值法指标。 | +| 绝对误差判定结果 | 绝对误差错误率判定结果,等于0标记为pass,其余情况标记为error。 | +| 二进制一致错误率 | NPU或GPU数据中每个Tensor精度不一致的数值的数量与Tensor中数值数量的比值。只有数据是builtin类型(bool、int、float、str)、torch.bool和torch的int类型或者在新精度标准中使用二进制一致算法进行比对的API才会展示。二进制一致法指标。 | +| 二进制一致错误率判定结果 | 二进制一致错误率判定结果,等于0标记为pass,其余情况标记为error。 | +| ULP误差平均值 | NPU数据与标杆数据ULP误差的平均值(取绝对值后)。ULP误差比对法指标。 | +| ULP误差大于阈值占比 | NPU数据与标杆数据的ULP误差(取绝对值后)大于阈值(当NPU数据类型为float16或bfloat16时,阈值为1;当NPU数据类型为float32时,阈值为32)的元素个数占总元素的个数比例。ULP误差比对法指标。 | +| ULP误差大于阈值占比比值 | NPU与CPU的ULP误差大于阈值占比/GPU与CPU的ULP误差大于阈值占比。ULP误差比对法指标。 | +| ULP误差判定结果 | ULP误差判定结果。
当NPU或GPU数据类型是float16或bfloat16时,以下两条标准满足其一标记为pass,否则标记为error:
NPU ULP误差大于阈值占比小于0.001;
NPU ULP误差大于阈值占比小于GPU ULP误差大于阈值占比。
当NPU或GPU数据类型是float32时,以下三条标准满足其一标记为pass,否则标记为error:
NPU ULP误差平均值小于64;
NPU ULP误差大于阈值占比小于0.05;
NPU ULP误差大于阈值占比小于GPU ULP误差大于阈值占比。 | +| 双千指标 | 双千精度指标。是指NPU的Tensor中的元素逐个与对应的标杆数据对比,相对误差小于千分之一的个数占总元素个数的比例。测试通过标准为相对误差大于千分之一的个数占总元素个数的比例小于千分之一。仅conv1d和conv2d使用该指标。双千指标法指标。 | +| 双千指标判定结果 | 双千指标判定结果。双千指标大于0.999标记为pass,否则标记为error。 | +| 比对结果 | 综合所有指标的最终结果。如果比对指标中有error,则标记为error;有warning,则标记为warning;否则标记为pass。 | +| 比对算法 | API使用的比对算法,为标杆比对法、二进制一致法、绝对阈值法和ULP误差比对法中的一种。 | +| Message | 提示信息。当前提示该API比对结果为error或warning时对应不符合标准的指标。 | + +# FAQ + +[FAQ](./FAQ.md) diff --git "a/debug/accuracy_tools/atat/pytorch/doc/atat\347\262\276\345\272\246\345\267\245\345\205\267\346\225\260\346\215\256dump\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" "b/debug/accuracy_tools/atat/pytorch/doc/atat\347\262\276\345\272\246\345\267\245\345\205\267\346\225\260\346\215\256dump\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" new file mode 100644 index 0000000000000000000000000000000000000000..ed175ff30172a54d8d4868097599ab8518b45e4f --- /dev/null +++ "b/debug/accuracy_tools/atat/pytorch/doc/atat\347\262\276\345\272\246\345\267\245\345\205\267\346\225\260\346\215\256dump\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" @@ -0,0 +1,182 @@ +# atat精度工具标准性能基线报告 + +## 环境信息 + +NPU:Atlas A2 训练系列产品 + +CPU: + +![输入图片说明](img/cpu_info.png) + +Torch:2.1.0 + +CANN:8.0.T2 + +除上述环境信息影响性能外,API的数量、种类以及Shape都会对性能产生影响,因此本次选取指定网络进行测试,为了避免算子编译耗时的影响,所有模型运行时都开启二进制,模型中添加torch.npu.set_compile_mode(jit_compile=False),所有模型都dump第二个step的数据。 + +## 模型信息和性能基线 + +大模型在使用atat工具dump数据时,建议先简化模型层数,减少dump数据量。 + +以下场景的性能基线测试数据均为多次测试后取平均值,因此实际运行时性能数据可能会根据环境状态稍有浮动。 + +### 工具配置信息 + +dump全部API级别输入输出数据以及相应堆栈信息,配置如下: + +```python +debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump") +debugger.configure_hook(mode="api_stack") +``` + +多卡指定rank0 dump,配置如下: + +```python +debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump",rank=0) +debugger.configure_hook(mode="api_stack") +``` + +dump保存API统计信息的pkl文件,配置如下: + +```python +debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump") +debugger.configure_hook(mode="api_stack", summary_only=True) +``` + +### YOLOV5s + +单卡 + +主要数据类型:FLOAT32 + +启动命令参数:python3 train_ptdbg.py --data ./data/coco.yaml --cfg yolov5s.yaml --weights '' --epochs 1 --batch-size 8 --device 1 + +dump保存API统计信息的pkl文件耗时:**7s** + +进行单卡dump全部API级别输入输出数据以及相应堆栈信息耗时:**11s** + +- dump存盘的API numpy文件大小:13G + + ![输入图片说明](img/YOLOV5S_1.png) + +- api numpy文件数量:3009个 + + ![输入图片说明](img/YOLOV5S_2.png) + + + + +### GPT-3 + +#### NUM_LAYER:1 + +8卡 + +主要数据类型:FLOAT16 + +启动命令参数: + +``` +python3 -m torch.distributed.launch $DISTRIBUTED_ARGS ../../pretrain_gpt_ptdbg.py --num-layers 1 --hidden-size 12288 --num-attention-heads 24 --micro-batch-size 2 --global-batch-size 2 --seq-length 1024 --max-position-embeddings 1024 --train-iters 10 --lr-decay-iters 320000 --save $CHECKPOINT_PATH --load $CHECKPOINT_PATH --data-path $DATA_PATH --tensor-model-parallel-size 8 --use-distributed-optimizer --pipeline-model-parallel-size 8 --vocab-file gpt2-vocab.json --merge-file gpt2-merges.txt --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.375e-5 --lr-decay-style cosine --min-lr 0.375e-6 --weight-decay 0.1 --clip-grad 1.0 --lr-warmup-fraction .01 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 +--recompute-granularity full --recompute-method uniform --no-gradient-accumulation-fusion --log-interval 1 --save-interval 10000 --eval-interval 1000 --eval-iters 10 --fp16 +``` + +dump保存API统计信息的pkl文件耗时:**3.3s** + +进行8卡dump全部API级别输入输出数据以及相应堆栈信息耗时:**53s** + +- dump存盘的api numpy文件大小:145G + + ![输入图片说明](img/GPT-3_1.png) + +- API numpy文件数量:5130个 + + ![输入图片说明](img/GPT-3_2.png) + + +**经测试8卡同时写入磁盘已达到磁盘I/O上限,工具的dump速度取决于磁盘性能,本机环境多进程写入磁盘上限为3GB/秒左右,理论上保存145GB的数据需要50秒左右,如果dump的数据中包含许多的小文件,那么耗时将会更久。** + +指定rank0 dump耗时:**9s** + +- dump存盘的api numpy文件大小:19G + + ![输入图片说明](img/GPT-3_3.png) + +- api numpy文件数量:643个 + + ![输入图片说明](img/GPT-3_4.png) + + +#### NUM_LAYER:8 + +8卡 + +主要数据类型:FLOAT16 + +启动命令参数: + +``` +python3 -m torch.distributed.launch $DISTRIBUTED_ARGS ../../pretrain_gpt_ptdbg.py --num-layers 8 --hidden-size 12288 --num-attention-heads 24 --micro-batch-size 2 --global-batch-size 2 --seq-length 1024 --max-position-embeddings 1024 --train-iters 10 --lr-decay-iters 320000 --save $CHECKPOINT_PATH --load $CHECKPOINT_PATH --data-path $DATA_PATH --tensor-model-parallel-size 8 --use-distributed-optimizer --pipeline-model-parallel-size 8 --vocab-file gpt2-vocab.json --merge-file gpt2-merges.txt --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.375e-5 --lr-decay-style cosine --min-lr 0.375e-6 --weight-decay 0.1 --clip-grad 1.0 --lr-warmup-fraction .01 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --recompute-granularity full --recompute-method uniform --no-gradient-accumulation-fusion --log-interval 1 --save-interval 10000 --eval-interval 1000 --eval-iters 10 --fp16 +``` + +dump保存API统计信息的pkl文件耗时:**6.7s** + +进行8卡dump全部API级别输入输出数据以及相应堆栈信息耗时:**323s** + +- dump存盘的API numpy文件大小:878G + + ![输入图片说明](img/GPT-3_5.png) + +- API numpy文件数量:24002个 + + ![输入图片说明](img/GPT-3_6.png) + + +指定rank0 dump耗时:**47s** + +- dump存盘的API numpy文件大小:110G + + ![输入图片说明](img/GPT-3_7.png) + +- API numpy文件数量:3002个 + + ![输入图片说明](img/GPT-3_8.png) + + +### BLOOM-7B + +8卡 + +NUM_LAYER:1 + +主要数据类型:BFLOAT16 + +启动命令参数: + +``` +python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_llama.py --DDP-impl local --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --sequence-parallel --num-layers 1 --hidden-size 12288 --position-embedding-type rope --normalization RMSNorm --ffn-hidden-size 11008 --num-attention-heads 24 --attention-dropout 0.0 --hidden-dropout 0.0 --init-method-std 0.01 --micro-batch-size 2 --global-batch-size 2 --seq-length 1024 --max-position-embeddings 1024 --data-path $DATA_PATH --tokenizer-name-or-path $TOKENIZER_PATH --tokenizer-not-use-fast --split 100,0,0 --distributed-backend nccl --lr 1.25e-5 --min-lr 1.25e-6 --lr-decay-style cosine --weight-decay 1e-1 --clip-grad 1.0 --initial-loss-scale 65536.0 --adam-beta1 0.9 --adam-beta2 0.95 --log-interval 1 --load ${LOAD_CHECKPOINT_PATH} --save ${SAVE_CHECKPOINT_PATH} --save-interval 10000 --eval-interval 10000 --eval-iters 0 --use-fused-rotary-pos-emb --no-masked-softmax-fusion --no-load-optim --no-load-rng --train-iters 20 --lr-warmup-fraction 0.01 --mlp-layer-fusion --use-flash-attn --use-fused-rmsnorm --bf16 +``` + +dump保存API统计信息的pkl文件耗时:**3s** + +进行8卡dump全部API级别输入输出数据以及相应堆栈信息耗时:**61s** + +- dump存盘的API numpy文件大小:160G + + ![输入图片说明](img/BLOOM-7B_1.png) + +- API numpy文件数量:4924个 + + ![输入图片说明](img/BLOOM-7B_2.png) + + +指定rank0 dump耗时:**17s** + +- dump存盘的API numpy文件大小:20G + + ![输入图片说明](img/BLOOM-7B_3.png) + +- API numpy文件数量:633个 + + ![输入图片说明](img/BLOOM-7B_4.png) + diff --git a/debug/accuracy_tools/atat/pytorch/doc/dump.md b/debug/accuracy_tools/atat/pytorch/doc/dump.md new file mode 100644 index 0000000000000000000000000000000000000000..44a4d09341dabef64306ee2c7ec7463a4fb367d4 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/doc/dump.md @@ -0,0 +1,206 @@ +# **精度数据采集** + +atat工具主要通过在训练脚本内添加dump接口并启动训练的方式来采集精度数据。 + +执行dump操作需要安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 + +## dump接口介绍 + +### PrecisionDebugger + +**功能说明** + +通过加载dump配置文件的方式来确定dump操作的详细配置。 + +可以在from atat.pytorch import PrecisionDebugger和模型初始化之间的任意位置添加该接口。 + +**原型** + +```Python +PrecisionDebugger(config_path=None, task=None, dump_path=None, level=None, model=None, step=None) +``` + +说明:上述参数除config_path和model外,其他参数均在[config.json](../../config)文件中可配,此处的参数优先级高于config.json文件中的配置,而config.json文件可以配置更多参数,若需要进行更多场景的精度数据dump,建议配置[config.json](../../config)文件。 + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ----------- | ------------------------------------------------------------ | -------- | +| config_path | 指定dump配置文件路径,String类型。参数示例:"./config.json"。未配置该路径时,默认使用../../config目录下的config.json文件的默认配置。 | 否 | +| task | dump的任务类型,String类型。可取值"statistics"(仅dump API统计信息)、"tensor"(dump API统计信息和完全复刻整网的API运行情况的真实数据)、"overflow_check"(溢出检测),默认未配置,取"statistics",参数示例:task="tensor"。 | 否 | +| dump_path | 设置dump数据目录路径,String类型。参数示例:dump_path="./dump_path"。 | 是 | +| level | dump级别,根据不同级别dump不同数据,String类型。可取值:
"L0":dump module模块级精度数据,仅PyTorch场景支持”。
"L1":dump API级精度数据,默认值。
"L2":dump kernel级精度数据。
"mix":dump module模块级和API级精度数据。
配置示例:level="L1"。 | 否 | +| model | 指定具体的torch.nn.Module,默认未配置,level配置为"L0"或"mix"时必须配置该参数。配置示例参见“**model配置代码示例**”。 | 否 | +| step | 指定dump某个step的数据,list[int]类型。默认未配置,表示dump所有step数据。dump特定step时,须指定为训练脚本中存在的step。step为list格式,可配置逐个step,例如:step=[0,1,2]。 | 否 | + +#### model配置代码示例 + +示例中定义了一个nn.Module类型的简单网络,在进行数据dump时使用原型函数PrecisionDebugger并传入config_path参数和model参数,其中model参数传入数据的类型为torch.nn.Module类型或torch.nn.Module子类型。 + +```python +#根据需要import包 +import os +import torch +import torch.nn as nn +import torch_npu +import torch.nn.functional as F +from atat.pytorch import PrecisionDebugger + +torch.npu.set_device("npu:0") +#定义一个简单的网络 +class ModuleOP(nn.Module) +def __init__(self) -> None: + super().__init__() + self.linear_1 = nn.Linear(in_features=8,out_features=4) + self.linear_2 = nn.Linear(in_features=4,out_features=2) +def forward(self,x): + x1 = self.linear_1(x) + x2 = self.linear_2(x1) + r1 = F.relu(x2) + return r1 + +if __name__ == "__main__" +module = ModuleOP() + +#注册工具 +debugger = PrecisionDebugger('./config.json',model=module) +debugger.start() +x = torch.randn(10,8) +out = module(x) +loss = out.sum() +loss.backward() +debugger.stop() +``` + +### start函数 + +**功能说明** + +启动函数。 + +在模型初始化之后的任意位置添加。 + +**原型** + +```Python +debugger.start() +``` + +该函数为类函数,可以使用debugger.start()也可以使用PrecisionDebugger.start()。 + +### stop函数 + +**功能说明** + +停止函数。 + +在**start**函数之后的任意位置添加。 + +**原型** + +```Python +debugger.stop() +``` + +该函数为类函数,可以使用debugger.stop()也可以使用PrecisionDebugger.stop()。 + +### step函数 + +**功能说明** + +结束标识。 + +在最后一个**stop**函数后或一个step结束的位置添加。 + +**原型** + +```Python +debugger.step() +``` + +该函数为类函数,可以使用debugger.step()也可以使用PrecisionDebugger.step()。 + +## 示例代码 + +```Python +from atat.pytorch import PrecisionDebugger +debugger = PrecisionDebugger(config_path="./config.json", dump_path="./dump_path") +# 请勿将以上初始化流程插入到循环代码中 + +# 模型初始化 +# 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() +debugger.start() + +# 需要dump的代码片段1 + +debugger.stop() +debugger.start() + +# 需要dump的代码片段2 + +debugger.stop() +debugger.step() +``` + +## dump结果文件介绍 + +训练结束后,工具将dump的数据保存在dump_path参数指定的目录下。 + +dump结果目录结构示例如下: + +```Python +├── dump_path +│ ├── step0 +│ | ├── rank0 +│ | │ ├── dump_tensor_data +| | | | ├── Tensor.permute.1.forward.pt +| | | | ├── MyModule.0.forward.input.pt # 开启模块级精度数据dump时存在模块级的dump数据文件 +| | | | ... +| | | | └── Fcuntion.linear.5.backward.output.pt +│ | | ├── dump.json # 保存前反向算子、算子的统计量信息或溢出算子信息。包含dump数据的API名称(命名格式为:`{api_type}_{api_name}_{API调用次数}_{前向反向}_{input/output}.{参数序号}`)、dtype、 shape、各数据的max、min、mean、L2norm统计信息以及当配置summary_mode="md5"时的md5数据。其中,“参数序号”表示该API下的第n个参数,例如1,则为第一个参数,若该参数为list格式,则根据list继续排序,例如1.1,表示该API的第1个参数的第1个子参数;L2norm表示2范数(平方根) +│ | | ├── stack.json # 算子调用栈信息 +│ | | └── construct.json # 分层分级结构 +│ | ├── rank1 +| | | ├── dump_tensor_data +| | | | └── ... +│ | | ├── dump.json +│ | | ├── stack.json +| | | └── construct.json +│ | ├── ... +│ | | +| | └── rank7 +│ ├── step1 +│ | ├── ... +│ ├── step2 +``` + +dump过程中,pt文件在对应算子或者模块被执行后就会落盘,而json文件则需要在正常执行PrecisionDebugger.stop()或set_dump_switch("OFF")后才会被落盘保存,异常的程序终止会保存终止前被执行算子的相关pt文件,但是不会生成json文件。 + +其中`dump_{version}`为默认命名,debugger方式dump不支持修改该文件夹名称;rank为设备上各卡的ID,每张卡上dump的数据会生成对应dump目录。 + +pt文件保存的前缀和PyTorch对应关系如下: + +| 前缀 | Torch模块 | +| ----------- | ------------------- | +| Tensor | torch.Tensor | +| Torch | torch | +| Functional | torch.nn.functional | +| NPU | NPU亲和算子 | +| VF | torch._VF | +| Aten | torch.ops.aten | +| Distributed | torch.distributed | + +## 工具支持的API列表 + +atat工具维护固定的API支持列表,若需要删除或增加dump的API,可以在atat/pytorch/hook_module/support_wrap_ops.yaml文件内手动修改,如下示例: + +```Python +functional: # functional为算子类别,找到对应的类别,在该类别下按照下列格式删除或添加API + - conv1d + - conv2d + - conv3d +``` + +# FAQ + +[FAQ](./FAQ.md) diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_1.png b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_1.png new file mode 100644 index 0000000000000000000000000000000000000000..3853626d6fab127915425238c76f16836a4cef0b Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_1.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_2.png b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_2.png new file mode 100644 index 0000000000000000000000000000000000000000..732abb496a4e0b171be8e21e70c09ea492383fe8 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_2.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_3.png b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_3.png new file mode 100644 index 0000000000000000000000000000000000000000..7f06074887e9a9df9b601dba6d26a45d924245d5 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_3.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_4.png b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_4.png new file mode 100644 index 0000000000000000000000000000000000000000..b0bd9d40a5b9e414f7713c121fb2adb52d908bf6 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_4.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_1.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_1.png new file mode 100644 index 0000000000000000000000000000000000000000..d633249c2ccd9b365c825374fe5472f33dd812f4 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_1.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_2.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_2.png new file mode 100644 index 0000000000000000000000000000000000000000..c4748ca479a85fe1857e2b599569714f4f8dfbd3 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_2.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_3.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_3.png new file mode 100644 index 0000000000000000000000000000000000000000..b20c34d943e9765f640cebb3820370c9e99b5527 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_3.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_4.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_4.png new file mode 100644 index 0000000000000000000000000000000000000000..1dbf6600e742d49e9022fe3696c9ff2adcdbaf39 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_4.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_5.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_5.png new file mode 100644 index 0000000000000000000000000000000000000000..5bf1c26d3d8734b14e003dcf1b26361ed300217d Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_5.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_6.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_6.png new file mode 100644 index 0000000000000000000000000000000000000000..1c326caa7f4e17c924456461622eedaabcd18362 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_6.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_7.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_7.png new file mode 100644 index 0000000000000000000000000000000000000000..2ee73472850e27e18da472aa3a42f747f7239d7e Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_7.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_8.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_8.png new file mode 100644 index 0000000000000000000000000000000000000000..6bedfab396f306478df4f6ce8869b3ab65c92b5e Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_8.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_1.png b/debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_1.png new file mode 100644 index 0000000000000000000000000000000000000000..791e49c178ef3d88bdbc7ca4eae2e78ffc35e250 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_1.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_2.png b/debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_2.png new file mode 100644 index 0000000000000000000000000000000000000000..61357f3feb95b5f70d6317dbfef7b516c0492449 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_2.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_details.png b/debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_details.png new file mode 100644 index 0000000000000000000000000000000000000000..6c39840f633743b3b1dd54dfa5307c0e82ba20b7 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_details.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_result.png b/debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_result.png new file mode 100644 index 0000000000000000000000000000000000000000..7e81606cb3a53364bb069393882ef162c8ce454c Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_result.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_details.png b/debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_details.png new file mode 100644 index 0000000000000000000000000000000000000000..b31ba52350989488df1245f4840886f2a7db4944 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_details.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_result.png b/debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_result.png new file mode 100644 index 0000000000000000000000000000000000000000..8087d83c8035d2939bebc4ecd9f213c0810d88c7 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_result.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/auto_analyze_log.png b/debug/accuracy_tools/atat/pytorch/doc/img/auto_analyze_log.png new file mode 100644 index 0000000000000000000000000000000000000000..999b47f97ef5661316c7e61dbdc93c87996259f3 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/auto_analyze_log.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl.png b/debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl.png new file mode 100644 index 0000000000000000000000000000000000000000..c64e9380c6d9c01bb2ad18c81e430ead0800bb7d Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl_md5.png.png b/debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl_md5.png.png new file mode 100644 index 0000000000000000000000000000000000000000..81ba1935e69218467b006f05dfffbe54f3f04cb4 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl_md5.png.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/cpu_info.png b/debug/accuracy_tools/atat/pytorch/doc/img/cpu_info.png new file mode 100644 index 0000000000000000000000000000000000000000..744d237e975e555160ecdc391810a3681d05252a Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/cpu_info.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/module_compare.png b/debug/accuracy_tools/atat/pytorch/doc/img/module_compare.png new file mode 100644 index 0000000000000000000000000000000000000000..2e1ea564eb191807034afd8aceac92b29b62a086 Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/module_compare.png differ diff --git a/debug/accuracy_tools/atat/pytorch/doc/parse_tool.md b/debug/accuracy_tools/atat/pytorch/doc/parse_tool.md new file mode 100644 index 0000000000000000000000000000000000000000..23000912910e8f95b4cb74c7983961918bd9a513 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/doc/parse_tool.md @@ -0,0 +1,286 @@ +# **数据解析工具** + +数据解析工具(parse_tool)提供命令行交互式界面,提供更多的数据解析功能并且展示结果。 + +使用场景:本工具主要用于精度比对前后两次NPU kernel层级dump数据的一致性。 + +## 进入parse交互式界面 + +安装atat工具后(详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节),可以通过使用命令 **atat -f pytorch parse** 进入交互式界面,如下所示: + +```bash +atat -f pytorch parse +Parse >>> +``` + +可在parse的界面中执行Shell命令,以及如下场景的相关解析命令: + +- 支持指定kernel层级算子数据比对。 +- 支持指定kernel层级算子数据转换及展示。 +- 支持交互式指定pkl文件中API对应dump数据查看。 +- 支持API进行可选层级比对和打印(统计级和像素级)。 + +Ctrl+C可以退出parse交互式界面。不退出parse交互式界面若需要执行非该界面下的内置Shell命令,且命令与parse交互式界面命令冲突时,非该界面命令需要使用run命令,在相关命令前加上run前缀,如下示例: + +```bash +atat -f pytorch parse +Parse >>> run vim cli.py +Parse >>> vim cli.py +``` + +以上各场景详细介绍请参见下文章节。 + +## kernel层级算子数据批量转换 + +本功能会将原有待比对dump数据目录下的dump数据按照算子名和时间戳进行梳理并分类,之后再将dump数据转为为npy文件。 + +依赖:CANN包中的msaccucmp工具,需要安装Ascend-CANN-toolkit,详见《[CANN 软件安装指南](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F700%2Fenvdeployment%2Finstg%2Finstg_0001.html)》。 + +输入以下比对命令进行数据转换。 + +```bash +cad -m my_dump_path [-out output_path] [-asc msaccucmp_path] +``` + +| 参数名称 | 说明 | 是否必选 | +| -------- | ------------------------------------------------------------ | -------- | +| -m | 待转换kernel dump数据目录。需要指定到kernel dump数据的deviceid级目录。 | 是 | +| -out | 结果输出目录,须指定已存在的目录,默认为./parse_data/acl_batch_convert。未指定时保存在默认路径下,比对结束后会打印log提示输出结果存放路径。 | 否 | +| -asc | 指定msaccucmp路径,默认路径为:/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py。 | 否 | + +**示例** + +``` +# 传入待比对数据目录 +Parse >>> cad -m /home/xxx/my_dump_path/20000124003856/0 +# 转换结果打印 +...... +╭──────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +# 转换前的dump文件 +│ SrcFile: /home/xxx/my_dump_path/20000124003856/0/272/TransData.trans_TransData_22.112.21.948645536672764 │ +# 转换后的npy文件 +│ - TransData.trans_TransData_22.112.21.948645536672764.output.0.npy │ +│ - TransData.trans_TransData_22.112.21.948645536672764.input.0.npy │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +...... +[INFO] The comparison result have been written to "./parse_data/acl_batch_convert". +``` + +输出结果: + +原dump数据目录: + +``` +├── /home/xxx/my_dump_path/20000124003856/0/ +│ ├── 272 +│ │ ├── {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp} +│ │ ... +│ ├── 512 +│ ... +``` + +转换后: + +``` +├── ./parse_data/acl_batch_convert/{timestamp} +│ ├── {op_name1} +│ │ ├── {timestamp1} +│ │ | ├── {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input/output}.{参数序号}.npy +│ │ | │ ... +│ │ ├── {timestamp2} +│ │ | ... +│ ├── {op_name2} +│ ├── ... +``` + +## kernel层级算子数据比对 + +本功能主要用于比对前后两次NPU kernel层级dump数据的一致性。 + +本功能支持批量比对,若需要进行批量比对,需要先将两份待比对的NPU kernel层级dump数据进行“**kernel层级算子数据批量转换**”,可以使两份数据更好的匹配;若直接进行dump数据的比对,建议只比对单个dump数据文件。 + +输入以下比对命令进行数据比对。 + +```bash +vc -m my_dump_path -g golden_dump_path [-out output_path] [-cmp_path msaccucmp_path] +``` + +| 参数名称 | 说明 | 是否必选 | +| --------- | ------------------------------------------------------------ | -------- | +| -m | 待比对kernel dump数据目录。如果比对单个算子,需要指定到kernel dump数据的model_id级目录;如果批量比对,则指定到cad转换后的timestamp级目录。 | 是 | +| -g | 标杆kernel dump数据目录。如果比对单个算子,需要指定到kernel dump数据的model_id级目录;如果批量比对,则指定到cad转换后的timestamp级目录。 | 是 | +| -out | 结果输出目录,须指定已存在的目录,默认为./parse_data/acl_batch_comapre。未指定时保存在默认路径下,比对结束后会打印log提示输出结果存放路径。 | 否 | +| -cmp_path | 指定msaccucmp路径,默认路径为:/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py | 否 | + +输出结果:batch_compare_{timestamp}.csv文件。 + +**示例** + +```bash +# 传入待比对数据目录以及标杆数据目录 +Parse >>> vc -m ./my_dump_path -g ./golden_data_path +[INFO]Compare result is saved in : parse_data/acl_batch_comapre/batch_compare_1707271118.csv +``` + +## kernel算子数据的npy转换 + +依赖:CANN包中的msaccucmp工具,需要安装Ascend-CANN-toolkit,详见《[CANN 软件安装指南](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F700%2Fenvdeployment%2Finstg%2Finstg_0001.html)》。 + +输入以下转换命令进行数据转换, 将kernel级别dump数据转为npy文件。 + +```bash +dc -n file_name/file_path [-f format] [-out output_path] +``` + +| 参数名称 | 说明 | 是否必选 | +| --------- | ------------------------------------------------------------ | -------- | +| -n | 需转换的dump数据文件或dump数据文件目录。 | 是 | +| -f | 开启format转换,指定该参数时需要配置format格式。当前内置的Format转换支持如下类型: FRACTAL_NZ转换NCHW FRACTAL_NZ转换成NHWC FRACTAL_NZ转换ND HWCN转换FRACTAL_Z HWCN转换成NCHW HWCN转换成NHWC NC1HWC0转换成HWCN NC1HWC0转换成NCHW NC1HWC0转换成NHWC NCHW转换成FRACTAL_Z NCHW转换成NHWC NHWC转换成FRACTAL_Z NHWC转换成HWCN NHWC转换成NCHW NDC1HWC0转换成NCDHW | 否 | +| -out | 结果输出目录。 | 否 | +| -cmp_path | 指定msaccucmp路径,默认路径为:/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py | 否 | + +- 输出结果:npy文件。 + +- 若指定-out参数需要用户传入输出路径,并且路径需要已存在。 + +- 若未指定输出目录, 则比对结束后将结果保存在默认目录 “./parse_data/convert_result”中,比对结束后会打印log提示输出结果存放路径及转换结果。 + +- 输入以下命令,展示npy数据统计信息。 + + ```bash + pt -n file_path + ``` + + | 参数名称 | 说明 | 是否必选 | + | -------- | ------------- | -------- | + | -n | npy文件路径。 | 是 | + + 打印统计信息:shape, dtype, max, min和mean。默认在npy文件路径下将该数据保存为txt文件。 + +**示例1** + +```bash +# 传入需转换的dump文件目录 +Parse >>> dc -n ./dump_data/ +...... +# 转换结果 +╭──────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ SrcFile: ./dump_data/ +│ - Add.fp32_vars_add_2fp32_vars_Relu_9.31.5.1636595794731103.input.0.npy │ +│ - Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.output.0.npy │ +│ - Add.fp32_vars_add_2fp32_vars_Relu_9.31.5.1636595794731103.input.1.npy │ +│ - Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.input.1.npy │ +│ - Add.fp32_vars_add_3fp32_vars_Relu_12.40.5.1636595794846124.input.1.npy │ +│ - Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.input.0.npy │ +│ - Add.fp32_vars_add_3fp32_vars_Relu_12.40.5.1636595794846124.input.0.npy │ +│ - Add.fp32_vars_add_2fp32_vars_Relu_9.31.5.1636595794731103.output.0.npy │ +│ - Add.fp32_vars_add_3fp32_vars_Relu_12.40.5.1636595794846124.output.0.npy │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────╯ +``` + +**示例2** + +```bash +# 查看某个dump数据块的数据信息 +# 默认会将数据中的tensor保存成 txt +Parse >>> pt -n ./parse_data/dump_convert/Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.output.0.npy +...... +# 打印统计信息 +[Shape: (1, 16, 56, 56, 16)] [Dtype: float16] [Max: 452.0] [Min: -408.5] [Mean: -3.809] +Path: ./parse_data/dump_convert/Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.input.0.npy +TextFile:./parse_data/dump_convert/Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.input.0.npy.txt +``` + +## dump.json文件中指定API的dump数据信息查看(暂不支持) + +输入以下命令,解析并输出dump.json文件中指定API的统计信息。 + +```bash +pk -f pkl_path -n api_name +``` + +| 参数名称 | 说明 | 是否必选 | +| -------- | ----------------------- | -------- | +| -f | 指定dump.json文件路径。 | 是 | +| -n | 指定API名称。 | 是 | + +- 输出结果:打印统计信息(shape, dtype, max和min mean)。 +- 若pkl文件中存在相应的堆栈信息,则会打印堆栈信息。 + +**示例** + +```bash +# 传入pkl文件及api名称 +Parse >>> pk -f ./torch_dump/xxx/rank0/dump.json -n Functional_conv2d_0_forward +...... +# 打印统计信息及堆栈(pkl文件不包含堆栈则不会打印堆栈) + +Statistic Info: + [Functional_conv2d_0_forward_input.0][dtype: torch.float32][shape: [2, 1, 2, 2]][max: 1.576936960220337][min: -0.9757485389709473][mean: 0.4961632490158081] + [Functional_conv2d_0_forward_input.1][dtype: torch.float32][shape: [2, 1, 2, 2]][max: 0.20064473152160645][min: -0.47102075815200806][mean: -0.20796933770179749] + [Functional_conv2d_0_forward_input.2][dtype: torch.float32][shape: [2]][max: 0.17380613088607788][min: -0.16853803396224976][mean: 0.0026340484619140625] + [Functional_conv2d_0_forward_output][dtype: torch.float32][shape: [2, 2, 1, 1]][max: 0.02364911139011383][min: -1.762906551361084][mean: -0.6710853576660156] +``` + +## API可选层级比对 + +输入以下命令, 进行统计级和像素级比对。 + +```bash +cn -m my_data*.npy -g gloden*.npy [-p num] [-al atol] [-rl rtol] +``` + +- 统计级比对:对tensor整体进行余弦值及相对误差的计算。 +- 像素级比对:对输入的两个npy文件进行逐元素比对。若两个tensor对应元素的相对误差或绝对误差大于**误差阈值**(-al和-rl配置)则被标记为错误数据。 + +| 参数名称 | 说明 | 是否必选 | +| -------- | ----------------------------------------------- | -------- | +| -m | 待比对数据。 | 是 | +| -g | 标杆数据。 | 是 | +| -p | 设置比对结束后打印错误元素的个数,默认值20。 | 否 | +| -al | 判定数据存在精度问题的绝对误差阈值,默认0.001。 | 否 | +| -rl | 判定数据存在精度问题的相对误差阈值,默认0.001。 | 否 | +| -s | 将npy文件保存成txt文件,用于查看,默认开启。 | 否 | + +输出结果: + +- 统计级比对结果。 +- 两个文件的统计信息(shape, dtype, max, min和mean)。 +- 错误数据打印表格。 + +**示例** + +```bash +# 对比两个tensor的数据 +Parse >>> cn -m Add.InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.323.1619494134703053.output.0.npy -g InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.0.1619492699305998.npy -p 10 -s -al 0.002 -rl 0.005 + Error Item Table Top Item Table +┏━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ ┏━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ +┃ Index ┃ Left ┃ Right ┃ Diff ┃ ┃ Index ┃ Left ┃ Right ┃ Diff ┃ +┡━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ ┡━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ +│ 155 │ 0.024600908 │ 0.022271132 │ 0.002329776 │ │ 0 │ -0.9206961 │ -0.9222216 │ 0.0015255213 │ +│ 247 │ 0.015752593 │ 0.017937578 │ 0.0021849852 │ │ 1 │ -0.6416973 │ -0.64051837 │ 0.0011789203 │ +│ 282 │ -0.0101207765 │ -0.007852031 │ 0.0022687456 │ │ 2 │ -0.35383835 │ -0.35433492 │ 0.0004965663 │ +│ 292 │ 0.019581757 │ 0.02240482 │ 0.0028230622 │ │ 3 │ -0.18851271 │ -0.18883198 │ 0.00031927228 │ +│ 640 │ -0.06593232 │ -0.06874806 │ 0.0028157383 │ │ 4 │ -0.43508735 │ -0.43534422 │ 0.00025686622 │ +│ 1420 │ 0.09293677 │ 0.09586689 │ 0.0029301196 │ │ 5 │ 1.4447614 │ 1.4466647 │ 0.0019032955 │ +│ 1462 │ -0.085207745 │ -0.088047795 │ 0.0028400496 │ │ 6 │ -0.3455438 │ -0.3444429 │ 0.0011008978 │ +│ 1891 │ -0.03433288 │ -0.036525503 │ 0.002192624 │ │ 7 │ -0.6560242 │ -0.6564579 │ 0.0004336834 │ +│ 2033 │ 0.06828873 │ 0.07139922 │ 0.0031104907 │ │ 8 │ -2.6964858 │ -2.6975214 │ 0.0010356903 │ +│ 2246 │ -0.06376442 │ -0.06121233 │ 0.002552092 │ │ 9 │ -0.73746175 │ -0.73650354 │ 0.00095820427 │ +└───────┴───────────────┴──────────────┴──────────────┘ └───────┴─────────────┴─────────────┴───────────────┘ +╭───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ Left: | +│ |- NpyFile: ./dump/temp/decode/Add.InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.323.1619494134703053.output.0.npy | +│ |- TxtFile: ./dump/temp/decode/Add.InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.323.1619494134703053.output.0.npy.txt | +│ |- NpySpec: [Shape: (32, 8, 8, 320)] [Dtype: float32] [Max: 5.846897] [Min: -8.368301] [Mean: -0.72565556] | +│ DstFile: │ +│ |- NpyFile: ./dump/cpu/InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.0.1619492699305998.npy | +│ |- TxtFile: ./dump/cpu/InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.0.1619492699305998.npy.txt | +│ |- NpySpec: [Shape: (32, 8, 8, 320)] [Dtype: float32] [Max: 5.8425903] [Min: -8.374472] [Mean: -0.7256237] │ +│ NumCnt: 655360 │ +│ AllClose: False │ +│ CosSim: 0.99999493 │ +│ ErrorPer: 0.023504638671875 (rl= 0.005, al= 0.002) │ +╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +``` + diff --git a/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_compare.md b/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_compare.md new file mode 100644 index 0000000000000000000000000000000000000000..9beda3b02f2d72383a2bcaa4c20bcd9c5b8ba971 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_compare.md @@ -0,0 +1,153 @@ +# **精度比对工具** + +## CPU或GPU与NPU精度数据比对 + +### 总体说明 + +- 本节主要介绍CPU或GPU与NPU精度数据比对的函数以及示例,执行精度比对操作前需要先完成CPU或GPU与NPU的精度数据dump,详见《[精度数据采集](./dump.md)》。 + +- 比对函数均通过单独创建精度比对脚本执行,可支持单卡和多卡场景的精度数据比对。 + +- 工具性能:比对数据量较小时(参考值单份文件小于10GB),参考比对速度0.1GB/s;比对数据量较大时,参考比对速度0.3GB/s。 推荐环境配置:独占环境,CPU核心数192,固态硬盘(IO速度参考:固态硬盘 > 500MB/s,机械硬盘60 ~ 170MB/s)。 + + 用户环境性能弱于标准约束或非独占使用的比对速度酌情向下浮动。比对速度的计算方式:两份比对文件大小/比对耗时。 + +### 约束 + +- NPU自研API,在CPU或GPU若没有对应的API,该API的dump数据不比对。 +- NPU与CPU或GPU的计算结果误差可能会随着模型的执行不断累积,最终会出现同一个API因为输入的数据差异较大而无法比对的情况。 +- CPU或GPU与NPU中两个相同的API会因为调用次数不同导致无法比对或比对到错误的API,不影响整体运行,该API忽略。 + +### compare_distributed + +**功能说明** + +将CPU或GPU与NPU的dump文件进行比对,支持单卡和多卡,可同时比对多卡的dump数据。多机场景需要每个设备单独执行比对操作。可自动检索和匹配对应卡和进程所dump的数据文件,再调用compare进行比对。单机单卡时与compare函数二选一。 + +**函数原型** + +```Python +compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| -------------- | ------------------------------------------------------------ | -------- | +| npu_dump_dir | 配置NPU环境下的dump目录。dump数据目录须指定到step级。参数示例:'./npu_dump/step0'。数据类型:str。 | 是 | +| bench_dump_dir | 配置CPU、GPU或NPU环境下的dump目录。参数示例:'./gpu_dump/step0'。数据类型:str。 | 是 | +| output_path | 配置比对结果csv文件存盘目录。需要预先创建output_path目录。参数示例:'./output'。文件名称基于时间戳自动生成,格式为:`compare_result_rank{npu_ID}-rank{cpu/gpu/npu_ID}_{timestamp}.csv`。数据类型:str。 | 是 | +| **kwargs | 支持compare的所有可选参数。 | 否 | + +**函数示例** + +创建比对脚本,例如compare_distributed.py,拷贝如下代码,具体参数请根据实际环境修改。 + +```Python +from atat.pytorch import * +compare_distributed('./npu_dump/step0', './gpu_dump/step0', './output') +``` + +dump数据目录须指定到step级。 + +### compare + +**功能说明** + +将CPU或GPU与NPU的dump文件进行比对,仅支持单机单卡。 + +**函数原型** + +```Python +compare(input_param, output_path, stack_mode=False, auto_analyze=True, fuzzy_match=False) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ------------ | ------------------------------------------------------------ | -------- | +| input_param | 配置dump数据文件及目录。数据类型:dict。配置参数包括:
"npu_json_path":指定NPU dump目录下的dump.json文件。参数示例:"npu_json_path": "./npu_dump/dump.json"。必选。
"bench_json_path":指定CPU、GPU或NPU dump目录下的dump.json文件。参数示例:"bench_json_path": "./gpu_dump/dump.json"。必选。
"stack_json_path":指定NPU dump目录下的stack.json文件。参数示例:"stack_json_path": "./npu_dump/stack.json"。可选。
"is_print_compare_log":配置是否开启日志打屏。可取值True或False。可选。 | 是 | +| output_path | 配置比对结果csv文件存盘目录。参数示例:'./output'。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.csv`。数据类型:str。 | 是 | +| stack_mode | 配置stack_mode的开关。仅当配置"stack_json_path"需要开启。可取值True或False,参数示例:stack_mode=True,默认为False。数据类型:bool。 | 否 | +| auto_analyze | 自动精度分析,开启后工具自动针对比对结果进行分析,识别到第一个精度不达标节点(在比对结果文件中的“Accuracy Reached or Not”列显示为No),并给出问题可能产生的原因(打屏展示并生成advisor_{timestamp}.txt文件)。可取值True或False,参数示例:auto_analyze=False,默认为True。数据类型:bool。 | 否 | +| fuzzy_match | 模糊匹配。开启后,对于网络中同一层级且命名仅调用次数不同的API,可匹配并进行比对。可取值True或False,参数示例:fuzzy_match=True,默认为False。数据类型:bool。 | 否 | + +**函数示例** + +单机单卡场景下创建比对脚本,例如compare.py,拷贝如下代码,具体参数请根据实际环境修改。 + +```Python +from atat.pytorch import compare +dump_result_param={ +"npu_json_path": "./npu_dump/dump.json", +"bench_json_path": "./gpu_dump/dump.json", +"stack_json_path": "./npu_dump/stack.json", +"is_print_compare_log": True +} +compare(dump_result_param, output_path="./output", stack_mode=True) +``` + +### 统计量比对 + +若使用**compare**或**compare_distributed**函数创建的比对脚本中,在[config.json](../../config/config.json)文件中配置"task": "statistics"方式dump时,可以进行统计量比对,此时比对dump.json文件中的统计信息,开启后的比对结果文件生成Max diff、Min diff、Mean diff和L2norm diff,表示NPU dump数据中API的输入或输出与标杆数据输入或输出的最大值、最小值、平均值以及L2范数的差。可以通过该值判断API是否存在精度问题:当某个API的输入和输出的Max diff、Min diff、Mean diff和L2norm diff均为0或无限趋于0,那么可以判断该API无精度问题,反之则可能存在精度问题。 + +**比对脚本示例** + +以compare.py为例。 + +```Python +from atat.pytorch import compare +dump_result_param={ +"npu_json_path": "./npu_dump/dump.json", +"bench_json_path": "./gpu_dump/dump.json", +"stack_json_path": "./npu_dump/stack.json", +"is_print_compare_log": True +} +compare(dump_result_param, output_path="./output", stack_mode=True) +``` + +**比对结果** + +数据量比对同样生成`compare_result_{timestamp}.csv`和`advisor_{timestamp}.txt`文件。其中`advisor_{timestamp}.txt`主要对`compare_result_{timestamp}.csv`中可能存在精度问题(Result为Waring)的API提出定位建议;`compare_result_{timestamp}.csv`主要有如下两种情况: + +- "summary_mode": "statistics"时比对dump.json文件: + + ![compare_result_pkl](img/compare_result_pkl.png) + + 上图是对dump.json文件中NPU及标杆API的统计信息进行比对,判断可能存在精度问题的API,文件中记录NPU及标杆API的基本信息和统计信息,其中需要关注Result列,包含结果:Waring(NPU与标杆统计信息的比对中存在相对误差大于0.5,则需要重点检查该API);为空(相对误差小于等于0.5,可以不需要重点关注,但不代表不存在精度问题);Nan(表示统计信息数据没有匹配上)。 + +- "summary_mode": "md5"时比对dump.json文件: + + ![compare_result_pkl_md5.png](img/compare_result_pkl_md5.png.png) + + 上图是对dump.json文件中NPU及标杆API的MD5信息进行比对,判断API数据的完整性,文件中记录NPU及标杆API的基本信息和MD5信息,其中需要关注Result列,包含结果:Pass(表示NPU与标杆的MD5值一致,即API数据完整);Different(表示NPU与标杆的MD5值不一致,即API数据不完全一致,可以通过NPU_Stack_Info列API调用栈查询该API的详细信息);Nan(表示MD5信息数据没有匹配上)。 + +## 计算精度评价指标 + +通过计算精度评价指标可以直接从精度比对结果文件中找出不符合精度标准的算子。 + +PyTorch精度比对是以CPU或GPU的计算结果为标杆,计算Cosine(余弦相似度)、MaxAbsErr(最大绝对误差)和MaxRelativeErr(最大相对误差),根据这两个结果判断API在运行时是否存在精度问题。 + +计算精度评价指标: + +1. Cosine:通过计算两个向量的余弦值来判断其相似度,数值越接近于1说明计算出的两个张量越相似,实际可接受阈值为大于0.99。在计算中可能会存在nan,主要由于可能会出现其中一个向量为0。 + +2. MaxAbsErr:当最大绝对误差越接近0表示其计算的误差越小,实际可接受阈值为小于0.001。 + +3. MaxRelativeErr:当最大相对误差越接近0表示其计算的误差越小。 + + 当dump数据中存在0或Nan时,比对结果中最大相对误差则出现inf或Nan的情况,属于正常现象。 + +4. One Thousandth Err Ratio(双千分之一)、Five Thousandths Err Ratio(双千分之五)精度指标:是指NPU的Tensor中的元素逐个与对应的标杆数据对比,相对误差大于千分之一、千分之五的比例占总元素个数的比例小于千分之一、千分之五。该数据仅作为精度下降趋势的参考,并不参与计算精度是否通过的判定。 + +精度比对结果csv文件中只需要通过Accuracy Reached or Not来判断计算精度是否达标,判断标准如下: + +1. Cosine < 0.99 且 MaxAbsError > 0.001时,精度不达标,标记为“No”。 +2. Cosine < 0.9,精度不达标,标记为“No”。 +3. MaxAbsError > 1,精度不达标,标记为“No”。 +4. 其余情况下记为精度达标,标记为“Yes”。 + +# FAQ + +[FAQ](./FAQ.md) + diff --git a/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_overview.md b/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_overview.md new file mode 100644 index 0000000000000000000000000000000000000000..708d90b3487c47249c5f6a8b0f37671e8918e7e2 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_overview.md @@ -0,0 +1,68 @@ +# **精度比对工具** + +## 简介 + +在PyTorch训练网络,对同一模型或API调试过程中,遇到API相关的计算精度问题,定位时费时费力。 + +atat的精度比对工具,用来进行PyTorch整网API粒度的数据dump、精度比对和溢出检测,从而定位PyTorch训练场景下的精度问题。 + +**使用场景** + +主要的使用场景包括: + +- 同一模型,从CPU或GPU移植到NPU中存在精度下降问题,对比NPU芯片中的API计算数值与CPU或GPU芯片中的API计算数值,进行问题定位。 +- 同一模型,进行迭代(模型、框架版本升级或设备硬件升级)时存在的精度下降问题,对比相同模型在迭代前后版本的API计算数值,进行问题定位。 + +## 原理介绍 + +精度对比工具,通过在PyTorch模型中注册hook,跟踪计算图中API的前向传播与反向传播时的输入与输出,排查存在计算精度误差,进行问题的精准定位。 + +**精度比对流程** + +1. 当模型在CPU或GPU上进行正向和反向传播时,分别dump每一层的数值输入与输出。 + +2. 当模型在NPU中进行计算时,采用相同的方式dump下相应的数据。 + +3. 通过对比dump出的数值,计算余弦相似度和最大绝对误差的方式,定位和排查NPU API存在的计算精度问题。如下图所示。 + + 精度比对逻辑图 + + ![module_compare](img/module_compare.png) + +**API匹配条件** + +进行精度比对时,需要判断CPU或GPU的API与NPU的API是否相同可比对,须满足以下匹配条件: + +- 两个API的名称相同,API命名规则:`{api_type}.{api_name}.{api调用次数}.{正反向}.{输入输出}.index`,如:Functional.conv2d.1.backward.input.0。 +- 两个API的输入输出Tensor数量和各个Tensor的Shape相同。 + +通常满足以上两个条件,工具就认为是同一个API,成功进行API的匹配,后续进行相应的计算精度比对。 + +## 精度比对总体流程 + +1. 准备CPU或GPU训练工程。 + +2. 在环境下安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 + +3. 在训练脚本内添加atat工具dump接口PrecisionDebugger采集标杆数据。详见《[精度数据采集](./dump.md)》。 + +4. 执行训练dump数据。 + +5. 将CPU或GPU训练工程迁移为NPU训练工程。详见《[PyTorch模型迁移调优指南](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html)》。 + +6. 在NPU环境下安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 + +7. 在NPU训练脚本内添加atat工具dump接口PrecisionDebugger采集标杆数据。详见《[精度数据采集](./dump.md)》。 + +8. NPU环境下执行训练dump数据。 + +9. 执行精度比对。 + + 1. 创建并配置精度比对脚本,例如compare.py。 + + 2. 执行CPU或GPU dump与NPU dump数据的精度比对。 + + 3. 比对结果分析。 + + 详见《[CPU或GPU与NPU精度数据比对](./ptdbg_ascend_compare.md)》。 + diff --git a/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_quickstart.md b/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_quickstart.md new file mode 100644 index 0000000000000000000000000000000000000000..ae6e3b0b4bbad4796b0332ee8a41b3ae14e5f94e --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_quickstart.md @@ -0,0 +1,381 @@ +# **精度比对工具** + +本文主要介绍atat的精度比对工具的快速入门和场景化示例。 + +本文介绍的操作需要安装atat工具,详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 + +本文介绍的操作主要是精度数据dump和精度比对,详细操作指导可参考《[精度数据采集](./dump.md)》和《[CPU或GPU与NPU精度数据比对](./ptdbg_ascend.md)》。 + +## 快速入门 + +### 单卡场景精度比对 + +**精度分析建议** + +PyTorch训练场景的精度问题分析建议参考以下思路进行精度比对和比对结果分析: + +1. 整网比对:dump整网数据并进行精度比对,初步定位异常范围。 + + 对于模型数据庞大(比如达到T级别)的场景,不推荐直接dump整网比对,整网dump可能导致磁盘不足,需要预留足够的存储空间或者分多次dump。 + +2. 缩小范围:根据Accuracy Reached or Not找出不符合精度标准的API。 + +3. 范围比对:对不符合精度标准的API重新dump详细信息。 + +4. 分析原因并优化:分析API精度不符合标准的原因并进行优化调整。 + +5. 整网比对:重新进行整网比对,判断优化后的API是否已符合精度标准以及是否出现新的精度问题。 + +6. 重复1~5步,直到不存在精度问题为止。 + +**精度分析示例** + +1. 修改dump配置文件config.json。 + + ```json + { + "task": "tensor", + "dump_path": "./npu_dump", + "rank": [], + "step": [], + "level": "L1", + "seed": 1234, + "is_deterministic": false, + + "tensor": { + "scope": [], + "list": [], + "data_mode": ["all"], + "summary_mode": "statistics" + } + } + ``` + +2. 在训练脚本内添加atat工具,dump整网数据。 + + 分别dump CPU或GPU以及NPU数据,在PyTorch训练脚本插入dump接口,示例代码如下(下面以NPU为例,CPU或GPU dump基本相同): + + ```python + from atat.pytorch import PrecisionDebugger + debugger = PrecisionDebugger(config_path="./config.json", dump_path="./npu_dump") + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + +3. 比对整网数据。 + + 第1步中的NPU dump数据目录为npu_dump,假设GPU dump数据目录为gpu_dump;dump将生成dump.json、stack.json、construct.json文件以及dump数据目录。 + + 创建并配置精度比对脚本,以创建compare.py为例,示例代码如下: + + ```python + from atat.pytorch import compare + dump_result_param={ + "npu_json_path": "./npu_dump/dump.json", + "bench_json_path": "./gpu_dump/dump.json", + "stack_json_path": "./npu_dump/stack.json", + "is_print_compare_log": True + } + compare(dump_result_param, output_path="./output", stack_mode=True) + ``` + + 执行比对: + + ```bash +python3 compare.py + ``` + + 在output目录下生成结果文件,包括:`compare_result_{timestamp}.csv`和`advisor_{timestamp}.txt` + +4. 找出存在问题的API。 + + 1. 根据`advisor_{timestamp}.txt`或打屏信息的提示,可找到存在精度问题的算子(Suspect Nodes)和专家建议(Expert Advice)。 + + ![auto_analyze_log](img/auto_analyze_log.png) + + 2. 根据第2步结果文件`compare_result_{timestamp}.csv`中的Accuracy Reached or No字段显示为NO的API,针对该API执行后续比对操作,分析该API存在的精度问题。 + +5. (可选)重新比对。 + + 根据第3步的dump数据重新配置compare.py并执行比对,可以对单API模型进行问题复现。 + +**注意**:部分API存在调用嵌套关系,比如functional.batch_norm实际调用torch.batch_norm,该场景会影响kernel init初始化多次,导致功能异常。 + +### 溢出检测场景 + +溢出检测是针对NPU的PyTorch API,检测是否存在溢出的情况。当前仅支持识别aicore浮点溢出。 + +溢出检测原理:针对溢出阶段,开启acl dump模式,重新对溢出阶段执行,落盘数据。 + +建议按照如下步骤操作: + +1. 修改dump配置文件config.json。 + + ```json + { + "task": "overflow_check", + "dump_path": "./npu_dump", + "rank": [], + "step": [], + "level": "L1", + "seed": 1234, + "is_deterministic": false, + + "overflow_check": { + "overflow_nums": 3 + } + } + ``` + +2. 在NPU训练脚本内添加atat工具,执行溢出检测dump。 + + ```python + from atat.pytorch import PrecisionDebugger + debugger = PrecisionDebugger(config_path="./config.json", dump_path="./npu_dump") + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + + 多卡使用时各卡单独计算溢出次数。 + +3. NPU环境下执行训练dump溢出数据。 + + 针对输入正常但输出存在溢出的API,会在训练执行目录下将溢出的API信息dump并保存为`dump.json`通过《[溢出解析工具](./run_overflow_check.md)》对json文件进行解析,输出溢出API为正常溢出还是非正常溢出,从而帮助用户快速判断。 + + 溢出解析工具执行命令如下: + + ```bash + atat -f pytorch run_overflow_check -api_info ./dump.json + ``` + + 反向过程溢出的API暂不支持精度预检功能。 + + +当重复执行溢出检测dump操作时,需要删除上一次dump目录下的溢出检测dump数据,否则将因重名而报错。 + +**注意事项** + +* (暂不支持)level为L2场景下,会增加npu的内存消耗,请谨慎开启。 +* (暂不支持)l部分API存在调用嵌套关系,比如functional.batch_norm实际调用torch.batch_norm,该场景会影响acl init初始化多次,导致level为L2功能异常。 +* 混合精度动态loss scale场景下,正常训练会有"Gradient overflow. SKipping step"日志,添加溢出检测后日志消失,可以通过设置环境变量export OVERFLOW_DEBUG_MODE_ENABLE=1,并将register_hook位置调整amp.initialize之前解决。此功能需要cann包配套支持,不支持版本执行报错EZ3003。 + +## 场景化示例 + +### 多卡场景精度比对 + +精度工具支持多卡场景的精度比对,多卡场景的dump步骤与单卡场景完全一致,请参见“**单卡场景精度比对**”章节,不同的是多卡数据精度比对时需要使用“compare_distributed”函数进行比对。 + +如下示例: + +说明:多机多卡场景需要每个节点单独执行比对操作。 + +假设NPU dump 数据目录为npu_dump,GPU dump数据目录为gpu_dump。 + +1. 创建比对脚本,例如compare_distributed.py,拷贝如下代码。 + + ```python + from atat.pytorch import * + compare_distributed('./npu_dump/step0', './gpu_dump/step0', './output') + ``` + + dump数据目录须指定到step级。 + +2. 执行比对: + + ```bash + python3 compare_distributed.py + ``` + +两次运行须用相同数量的卡,传入`compare_distributed`的两个文件夹下须有相同个数的rank文件夹,且不包含其他无关文件,否则将无法比对。 + +**多卡set_dump_path注意事项** + +多卡一般为多进程,须保证每个进程都正确调用PrecisionDebugger,或把PrecisionDebugger插入到import语句后,如: + +```python +from atat.pytorch import PrecisionDebugger +debugger = PrecisionDebugger(config_path="./config.json", dump_path="./npu_dump") +``` + +如此可保证set_dump_path在每个进程都被调用。 + +### NPU vs NPU精度比对 + +对于NPU vs NPU场景,是针对同一模型,进行迭代(模型、API版本升级或设备硬件升级)时存在的精度下降问题,对比相同模型在迭代前后版本的API计算数值,进行问题定位。 + +一般情况下迭代涉及NPU自定义算子,因此,可以仅dump NPU自定义算子进行比对。比对精度问题分析请参见“**单卡场景精度比对**”章节。 + +工具当前支持dump NPU自定义算子如下: + +| 序号 | NPU自定义算子 | +| :--- | ----------------------------------------------- | +| 1 | torch_npu.one_ | +| 2 | torch_npu.npu_sort_v2 | +| 3 | torch_npu.npu_transpose | +| 4 | torch_npu.npu_broadcast | +| 5 | torch_npu.npu_dtype_cast | +| 6 | torch_npu.empty_with_format | +| 7 | torch_npu.npu_one_hot | +| 8 | torch_npu.npu_stride_add | +| 9 | torch_npu.npu_ps_roi_pooling | +| 10 | torch_npu.npu_roi_align | +| 11 | torch_npu.npu_nms_v4 | +| 12 | torch_npu.npu_iou | +| 13 | torch_npu.npu_nms_with_mask | +| 14 | torch_npu.npu_pad | +| 15 | torch_npu.npu_bounding_box_encode | +| 16 | torch_npu.npu_bounding_box_decode | +| 17 | torch_npu.npu_batch_nms | +| 18 | torch_npu.npu_slice | +| 19 | torch_npu._npu_dropout | +| 20 | torch_npu.npu_indexing | +| 21 | torch_npu.npu_ifmr | +| 22 | torch_npu.npu_max | +| 23 | torch_npu.npu_scatter | +| 24 | torch_npu.npu_layer_norm_eval | +| 25 | torch_npu.npu_alloc_float_status | +| 26 | torch_npu.npu_confusion_transpose | +| 27 | torch_npu.npu_bmmV2 | +| 28 | torch_npu.fast_gelu | +| 29 | torch_npu.npu_sub_sample | +| 30 | torch_npu.npu_deformable_conv2d | +| 31 | torch_npu.npu_mish | +| 32 | torch_npu.npu_anchor_response_flags | +| 33 | torch_npu.npu_yolo_boxes_encode | +| 34 | torch_npu.npu_grid_assign_positive | +| 35 | torch_npu.npu_normalize_batch | +| 36 | torch_npu.npu_masked_fill_range | +| 37 | torch_npu.npu_linear | +| 38 | torch_npu.npu_bert_apply_adam | +| 39 | torch_npu.npu_giou | +| 40 | torch_npu.npu_ciou | +| 41 | torch_npu.npu_diou | +| 42 | torch_npu.npu_sign_bits_pack | +| 43 | torch_npu.npu_sign_bits_unpack | +| 44 | torch_npu.npu_flash_attention | +| 45 | torch_npu.npu_scaled_masked_softmax | +| 46 | torch_npu.npu_rotary_mul | +| 47 | torch_npu.npu_roi_align | +| 48 | torch_npu.npu_roi_alignbk | +| 49 | torch_npu.npu_ptiou | +| 50 | torch_npu.npu_fusion_attention | +| 51 | torch_npu.npu_dropout_with_add_softmax | +| 52 | torch_npu.npu_random_choice_with_mask | +| 53 | torch_npu.npu_rotated_iou | +| 54 | torch_npu.npu_conv2d | +| 55 | torch_npu.npu_conv3d | +| 56 | torch_npu.npu_softmax_cross_entropy_with_logits | +| 57 | torch_npu.npu_all_gather_base_mm | +| 58 | torch_npu.npu_swiglu | +| 59 | torch_npu.npu_rms_norm | +| 60 | torch_npu.npu_mm_reduce_scatter_base | +| 61 | torch_npu.npu_mm_all_reduce_base | +| 62 | torch_npu.npu_conv_transpose2d | +| 63 | torch_npu.npu_convolution | +| 64 | torch_npu.npu_convolution_transpose | +| 65 | torch_npu.npu_min | +| 66 | torch_npu.npu_nms_rotated | +| 67 | torch_npu.npu_reshape | +| 68 | torch_npu.npu_rotated_box_decode | +| 69 | torch_npu.npu_rotated_box_encode | +| 70 | torch_npu.npu_rotated_overlaps | +| 71 | torch_npu.npu_silu | +| 72 | torch_npu.npu_fused_attention_score | +| 73 | torch_npu.npu_multi_head_attention | +| 74 | torch_npu.npu_gru | +| 75 | torch_npu.npu_incre_flash_attention | +| 76 | torch_npu.npu_prompt_flash_attention | +| 77 | torch_npu.npu_lstm | +| 78 | torch_npu.npu_apply_adam | + +### 通信API的数据dump + +通信类API数据可以使用全量dump方式获取,若只dump通信类API数据,可以使用如下示例: + +1. 修改dump配置文件config.json。 + + ```json + { + "task": "tensor", + "dump_path": "./npu_dump", + "rank": [], + "step": [], + "level": "L1", + "seed": 1234, + "is_deterministic": false, + + "tensor": { + "scope": [], + "list": ["distributed"], + "data_mode": ["all"], + "summary_mode": "statistics" + } + } + ``` + +2. 在训练脚本内添加atat工具,dump整网数据。 + + ```python + from atat.pytorch import PrecisionDebugger + debugger = PrecisionDebugger(config_path="./config.json", dump_path="./npu_dump") + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + +通信类API支持列表: + +| 序号 | Distributed | +| :--- | -------------------- | +| 1 | send | +| 2 | recv | +| 3 | broadcast | +| 4 | all_reduce | +| 5 | reduce | +| 6 | all_gather | +| 7 | gather | +| 8 | isend | +| 9 | irecv | +| 10 | scatter | +| 11 | reduce_scatter | +| 12 | _reduce_scatter_base | +| 13 | _all_gather_base | + diff --git a/debug/accuracy_tools/atat/pytorch/doc/run_overflow_check.md b/debug/accuracy_tools/atat/pytorch/doc/run_overflow_check.md new file mode 100644 index 0000000000000000000000000000000000000000..1bdc4f354cfaf0bfbdf701baa7dfb05f3771e30b --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/doc/run_overflow_check.md @@ -0,0 +1,25 @@ +# **溢出解析工具** + +针对训练过程中的溢出检测场景(当《[精度数据采集](./dump.md)》开启溢出检测dump时),对于输入正常但输出存在溢出的API,会在训练执行目录下将溢出的API信息按照前向和反向分类,dump并保存为`dump.json`,前向过程溢出的API可通过该工具对`dump.json`进行解析,输出溢出API为正常溢出还是非正常溢出,从而帮助用户快速判断。 + +工具支持PyTorch版本:1.11.0/2.0/2.1/2.2。 + +操作步骤如下: + +1. 安装工具。 + + 详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。 + +2. 执行溢出API解析操作。 + + ```bash + atat -f pytorch run_overflow_check -api_info ./dump.json + ``` + +| 参数名称 | 说明 | 是否必选 | +| -------------------------- | -------------------------------------------------- | -------- | +| -api_info或--api_info_file | 指定API信息文件dump.json。 | 是 | +| -j或--jit_compile | 开启jit编译。 | 否 | +| -d或--device | 指定Device ID,选择UT代码运行所在的卡,默认值为0。 | 否 | + +反向过程溢出的API暂不支持该功能。 diff --git a/debug/accuracy_tools/atat/pytorch/dump/dump.py b/debug/accuracy_tools/atat/pytorch/dump/dump.py new file mode 100644 index 0000000000000000000000000000000000000000..64652bdaec5bc1d5de5d740c2c23de474a27d5fa --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/dump/dump.py @@ -0,0 +1,455 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import inspect +import json +import os +import threading +from pathlib import Path + +import numpy as np +import torch + +try: + import torch_npu +except ImportError: + is_gpu = True +else: + is_gpu = False + +from atat.core.utils import (print_warn_log, Const, print_info_log, modify_dump_path, check_inplace_op, CompareConst, + print_error_log) +from atat.core.file_check_util import FileOpen, change_mode, FileCheckConst +from atat.pytorch.common.utils import get_md5_for_tensor +from ..dump.utils import check_writable +from .utils import (DumpUtil, check_if_in_api_list, make_dump_data_dir, get_tensor_rank, create_dirs_if_not_exist, + CompareException, check_single_rank_folder) + + +forward_init_status = False +backward_init_status = False + +thread_lock = threading.Lock() +pkl_name = "" +rank = os.getpid() + 100000 +multi_output_apis = ["_sort_", "npu_flash_attention"] +module_count = {} + + +class APIList(list): + threshold = 1000 + + def __init__(self, *args): + self.dump_count = 0 + self.pkl_mode_changed = False + super().__init__(*args) + + def flush(self): + pkl_path = get_pkl_file_path() + if len(self) == 0 or pkl_path == "": + return + with FileOpen(pkl_path, 'a') as f: + try: + f.write('\n'.join(json.dumps(item) for item in self)) + f.write('\n') + except IOError as ex: + raise Exception("write to disk failed") from ex + self.dump_count += 1 + print_info_log(f"write {len(self)} items to {pkl_path} the {self.dump_count} time") + if not self.pkl_mode_changed: + change_mode(pkl_path, FileCheckConst.DATA_FILE_AUTHORITY) + self.pkl_mode_changed = True + self.clear() + + def append(self, data): + list.append(self, data) + if len(self) >= APIList.threshold: + self.flush() + + +api_list = APIList() + + +class DataInfo(object): + def __init__(self, save_data, summary_data, dtype, shape, md5=None): + if md5 is None: + md5 = [] + self.save_data = save_data + self.summary_data = summary_data + self.dtype = dtype + self.shape = shape + self.md5 = md5 + + +def get_not_float_tensor_info(data): + if DumpUtil.summary_mode == "md5": + return DataInfo([], [], str(data.dtype), tuple(data.shape), get_md5_for_tensor(data)) + if data.numel() == 0 or data.dtype == torch.bool: + tensor_max = [] + tensor_min = [] + tensor_mean = [] + elif len(data.shape) == 0: + item = data.float().item() + tensor_max = item + tensor_min = item + tensor_mean = item + else: + tensor_max = torch._C._VariableFunctionsClass.max(data).float().item() + tensor_min = torch._C._VariableFunctionsClass.min(data).float().item() + tensor_mean = torch._C._VariableFunctionsClass.mean(data.float()).float().item() + return get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean, CompareConst.NAN) + + +def get_scalar_data_info(data): + summary_data = [data, data, data, data] + return DataInfo(data, summary_data, str(type(data)), str([])) + + +def get_float_tensor_info(data): + if DumpUtil.summary_mode == "md5": + return DataInfo([], [], str(data.dtype), tuple(data.shape), get_md5_for_tensor(data)) + tensor_max = torch._C._VariableFunctionsClass.max(data).float().item() + tensor_min = torch._C._VariableFunctionsClass.min(data).float().item() + tensor_mean = torch._C._VariableFunctionsClass.mean(data).float().item() + tensor_norm = torch._C._VariableFunctionsClass.norm(data).float().item() + return get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean, tensor_norm) + + +def get_tensor_data_info(data, *tensor_args): + summary_data = [] + summary_data.extend([*tensor_args]) + if DumpUtil.summary_mode == "all": + saved_tensor = data.contiguous().cpu().detach() + if data.dtype == torch.bfloat16: + saved_numpy = saved_tensor.to(torch.float32).numpy() + else: + saved_numpy = saved_tensor.numpy() + return DataInfo(saved_numpy, summary_data, str(data.dtype), tuple(data.shape)) + return DataInfo([], summary_data, str(data.dtype), tuple(data.shape)) + + +def dump_tensor(x, prefix, dump_step): + if isinstance(x, (tuple, list)) and x: + for i, item in enumerate(x): + dump_tensor(item, "{}.{}".format(prefix, i), dump_step) + return + elif isinstance(x, torch.Tensor): + if x.is_meta: + print_info_log(f"Meta tensor {prefix} is skipped.") + return + x_clone = x.clone().detach() + if x_clone.numel() == 0 or len(x_clone.shape) == 0 or not x_clone.is_floating_point(): + if DumpUtil.dump_filter_switch == Const.OFF: + data_info = get_not_float_tensor_info(x_clone) + dump_data_by_rank_count(dump_step, prefix, data_info) + else: + return + else: + data_info = get_float_tensor_info(x_clone) + dump_data_by_rank_count(dump_step, prefix, data_info) + + elif DumpUtil.dump_filter_switch == Const.OFF: + if isinstance(x, bool) or isinstance(x, int) or isinstance(x, float): + data_info = get_scalar_data_info(x) + dump_data_by_rank_count(dump_step, prefix, data_info) + + +def append_pkl_data(dump_step, prefix, data_info): + global api_list + thread_lock.acquire() + api_list.append([prefix, dump_step, data_info.md5, data_info.dtype, data_info.shape, data_info.summary_data]) + thread_lock.release() + + +def dump_data(prefix, data_info): + if DumpUtil.summary_mode != "all": + return + output_path = os.path.join(DumpUtil.dump_data_dir, f'{prefix}.npy') + try: + np.save(output_path, data_info.save_data) + change_mode(output_path, FileCheckConst.DATA_FILE_AUTHORITY) + except Exception as e: + print_warn_log("Dump data failed, error: {}".format(e)) + + +def thread_dump_data(prefix, data_info): + DumpUtil.dump_thread_pool.submit(dump_data, prefix, data_info) + + +def dump_data_by_rank_count(dump_step, prefix, data_info): + print_info_log(f"ptdbg is analyzing rank{rank} api: {prefix}" + " " * 10, end='\r') + if DumpUtil.is_single_rank and DumpUtil.dump_thread_pool: + thread_dump_data(prefix, data_info) + else: + dump_data(prefix, data_info) + append_pkl_data(dump_step, prefix, data_info) + + +def dump_stack_info(name_template): + if check_inplace_op(name_template) and Const.PRE_FORWARD in name_template: + return + + stack_str = [] + try: + for (_, path, line, func, code, _) in inspect.stack()[4:]: + if code: + stack_line = [path, str(line), func, code[0].strip() if code else code] + else: + stack_line = [path, str(line), func, code] + stack_str.append(stack_line) + except Exception as e: + print_warn_log("Dump stack info failed, error: {}".format(e)) + stack_str.append('') + + prefix = name_template.format("stack_info") + if DumpUtil.dump_switch_mode in Const.DUMP_MODE: + complement_set = set(['forward', 'backward', 'input', 'output']) - set(DumpUtil.dump_mode) + if not any(mode in prefix for mode in complement_set): + api_list.append([prefix, stack_str]) + else: + api_list.append([prefix, stack_str]) + + +def dump_api_tensor(dump_step, in_feat, name_template, out_feat): + if check_inplace_op(name_template): + if Const.PRE_FORWARD in name_template: + name_template = name_template.replace(Const.PRE_FORWARD, Const.FORWARD) + else: + if Const.BACKWARD in name_template and Const.BACKWARD in DumpUtil.dump_mode: + return + elif Const.BACKWARD not in name_template and Const.FORWARD in DumpUtil.dump_mode: + if "output" in DumpUtil.dump_mode: + dump_tensor(in_feat, name_template.format("output"), dump_step) + if "input" in DumpUtil.dump_mode: + return + + if Const.BACKWARD in name_template and Const.BACKWARD in DumpUtil.dump_mode: + if 'input' in DumpUtil.dump_mode: + dump_tensor(out_feat, name_template.format("input"), dump_step) + if 'output' in DumpUtil.dump_mode: + dump_tensor(in_feat, name_template.format("output"), dump_step) + elif Const.BACKWARD not in name_template and Const.FORWARD in DumpUtil.dump_mode: + if 'input' in DumpUtil.dump_mode: + dump_tensor(in_feat, name_template.format("input"), dump_step) + if 'output' in DumpUtil.dump_mode: + dump_tensor(out_feat, name_template.format("output"), dump_step) + + +def rename_(): + global rank + global pkl_name + if rank is not None and pkl_name is not None: + dir_name = os.path.join(DumpUtil.dump_root, "step{}".format(DumpUtil.iter_num), "rank{}".format(os.getpid() + 100000)) + new_name = os.path.join(DumpUtil.dump_root, "step{}".format(DumpUtil.iter_num), "rank{}".format(rank)) + if not os.path.exists(new_name) and os.path.exists(dir_name): + _, file_name = os.path.split(pkl_name) + os.rename(dir_name, new_name) + pkl_name = os.path.join(new_name, file_name) + + +def dump_acc_cmp(name, in_feat, out_feat, dump_step, module): + if not DumpUtil.get_dump_switch(): + return + if DumpUtil.dump_switch_mode == Const.API_LIST and not check_if_in_api_list(name): + return + if DumpUtil.dump_switch_mode in [Const.LIST, Const.ACL, Const.RANGE, Const.STACK] and not DumpUtil.check_switch_scope(name): + return + dump_file = DumpUtil.get_dump_path() + dump_file = modify_dump_path(dump_file, DumpUtil.dump_switch_mode) + global rank + dump_dir, dump_filename = os.path.split(dump_file) + dump_dir = os.path.join(dump_dir, "step{}".format(DumpUtil.iter_num)) + if not os.path.exists(dump_dir): + Path(dump_dir).mkdir(mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True) + dump_file = os.path.join(dump_dir, dump_filename) + rank_this = get_tensor_rank(in_feat, out_feat) + DumpUtil.dump_root = os.path.dirname(DumpUtil.dump_path) + if rank_this is not None and rank != rank_this: + rank = rank_this + rename_() + if not DumpUtil.dump_init_enable: + if '.pkl' in dump_filename: + npy_dir = dump_filename[:-4] + else: + npy_dir = dump_filename + DumpUtil.dump_data_dir = os.path.join(DumpUtil.dump_root, "step{}".format(DumpUtil.iter_num), "rank{}".format(rank), npy_dir) + if DumpUtil.target_rank is not None: + if rank != DumpUtil.target_rank: + return + dump_file = create_dirs_if_not_exist(rank, dump_file) + global pkl_name + pkl_name = dump_file + if DumpUtil.dump_init_enable: + DumpUtil.dump_init_enable = False + DumpUtil.dump_data_dir = make_dump_data_dir(dump_file) \ + if DumpUtil.dump_switch_mode not in [Const.STACK, Const.ACL] and DumpUtil.summary_mode == "all" else "" + if os.path.exists(dump_file) and not os.path.isdir(dump_file): + check_writable(dump_file) + try: + os.remove(dump_file) + except FileNotFoundError as e: + print_warn_log("The file does not exist, error: {}".format(e)) + + name_prefix = name + name_template = f"{name_prefix}" + "_{}" + if DumpUtil.is_single_rank is None: + DumpUtil.is_single_rank = check_single_rank_folder(dump_dir) + if DumpUtil.dump_switch_mode in [Const.ALL, Const.API_LIST]: + dump_api_tensor(dump_step, in_feat, name_template, out_feat) + elif DumpUtil.dump_switch_mode == Const.API_STACK: + dump_api_tensor(dump_step, in_feat, name_template, out_feat) + dump_stack_info(name_template) + else: + if DumpUtil.dump_switch_mode == Const.ACL: + acl_dump(module, name, name_prefix) + elif DumpUtil.dump_switch_mode != Const.STACK: + dump_api_tensor(dump_step, in_feat, name_template, out_feat) + dump_stack_info(name_template) + + +def acl_dump(module, module_name, name_prefix): + if name_prefix in DumpUtil.backward_input: + dump_mode_backward_acl_dump(module, module_name, DumpUtil.backward_input.get(name_prefix)) + else: + forward_acl_dump(module, module_name) + + +def Op_Need_Trigger(module_name): + if 'Tensor.__getitem__.' in module_name: + return True + return False + + +def forward_acl_dump(module, module_name): + global forward_init_status + global backward_init_status + if not forward_init_status and not backward_init_status: + forward_init_status = True + torch_npu.npu.synchronize() + torch_npu.npu.init_dump() + torch_npu.npu.set_dump(DumpUtil.dump_config) + torch_npu.npu.synchronize() + if Op_Need_Trigger(module_name): + module.forward(*module.input_args, **module.input_kwargs).cpu() + else: + module.forward(*module.input_args, **module.input_kwargs) + torch_npu.npu.synchronize() + torch_npu.npu.finalize_dump() + torch_npu.npu.synchronize() + del module.input_args + del module.input_kwargs + forward_init_status = False + print_info_log("Dump %s op file." % module_name) + + +def acl_backward_dump_status(output, grad, module_name): + if isinstance(output, torch.Tensor): + output.backward(grad, retain_graph=True) + return True + + for api_name in multi_output_apis: + if api_name in module_name: + output[0].backward(grad, retain_graph=True) + return True + return False + + +def dump_mode_backward_acl_dump(module, module_name, grad_path): + global forward_init_status + global backward_init_status + module_name = module_name.replace(Const.FORWARD, Const.BACKWARD) + if not forward_init_status and not backward_init_status: + forward_init_status = True + module.input_args = list(module.input_args) + for i, data in enumerate(module.input_args): + if isinstance(data, torch.Tensor) and data.grad_fn: + module.input_args[i] = data.detach().requires_grad_() + output = module.forward(*module.input_args, **module.input_kwargs) + grad = torch.tensor(np.load(grad_path)).to("npu").requires_grad_() + torch_npu.npu.init_dump() + torch_npu.npu.set_dump(DumpUtil.dump_config) + torch_npu.npu.synchronize() + if not acl_backward_dump_status(output, grad, module_name): + print_warn_log("The output of {} is not of tensor type and cannot be automatically derived. " + "you can manually construct a single API backward case for ACL dump.".format(module_name)) + torch_npu.npu.synchronize() + torch_npu.npu.finalize_dump() + del module.input_args + del module.input_kwargs + forward_init_status = False + print_info_log("Dump %s op file." % module_name) + + +def module_count_func(name, name_template): + module_name = name.split("_")[-3] + if Const.FORWARD in name_template: + if module_name not in module_count: + module_count[module_name] = [0, [0]] + else: + if module_count[module_name][-1] and \ + module_count[module_name][0] != module_count[module_name][-1][-1]: + module_count[module_name][-1].pop() + module_count[module_name][0] += 1 + module_count[module_name][-1].append(module_count[module_name][0]) + index = module_count[module_name][0] + else: + backward_stack = module_count[module_name][-1] if module_name in module_count else [] + if not backward_stack: + print_warn_log("The backward stack of {} is empty.".format(module_name)) + index = "abnormal" + else: + index = backward_stack.pop() + return index + + +def acc_cmp_dump(name, **kwargs): + dump_step = kwargs.get('dump_step', 1) + pid = kwargs.get('pid') + name_template = name + if not pid: + return RuntimeError("Not get the specified process pid.") + + def acc_cmp_hook(module, in_feat, out_feat=None): + nonlocal name, name_template + if "_{}_" in name_template: + try: + index = module_count_func(name, name_template) + except IndexError as e: + print_error_log(f"Get module {name_template} index failed.") + raise CompareException(CompareException.INDEX_OUT_OF_BOUNDS_ERROR) from e + name = name_template.format(index) + if pid == os.getpid(): + dump_acc_cmp(name, in_feat, out_feat, dump_step, module) + if hasattr(module, "input_args"): + del module.input_args + if hasattr(module, "input_kwargs"): + del module.input_kwargs + + return acc_cmp_hook + + +def write_to_disk(): + api_list.flush() + + +def get_pkl_file_path(): + return pkl_name + + +def reset_module_count(): + global module_count + module_count = {} diff --git a/debug/accuracy_tools/atat/pytorch/dump/utils.py b/debug/accuracy_tools/atat/pytorch/dump/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8e58f35606a4a4f9cf9e7ae732beeedb7777cdef --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/dump/utils.py @@ -0,0 +1,357 @@ +import os +import re +import shutil +from pathlib import Path +import torch +import torch.distributed as dist + +from atat.core.utils import print_error_log, CompareException, DumpException, Const, get_time, print_info_log, \ + check_mode_valid, check_switch_valid, check_dump_mode_valid, check_summary_only_valid, generate_compare_script, \ + check_file_valid, make_dump_path_if_not_exists, check_path_before_create, check_summary_mode_valid +from atat.core.file_check_util import FileChecker, FileCheckConst, check_path_length, check_path_pattern_vaild +from atat.pytorch.common.utils import check_is_npu + +from ..dump import dump + +dump_count = 0 +range_begin_flag, range_end_flag = False, False + + +def check_list_or_acl_mode(name_prefix): + global dump_count + for item in DumpUtil.dump_switch_scope: + if name_prefix.startswith(item): + dump_count = dump_count + 1 + return True + return False + + +def check_range_mode(name_prefix): + global range_begin_flag + global range_end_flag + if name_prefix.startswith(DumpUtil.dump_switch_scope[0]): + range_begin_flag = True + return True + if name_prefix.startswith(DumpUtil.dump_switch_scope[1]): + range_end_flag = True + return True + if range_begin_flag and not range_end_flag: + return True + return False + + +def check_stack_mode(name_prefix): + if len(DumpUtil.dump_switch_scope) == 0: + return True + elif len(DumpUtil.dump_switch_scope) == 1: + return name_prefix.startswith(DumpUtil.dump_switch_scope[0]) + elif len(DumpUtil.dump_switch_scope) == 2: + return check_range_mode(name_prefix) + else: + print_error_log("dump scope is invalid, Please set the scope mode in" + " set_dump_switch with 'all', 'list', 'range', 'stack', 'acl', 'api_list'!") + return False + + +class DumpConfig: + def __init__(self, mode=None, scope=None, api_list=None, filter_switch=None, dump_mode=None, summary_only=False, summary_mode="all"): + self.mode = mode + self.scope = scope + self.api_list = api_list + self.filter_switch = filter_switch + self.dump_mode = dump_mode + self.summary_only = summary_only + self.summary_mode = summary_mode + + +class DumpUtil(object): + dump_root = None + dump_data_dir = None + dump_path = None + dump_switch = None + dump_switch_mode = Const.ALL # all, api_stack, list, stack... + dump_switch_scope = [] + dump_init_enable = False + dump_api_list = [] + dump_filter_switch = None + dump_mode = ['forward', 'backward', 'input', 'output'] + backward_input = {} + dump_dir_tag = 'ptdbg_dump' + dump_config = None + dataloader_iter = 0 + target_iter = None + iter_num = 0 + target_rank = None + summary_only = False + need_replicate = False + summary_mode = "all" + is_single_rank = None + dump_thread_pool = None + + + @staticmethod + def set_dump_path(save_path): + DumpUtil.dump_path = save_path + DumpUtil.dump_init_enable = True + + @staticmethod + def set_acl_config(acl_config): + if not acl_config: + raise ValueError("acl_config must be configured when mode is 'acl'") + acl_config_checker = FileChecker(acl_config, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.JSON_SUFFIX) + acl_config = acl_config_checker.common_check() + DumpUtil.dump_config = acl_config + + @staticmethod + def set_dump_switch(switch, dump_config): + DumpUtil.dump_switch = switch + if dump_config.mode is not None: + DumpUtil.dump_switch_mode = dump_config.mode + DumpUtil.dump_init_enable = True + if dump_config.scope is not None: + DumpUtil.dump_switch_scope = dump_config.scope + if dump_config.api_list is not None: + DumpUtil.dump_api_list = [api.lower() for api in dump_config.api_list] + if dump_config.filter_switch is not None: + DumpUtil.dump_filter_switch = dump_config.filter_switch + if dump_config.dump_mode is not None: + DumpUtil.dump_mode = dump_config.dump_mode if isinstance(dump_config.dump_mode, list) else [dump_config.dump_mode] + + if dump_config.mode == Const.ACL: + DumpUtil.dump_switch_scope = [api_name.replace("backward", "forward") for api_name in dump_config.scope] + + DumpUtil.summary_only = dump_config.summary_only + DumpUtil.summary_mode = dump_config.summary_mode + + check_mapper = { + Const.LIST: check_list_or_acl_mode, + Const.ACL: check_list_or_acl_mode, + Const.RANGE: check_range_mode, + Const.STACK: check_stack_mode + } + + @staticmethod + def check_switch_scope(name_prefix): + if DumpUtil.dump_switch_mode in DumpUtil.check_mapper: + check_func = DumpUtil.check_mapper[DumpUtil.dump_switch_mode] + return check_func(name_prefix) + return False + + @staticmethod + def get_dump_path(): + if DumpUtil.dump_path: + return DumpUtil.dump_path + + if DumpUtil.dump_switch_mode == Const.ALL: + raise RuntimeError("get_dump_path: the file path is empty," + " you must use set_dump_path to set a valid dump path!!!") + else: + dir_path = os.path.realpath("./") + dump_file_name = "scope_dump_{}_{}_{}.pkl".format( + DumpUtil.dump_switch_mode, DumpUtil.dump_switch_scope[0], get_time()) + DumpUtil.dump_path = os.path.join(dir_path, dump_file_name) + return DumpUtil.dump_path + + @staticmethod + def get_dump_switch(): + return DumpUtil.dump_switch == "ON" + + +def set_dump_path(fpath=None, dump_tag='ptdbg_dump'): + fpath = load_env_dump_path(fpath) + check_file_valid(fpath) + if not re.match(Const.FILE_PATTERN, dump_tag): + print_error_log('The file path {} contains special characters.'.format(dump_tag)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + real_path = os.path.realpath(fpath) + make_dump_path_if_not_exists(real_path) + fpath_checker = FileChecker(real_path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE) + fpath_checker.common_check() + DumpUtil.set_dump_path(real_path) + DumpUtil.dump_dir_tag = dump_tag + + +def get_tensor_rank(in_feat, out_feat): + if dist.is_initialized(): + return dist.get_rank() + + def get_tensor_rank_single(x): + if isinstance(x, (list, tuple)): + if len(x) > 0: + return get_tensor_rank_single(x[0]) + return None + elif isinstance(x, torch.Tensor): + device = x.device + if device.type == 'cpu': + return None + else: + return device.index + return None + in_rank = get_tensor_rank_single(in_feat) + if in_rank is None: + out_rank = get_tensor_rank_single(out_feat) + if out_rank is None: + return None + return out_rank + return in_rank + + +def create_dirs_if_not_exist(rank, dump_file): + dump_path, file_name = os.path.split(dump_file) + rank_dir = os.path.join(dump_path, f"rank{rank}") + dump_file = os.path.join(rank_dir, file_name) + if not os.path.isdir(rank_dir): + check_path_pattern_vaild(dump_file) + check_path_length(dump_file, name_length=200) + Path(rank_dir).mkdir(mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True) + return dump_file + + +def generate_dump_path_str(): + if DumpUtil.dump_switch_mode == 'acl': + if DumpUtil.dump_config == '': + print_error_log("Please provide dump config for register hook before turning on dump switch!") + raise DumpException(DumpException.NONE_ERROR) + dump_path = f"according to dump config {DumpUtil.dump_config}" + else: + dump_dir, dump_file = os.path.split(DumpUtil.dump_path) + if not dump_file.endswith(".pkl"): + dump_dir = DumpUtil.dump_path + dump_path = f"to {dump_dir}" + return dump_path + + +def set_dump_switch(switch, mode=Const.ALL, scope=None, api_list=None, filter_switch=Const.OFF, dump_mode=None, + summary_only=False): + if scope is None: + scope = [] + if api_list is None: + api_list = [] + if dump_mode is None: + dump_mode = [Const.ALL] + check_switch_valid(switch) + if not DumpUtil.dump_path: + set_dump_path() + dump_config = DumpConfig(summary_only=summary_only) + DumpUtil.set_dump_switch(switch, dump_config) + dump_path_str = generate_dump_path_str() + if switch == "OFF": + dump.write_to_disk() + if check_is_npu() and DumpUtil.dump_switch_mode in [Const.ALL, Const.API_STACK, Const.LIST, Const.RANGE, Const.API_LIST]: + generate_compare_script(DumpUtil.dump_data_dir, dump.get_pkl_file_path(), DumpUtil.dump_switch_mode) + set_dump_switch_print_info(switch, mode, dump_path_str) + set_dump_switch_config(mode=mode, scope=scope, api_list=api_list, filter_switch=filter_switch, dump_mode=dump_mode, + summary_only=summary_only) + + +def set_dump_switch_config(mode=Const.ALL, scope=None, api_list=None, filter_switch=Const.OFF, dump_mode=None, + summary_only=False, summary_mode="all"): + if scope is None: + scope = [] + if api_list is None: + api_list = [] + if dump_mode is None: + dump_mode = [Const.ALL] + try: + check_summary_mode_valid(summary_mode) + check_mode_valid(mode, scope, api_list) + check_switch_valid(filter_switch) + dump_mode = check_dump_mode_valid(dump_mode) + summary_only = check_summary_only_valid(summary_only) + except (CompareException, AssertionError) as err: + print_error_log(str(err)) + raise CompareException(CompareException.INVALID_PARAM_ERROR) from err + switch = DumpUtil.dump_switch + dump_config = DumpConfig(mode, scope, api_list, filter_switch, dump_mode, summary_only, summary_mode) + DumpUtil.set_dump_switch("OFF", dump_config) + DumpUtil.dump_switch = switch + + +def set_dump_switch_print_info(switch, mode, dump_path_str): + global dump_count + if switch == "ON": + print_info_log(f"Dump switch is turned on. Dump data will be saved {dump_path_str}. ") + if mode == Const.LIST: + dump_count = 0 + else: + print_info_log(f"Dump switch is turned off. ") + if mode == Const.LIST: + print_info_log("The number of matched dump is {}".format(dump_count)) + + +def check_if_in_api_list(name): + if not DumpUtil.dump_api_list: + return False + for api in DumpUtil.dump_api_list: + if api.lower() in name.lower(): + return True + return False + + +def set_backward_input(backward_input): + for index, api_name in enumerate(DumpUtil.dump_switch_scope): + DumpUtil.backward_input[api_name] = backward_input[index] + + +def make_dump_data_dir(dump_file_name): + dump_path, file_name = os.path.split(os.path.realpath(dump_file_name)) + name_body, name_extension = os.path.splitext(file_name) + output_dir = os.path.join(dump_path, f"{name_body}") + check_path_before_create(output_dir) + if not os.path.exists(output_dir): + Path(output_dir).mkdir(mode=0o750, exist_ok=True) + else: + shutil.rmtree(output_dir, ignore_errors=True) + Path(output_dir).mkdir(mode=0o750, exist_ok=True) + return output_dir + + +def make_dump_dirs(): + dump_file_name, dump_file_name_body = "dump.pkl", "dump" + dump_root_dir = load_env_dump_path(DumpUtil.dump_path) + tag_dir = os.path.join(dump_root_dir, DumpUtil.dump_dir_tag) + check_path_length(tag_dir) + check_path_pattern_vaild(tag_dir) + Path(tag_dir).mkdir(mode=0o750, parents=True, exist_ok=True) + DumpUtil.dump_dir = tag_dir + dump_file_path = os.path.join(tag_dir, dump_file_name) + DumpUtil.set_dump_path(dump_file_path) + + +def check_writable(dump_file): + if not os.access(dump_file, os.W_OK): + print_error_log( + 'The path {} does not have permission to write. Please check the path permission'.format( + dump_file)) + raise DumpException(DumpException.INVALID_PATH_ERROR) + + +def load_env_dump_path(dump_path): + if not dump_path: + dump_path = os.getenv(Const.ASCEND_WORK_PATH) + if dump_path: + try: + dump_path = os.path.join(str(dump_path), Const.DUMP_DIR) + except TypeError as err: + print_error_log("Generating dump path from environment variables ASCEND_WORK_PATH failed.") + raise DumpException(DumpException.INVALID_PATH_ERROR) from err + else: + print_error_log("Dump path is None, you can configure it in the following ways:\n" + "1. Configure set_dump_path function.\n" + "2. Configure the dump_path parameter of PrecisionDebugger.\n" + "3. Set environment variables ASCEND_WORK_PATH.") + raise DumpException(DumpException.INVALID_PATH_ERROR) + return dump_path + + +def check_single_rank_folder(dump_path): + rank_folder_pattern = re.compile(r'^rank\d+$') + rank_folder_count = 0 + for item in os.listdir(dump_path): + full_path = os.path.join(dump_path, item) + if os.path.isdir(full_path) and rank_folder_pattern.match(item): + rank_folder_count += 1 + if rank_folder_count > 1: + return False + return rank_folder_count == 1 diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/__init__.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3ffe161cba432405e2dc8d98f9be89053b58849d --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/__init__.py @@ -0,0 +1,8 @@ +from atat.pytorch.common import print_warn_log_rank_0, print_info_log_rank_0 +from atat.pytorch.common.exceptions import FreeBenchmarkException +from atat.pytorch.common.utils import Const + +from .main import FreeBenchmarkCheck +from .common.params import UnequalRow + +__all__ = [FreeBenchmarkCheck, UnequalRow] diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/__init__.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/constant.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..9b72437f2280ca44a20fc5e370f1cfd9b9ea3ac4 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/constant.py @@ -0,0 +1,66 @@ +from typing import Dict + +import numpy as np +import torch +from atat.pytorch.free_benchmark.common.enums import FuzzThreshold +from atat.pytorch.free_benchmark.common.params import BenchmarkThd + + +class CommonField: + DEVICE = "device" + META = "meta" + FUZZ_TENSOR = "fuzz_tensor" + REQUIRES_GRAD = "requires_grad" + HOLD_PLACE = "hold_place" + DISTRIBUTED_OP = "torch.distributed" + + +class ThresholdConfig: + PERTURBATION_VALUE_DICT: Dict = { + torch.bfloat16: FuzzThreshold.BF16_THD, + torch.float16: FuzzThreshold.F16_THD, + torch.float32: FuzzThreshold.F32_THD, + torch.float64: FuzzThreshold.F64_THD, + } + + ABS_TOL_VALUE_DICT: Dict = { + torch.bfloat16: FuzzThreshold.BF16_THD, + torch.float16: FuzzThreshold.F16_THD, + torch.float32: FuzzThreshold.F32_THD, + torch.float64: FuzzThreshold.F64_THD, + } + + # bit翻转需要匹配到等长或更长的整型 + PERTURBATION_BIT_DICT = { + torch.bfloat16: torch.int16, + torch.float16: torch.int16, + torch.float32: torch.int32, + torch.float64: torch.int64, + } + + # 输入噪声下界 + NOISE_INPUT_LOWER_BOUND = 1e-8 + COMP_CONSISTENT = 1.0 + COMP_NAN = np.nan + SYMBOL_FLIPPING = "symbol_flipping" + BACKWARD_OUTPUT_LOWER_BOUND = 1e-3 + SMALL_VALUE = 1.0 + # 预热初始阈值 + PREHEAT_INITIAL_THD = 2.05 + API_THD_STEP = 2.0 + + DTYPE_PER_THD = { + torch.float16: 1.002, + torch.float32: 1.0002, + } + BENCHMARK_THD_DICT = { + torch.float32: BenchmarkThd(2**-14, 1.0, 2**-14, 1e-4), + torch.float16: BenchmarkThd(2**-11, 1.0, 2**-11, 1e-4), + torch.bfloat16: BenchmarkThd(2**-8, 1.0, 2**-8, 1e-4), + } + + +class PreheatConfig: + IF_PREHEAT = "if_preheat" + PREHEAT_STEP = "preheat_step" + MAX_SAMPLE = "max_sample" diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/counter.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/counter.py new file mode 100644 index 0000000000000000000000000000000000000000..186b75c71aeaf71fc2adab7ec38c7f00f6b7fdb7 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/counter.py @@ -0,0 +1,72 @@ +from collections import defaultdict +from atat.pytorch.free_benchmark.common.constant import ThresholdConfig + + +class PreheatCounter: + def __init__(self) -> None: + self.api_called_time: dict = defaultdict(int) + self.api_sample_time: dict = defaultdict(int) + self.one_step_used_api: dict = defaultdict(int) + self.api_thd: dict = defaultdict(dict) + self.preheat_record: dict = defaultdict(dict) + self.dtype_map: dict = {} + self.if_preheat: dict = defaultdict(dict) + self.step = 0 + + def clear_step(self): + self.preheat_record.clear() + self.api_called_time.clear() + self.api_sample_time.clear() + + def check_step(self, current_step): + if current_step != self.step: + self.clear_step() + self.step = current_step + + def add_api_called_time(self, api_name: str): + self.api_called_time[api_name] += 1 + + def get_api_called_time(self, api_name: str) -> int: + return self.api_called_time[api_name] + + def add_api_sample_time(self, api_name: str): + self.api_sample_time[api_name] += 1 + + def get_api_sample_time(self, api_name: str) -> int: + return self.api_sample_time[api_name] + + def add_one_step_used_api(self, api_name: str): + self.one_step_used_api[api_name] += 1 + + def get_one_step_used_api(self, api_name: str): + return self.one_step_used_api[api_name] + + def update_preheat_record(self, api_name, dtype, cmp_result): + # 记录预热阶段CPU标杆比对的结果 + if str(dtype) not in self.preheat_record[api_name].keys(): + self.preheat_record[api_name][str(dtype)] = list() + self.preheat_record[api_name][str(dtype)].append(cmp_result) + self.dtype_map[str(dtype)] = dtype + + def update_api_thd(self, api_name, dtype, threshold, dthreshold): + self.api_thd[api_name][str(dtype)] = ( + threshold if threshold > dthreshold else dthreshold + ) + + def get_api_thd(self, api_name, dtype): + if not str(dtype) in self.api_thd[api_name]: + self.api_thd[api_name][str(dtype)] = ThresholdConfig.PREHEAT_INITIAL_THD + self.dtype_map[str(dtype)] = dtype + return self.api_thd[api_name][str(dtype)] + + def set_api_preheat(self, api_name, dtype_str, is_preheat=True): + # 标记cpu不一致的dtype 不再进行预热 + self.if_preheat[api_name][dtype_str] = is_preheat + + def get_api_preheat(self, api_name, dtype): + # 标记cpu不一致的dtype 不再进行预热 + if str(dtype) not in self.if_preheat[api_name]: + return True + return self.if_preheat[api_name][str(dtype)] + +preheat_counter = PreheatCounter() \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/enums.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/enums.py new file mode 100644 index 0000000000000000000000000000000000000000..bfb1bbaa40dc2a535a02aa914f823906b0a374ab --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/enums.py @@ -0,0 +1,37 @@ +class PerturbationMode: + ADD_NOISE = "add_noise" + CHANGE_VALUE = "change_value" + IMPROVE_PRECISION = "improve_precision" + NO_CHANGE = "no_change" + BIT_NOISE = "bit_noise" + TO_CPU = "to_cpu" + + +class DeviceType: + NPU = "npu" + CPU = "cpu" + + +class FuzzThreshold: + BF16_THD = 1e-4 + F16_THD = 1e-6 + F32_THD = 1e-8 + F64_THD = 1e-16 + + +class NormType: + ONE_NORM = (1, "one_norm") + TWO_NORM = (2, "two_norm") + ENDLESS_NORM = (3, "endless_norm") + + +class HandlerType: + CHECK = "check" + PREHEAT = "preheat" + FIX = "fix" + + +class FuzzLevel: + BASE_LEVEL = "L1" + ADV_LEVEL = "L2" + REAL_LEVEL = "L3" diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/params.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/params.py new file mode 100644 index 0000000000000000000000000000000000000000..c5dfefb43f856383af93068840e9e48e1590c431 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/params.py @@ -0,0 +1,130 @@ +from abc import ABC +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional, Tuple + +import torch +from atat.pytorch.free_benchmark import Const, print_warn_log_rank_0 +from atat.pytorch.free_benchmark.common.enums import ( + DeviceType, + FuzzLevel, + PerturbationMode, +) +from atat.pytorch.free_benchmark.common.utils import Tools + + +@dataclass +class DataParams: + args: Optional[Tuple] = None + kwargs: Optional[Dict] = None + valid_input_index: Optional[int] = None + original_result: Optional[Any] = None + perturbed_result: Optional[Any] = None + is_consistent: Optional[bool] = True + perturbed_value: Optional[Any] = None + origin_func: Optional[Callable] = None + api_type: Optional[str] = None + fuzz_stage: Optional[str] = None + grad_unequal_flag: Optional[bool] = True + + +@dataclass +class HandlerParams: + handler_type: Optional[str] = None + api_name: Optional[str] = None + pert_mode: Optional[PerturbationMode] = None + step: Optional[int] = None + fuzz_stage: Optional[str] = None + fuzz_device: Optional[DeviceType] = None + preheat_config: Optional[Dict] = None + fuzz_level: Optional[str] = None + + +@dataclass +class UnequalRow: + rank: Optional[int] = None + pert_mode: Optional[PerturbationMode] = None + stage: Optional[str] = None + step: Optional[int] = None + api_name: Optional[str] = None + max_rel: Optional[float] = None + dtype: Optional[str] = None + shape: Optional[str] = None + output_index: Optional[int] = None + + +@dataclass +class BenchmarkThd: + rtol: Optional[float] = None # 相对误差阈值 + small_value: Optional[float] = None # 小值域 + small_value_atol: Optional[float] = None # 小值域绝对阈值 + err_balance: Optional[float] = None # 误差均衡性 + + +def check_args_type(args: Tuple) -> int: + for i, arg in enumerate(args): + if torch.is_tensor(arg): + if arg.is_meta: + continue + if not torch.is_floating_point(arg): + continue + return i + if isinstance(arg, (List, Tuple, Dict)): + return i + return -1 + + +def data_pre_deal(name, func, args, kwargs): + data_params = DataParams(args=args, kwargs=kwargs, origin_func=func) + index = check_args_type(args) + data_params.valid_input_index = index + if index == -1: + print_warn_log_rank_0( + f"[atat] Free benchmark: 无标杆工具不支持当前算子的输入类型 {name}." + ) + return data_params + + +def make_handler_params(name, config, step): + handler_params = HandlerParams() + handler_params.api_name = name + handler_params.step = step + handler_params.handler_type = config.handler_type + handler_params.fuzz_stage = config.fuzz_stage + handler_params.fuzz_device = config.fuzz_device + handler_params.preheat_config = config.preheat_config + handler_params.fuzz_level = config.fuzz_level + handler_params.pert_mode = config.pert_mode + return handler_params + + +def make_unequal_row( + data_params: DataParams, + handle_params: HandlerParams, + ratio: float = None, + index: int = None, +): + row = UnequalRow( + api_name=handle_params.api_name, + pert_mode=handle_params.pert_mode, + output_index=index, + stage=handle_params.fuzz_stage, + step=handle_params.step, + ) + if isinstance(ratio, float): + row.max_rel = ratio - 1 + origin_tensor = data_params.original_result + perturbed_tensor = data_params.perturbed_result + if index: + origin_tensor = origin_tensor[index] + perturbed_tensor = perturbed_tensor[index] + row.output_index = index + if isinstance(origin_tensor, torch.Tensor): + row.dtype = origin_tensor.dtype + row.shape = origin_tensor.shape + row.rank = Tools.get_dist_rank() + # 以下暂不支持 + if handle_params.fuzz_level == FuzzLevel.ADV_LEVEL: + pass + if handle_params.fuzz_level == FuzzLevel.REAL_LEVEL: + pass + return row diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..24d25967635b3dcfd1da89e1f54d3282fa1181ed --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/utils.py @@ -0,0 +1,98 @@ +import torch +from atat.pytorch.free_benchmark.common.enums import DeviceType + + +class Tools: + + @staticmethod + def is_float_tensor(tensor) -> bool: + if isinstance(tensor, torch.Tensor) and torch.is_floating_point(tensor): + return True + if isinstance(tensor, (list, tuple)): + for value in tensor: + if isinstance(value, torch.Tensor) and torch.is_floating_point(value): + return True + return False + + @staticmethod + def get_dist_rank(): + try: + return torch.distributed.get_rank() + except RuntimeError: + return 0 + + @staticmethod + def get_first_tensor_dtype(tensor_seq): + if isinstance(tensor_seq, torch.Tensor): + return tensor_seq.dtype + if isinstance(tensor_seq, (list, tuple)): + for object_ in tensor_seq: + if isinstance(object_, torch.Tensor): + return object_.dtype + raise RuntimeError("The sequence does not contain tensors.") + + @staticmethod + def get_pure_api_name(api_name: str): + return api_name.rsplit(".", 2)[0] + + @staticmethod + def convert_device_and_dtype( + tensor_seq, device: str = DeviceType.CPU, change_dtype: bool = False + ): + if isinstance(tensor_seq, torch.Tensor): + if change_dtype and tensor_seq.dtype in [torch.float16, torch.bfloat16]: + return tensor_seq.detach().to(device).to(torch.float32) + return tensor_seq.detach().to(device) + if isinstance(tensor_seq, dict): + return { + key: Tools.convert_device_and_dtype(value, device, change_dtype) + for key, value in tensor_seq.items() + } + if isinstance(tensor_seq, (tuple, list)): + return type(tensor_seq)( + [ + Tools.convert_device_and_dtype(value, device, change_dtype) + for value in tensor_seq + ] + ) + return tensor_seq + + @staticmethod + def convert_fuzz_output_to_origin(origin, perturbed): + if isinstance(origin, torch.Tensor): + origin.data = perturbed.to(origin.dtype).to(origin.device) + return origin + if isinstance(origin, dict): + output = dict() + for key, value in origin.items(): + output[key] = Tools.convert_fuzz_output_to_origin(value, perturbed[key]) + return output + if isinstance(origin, (tuple, list)): + result = list() + for index_, value in enumerate(origin): + result.append( + Tools.convert_fuzz_output_to_origin(value, perturbed[index_]) + ) + return type(origin)(result) + return origin + +class TorchC: + sum = torch._C._VariableFunctionsClass.sum + isinf = torch._C._VariableFunctionsClass.isinf + isfinite = torch._C._VariableFunctionsClass.isfinite + isnan = torch._C._VariableFunctionsClass.isnan + logical_not = torch._C._VariableFunctionsClass.logical_not + subtract = torch._C._VariableFunctionsClass.subtract + abs = torch._C._VariableFunctionsClass.abs + where = torch._C._VariableFunctionsClass.where + div = torch._C._VariableFunctionsClass.div + max = torch._C._VariableFunctionsClass.max + min = torch._C._VariableFunctionsClass.min + gt = torch._C._VariableFunctionsClass.gt + ge = torch._C._VariableFunctionsClass.ge + lt = torch._C._VariableFunctionsClass.lt + mean = torch._C._VariableFunctionsClass.mean + full = torch._C._VariableFunctionsClass.full + add = torch._C._VariableFunctionsClass.add + bitwise_xor = torch._C._VariableFunctionsClass.bitwise_xor + clone = torch._C._VariableFunctionsClass.clone diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/grad_saver.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/grad_saver.py new file mode 100644 index 0000000000000000000000000000000000000000..a8752656ed72bc21773aca2bb06d4e69d96a5c4b --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/grad_saver.py @@ -0,0 +1,172 @@ +import torch +from atat.pytorch.free_benchmark import print_info_log_rank_0, print_warn_log_rank_0 +from atat.pytorch.free_benchmark.common.params import DataParams, HandlerParams +from atat.pytorch.free_benchmark.common.constant import CommonField +from atat.pytorch.free_benchmark.common.utils import Tools +from atat.pytorch.free_benchmark.result_handlers.handler_factory import ( + FuzzHandlerFactory, +) +from atat.pytorch.free_benchmark.perturbed_layers.layer_factory import LayerFactory + + +class GradSaver: + + def __init__(self, origin_func, handler_params: HandlerParams): + + self.handler_params = handler_params + self.api_name = handler_params.api_name + self.origin_func = origin_func + self.data_params = DataParams() + self.is_compare = True + self.kwargs = dict() + self.perturbed_grad_input = tuple() + self.origin_grad_input = tuple() + self.need_grad_flag = list() + self.backward_input = tuple() + + def register_compare_func_for_inputs(self, inputs, data_processor): + _index = 0 + for j, obj in enumerate(inputs): + if torch.is_tensor(obj) and obj.requires_grad: + + def compare_func(grad, new_grad_index=_index, input_index=j): + if not self.is_compare: + return grad + try: + perturbed_grad = self.check_grad_input(grad, new_grad_index) + handler = FuzzHandlerFactory.create(self.handler_params) + self.compare_grad_results( + handler, grad, perturbed_grad, index=input_index + ) + data_processor.update_unequal_rows(handler.get_unequal_rows()) + except Exception as e: + print_warn_log_rank_0( + f"[atat] Free benchmark: grad compara error: {e}" + ) + return grad + return grad + + obj.register_hook(compare_func) + _index += 1 + + def compare_grad_results(self, handler, origin_grad, perturbed_grad, index): + # TODO get dtype? + self.data_params.original_result = origin_grad + self.data_params.perturbed_result = perturbed_grad + self.data_params.grad_unequal_flag = False + self.data_params.valid_input_index = index + try: + handler.handle(self.data_params) + if not self.data_params.is_consistent: + self.is_compare = False + self.data_params.grad_unequal_flag = True + self.data_params.is_consistent = True + self.data_params.perturbed_result = self.perturbed_grad_input + self.data_params.original_result = self.origin_grad_input + handler.handle(self.data_params) + except Exception as e: + print_warn_log_rank_0( + f"[atat] Free benchmark: compare two vjp failed: api:{self.handler_params.api_name}." + f"{e}" + ) + + def check_grad_input(self, origin_grad, new_grad_index): + if self.perturbed_grad_input is None: + print_info_log_rank_0( + f"[atat] Free benchmark: grad not exsits : {self.api_name}." + ) + return None + try: + with torch.no_grad(): + perturbed_grad = self.perturbed_grad_input[new_grad_index].to( + origin_grad.device + ) + except IndexError: + print_warn_log_rank_0( + f"[atat] Free benchmark: grad index out of range. api:{self.handler_params.api_name}." + f"index:{new_grad_index}, perturbation grad len {len(self.perturbed_grad_input)}" + ) + return None + if origin_grad.shape != perturbed_grad.shape: + print_warn_log_rank_0( + f"[atat] Free benchmark: grad shapes are unconsistent. api:{self.handler_params.api_name}." + f"origin:{origin_grad.shape}, perturbation: {perturbed_grad.shape}" + ) + return None + return perturbed_grad + + def cache_backward_input(self, backward_input_list): + _inputs = [] + with torch.no_grad(): + for backward_input in backward_input_list: + if torch.is_tensor(backward_input): + _inputs.append( + { + CommonField.DEVICE: backward_input.device, + CommonField.FUZZ_TENSOR: backward_input.cpu(), + CommonField.REQUIRES_GRAD: backward_input.requires_grad, + } + ) + else: + _inputs.append(backward_input) + self.backward_input = _inputs + + def get_vjp_input(self): + inner_args_tmp = [] + need_grad_tensors = [] + for object_ in self.backward_input: + if isinstance(object_, dict) and CommonField.FUZZ_TENSOR in object_.keys(): + tensor_ = torch.tensor( + object_.get(CommonField.FUZZ_TENSOR).data, + dtype=object_.get(CommonField.FUZZ_TENSOR).dtype, + device=object_.get(CommonField.DEVICE), + requires_grad=object_.get(CommonField.REQUIRES_GRAD), + ) + + if tensor_.requires_grad: + inner_args_tmp.append(CommonField.HOLD_PLACE) + need_grad_tensors.append(tensor_) + self.need_grad_flag.append(True) + else: + self.need_grad_flag.append(False) + inner_args_tmp.append(tensor_) + else: + self.need_grad_flag.append(False) + inner_args_tmp.append(object_) + + return need_grad_tensors, tuple(inner_args_tmp) + + def get_grad_input_from_vjp(self, need_grad_tensors, grad_output, inner_args): + def vjp_func(*inputs): + _real_input = [] + index_ = 0 + for object_ in inner_args: + if object_ is CommonField.HOLD_PLACE: + _real_input.append(inputs[index_]) + index_ += 1 + else: + _real_input.append(object_) + kwargs = self.kwargs.copy() + if 'inplace' in kwargs: + kwargs['inplace'] = False + return self.origin_func(*_real_input, **kwargs) + + _, grad_input = torch.autograd.functional.vjp( + vjp_func, tuple(need_grad_tensors), grad_output + ) + return grad_input + + def calculate_perturbed_grad_input(self, grad_output, need_grad_tensors, inner_args): + self.data_params.args = [need_grad_tensors, grad_output, inner_args] + self.data_params.kwargs = {} + self.data_params.valid_input_index = 0 + self.data_params.origin_func = self.get_grad_input_from_vjp + layer = LayerFactory.create( + self.handler_params.api_name, + self.handler_params.fuzz_device, + self.handler_params.pert_mode, + ) + layer.handle(self.data_params) + self.perturbed_grad_input = tuple( + [x.cpu() for x in self.data_params.perturbed_result] + ) diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/single_benchmark.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/single_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..ed834c468ba6f15437da4479a3e2b3257fd7b6c1 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/single_benchmark.py @@ -0,0 +1,103 @@ +import torch +import math + +from atat.pytorch.free_benchmark import print_warn_log_rank_0 +from atat.pytorch.free_benchmark.common.utils import TorchC +from atat.pytorch.free_benchmark.common.constant import ThresholdConfig + + +class SingleCompare: + def __init__(self) -> None: + self.relative_err = None + self.absolute_err = None + self.eb = None + self.threshold = None + + def compare_seq(self, actual, golden): + if isinstance(golden, torch.Tensor): + return self.compare_tensor_seq(actual, golden) + elif isinstance(golden, dict): + return self.compare_dict_seq(actual, golden) + elif isinstance(golden, (tuple, list)): + return self.compare_list_seq(actual, golden) + elif isinstance(golden, float): + return self.compare_float_seq(actual, golden) + else: + return self.compare_other_seq(actual, golden) + + def compare_tensor_seq(self, actual, golden): + self.threshold = ThresholdConfig.BENCHMARK_THD_DICT.get( + actual.dtype, ThresholdConfig.BENCHMARK_THD_DICT.get(torch.float32) + ) + if self.filter_overflow(golden) > 0: + print_warn_log_rank_0("[atat] Free Benchmark: inf and nan" + "in golden tensor is not supported.") + return True + actual = self.replace_inf_or_nan(actual) + actual = actual.to(torch.float64) + golden = golden.to(torch.float64).to(actual.device) + self._cal_compare_metrics(actual, golden) + if self.absolute_err > self.threshold.small_value_atol: + return False + if self.relative_err > self.threshold.rtol: + return False + if self.eb > self.threshold.err_balance: + return False + return True + + + def _cal_compare_metrics(self, actual, golden): + diff_value = TorchC.subtract(actual, golden) + diff_abs = TorchC.abs(diff_value) + golden_abs = TorchC.abs(golden) + # 使用绝对误差的元素 + self.absolute_err = TorchC.max(TorchC.where( + TorchC.lt(TorchC.abs(actual), self.threshold.small_value), diff_abs, 0 + )) + diff_rel = TorchC.div(diff_abs, golden_abs) + # 使用相对误差的元素 + self.relative_err = TorchC.max(TorchC.where( + TorchC.ge(TorchC.abs(actual), self.threshold.small_value), diff_rel, 0 + )) + # 获取误差均衡性 + divided = TorchC.where( + TorchC.ge(TorchC.abs(golden), self.threshold.small_value), golden_abs, 1 + ) + self.eb = TorchC.mean(TorchC.div(diff_value, divided)) + + def compare_dict_seq(self, actual, golden): + if len(actual) != len(golden): + return False + for key, value in golden.items(): + if not self.compare_seq(value, actual.get(key)): + return False + return True + + def compare_list_seq(self, actual, golden): + if len(actual) != len(golden): + return False + for index_, value in enumerate(golden): + if not self.compare_seq(value, actual[index_]): + return False + return True + + def compare_float_seq(self, actual, golden): + return math.isclose(actual, golden) + + def compare_other_seq(self, actual, golden): + return actual == golden + + @staticmethod + def filter_overflow(tensor) -> int: + inf_num = TorchC.sum(TorchC.isinf(tensor)) + nan_num = TorchC.sum(TorchC.isnan(tensor)) + return inf_num + nan_num + + @staticmethod + def replace_inf_or_nan(tensor): + finite_mask = TorchC.isfinite(tensor) + inf_or_nan_mask = TorchC.logical_not(finite_mask) + inf_or_nan_num = TorchC.sum(inf_or_nan_mask).item() + if inf_or_nan_num > 0: + tensor[inf_or_nan_mask] = 1 + return tensor diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/main.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/main.py new file mode 100644 index 0000000000000000000000000000000000000000..c2e0005181d967ed8437e3047f9d967b1370d4e3 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/main.py @@ -0,0 +1,102 @@ +import importlib +from abc import ABC + +import torch +from atat.pytorch.free_benchmark import Const, print_warn_log_rank_0 + +from atat.pytorch.free_benchmark.common.params import data_pre_deal, make_handler_params +from atat.pytorch.free_benchmark.common.enums import ( + PerturbationMode, + FuzzLevel, + DeviceType, + HandlerType +) +from atat.pytorch.free_benchmark.compare.grad_saver import GradSaver +from atat.pytorch.free_benchmark.perturbed_layers.layer_factory import LayerFactory +from atat.pytorch.free_benchmark.result_handlers.handler_factory import ( + FuzzHandlerFactory, +) + + +class FreeBenchmarkCheck(ABC): + + def __init__(self, config) -> None: + super().__init__() + self.config = config + if self.config.pert_mode is None: + self.config.pert_mode = PerturbationMode.IMPROVE_PRECISION + if self.config.fuzz_level is None: + self.config.fuzz_level = FuzzLevel.BASE_LEVEL + if self.config.fuzz_device is None: + self.config.fuzz_device = DeviceType.NPU + self.current_iter = 0 + + def update_iter(self, update_iter): + self.current_iter = update_iter + + def if_fix(self): + if self.config.handler_type==HandlerType.FIX: + return True + return False + + def pre_forward(self, name, module, data_processor, args, kwargs): + if not self.config.fuzz_stage == Const.BACKWARD: + return + origin_func = ( + module._slow_forward if torch._C._get_tracing_state() else module.forward + ) + handler_params = make_handler_params(name, self.config, self.current_iter) + grad_saver = GradSaver(origin_func, handler_params) + grad_saver.kwargs = kwargs + grad_saver.register_compare_func_for_inputs(args, data_processor) + grad_saver.cache_backward_input(args) + setattr(module, "grad_saver", grad_saver) + + def forward(self, name, module, args, kwargs, output): + if not self.config.fuzz_stage == Const.FORWARD: + return output, [] + origin_func = ( + module._slow_forward if torch._C._get_tracing_state() else module.forward + ) + data_params = data_pre_deal(name, origin_func, args, kwargs) + if data_params.valid_input_index == -1: + return output, [] + data_params.original_result = output + data_params.fuzz_stage = self.config.fuzz_stage + + layer = LayerFactory.create( + name, self.config.fuzz_device, self.config.pert_mode + ) + layer.handle(data_params) + handler_params = make_handler_params(name, self.config, self.current_iter) + handler = FuzzHandlerFactory.create(handler_params) + handler.handle(data_params) + return output, handler.get_unequal_rows() + + def backward(self, name, module, grad_output): + + if not self.config.fuzz_stage == Const.BACKWARD: + return + try: + grad_saver = getattr(module, "grad_saver") + except AttributeError: + print_warn_log_rank_0( + f"[atat] Free benchmark: get grad saver failed. api_name:{name}" + ) + return + + _new_grad_output = grad_output + try: + need_grad_tensors, _inner_args = grad_saver.get_vjp_input() + origin_grad_input = grad_saver.get_grad_input_from_vjp( + tuple(need_grad_tensors), _new_grad_output, _inner_args + ) + grad_saver.origin_grad_input = tuple([x.cpu() for x in origin_grad_input]) + grad_saver.calculate_perturbed_grad_input( + _new_grad_output, need_grad_tensors, _inner_args + ) + except Exception as e: + print_warn_log_rank_0( + f"[atat] Free benchmark: grad vjp calculate failed. api_name:{name} error: {e}" + ) + return diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/__init__.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/base_layer.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/base_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..aa572fd8e8dc8b62493dfa1fecc587b934c83a99 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/base_layer.py @@ -0,0 +1,13 @@ +from abc import ABC, abstractmethod +from typing import Any + +from atat.pytorch.free_benchmark.common.params import DataParams + + +class BaseLayer(ABC): + def __init__(self, api_name: str) -> None: + self.api_name = api_name + + @abstractmethod + def handle(self, params: DataParams) -> Any: + pass diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/layer_factory.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/layer_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..0d09438ce04132c9c5c301d758dc06818805082e --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/layer_factory.py @@ -0,0 +1,41 @@ +from atat.pytorch.free_benchmark import FreeBenchmarkException +from atat.pytorch.free_benchmark.common.enums import DeviceType, PerturbationMode +from atat.pytorch.free_benchmark.perturbed_layers.npu.improve_precision import ( + ImprovePrecisionLayer, +) +from atat.pytorch.free_benchmark.perturbed_layers.npu.add_noise import AddNoiseLayer +from atat.pytorch.free_benchmark.perturbed_layers.npu.bit_noise import BitNoiseLayer +from atat.pytorch.free_benchmark.perturbed_layers.npu.no_change import NoChangeLayer +from atat.pytorch.free_benchmark.perturbed_layers.npu.change_value import ( + ChangeValueLayer, +) +from atat.pytorch.free_benchmark.perturbed_layers.run_cpu import CpuLayer + + +class LayerFactory: + layers = { + DeviceType.NPU: { + PerturbationMode.ADD_NOISE: AddNoiseLayer, + PerturbationMode.CHANGE_VALUE: ChangeValueLayer, + PerturbationMode.NO_CHANGE: NoChangeLayer, + PerturbationMode.BIT_NOISE: BitNoiseLayer, + PerturbationMode.IMPROVE_PRECISION: ImprovePrecisionLayer, + }, + DeviceType.CPU: {PerturbationMode.TO_CPU: CpuLayer}, + } + + @staticmethod + def create(api_name: str, device_type: str, mode: str): + layer = LayerFactory.layers.get(device_type) + if not layer: + raise FreeBenchmarkException( + FreeBenchmarkException.UnsupportedType, + f"无标杆工具不支持当前设备 {device_type}", + ) + layer = layer.get(mode) + if not layer: + raise FreeBenchmarkException( + FreeBenchmarkException.UnsupportedType, + f"无标杆工具无法识别该扰动因子 {mode} on {device_type}", + ) + return layer(api_name) diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/__init__.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py new file mode 100644 index 0000000000000000000000000000000000000000..d03dbe931d91e5ed91b70c7b2b8fe1fb8f1342fa --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py @@ -0,0 +1,93 @@ +import torch +from atat.pytorch.free_benchmark import ( + print_info_log_rank_0, + print_warn_log_rank_0, +) +from atat.pytorch.free_benchmark.common.constant import ThresholdConfig +from atat.pytorch.free_benchmark.common.params import DataParams +from atat.pytorch.free_benchmark.common.utils import TorchC +from atat.pytorch.free_benchmark.common.enums import PerturbationMode +from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( + NpuBaseLayer, +) + + +class AddNoiseLayer(NpuBaseLayer): + + def _get_noise(self, tensor_obj): + dtype = tensor_obj.dtype + device = str(tensor_obj.device) + noise = TorchC.full( + tensor_obj.shape, + self.perturbed_value, + device=device, + dtype=dtype, + ) + return noise + + def _check_details(self, tensor_obj): + """ + 判断是否需要添加扰动 + """ + if not self.perturbed_value: + print_warn_log_rank_0( + f"[atat] Free Benchmark: For {self.api_name}, " + f"dtype unsupported. Cancel perturbation." + ) + return False + if tensor_obj.numel() == 0: + print_warn_log_rank_0( + f"[atat] Free benchmark: For {self.api_name}, tensor shape must > 0." + f" Cancel adding noise." + ) + return False + abs_tol = ThresholdConfig.ABS_TOL_VALUE_DICT.get( + tensor_obj.dtype, ThresholdConfig.NOISE_INPUT_LOWER_BOUND + ) + try: + max_val = TorchC.max(TorchC.abs(tensor_obj)).item() + except Exception: + print_warn_log_rank_0( + f"[atat] Free Benchmark: For {self.api_name}, " + f"when calculate maximun value, tensor is changed to float32." + ) + max_val = TorchC.max(TorchC.abs(tensor_obj.to(torch.float32))).item() + if max_val < abs_tol: + print_warn_log_rank_0( + f"[atat] Free Benchmark: For {self.api_name}, " + f"Maximun value is less than the minimun threshold. Cancel add noise." + ) + return False + return True + + def add_noise(self, tensor_obj): + if isinstance(tensor_obj, torch.Tensor): + self.perturbed_value = ThresholdConfig.PERTURBATION_VALUE_DICT.get( + tensor_obj.dtype + ) + if not self.pre_check(tensor_obj): + return tensor_obj + noise = self._get_noise(tensor_obj) + result = TorchC.where( + TorchC.gt(TorchC.abs(tensor_obj), self.perturbed_value**0.5), + TorchC.add(noise, tensor_obj), + tensor_obj, + ).to(tensor_obj.dtype) + self.is_added = True + return result + if isinstance(tensor_obj, dict): + return {key: self.add_noise(value) for key, value in tensor_obj.items()} + if isinstance(tensor_obj, (tuple, list)): + return type(tensor_obj)([self.add_noise(value) for value in tensor_obj]) + return tensor_obj + + def handle(self, params: DataParams) -> torch.Any: + """ + 对输入添加扰动并返回 + """ + print_info_log_rank_0( + f"[atat] Free benchmark: Perturbation is " + f"{PerturbationMode.ADD_NOISE} of {self.api_name}." + ) + params.perturbed_value = self.add_noise(params.args[params.valid_input_index]) + return self.perturbed_result(params) diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py new file mode 100644 index 0000000000000000000000000000000000000000..72d04af412067882826ea402ed6fa00490bce348 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py @@ -0,0 +1,107 @@ +import torch +from atat.pytorch.free_benchmark import ( + print_info_log_rank_0, + print_warn_log_rank_0, +) +from atat.pytorch.free_benchmark.common.constant import ThresholdConfig +from atat.pytorch.free_benchmark.common.params import DataParams +from atat.pytorch.free_benchmark.common.utils import TorchC +from atat.pytorch.free_benchmark.common.enums import PerturbationMode +from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( + NpuBaseLayer, +) + + +class BitNoiseLayer(NpuBaseLayer): + def __init__(self, api_name): + super().__init__(api_name) + self.bit_mode = TorchC.bitwise_xor + self.bit_tail: int = 1 + self.bit_type = None + + def _check_details(self, tensor_obj): + """ + 判断是否需要添加扰动, bit翻转 + """ + if not self.bit_type: + print_warn_log_rank_0( + f"[atat] Free Benchmark: For {self.api_name}, " + f"dtype unsupported. Cancel perturbation." + ) + return False + if tensor_obj.numel() == 0: + print_warn_log_rank_0( + f"[atat] Free benchmark: For {self.api_name}, tensor shape must > 0" + f" Cancel adding noise." + ) + return False + abs_tol = ThresholdConfig.ABS_TOL_VALUE_DICT.get( + tensor_obj.dtype, ThresholdConfig.NOISE_INPUT_LOWER_BOUND + ) + try: + max_val = TorchC.max(TorchC.abs(tensor_obj)).item() + except Exception: + print_warn_log_rank_0( + f"[atat] Free Benchmark: For {self.api_name}, " + f"when calculate maximun value, tensor is changed to float32." + ) + max_val = TorchC.max(TorchC.abs(tensor_obj.to(torch.float32))).item() + if max_val < abs_tol: + print_warn_log_rank_0( + f"[atat] Free Benchmark: For {self.api_name}, " + f"Maximun value is less than the minimun threshold. Cancel add noise." + ) + return False + return True + + def _set_perturbation_bit(self, tensor_obj): + """ + 根据不同浮点数确定不同位数扰动值 + """ + bit_len_type = ThresholdConfig.PERTURBATION_BIT_DICT.get(tensor_obj.dtype) + if bit_len_type: + self.bit_tail = 1 + self.bit_type = bit_len_type + + def add_bit_noise(self, tensor_obj): + """ + 对输入添加噪声 + """ + # finfo应该列入黑名单 + + if isinstance(tensor_obj, torch.Tensor): + self._set_perturbation_bit(tensor_obj) + if not self.pre_check(tensor_obj): + return tensor_obj + sub_normal = torch.finfo(tensor_obj.dtype).smallest_normal + noise = TorchC.full( + tensor_obj.shape, + self.bit_tail, + device=tensor_obj.device, + dtype=self.bit_type, + ) + result = tensor_obj.view(self.bit_type) + result = TorchC.where( + TorchC.gt(TorchC.abs(tensor_obj), sub_normal), + self.bit_mode(result, noise), + result, + ).view(tensor_obj.dtype) + + self.is_added = True + return result + if isinstance(tensor_obj, dict): + return {key: self.add_bit_noise(value) for key, value in tensor_obj.items()} + if isinstance(tensor_obj, (tuple, list)): + return type(tensor_obj)([self.add_bit_noise(value) for value in tensor_obj]) + return tensor_obj + + def handle(self, params: DataParams) -> torch.Any: + """ + 对输入添加扰动并返回 + """ + print_info_log_rank_0( + f"[atat] Free benchmark: Perturbation is " + f"{PerturbationMode.BIT_NOISE} of {self.api_name}." + ) + params.perturbed_value = self.add_bit_noise(params.args[params.valid_input_index]) + return self.perturbed_result(params) diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/change_value.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/change_value.py new file mode 100644 index 0000000000000000000000000000000000000000..ab91bcb7eeea00085318a21c20bb9f03d69b8908 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/change_value.py @@ -0,0 +1,63 @@ +import torch +from atat.pytorch.free_benchmark import print_warn_log_rank_0, print_info_log_rank_0 +from atat.pytorch.free_benchmark.common.params import DataParams +from atat.pytorch.free_benchmark.common.utils import TorchC +from atat.pytorch.free_benchmark.common.enums import PerturbationMode +from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( + NpuBaseLayer, +) + + +class ChangeValueLayer(NpuBaseLayer): + def __init__(self, api_name): + super().__init__(api_name) + self.head: int = 0 + self.tail: int = -1 + + def _check_details(self, tensor_obj): + """ + 判断是否需要添加扰动, 首尾值交换 + """ + if tensor_obj.size(0) < 2: + print_warn_log_rank_0( + f"[atat] Free Benchmark: For {self.api_name}, " + f"size 0 must greater than 1. Cancel change value." + ) + return False + return True + + def change_value(self, tensor_obj): + """ + 交换张量首尾 + """ + if isinstance(tensor_obj, torch.Tensor) and self.pre_check(tensor_obj): + new_tensor = TorchC.clone(tensor_obj) + if new_tensor.ndim == 1: + temp_first = TorchC.clone(new_tensor[self.head]) + temp_last = TorchC.clone(new_tensor[self.tail]) + new_tensor[self.head] = temp_last + new_tensor[self.tail] = temp_first + else: + temp_first = TorchC.clone(new_tensor[self.head][self.head]) + temp_last = TorchC.clone(new_tensor[self.tail][self.tail]) + new_tensor[self.head][self.head] = temp_last + new_tensor[self.tail][self.tail] = temp_first + + self.is_added = True + return new_tensor + if isinstance(tensor_obj, dict): + return {key: self.change_value(value) for key, value in tensor_obj.items()} + if isinstance(tensor_obj, (tuple, list)): + return type(tensor_obj)([self.change_value(value) for value in tensor_obj]) + return tensor_obj + + def handle(self, params: DataParams) -> torch.Any: + """ + 对输入添加扰动并返回 + """ + print_info_log_rank_0( + f"[atat] Free benchmark: Perturbation is " + f"{PerturbationMode.CHANGE_VALUE} of {self.api_name}." + ) + params.perturbed_value = self.change_value(params.args[params.valid_input_index]) + return self.perturbed_result(params) diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py new file mode 100644 index 0000000000000000000000000000000000000000..fb126972c6853b81d24db8138880601f9a3af21a --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py @@ -0,0 +1,64 @@ +import torch +from atat.pytorch.free_benchmark import Const, print_info_log_rank_0 +from atat.pytorch.free_benchmark.common.constant import CommonField +from atat.pytorch.free_benchmark.common.params import DataParams +from atat.pytorch.free_benchmark.common.enums import PerturbationMode +from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( + NpuBaseLayer, +) + + +class ImprovePrecisionLayer(NpuBaseLayer): + + def _set_improve_valus(self, inputs): + # TODO why + if inputs.dtype in [torch.float16, torch.bfloat16]: + self.perturbed_value = torch.float32 + + def _change_dtype(self, inputs): + if hasattr(inputs, CommonField.DEVICE): + device = inputs.device + if device is CommonField.META: + new_inputs = inputs.to( + device=CommonField.META, dtype=self.perturbed_value + ) + else: + new_inputs = inputs.to(dtype=self.perturbed_value).to(device) + else: + new_inputs = inputs.to(dtype=self.perturbed_value) + return new_inputs + + def improve_tensor_precision(self, tensor_obj): + if ( + isinstance(tensor_obj, torch.Tensor) + and torch.is_floating_point(tensor_obj) + and tensor_obj.dtype not in [torch.float32, torch.float64] + ): + self._set_improve_valus(tensor_obj) + tensor_obj = self._change_dtype(tensor_obj) + return tensor_obj + if isinstance(tensor_obj, dict): + return { + key: self.improve_tensor_precision(value) + for key, value in tensor_obj.items() + } + if isinstance(tensor_obj, (tuple, list)): + return type(tensor_obj)( + [self.improve_tensor_precision(value) for value in tensor_obj] + ) + return tensor_obj + + def handle(self, params: DataParams) -> torch.Any: + print_info_log_rank_0( + f"[atat] Free benchmark: Perturbation is " + f"{PerturbationMode.IMPROVE_PRECISION} of {self.api_name}." + ) + new_args = self.improve_tensor_precision(params.args) + if params.fuzz_stage == Const.BACKWARD: + new_kwargs = {} + else: + new_kwargs = self.improve_tensor_precision(params.kwargs) + if "inplace" in new_kwargs: + new_kwargs["inplace"] = False + params.perturbed_result = params.origin_func(*new_args, **new_kwargs) + return params.perturbed_result diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/no_change.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/no_change.py new file mode 100644 index 0000000000000000000000000000000000000000..7ec5870fb72db30101f41a8ec057bf95d94da9b3 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/no_change.py @@ -0,0 +1,29 @@ +import torch +from atat.pytorch.free_benchmark import print_info_log_rank_0 +from atat.pytorch.free_benchmark.common.params import DataParams +from atat.pytorch.free_benchmark.common.enums import PerturbationMode +from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import ( + NpuBaseLayer, +) + + +class NoChangeLayer(NpuBaseLayer): + + def no_change(self, tensor_obj): + """ + 不对输入做任何改变、直接二次执行 + """ + self.is_added = True + return tensor_obj + + + def handle(self, params: DataParams) -> torch.Any: + """ + 对输入添加扰动并返回 + """ + print_info_log_rank_0( + f"[atat] Free benchmark: Perturbation is " + f"{PerturbationMode.NO_CHANGE} of {self.api_name}." + ) + params.perturbed_value = self.no_change(params.args[params.valid_input_index]) + return self.perturbed_result(params) diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py new file mode 100644 index 0000000000000000000000000000000000000000..ca502365e1b1b4ae0b37e2ecc48bff3b203f765c --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py @@ -0,0 +1,46 @@ +from abc import abstractmethod +from typing import Any +import torch +from atat.pytorch.free_benchmark.common.constant import CommonField, ThresholdConfig +from atat.pytorch.free_benchmark.common.utils import TorchC +from atat.pytorch.free_benchmark.common.params import DataParams +from atat.pytorch.free_benchmark.perturbed_layers.base_layer import BaseLayer + + +class NpuBaseLayer(BaseLayer): + def __init__(self, api_name: str) -> None: + super().__init__(api_name) + self.perturbed_value = None # 扰动的元素 + self.is_added = False # 标记当前算子输入是否调整 + + @abstractmethod + def handle(self, params: DataParams) -> Any: + pass + + def _check_details(self, tensor_obj): + return True + + def pre_check(self, tensor_obj): + """ + 检查张量是否符合标准(float类型且最大值大于对应精度最小值) + """ + # 只针对第一个满足要求的添加扰动 + if self.is_added: + return False + if not torch.is_floating_point(tensor_obj): + return False + if not self._check_details(tensor_obj): + return False + return True + + @staticmethod + def perturbed_result(params: DataParams) -> Any: + args_front = params.args[: params.valid_input_index] + args_rear = params.args[params.valid_input_index + 1 :] + # 此处会将有inplace属性的算子换为非inplace + if "inplace" in params.kwargs: + params.kwargs["inplace"] = False + params.perturbed_result = params.origin_func( + *args_front, params.perturbed_value, *args_rear, **params.kwargs + ) + return params.perturbed_result diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/run_cpu.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/run_cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..387f9447fd29276e3c43bcdabf0e8a3a05b8ecec --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/run_cpu.py @@ -0,0 +1,19 @@ +import torch +from atat.pytorch.free_benchmark import print_info_log_rank_0 +from atat.pytorch.free_benchmark.common.params import DataParams +from atat.pytorch.free_benchmark.common.utils import Tools +from atat.pytorch.free_benchmark.common.enums import DeviceType +from atat.pytorch.free_benchmark.perturbed_layers.base_layer import BaseLayer + + +class CpuLayer(BaseLayer): + + def handle(self, params: DataParams) -> torch.Any: + + print_info_log_rank_0( + f"[atat] Free benchmark: Perturbation is to_cpu of {self.api_name}." + ) + new_args = Tools.convert_device_and_dtype(params.args, DeviceType.CPU, change_dtype=True) + new_kwargs = Tools.convert_device_and_dtype(params.kwargs, DeviceType.CPU, change_dtype=True) + params.perturbed_result = params.origin_func(*new_args, **new_kwargs) + return params.perturbed_result diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/__init__.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/base_handler.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/base_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..1d59ef9fc3adc2f90a7145d825ce597e209758e4 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/base_handler.py @@ -0,0 +1,213 @@ +import math +from abc import ABC, abstractmethod +from typing import Any, Optional, Tuple + +import torch +from atat.pytorch.free_benchmark import ( + Const, + print_warn_log_rank_0, +) +from atat.pytorch.free_benchmark.common.utils import TorchC +from atat.pytorch.free_benchmark.common.constant import ThresholdConfig +from atat.pytorch.free_benchmark.common.enums import ( + FuzzThreshold, + NormType, + PerturbationMode, +) +from atat.pytorch.free_benchmark.common.params import DataParams, HandlerParams, make_unequal_row + + +class FuzzHandler(ABC): + def __init__(self, params: HandlerParams) -> None: + self.params = params + self.unequal_rows = [] + + @staticmethod + def pre_process(origin_ouput, perturbed_output): + if ( + isinstance(origin_ouput, tuple) + and hasattr(origin_ouput, "values") + and hasattr(origin_ouput, "indices") + ): + origin_ouput = origin_ouput.values + perturbed_output = perturbed_output.values + if hasattr(perturbed_output, "dtype"): + abs_tol = ThresholdConfig.ABS_TOL_VALUE_DICT.get(perturbed_output.dtype) + else: + abs_tol = FuzzThreshold.F32_THD.value + return ( + origin_ouput.to(perturbed_output.dtype).to(perturbed_output.device), + perturbed_output, + abs_tol, + ) + + def get_ratio_from_specific_norm( + self, origin_output, perturbed_output, norm_type, abs_tol + ): + if norm_type == NormType.ENDLESS_NORM: + return self.get_endless_norm(origin_output, perturbed_output, abs_tol) + return ThresholdConfig.COMP_CONSISTENT + + @staticmethod + def convert_overflow_ratio_to_consistent(ratio): + if math.isnan(ratio) or math.isinf(ratio): + return ThresholdConfig.COMP_CONSISTENT + return ratio + + def get_endless_norm(self, origin_output, perturbed_output, abs_tol): + try: + ratio_tensor1 = TorchC.where( + TorchC.gt(TorchC.abs(perturbed_output), abs_tol), + TorchC.div( + TorchC.abs(origin_output), + TorchC.add(TorchC.abs(perturbed_output), abs_tol), + ), + 1, + ) + ratio_tensor2 = TorchC.where( + TorchC.gt(TorchC.abs(origin_output), abs_tol), + TorchC.div( + TorchC.abs(perturbed_output), + TorchC.add(TorchC.abs(origin_output), abs_tol), + ), + 1, + ) + except: + ratio_tensor1 = TorchC.where( + TorchC.gt(TorchC.abs(perturbed_output.to(torch.float32)), abs_tol), + TorchC.div( + origin_output.to(torch.float32), perturbed_output.to(torch.float32) + ), + 1, + ) + ratio_tensor2 = TorchC.where( + TorchC.gt(TorchC.abs(origin_output.to(torch.float32)), abs_tol), + TorchC.div( + perturbed_output.to(torch.float32), origin_output.to(torch.float32) + ), + 1, + ) + norm1 = self.convert_overflow_ratio_to_consistent( + TorchC.max(ratio_tensor1).item() + ) + norm2 = self.convert_overflow_ratio_to_consistent( + TorchC.max(ratio_tensor2).item() + ) + norm3 = self.convert_overflow_ratio_to_consistent( + TorchC.min(ratio_tensor1).item() + ) + if norm3 < 0: + ratio = ThresholdConfig.SYMBOL_FLIPPING + else: + ratio = max(norm1, norm2) + return ratio + + def ratio_calculate(self, origin_output, perturbed_output, norm_type) -> float: + try: + origin_output, perturbed_output, abs_tol = self.pre_process( + origin_output, perturbed_output + ) + except Exception as e: + print_warn_log_rank_0( + f"[atat] Free Benchmark: For {self.params.api_name}, " + f"when computing ratio," + f" y1 or y2 dtype is not supported {e}" + ) + return ThresholdConfig.COMP_NAN + if self.params.fuzz_stage == Const.BACKWARD: + abs_tol = ThresholdConfig.BACKWARD_OUTPUT_LOWER_BOUND + else: + abs_tol = abs_tol**0.5 + return self.get_ratio_from_specific_norm( + origin_output, perturbed_output, norm_type, abs_tol + ) + + @abstractmethod + def get_threshold(self, dtype): + pass + + def _get_default_threshold(self, dtype): + if self.params.pert_mode == PerturbationMode.NO_CHANGE: + threshold = ThresholdConfig.COMP_CONSISTENT + else: + threshold = ThresholdConfig.DTYPE_PER_THD.get( + dtype, ThresholdConfig.DTYPE_PER_THD.get(torch.float32) + ) + return threshold + + def npu_compare( + self, origin_output, perturbed_output + ) -> Tuple[bool, Optional[float]]: + + if isinstance(perturbed_output, int): + return origin_output == perturbed_output, None + elif isinstance(perturbed_output, float): + return ( + math.isclose(origin_output, perturbed_output), + origin_output / perturbed_output, + ) + elif not isinstance(perturbed_output, torch.Tensor): + print_warn_log_rank_0( + f"[atat] Free Benchmark: For {self.params.api_name} " + f"The compare for output type {type(perturbed_output)} is not supported" + ) + + threshold = self.get_threshold(origin_output.dtype) + ratio = self.ratio_calculate( + origin_output, perturbed_output, norm_type=NormType.ENDLESS_NORM + ) + if ratio == ThresholdConfig.SYMBOL_FLIPPING: + is_consistent = False + else: + is_consistent = threshold >= ratio >= 1 / threshold + return is_consistent, ratio + + def cmp_output_npu(self, data_params: DataParams): + npu_consistent = True + max_fuzz_ratio = 0 + try: + if isinstance(data_params.original_result, torch.Tensor): + is_consistent, ratio = self.npu_compare( + data_params.original_result, data_params.perturbed_result + ) + npu_consistent = is_consistent + max_fuzz_ratio = ( + max_fuzz_ratio if ratio is None else max(max_fuzz_ratio, ratio) + ) + data_params.is_consistent = is_consistent and data_params.is_consistent + if not is_consistent and data_params.grad_unequal_flag: + self.unequal_rows.append( + make_unequal_row(data_params, self.params, ratio=ratio) + ) + + elif isinstance(data_params.original_result, (list, tuple)): + for index_, origin_item in enumerate(data_params.original_result): + is_consistent, ratio = self.npu_compare( + origin_item, data_params.perturbed_result[index_] + ) + npu_consistent = npu_consistent and is_consistent + max_fuzz_ratio = ( + max_fuzz_ratio if ratio is None else max(max_fuzz_ratio, ratio) + ) + data_params.is_consistent = ( + is_consistent and data_params.is_consistent + ) + if not is_consistent and data_params.grad_unequal_flag: + self.unequal_rows.append( + make_unequal_row( + data_params, self.params, ratio=ratio, index=index_ + ) + ) + except Exception as e: + print_warn_log_rank_0( + f"[atat] Free Benchmark: For {self.params.api_name}, " + f"when campare the result exception raise {e}" + ) + return npu_consistent, max_fuzz_ratio + + @abstractmethod + def handle(self, data_params: DataParams) -> Any: + pass + + def get_unequal_rows(self): + return self.unequal_rows diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/check_handler.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/check_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..2f590855f1b96e0a6475c87c9b3dfdafd0288332 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/check_handler.py @@ -0,0 +1,41 @@ +from typing import Any + +import torch +from atat.pytorch.free_benchmark import print_warn_log_rank_0 +from atat.pytorch.free_benchmark.common.enums import DeviceType +from atat.pytorch.free_benchmark.compare.single_benchmark import SingleCompare +from atat.pytorch.free_benchmark.common.params import DataParams, make_unequal_row +from atat.pytorch.free_benchmark.common.utils import Tools +from atat.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler + + +class CheckerHandler(FuzzHandler): + @staticmethod + def other_compare(self, data_params: DataParams) -> bool: + is_consistent = SingleCompare().compare_seq( + data_params.original_result, data_params.perturbed_result + ) + if not is_consistent: + self.unequal_rows.append( + make_unequal_row(data_params, self.params) + ) + + def get_threshold(self, dtype): + return self._get_default_threshold(dtype) + + def handle(self, data_params: DataParams) -> Any: + if isinstance(data_params.perturbed_result, bool) or not Tools.is_float_tensor( + data_params.perturbed_result + ): + return data_params.original_result + try: + if self.params.fuzz_device == DeviceType.NPU: + self.cmp_output_npu(data_params) + else: + self.other_compare(data_params) + except Exception as e: + print_warn_log_rank_0( + f"[atat] Free Benchmark: For {self.params.api_name}, " + f"when campare the result exception raise {e}" + ) + return data_params.original_result diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/fix_handler.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/fix_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..789e2653aa0eafc3619fbe3bd192b49dee643a1d --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/fix_handler.py @@ -0,0 +1,24 @@ +from typing import Any + +from atat.pytorch.free_benchmark.common.params import DataParams +from atat.pytorch.free_benchmark.common.utils import Tools +from atat.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler +from atat.pytorch.free_benchmark import print_warn_log_rank_0 + + +class FixHandler(FuzzHandler): + + def get_threshold(self, dtype): + return self._get_default_threshold(dtype) + + def handle(self, data_params: DataParams) -> Any: + try: + return Tools.convert_fuzz_output_to_origin( + data_params.original_result, data_params.perturbed_result + ) + except Exception as e: + print_warn_log_rank_0( + f"[atat] Free Benchmark: For {self.params.api_name} " + f"Fix output failed. " + ) + return data_params.original_result \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/handler_factory.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/handler_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..50f791d81eeb25f8a50a6b4044dbc8e6e09e6a1e --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/handler_factory.py @@ -0,0 +1,32 @@ +from atat.pytorch.free_benchmark import FreeBenchmarkException +from atat.pytorch.free_benchmark.common.constant import PreheatConfig +from atat.pytorch.free_benchmark.common.utils import Tools +from atat.pytorch.free_benchmark.common.enums import HandlerType +from atat.pytorch.free_benchmark.common.params import HandlerParams +from atat.pytorch.free_benchmark.result_handlers.check_handler import CheckerHandler +from atat.pytorch.free_benchmark.result_handlers.preheat_handler import PreheatHandler +from atat.pytorch.free_benchmark.result_handlers.fix_handler import FixHandler + + +class FuzzHandlerFactory: + + result_handlers = { + HandlerType.CHECK: CheckerHandler, + HandlerType.FIX: FixHandler, + HandlerType.PREHEAT: PreheatHandler, + } + + @staticmethod + def create(params: HandlerParams): + if_preheat = params.preheat_config.get(PreheatConfig.IF_PREHEAT) + if not if_preheat: + handler = FuzzHandlerFactory.result_handlers.get(params.handler_type) + else: + handler = FuzzHandlerFactory.result_handlers.get(HandlerType.PREHEAT) + # TODO + if not handler: + raise FreeBenchmarkException( + FreeBenchmarkException.UnsupportedType, + f"无标杆工具支持 [ {HandlerType.CHECK}、{HandlerType.FIX}] 形式", + ) + return handler(params) diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/preheat_handler.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/preheat_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..b8ff3bccf00c2dbe699159b4f77da86c75ae4062 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/preheat_handler.py @@ -0,0 +1,174 @@ +from typing import Any + +import torch +import math +from atat.pytorch.free_benchmark import print_info_log_rank_0, print_warn_log_rank_0 +from atat.pytorch.free_benchmark.common.constant import ThresholdConfig +from atat.pytorch.free_benchmark.common.enums import DeviceType +from atat.pytorch.free_benchmark.common.params import DataParams, make_unequal_row +from atat.pytorch.free_benchmark.common.utils import Tools +from atat.pytorch.free_benchmark.compare.single_benchmark import SingleCompare +from atat.pytorch.free_benchmark.common.counter import preheat_counter +from atat.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler +from atat.pytorch.free_benchmark.common.params import HandlerParams + + +class PreheatHandler(FuzzHandler): + + def __init__(self, params: HandlerParams) -> None: + super().__init__(params) + self.pure_name = Tools.get_pure_api_name(self.params.api_name) + + def get_threshold(self, dtype): + return preheat_counter.get_api_thd(self.pure_name, dtype) + + def _is_take_a_sample(self) -> bool: + need_sample_set = self._get_need_sample_set() + curr_called_seq = preheat_counter.get_api_called_time(self.pure_name) + res = curr_called_seq in need_sample_set + if res: + total_count = preheat_counter.get_one_step_used_api(self.pure_name) + print_info_log_rank_0( + f"[atat] Free benchmark: preheat sample in step{self.params.step}" + f"api_name {self.params.api_name}, " + f"curr_called_seq: {curr_called_seq}/{total_count}" + ) + preheat_counter.add_api_sample_time(self.pure_name) + return res + + def _get_sample_count_per_step(self) -> set: + """ + 每一个step中应该采集的样本数 + """ + total_count = preheat_counter.get_one_step_used_api(self.pure_name) + preheat_step = self.params.preheat_config.get("preheat_step") + max_sample = self.params.preheat_config.get("max_sample") + return min(math.ceil(total_count / preheat_step), max_sample) + + def _get_need_sample_set(self): + """ + 需要采集的api集合 + """ + # 每一步样本数 + total_count = preheat_counter.get_one_step_used_api(self.pure_name) + sample_count_per_step = self._get_sample_count_per_step() + need_sample_set = set() + prehead_step = self.params.preheat_config.get("preheat_step") + for i in range(1, sample_count_per_step + 1): + count = (prehead_step * (i - 1) + self.params.step) % total_count + if count == 0: + count = total_count + need_sample_set.add(count) + return need_sample_set + + + def compare_npu_and_cpu(self, data_params: DataParams): + args = Tools.convert_device_and_dtype( + data_params.args, DeviceType.CPU, change_dtype=True + ) + kwargs = Tools.convert_device_and_dtype( + data_params.kwargs, DeviceType.CPU, change_dtype=True + ) + cpu_result = data_params.origin_func(*args, **kwargs) + return SingleCompare().compare_seq(data_params.original_result, cpu_result) + + def _need_adjust_threshold(self) -> bool: + sample_count_per_step = self._get_sample_count_per_step() + sampled_time = preheat_counter.get_api_sample_time(self.pure_name) + res = sampled_time >= sample_count_per_step + return res + + def _adjust_threshold_for_dtype(self, dtype_str, compare_result): + con_ratio = [ratio for ratio, is_consistent in compare_result if is_consistent] + incon_ratio = [ + ratio for ratio, is_consistent in compare_result if not is_consistent + ] + old_thd = preheat_counter.get_api_thd(self.pure_name, dtype_str) + new_thd = old_thd + # 正例负例都存在 + if con_ratio and incon_ratio: + if min(incon_ratio) > max(con_ratio): + new_thd = min(min(incon_ratio), old_thd) + preheat_counter.set_api_preheat(self.pure_name, dtype_str, is_preheat=False) + elif con_ratio: + # 存在漏报 + if max(con_ratio) > old_thd: + new_thd = 1 + ((old_thd - 1) * ThresholdConfig.API_THD_STEP) + else: + new_thd = 1 + ((old_thd - 1) / ThresholdConfig.API_THD_STEP) + else: + new_thd = min(min(incon_ratio), old_thd) + preheat_counter.set_api_preheat(self.pure_name, dtype_str, is_preheat=False) + return new_thd + + def _adjust_threshold(self): + for dtype_str, compare_result in preheat_counter.preheat_record[ + self.pure_name + ].items(): + new_thd = self._adjust_threshold_for_dtype(dtype_str, compare_result) + threshold = self._get_default_threshold( + preheat_counter.dtype_map.get(dtype_str) + ) + preheat_counter.update_api_thd( + self.pure_name, dtype_str, new_thd, threshold + ) + + def preheat(self, max_fuzz_ratio, cpu_consistent, first_dtype): + # 存储当前step所有输出比值和对应npu\cpu比对结果 + preheat_counter.update_preheat_record( + self.pure_name, + first_dtype, + (max_fuzz_ratio, cpu_consistent), + ) + if self._need_adjust_threshold(): + self._adjust_threshold() + + def handle(self, data_params: DataParams) -> Any: + + if isinstance(data_params.perturbed_result, bool) or not Tools.is_float_tensor( + data_params.perturbed_result + ): + return data_params.original_result + + if self.params.step == 0: + preheat_counter.add_one_step_used_api(self.pure_name) + return data_params.original_result + + # 如果当前api,step需要预热 + npu_consistent, max_fuzz_ratio = self.cmp_output_npu(data_params) + data_params.is_consistent = npu_consistent + + preheat_counter.check_step(self.params.step) + + if self.params.preheat_config.get("preheat_step") <= self.params.step: + return data_params.original_result + + if not data_params.grad_unequal_flag: + data_params.grad_unequal_flag = True + data_params.is_consistent = False + return data_params.original_result + preheat_counter.add_api_called_time(self.pure_name) + + + if not self._is_take_a_sample(): + return data_params.original_result + + cpu_consistent = True + try: + cpu_consistent = self.compare_npu_and_cpu(data_params) + except Exception as e: + print_warn_log_rank_0( + f"[atat] Free Benchmark: For {self.params.api_name}, " + f"when campare to cpu exception raise {e}" + ) + try: + first_dtype = Tools.get_first_tensor_dtype(data_params.perturbed_result) + except RuntimeError: + print_warn_log_rank_0( + f"[atat] Free Benchmark: For {self.params.api_name}, " + f"the output sequence does not contain tensors." + ) + if preheat_counter.get_api_preheat(self.pure_name, str(first_dtype)): + self.preheat(max_fuzz_ratio, cpu_consistent, first_dtype) + + return data_params.original_result diff --git a/debug/accuracy_tools/atat/pytorch/functional/__init__.py b/debug/accuracy_tools/atat/pytorch/functional/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..12e530d4c950f6bab9d6fe48861954ca0061e33d --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/functional/__init__.py @@ -0,0 +1,4 @@ +from .repair import build_repair +from .scope import build_scope +from .step_post_process import build_step_post_process +from .data_collector import build_data_collector diff --git a/debug/accuracy_tools/atat/pytorch/functional/data_collector.py b/debug/accuracy_tools/atat/pytorch/functional/data_collector.py new file mode 100644 index 0000000000000000000000000000000000000000..a217207161b3848dcea671c3ea903f2b7c351c80 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/functional/data_collector.py @@ -0,0 +1,210 @@ + +import os +import torch +from ..module_processer import ModuleProcesser +from .scope import BaseScope, build_scope, ListScope +from .json_writer import DataWriter +from ..common.log import print_info_log, print_warn_log, print_info_log_rank_0, print_error_log_rank_0 +from ..common.utils import Const +from ..common.file_check import FileOpen +from .data_processor import build_data_processor, DataProcessor + +try: + import torch_npu +except ImportError: + pass + +forward_init_status = False + +def build_data_collector(config): + return DataCollector(config) + + +class DataCollector: + overflow_task = "overflow_check" + tensor_task = "tensor" + freebenchmark_task = "free_benchmark" + multi_output_apis = ["_sort_", "npu_flash_attention"] + tasks_need_tensor_data = [overflow_task, tensor_task, freebenchmark_task] + level_without_construct = ["L1", "L2"] + + def __init__(self, config): + self.config = config + self.data_writer = DataWriter() + self.data_processor = build_data_processor(config, self.data_writer) + self.module_count = {} + if config.task == DataCollector.freebenchmark_task: + self.scope = build_scope(ListScope, self.config.scope, self.config.list) + else: + self.scope = build_scope(None, self.config.scope, self.config.list) + + def if_return_forward_new_output(self): + return self.data_processor.if_return_forward_new_output() + + def get_forward_new_output(self): + return self.data_processor.get_forward_new_output() + + @property + def dump_data_dir(self): + return self.data_writer.dump_tensor_data_dir + + @property + def dump_file_path(self): + return self.data_writer.dump_file_path + + def visit_and_clear_overflow_status(self, api_or_module_name): + self.data_processor.visit_and_clear_overflow_status(api_or_module_name) + + def write_json(self): + self.data_writer.write_json() + + + def update_data(self, data_info, msg=''): + if self.config.task == DataProcessor.overflow: + if self.data_processor.has_overflow: + self.data_writer.update_data(data_info) + msg += "Overflow detected." + else: + msg += "No Overflow, OK." + else: + self.data_writer.update_data(data_info) + return msg + + @staticmethod + def check_scope_and_pid(scope, name, pid): + return (not scope or scope.check(name)) and pid == os.getpid() + + @staticmethod + def is_inplace(module): + return getattr(module, "op_is_inplace", False) + + def pre_forward_data_collect(self, name, module, pid, module_input_output): + backward_name = name.replace("forward", "backward") + if self.check_scope_and_pid(self.scope, backward_name, pid): + self.data_processor.analyze_pre_forward(backward_name, module, module_input_output) + if not self.is_inplace(module): + return + print_info_log(f"API {name} is inplace.") + if self.check_scope_and_pid(self.scope, name, pid): + data_info = self.data_processor.analyze_pre_forward_inplace(name, module_input_output) + self.update_data(data_info) + + def forward_data_collect(self, name, module, pid, module_input_output): + self.collect_data(name, module, pid, module_input_output, Const.FORWARD) + + def backward_data_collect(self, name, module, pid, module_input_output): + self.collect_data(name, module, pid, module_input_output, Const.BACKWARD) + + def collect_data(self, name, module, pid, module_input_output, forward_or_backward): + if self.config.level not in DataCollector.level_without_construct: + self.data_writer.update_construct({name: ModuleProcesser.api_parent_node}) + self.data_writer.update_construct(ModuleProcesser.module_node) + if not self.check_scope_and_pid(self.scope, name, pid): + return + + if self.config.level == "L2": + self.acl_dump(module, module_input_output, name) + return + + msg = f"msProbe is collecting data on {name}. " + if forward_or_backward == Const.FORWARD: + if not self.is_inplace(module): + data_info = self.data_processor.analyze_forward(name, module, module_input_output) + else: + data_info = self.data_processor.analyze_forward_inplace(name, module_input_output) + self.data_writer.update_stack(self.data_processor.analyze_api_call_stack(name)) + elif forward_or_backward == Const.BACKWARD: + data_info = self.data_processor.analyze_backward(name, module, module_input_output) + else: + raise ValueError(f"Unsupported forward_or_backward value: {forward_or_backward}") + + if data_info: + msg = self.update_data(data_info, msg) + print_info_log(msg) + self.data_writer.flush_data_when_buffer_is_full() + + def module_count_func(self, name, name_template): + module_name = name.split(Const.SEP)[-3] + if "forward" in name_template: + if module_name not in self.module_count: + self.module_count[module_name] = [0, [0]] + else: + if self.module_count[module_name][-1] and \ + self.module_count[module_name][0] != self.module_count[module_name][-1][-1]: + self.module_count[module_name][-1].pop() + self.module_count[module_name][0] += 1 + self.module_count[module_name][-1].append(self.module_count[module_name][0]) + index = self.module_count[module_name][0] + else: + backward_stack = self.module_count[module_name][-1] if module_name in self.module_count else [] + if not backward_stack: + index = "abnormal" + else: + index = backward_stack.pop() + return index + + def update_dump_paths(self, *args): + self.data_writer.update_dump_paths(*args) + self.data_writer.initialize_json_file(task=self.config.task, level=self.config.level) + + def update_iter(self, current_iter): + self.data_processor.update_iter(current_iter) + + def acl_dump(self, module, module_input_output, module_name): + if self.config.is_forward_acl_dump: + self.forward_acl_dump(module, module_input_output, module_name) + else: + self.dump_mode_backward_acl_dump(module, module_input_output, module_name) + + def op_need_trigger(self, module_name): + if 'Tensor___getitem___' in module_name: + return True + return False + + def forward_acl_dump(self, module, module_input_output, module_name): + global forward_init_status + if not forward_init_status: + forward_init_status = True + torch_npu.npu.synchronize() + torch_npu.npu.init_dump() + torch_npu.npu.set_dump(self.config.acl_config) + torch_npu.npu.synchronize() + if self.op_need_trigger(module_name): + module.forward(*module_input_output.args, **module_input_output.kwargs).cpu() + else: + module.forward(*module_input_output.args, **module_input_output.kwargs) + torch_npu.npu.synchronize() + torch_npu.npu.finalize_dump() + torch_npu.npu.synchronize() + forward_init_status = False + print_info_log("Dump %s op file." % module_name) + + def acl_backward_dump_status(self, output, grad, module_name): + if isinstance(output, torch.Tensor): + output.backward(grad, retain_graph=True) + return True + + for api_name in DataCollector.multi_output_apis: + if api_name in module_name: + output[0].backward(grad, retain_graph=True) + return True + return False + + def dump_mode_backward_acl_dump(self, module, module_input_output, module_name): + global forward_init_status + grad_path = self.config.backward_input.get(module_name) + if not forward_init_status: + forward_init_status = True + output = module.forward(*module_input_output.args, **module_input_output.kwargs) + grad = torch.load(grad_path).to("npu").requires_grad_() + torch_npu.npu.init_dump() + torch_npu.npu.set_dump(self.config.acl_config) + torch_npu.npu.synchronize() + if not self.acl_backward_dump_status(output, grad, module_name): + print_warn_log("The output of {} is not of tensor type and cannot be automatically derived. " + "you can manually construct a single API backward case for ACL dump.".format( + module_name)) + torch_npu.npu.synchronize() + torch_npu.npu.finalize_dump() + forward_init_status = False + print_info_log("Dump %s op file." % module_name) diff --git a/debug/accuracy_tools/atat/pytorch/functional/data_processor.py b/debug/accuracy_tools/atat/pytorch/functional/data_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..1ef1b79acb2172daa3bc85d11ffe4049d1bca942 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/functional/data_processor.py @@ -0,0 +1,539 @@ +import torch +import zlib +import numpy as np +import os +import inspect +from dataclasses import dataclass, asdict +import torch_npu +from typing import Tuple, List, Dict, Optional, Union +from ..common.exceptions import MsaccException +from ..common.file_check import path_len_exceeds_limit, change_mode, FileCheckConst +from ..common.log import print_warn_log +from ..common.utils import Const +from ..common import recursive_apply_transform +from ..functional. json_writer import DataWriter +from ..free_benchmark import FreeBenchmarkCheck, UnequalRow + +bits_for_overflow = 8 + +def build_data_processor(config, data_writer): + if config.task == DataProcessor.full: + return FullTensorDataProcessor(config, data_writer) + elif config.task == DataProcessor.summary: + return DataProcessor(config, data_writer) + elif config.task == DataProcessor.overflow: + return OverflowTensorDataProcessor(config, data_writer) + elif config.task == DataProcessor.free_benchmark: + return FreeBenchmarkDataProcessor(config, data_writer) + else: + raise MsaccException(MsaccException.INVALID_PARAM_ERROR, + "task should be in [{}, {}, {}, {}]".format( + DataProcessor.full, + DataProcessor.summary, + DataProcessor.overflow, + DataProcessor.free_benchmark + )) + + +@dataclass +class ModuleForwardInputsOutputs: + args: Optional[Tuple] + kwargs: Optional[Dict] + output: Union[Tuple, torch.Tensor] + + @property + def args_tuple(self): + if not isinstance(self.args, tuple): + return (self.args, ) + else: + return self.args + + @property + def output_tuple(self): + if not isinstance(self.output, tuple): + return (self.output, ) + else: + return self.output + + def concat_args_and_kwargs(self): + args = self.args + tuple(self.kwargs.values()) + return args + + +@dataclass +class ModuleBackwardInputsOutputs: + grad_output: Optional[Tuple] + grad_input: Optional[Tuple] + + @property + def grad_input_tuple(self): + if not isinstance(self.grad_input, tuple): + return (self.grad_input, ) + else: + return self.grad_input + + @property + def grad_output_tuple(self): + if not isinstance(self.grad_output, tuple): + return (self.grad_output, ) + else: + return self.grad_output + + +class DataProcessor: + full = "tensor" + summary = "statistics" + overflow = "overflow_check" + free_benchmark = "free_benchmark" + + def __init__(self, config, data_writer): + self.data_writer = data_writer + self.api_info_struct = {} + self.stack_info_struct = {} + self.torch_object_key = { + "device": self.analyze_device_in_kwargs, + "dtype": self.analyze_dtype_in_kwargs + } + self.current_api_or_module_name = None + self.config = config + self.api_data_category = None + self.has_overflow = False + self.current_iter = 0 + + # 需要对forward的output进行更改 + self._return_forward_new_output = False + self._forward_new_output = None + + def if_return_forward_new_output(self): + return self._return_forward_new_output + + def get_forward_new_output(self): + self._return_forward_new_output = False + return self._forward_new_output + + @staticmethod + def get_md5_for_tensor(x): + if x.dtype == torch.bfloat16: + x = x.float() + tensor_bytes = x.cpu().detach().numpy().tobytes() + crc32_hash = zlib.crc32(tensor_bytes) + return f"{crc32_hash:08x}" + + @staticmethod + def analyze_device_in_kwargs(element): + single_arg = {} + single_arg.update({'type': "torch.device"}) + if not isinstance(element, str): + if hasattr(element, "index"): + device_value = element.type + ":" + str(element.index) + else: + device_value = element.type + single_arg.update({"value": device_value}) + else: + single_arg.update({"value": element}) + return single_arg + + @staticmethod + def analyze_dtype_in_kwargs(element): + single_arg = {} + single_arg.update({"type": "torch.dtype"}) + single_arg.update({"value": str(element)}) + return single_arg + + @staticmethod + def _convert_numpy_to_builtin(arg): + type_mapping = { + np.integer: int, + np.floating: float, + np.bool_: bool, + np.complexfloating: complex, + np.str_: str, + np.byte: bytes, + np.unicode_: str + } + for numpy_type, builtin_type in type_mapping.items(): + if isinstance(arg, numpy_type): + return builtin_type(arg), type(arg).__name__ + return arg, '' + + def update_iter(self, current_iter): + self.current_iter = current_iter + + def visit_and_clear_overflow_status(self, api_or_module_name): + if self.current_api_or_module_name != api_or_module_name: + self.current_api_or_module_name = api_or_module_name + self.has_overflow = False + + def _analyze_numpy(self, value, numpy_type): + single_arg = {} + single_arg.update({"type": numpy_type}) + single_arg.update({"value": value}) + return single_arg + + def get_stat_info(self, data): + if data.is_meta: + return + data_clone = data.detach() + if data_clone.numel() == 0: + tensor_max = None + tensor_min = None + tensor_mean = None + tensor_norm = None + elif data_clone.dtype == torch.bool: + tensor_max = True in data_clone + tensor_min = False not in data_clone + tensor_mean = None + tensor_norm = None + elif not len(data_clone.shape): + tensor_max = data_clone.item() + tensor_min = tensor_max + tensor_mean = tensor_max + tensor_norm = tensor_max + else: + if not data_clone.is_floating_point(): + data_clone = data_clone.float() + tensor_max = torch._C._VariableFunctionsClass.max(data_clone).item() + tensor_min = torch._C._VariableFunctionsClass.min(data_clone).item() + tensor_mean = torch._C._VariableFunctionsClass.mean(data_clone).item() + tensor_norm = torch._C._VariableFunctionsClass.norm(data_clone).item() + + return tensor_max, tensor_min, tensor_mean, tensor_norm + + def _analyze_builtin(self, arg): + single_arg = {} + if isinstance(arg, slice): + single_arg.update({"type": "slice"}) + # slice参数中可能存在tensor类型,json序列化,需要转换为python数值类型 + values = [ + value if not isinstance(value, torch.Tensor) else value.item() + for value in [arg.start, arg.stop, arg.step] + ] + single_arg.update({"value": values}) + else: + single_arg.update({"type": type(arg).__name__}) + single_arg.update({"value": arg}) + return single_arg + + def _analyze_torch_size(self, arg): + single_arg = {} + single_arg.update({"type": "torch.Size"}) + single_arg.update({"value": list(arg)}) + return single_arg + + def is_dump_for_data_mode(self, forward_backward, input_output): + """ + Compare the parameters with data_mode to determine whether to dump. + + Args: + forward_backward(str): The forward or backward mode to check. + input_output(str): The input or output mode to check. + + Return: + bool: True if the parameters are in data_mode or data_mode is all, False otherwise. + """ + return (Const.ALL in self.config.data_mode or + forward_backward in self.config.data_mode or + input_output in self.config.data_mode) + + @staticmethod + def handle_tensor_extremum_nan_inf(data_clone, operator): + data_nan = torch._C._VariableFunctionsClass.isnan(data_clone) + if int(torch._C._VariableFunctionsClass.sum(data_nan)) == data_clone.numel(): + return float('nan') + finite_mask = torch._C._VariableFunctionsClass.isfinite(data_clone) + if int(torch._C._VariableFunctionsClass.sum(finite_mask)) > 0: + finite_values = data_clone[finite_mask] + return torch._C._VariableFunctionsClass.max(finite_values).item() if operator == 'max' else \ + torch._C._VariableFunctionsClass.min(finite_values).item() + else: + data_no_nan = data_clone[~data_nan] + return torch._C._VariableFunctionsClass.max(data_no_nan).item() if operator == 'max' else \ + torch._C._VariableFunctionsClass.min(data_no_nan).item() + + def _analyze_maybe_overflow_tensor(self, tensor_json, tensor): + data_clone = tensor.detach() + if hasattr(torch_npu._C, '_npu_is_support_inf_nan') and torch_npu._C._npu_is_support_inf_nan(): + if tensor_json['Max'] is None: + return + if np.isinf(tensor_json['Max']) or np.isnan(tensor_json['Max']): + tensor_json['Max_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(data_clone, "max") + self.has_overflow = True + if np.isinf(tensor_json['Min']) or np.isnan(tensor_json['Min']): + tensor_json['Min_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(data_clone, "min") + self.has_overflow = True + else: + self.has_overflow = check_overflow_npu() + if self.has_overflow: + clear_overflow_npu() + + def _analyze_tensor(self, tensor, suffix): + tensor_max, tensor_min, tensor_mean, tensor_norm = self.get_stat_info(tensor) + + tensor_json = {} + tensor_json.update({'type': 'torch.Tensor'}) + tensor_json.update({'dtype': str(tensor.dtype)}) + tensor_json.update({"shape": tensor.shape}) + tensor_json.update({"Max": tensor_max}) + tensor_json.update({"Min": tensor_min}) + self._analyze_maybe_overflow_tensor(tensor_json, tensor) + tensor_json.update({"Mean": tensor_mean}) + tensor_json.update({"Norm": tensor_norm}) + tensor_json.update({"requires_grad": tensor.requires_grad}) + if self.config.summary_mode == "md5": + tensor_md5 = self.get_md5_for_tensor(tensor) + tensor_json.update({"md5": tensor_md5}) + + return tensor_json + + def analyze_single_element(self, element, suffix_stack): + if suffix_stack and suffix_stack[-1] in self.torch_object_key: + return self.torch_object_key[suffix_stack[-1]](element) + + if isinstance(element, torch.Size): + return self._analyze_torch_size(element) + + converted_numpy, numpy_type = self._convert_numpy_to_builtin(element) + if converted_numpy is not element: + return self._analyze_numpy(converted_numpy, numpy_type) + + if isinstance(element, torch.Tensor): + return self._analyze_tensor(element, Const.SEP.join(suffix_stack)) + + if isinstance(element, (bool, int, float, str, slice)): + return self._analyze_builtin(element) + + def analyze_element(self, element): + return recursive_apply_transform(element, self.analyze_single_element) + + @staticmethod + def analyze_api_call_stack(name): + stack_str = [] + for (_, path, line, func, code, _) in inspect.stack()[5:]: + if not code: + continue + stack_line = " ".join([ + "File", ", ".join([ + path, + " ".join(["line", str(line)]), + " ".join(["in", func]), + " ".join(["\n", code[0].strip()]) + ]) + ]) + stack_str.append(stack_line) + stack_info_struct = {name: stack_str} + return stack_info_struct + + def analyze_pre_forward(self, name, module, + module_input_output: ModuleForwardInputsOutputs): + pass + + def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs): + api_info_struct = {} + if self.is_dump_for_data_mode(Const.FORWARD, Const.INPUT): # check whether data_mode contains forward or input + api_info_struct[name] = {} + self.api_data_category = Const.INPUT + args_info_list = self.analyze_element(module_input_output.args_tuple) + api_info_struct[name][Const.INPUT_ARGS] = args_info_list + + self.api_data_category = Const.KWARGS + kwargs_info_list = self.analyze_element(module_input_output.kwargs) + api_info_struct[name][Const.INPUT_KWARGS] = kwargs_info_list + + if self.is_dump_for_data_mode(Const.FORWARD, Const.OUTPUT): # check whether data_mode contains forward or output + api_info_struct[name] = api_info_struct.get(name, {}) + self.api_data_category = Const.OUTPUT + output_info_list = self.analyze_element(module_input_output.output_tuple) + api_info_struct[name][Const.OUTPUT] = output_info_list + + return api_info_struct + + def analyze_pre_forward_inplace(self, name, module_input_output: ModuleForwardInputsOutputs): + api_info_struct = {} + if self.is_dump_for_data_mode(Const.FORWARD, Const.INPUT): + api_info_struct[name] = {} + self.api_data_category = Const.INPUT + args_info_list = self.analyze_element(module_input_output.args_tuple) + api_info_struct[name][Const.INPUT_ARGS] = args_info_list + + self.api_data_category = Const.KWARGS + kwargs_info_list = self.analyze_element(module_input_output.kwargs) + api_info_struct[name][Const.INPUT_KWARGS] = kwargs_info_list + + return api_info_struct + + def analyze_forward_inplace(self, name, module_input_output: ModuleForwardInputsOutputs): + concat_args = module_input_output.concat_args_and_kwargs() + api_info_struct = {} + if self.is_dump_for_data_mode(Const.FORWARD, Const.OUTPUT): + api_info_struct[name] = {} + self.api_data_category = Const.OUTPUT + output_info_list = self.analyze_element(concat_args) + api_info_struct[name][Const.OUTPUT] = output_info_list + + return api_info_struct + + + def analyze_backward(self, name, module, module_input_output: ModuleBackwardInputsOutputs): + api_info_struct = {} + if self.is_dump_for_data_mode(Const.BACKWARD, Const.OUTPUT): + api_info_struct[name] = {} + self.api_data_category = Const.OUTPUT + input_info_list = self.analyze_element(module_input_output.grad_input_tuple) + api_info_struct[name][Const.GRAD_INPUT] = input_info_list + + if self.is_dump_for_data_mode(Const.BACKWARD, Const.INPUT): + api_info_struct[name] = api_info_struct.get(name, {}) + self.api_data_category = Const.INPUT + output_info_list = self.analyze_element(module_input_output.grad_output_tuple) + api_info_struct[name][Const.GRAD_OUTPUT] = output_info_list + + return api_info_struct + + +class FullTensorDataProcessor(DataProcessor): + + def _analyze_tensor(self, tensor, suffix): + self.data_path = self.data_writer.dump_tensor_data_dir + dump_data_name = (self.current_api_or_module_name + Const.SEP + self.api_data_category + Const.SEP + + suffix + ".pt") + file_path = os.path.join(self.data_writer.dump_tensor_data_dir, dump_data_name) + if not path_len_exceeds_limit(file_path): + torch.save(tensor, file_path) + change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY) + else: + print_warn_log(f'The file path {file_path} length exceeds limit.') + single_arg = super()._analyze_tensor(tensor, suffix) + single_arg.update({"data_name": dump_data_name}) + return single_arg + + +class OverflowTensorDataProcessor(DataProcessor): + __slots__ = ["cached_tensors_and_file_paths"] + + def __init__(self, config, data_writer): + super().__init__(config, data_writer) + self.cached_tensors_and_file_paths = {} + self.real_overflow_dump_times = 0 + self.overflow_nums = config.overflow_num + + def _analyze_tensor(self, tensor, suffix): + self.data_path = self.data_writer.dump_tensor_data_dir + dump_data_name = (self.current_api_or_module_name + Const.SEP + self.api_data_category + Const.SEP + + suffix + ".pt") + file_path = os.path.join(self.data_writer.dump_tensor_data_dir, dump_data_name) + if not path_len_exceeds_limit(file_path): + self.cached_tensors_and_file_paths.update({file_path: tensor}) + else: + print_warn_log(f'The file path {file_path} length exceeds limit.') + single_arg = super()._analyze_tensor(tensor, suffix) + single_arg.update({"data_name": dump_data_name}) + return single_arg + + def analyze_forward(self, name, module, + module_input_output: ModuleForwardInputsOutputs): + self.has_overflow = False + api_info_struct = super().analyze_forward(name, module, module_input_output) + self.maybe_save_overflow_data_and_check_overflow_times() + return api_info_struct if self.has_overflow else None + + def analyze_backward(self, name, module, + module_input_output: ModuleBackwardInputsOutputs): + self.has_overflow = False + api_info_struct = super().analyze_backward(name, module, module_input_output) + self.maybe_save_overflow_data_and_check_overflow_times() + return api_info_struct if self.has_overflow else None + + def maybe_save_overflow_data_and_check_overflow_times(self): + if self.has_overflow: + for file_path, tensor in self.cached_tensors_and_file_paths.items(): + torch.save(tensor, file_path) + change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY) + self.inc_and_check_overflow_times() + self.cached_tensors_and_file_paths = {} + + def inc_and_check_overflow_times(self): + self.real_overflow_dump_times += 1 + if self.overflow_nums == -1: + return + if self.real_overflow_dump_times >= self.overflow_nums: + raise MsaccException(MsaccException.OVERFLOW_NUMS_ERROR, + str(self.real_overflow_dump_times)) + + +class FreeBenchmarkDataProcessor(DataProcessor): + + def __init__(self, config, data_writer): + super().__init__(config, data_writer) + self.checker = FreeBenchmarkCheck(config=config) + + def update_iter(self, current_iter): + self.current_iter = current_iter + self.checker.update_iter(current_iter) + + def update_unequal_rows(self, unequal_rows: List[UnequalRow]): + if len(unequal_rows) == 0: + return + for row in unequal_rows: + data_dict = asdict(row) + self.data_writer.write_data_to_csv( + data_dict.values(), + data_dict.keys(), + self.data_writer.free_benchmark_file_path + ) + return + + def analyze_pre_forward(self, name, module, + module_input_output: ModuleForwardInputsOutputs): + args = module_input_output.args + kwargs = module_input_output.kwargs + self.checker.pre_forward(name, module, self, args, kwargs) + + def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs): + new_output, unequal_rows = self.checker.forward( + name, + module, + module_input_output.args, + module_input_output.kwargs, + module_input_output.output, + ) + self.update_unequal_rows(unequal_rows) + if self.checker.if_fix(): + self._return_forward_new_output = True + self._forward_new_output = new_output + return None + + def analyze_backward(self, name, module, module_input_output: ModuleBackwardInputsOutputs): + self.checker.backward(name, module, module_input_output.grad_output) + return None + + + +def overflow_debug_mode_enable(): + overflow_mode = os.getenv(OverflowConst.OVERFLOW_DEBUG_MODE_ENABLE, Const.ENV_DISABLE) + return overflow_mode == Const.ENV_ENABLE + +def check_overflow_npu(): + if overflow_debug_mode_enable(): + float_status = torch.zeros(bits_for_overflow).npu() + result = torch_npu.npu_get_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE) + if (result.cpu()[0] != 0): + return True + else: + return False + else: + return torch_npu._C._check_overflow_npu() + +def clear_overflow_npu(): + if overflow_debug_mode_enable(): + float_status = torch.zeros(bits_for_overflow).npu() + torch_npu.npu_clear_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE) + else: + torch_npu._C._clear_overflow_npu() + +class OverflowConst: + """ + Class for Overflow + """ + OVERFLOW_DEBUG_MODE_ENABLE = "OVERFLOW_DEBUG_MODE_ENABLE" + OVERFLOW_ORIGINAL_MODE = 0 + OVERFLOW_DEBUG_MODE = 1 diff --git a/debug/accuracy_tools/atat/pytorch/functional/dump_module.py b/debug/accuracy_tools/atat/pytorch/functional/dump_module.py new file mode 100644 index 0000000000000000000000000000000000000000..fed73ad5374178fac01180bb905468b9e7c747fa --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/functional/dump_module.py @@ -0,0 +1,38 @@ +import torch.nn as nn +from atat.core.utils import print_error_log, DumpException +from .scope import BaseScope +from ..common.utils import Const +from ..hook_module.api_registry import api_register +from ..debugger.precision_debugger import PrecisionDebugger + +module_count = {} + + +def module_dump(module, dump_name): + if not isinstance(module, nn.Module): + print_error_log("The parameter:module in module_dump is not a Module subclass.") + raise DumpException(DumpException.INVALID_PARAM_ERROR) + if not isinstance(dump_name, str): + print_error_log("The parameter:dump_name in module_dump is not a str type.") + raise DumpException(DumpException.INVALID_PARAM_ERROR) + api_register.api_originality() + if dump_name not in module_count: + module_count[dump_name] = 0 + else: + module_count[dump_name] += 1 + dump_name = dump_name + Const.SEP + str(module_count.get(dump_name)) + Const.SEP + + pdg = PrecisionDebugger() + _, forward_hook, backward_hook = pdg.service.build_hook(BaseScope.Module_Type_Module, dump_name) + module.register_forward_hook(forward_hook, with_kwargs=True) + module.register_full_backward_hook(backward_hook) + + module.register_forward_pre_hook(pdg.service.module_processor.node_hook(dump_name + Const.FORWARD, Const.START)) + module.register_forward_hook(pdg.service.module_processor.node_hook(dump_name + Const.FORWARD, Const.STOP)) + module.register_full_backward_pre_hook( + pdg.service.module_processor.node_hook(dump_name + Const.BACKWARD, Const.START)) + module.register_full_backward_hook(pdg.service.module_processor.node_hook(dump_name + Const.BACKWARD, Const.STOP)) + + +def module_dump_end(): + api_register.api_modularity() diff --git a/debug/accuracy_tools/atat/pytorch/functional/json_writer.py b/debug/accuracy_tools/atat/pytorch/functional/json_writer.py new file mode 100644 index 0000000000000000000000000000000000000000..0fee3aa9731aa79c2f0e5857fb8596a86e86b6d7 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/functional/json_writer.py @@ -0,0 +1,120 @@ +import os +import csv +from pathlib import Path +import json +from ..common.file_check import FileCheckConst, change_mode +from ..common.log import print_info_log_rank_0 +from ..common.utils import Const + + +class DataWriter: # TODO: UT + # dump_json_name = "dump.json" + # stack_json_name = "stack.json" + # construct_json_name = "construct.json" + + def __init__(self, init_json=None) -> None: + self.dump_count = 0 + self.init_json = init_json + self.dump_file_path = None # os.path.join(dump_dir, DataWriter.dump_json_name) + self.stack_file_path = None # os.path.join(dump_dir, DataWriter.stack_json_name) + self.construct_file_path = None # os.path.join(dump_dir, DataWriter.construct_json_name) + self.free_benchmark_file_path = None + self.dump_tensor_data_dir = None + self.buffer_size = 1000 + self.cache_data = {"data": {}} + self.cache_stack = {} + self.cache_construct = {} + + def initialize_json_file(self, **kwargs): + kwargs.update({"dump_data_dir": self.dump_tensor_data_dir, "data": {}}) + with os.fdopen( + os.open(self.dump_file_path, Const.OVERWRITE_FLAGS, FileCheckConst.DATA_FILE_AUTHORITY), 'w' + ) as f: + json.dump(kwargs, f) + + if os.path.exists(self.stack_file_path): + os.remove(self.stack_file_path) + Path(self.stack_file_path).touch() + change_mode(self.stack_file_path, FileCheckConst.DATA_FILE_AUTHORITY) + + if os.path.exists(self.construct_file_path): + os.remove(self.construct_file_path) + Path(self.construct_file_path).touch() + change_mode(self.construct_file_path, FileCheckConst.DATA_FILE_AUTHORITY) + + def update_dump_paths(self, dump_file_path, stack_file_path, construct_file_path, dump_data_dir, free_benchmark_file_path): + self.dump_file_path = dump_file_path + self.stack_file_path = stack_file_path + self.construct_file_path = construct_file_path + self.dump_tensor_data_dir = dump_data_dir + self.free_benchmark_file_path = free_benchmark_file_path + + def update_data(self, new_data): + key = next(iter(new_data.keys())) # assert len(new_data.keys()) == 1 + if key in self.cache_data["data"]: + self.cache_data["data"][key].update(new_data[key]) + else: + self.cache_data["data"].update(new_data) + + def flush_data_when_buffer_is_full(self): + if len(self.cache_data["data"]) >= self.buffer_size: + self.write_data_json(self.dump_file_path) + + def update_stack(self, new_data): + self.cache_stack.update(new_data) + + def update_construct(self, new_data): + self.cache_construct.update(new_data) + + def write_data_json(self, file_path): + import fcntl + print_info_log_rank_0(f"dump.json is at {os.path.dirname(os.path.dirname(file_path))}. ") + if Path(file_path).exists() and os.path.getsize(file_path) > 0: + with open(file_path, "r+") as f: + fcntl.flock(f, fcntl.LOCK_EX) + data_to_write = json.load(f) + fcntl.flock(f, fcntl.LOCK_UN) + else: + self.init_json['data_path'] = self.dump_tensor_data_dir + data_to_write = self.init_json + data_to_write['data'].update(self.cache_data['data']) + with open(file_path, 'w+') as f: + fcntl.flock(f, fcntl.LOCK_EX) + json.dump(data_to_write, f, indent=1) + fcntl.flock(f, fcntl.LOCK_UN) + + self.cache_data["data"].clear() + + def write_stack_info_json(self, file_path): + import fcntl + with open(file_path, 'w+') as f: + fcntl.flock(f, fcntl.LOCK_EX) + json.dump(self.cache_stack, f, indent=1) + fcntl.flock(f, fcntl.LOCK_UN) + + def write_construct_info_json(self, file_path): + import fcntl + with open(file_path, 'w+') as f: + fcntl.flock(f, fcntl.LOCK_EX) + json.dump(self.cache_construct, f, indent=1) + fcntl.flock(f, fcntl.LOCK_UN) + + def write_json(self): + self.write_data_json(self.dump_file_path) + self.write_stack_info_json(self.stack_file_path) + self.write_construct_info_json(self.construct_file_path) + + @staticmethod + def write_data_to_csv(result: list, result_header: tuple, file_path: str): + if len(result) == 0: + return + is_exists = os.path.exists(file_path) + append = "a+" if is_exists else "w+" + with os.fdopen( + os.open(file_path, Const.WRITE_FLAGS, FileCheckConst.DATA_FILE_AUTHORITY), append, newline="" + ) as csv_file: + spawn_writer = csv.writer(csv_file) + if not is_exists: + spawn_writer.writerow(result_header) + spawn_writer.writerows([result,]) + \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/functional/repair.py b/debug/accuracy_tools/atat/pytorch/functional/repair.py new file mode 100644 index 0000000000000000000000000000000000000000..aed8326424f5e171a9a71d21cfeb48db6fb26fb3 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/functional/repair.py @@ -0,0 +1,90 @@ +from abc import ABC, abstractmethod + +import torch + +from .scope import build_scope, ListScope, BaseScope +from ..common.exceptions import RepairException +from ..common import recursive_apply_transform, print_info_log_rank_0 + + +def build_repair(config): + if config.repair_type is None: + return None + elif config.repair_type == RepairAPI.ToCPU: + return RepairAPI_toCPU(config) + elif config.repair_type == RepairAPI.RaisePrecision: + return RepairAPI_raise(config) + else: + raise RepairException(RepairException.InvalidRepairType, f"精度修复类型" + f"须配置为'{RepairAPI.ToCPU}'或'{RepairAPI.RaisePrecision}," + f"实际配置为{config.repair_type}") + + +class RepairAPI(ABC): + ToCPU = "cpu" + RaisePrecision = "raise" + + def __init__(self, config): + self.config = config + self.scope = build_scope(ListScope, config.repair_scope, config.repair_api_str) + self.saved, self.towards = "None", "None" + + def check_name_and_module_type(self, name, module_type): + if module_type == BaseScope.Module_Type_Module: + return False + if not self.scope.check(name): + return False + return True + + def convert(self, name, module_type, args, kwargs): + is_target = self.check_name_and_module_type(name, module_type) + if is_target: + args = recursive_apply_transform(args, self.fx) + kwargs = recursive_apply_transform(kwargs, self.fx) + print_info_log_rank_0(f"[msProbe] convert inputs of {name} to " + f"{self.towards}.") + return args, kwargs + + def invert(self, name, module_type, out_feat): + is_target = self.check_name_and_module_type(name, module_type) + if is_target: + out_feat = recursive_apply_transform(out_feat, self.inv_fx) + print_info_log_rank_0(f"[msProbe] convert outputs of {name} back to "\ + f"{self.saved}.") + return out_feat + + +class RepairAPI_toCPU(RepairAPI): + def fx(self, arg, _): + if isinstance(arg, torch.Tensor): + self.saved = arg.device + self.towards = torch.device("cpu") + return arg.cpu() + return arg + + def inv_fx(self, arg, _): + if isinstance(arg, torch.Tensor): + return arg.to(self.saved) + return arg + + +class RepairAPI_raise(RepairAPI): + raise_dtype_map = { + torch.bfloat16: torch.float32, + torch.float16: torch.float32 + } + + def fx(self, arg, _): + if isinstance(arg, torch.Tensor): + self.saved = arg.dtype + self.towards = RepairAPI_raise.raise_dtype_map.get(self.saved) + # bug: nested input may be of various dtypes. which to save and invert? + return arg.to(self.towards) + return arg + + def inv_fx(self, arg, _): + if isinstance(arg, torch.Tensor): + return arg.to(self.saved) + return arg + + diff --git a/debug/accuracy_tools/atat/pytorch/functional/scope.py b/debug/accuracy_tools/atat/pytorch/functional/scope.py new file mode 100644 index 0000000000000000000000000000000000000000..e557b876b1b00beef60dd623175374ad20d6a287 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/functional/scope.py @@ -0,0 +1,174 @@ +from abc import ABC, abstractmethod +from ..common.exceptions import ScopeException +from ..common.utils import Const + + +def build_scope(scope_class, scope=[], api_list=[]): + if not scope and not api_list: + return None + if scope_class: + return scope_class(scope, api_list) + return build_range_scope_according_to_scope_name(scope, api_list) + + +def build_range_scope_according_to_scope_name(scope, api_list): + api_range_scope = APIRangeScope(scope, api_list) + module_range_scope = ModuleRangeScope(scope, api_list) + if not scope: # 如果没有scope参数则用哪类scope都一样 + return api_range_scope + if api_range_scope.is_valid and module_range_scope.is_valid: + raise ScopeException(ScopeException.InvalidScope, f"scope={scope}.") + elif api_range_scope.is_valid: + return api_range_scope + elif module_range_scope.is_valid: + return module_range_scope + else: + raise ScopeException(ScopeException.InvalidScope, f"scope={scope}") + + +class BaseScope(ABC): + Module_Type_Module = "Module" + Module_Type_API = "api" + + @staticmethod + def rectify_args(scope, api_list): + if not isinstance(api_list, list): + raise ScopeException(ScopeException.InvalidApiStr, + f"api_list参数须配置为列表,实际类型为{type(api_list)}.") + for api in api_list: + if not isinstance(api, str): + raise ScopeException(ScopeException.InvalidApiStr, + f"api_list中的元素须配置为字符串,实际类型为{type(api)}.") + if isinstance(scope, str): + scope = [scope] + return scope, api_list + if not isinstance(scope, list): + raise ScopeException(ScopeException.InvalidScope, + f"scope参数须配置为字符串或列表,实际类型为{type(scope)}.") + for s in scope: + if not isinstance(s, str): + raise ScopeException(ScopeException.InvalidScope, + f"scope列表元素要求类型为字符串,实际类型为{type(s)}.") + return scope, api_list + + def __init__(self, scope, api_list): + scope, api_list = self.rectify_args(scope, api_list) + self.scope = scope + self.api_list = api_list + + def check_api_list(self, api_name): + if not self.api_list: + return True + for api_str in self.api_list: + if api_str in api_name: + return True + + @abstractmethod + def check(self, name): + pass + + +class ListScope(BaseScope): + @staticmethod + def rectify_args(scope, api_list): + if scope and api_list: + raise ScopeException(ScopeException.ArgConflict, + f"scope和api_list不可以同时配置,实际配置为scope={scope}, api_list={api_list}.") + return super(ListScope, ListScope).rectify_args(scope, api_list) + + def check(self, module_name): + if not self.scope or module_name in self.scope: + return self.check_api_list(module_name) + return False + + +class RangeScope(BaseScope, ABC): + @staticmethod + def rectify_args(scope, api_list): + scope, api_list = super(RangeScope, RangeScope).rectify_args(scope, api_list) + if isinstance(scope, list): + if len(scope) == 1: + scope.append(scope[0]) + elif len(scope) > 2: + raise ScopeException(ScopeException.InvalidScope, + f"scope参数指定区间断点,须传入长度为1或2的列表,实际长度为{len(scope)}.") + + return scope, api_list + + @abstractmethod + def check_scope_is_valid(self): + pass + + def __init__(self, *args): + super().__init__(*args) + self.in_scope = False + self.is_valid = self.check_scope_is_valid() + + def begin_module(self, module_name): + pass + + def end_module(self, module_name): + pass + + +class APIRangeScope(RangeScope): + def check_scope_is_valid(self): + if not self.scope: + return True + scope_start_type = self.scope[0].split(Const.SEP)[0] + if scope_start_type == BaseScope.Module_Type_Module: + return False + scope_stop_type = self.scope[1].split(Const.SEP)[0] + if scope_stop_type == BaseScope.Module_Type_Module: + return False + return True + + def check(self, api_name): + if self.scope and api_name == self.scope[0]: + self.in_scope = True + + if not self.scope or self.in_scope: + result = self.check_api_list(api_name) + else: + result = False + + if self.scope and api_name == self.scope[1]: + self.in_scope = False + return result + + +class ModuleRangeScope(RangeScope): + """ + 模块与api不同的是,模块内部还有子结构需要dump, + 需要用pre_hook和full_backward_hook来精确控制module的开始和结束, + 在这些hook触发时调用begin_module和end_module做区间控制 + """ + def check_scope_is_valid(self): + if not self.scope: + return True + scope_start_type = self.scope[0].split(Const.SEP)[0] + scope_stop_type = self.scope[1].split(Const.SEP)[0] + if scope_start_type == BaseScope.Module_Type_Module and \ + scope_stop_type == BaseScope.Module_Type_Module: + return True + return False + + def begin_module(self, module_name): + if not self.scope: + return + if module_name == self.scope[0]: + self.in_scope = True + + def end_module(self, module_name): + if not self.scope: + return + if module_name == self.scope[1]: + self.in_scope = False + + def check(self, module_name): + if not self.scope or self.in_scope: + return self.check_api_list(module_name) + return False + + + diff --git a/debug/accuracy_tools/atat/pytorch/functional/step_post_process.py b/debug/accuracy_tools/atat/pytorch/functional/step_post_process.py new file mode 100644 index 0000000000000000000000000000000000000000..7f0d3459326f04691a0041c120bf4efc676f8bc1 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/functional/step_post_process.py @@ -0,0 +1,43 @@ +from abc import ABC, abstractmethod +from ..common.exceptions import StepException + + +def run_parallel_ut(config): + pass + + +def compare_distrbuted(config): + pass + + +def build_step_post_process(config): + if not config.on_step_end: + return None + if config.on_step_end == StepPostProcess.SingleAPICheck: + return SingleAPICheck(config) + elif config.on_step_end == StepPostProcess.Compare: + return AutoCompare(config) + else: + raise StepException(StepException.InvalidPostProcess, f"step后处理须配置为" + f"'{StepPostProcess.SingleAPICheck}'或'{StepPostProcess.Compare}'," + f"实际配置为{config.on_step_end}") + + +class StepPostProcess(ABC): + SingleAPICheck = 'single_api_check' + Compare = 'compare' + + +class SingleAPICheck: + def __init__(self, config): + self.config = config + + def run(self): + run_parallel_ut(self.config) + +class AutoCompare: + def __init__(self, config): + self.config = config + + def run(self): + compare_distrbuted(self.config.bench_dump_path, self.config.dump_path) diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/__init__.py b/debug/accuracy_tools/atat/pytorch/hook_module/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4e7a5ca15e8d08d0bb886866bf413712796c9edd --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/hook_module/__init__.py @@ -0,0 +1 @@ +from .wrap_functional import remove_dropout \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/api_registry.py b/debug/accuracy_tools/atat/pytorch/hook_module/api_registry.py new file mode 100644 index 0000000000000000000000000000000000000000..003a8699cd750a424bf989ae9d1b3fac78f76650 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/hook_module/api_registry.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import torch +import torch.distributed as dist +from . import wrap_torch, wrap_functional, wrap_tensor, wrap_vf, wrap_distributed, wrap_aten +from .wrap_torch import get_torch_ops +from .wrap_functional import get_functional_ops +from .wrap_tensor import get_tensor_ops +from .wrap_vf import get_vf_ops +from .wrap_distributed import get_distributed_ops +from .wrap_aten import get_aten_ops +from ..common.utils import torch_without_guard_version, npu_distributed_api, is_gpu +torch_version_above_2 = torch.__version__.split('+')[0] > '2.0' + +if not is_gpu: + import torch_npu + from . import wrap_npu_custom + from .wrap_npu_custom import get_npu_ops + + +class ApiRegistry: + def __init__(self): + self.tensor_ori_attr = {} + self.torch_ori_attr = {} + self.functional_ori_attr = {} + self.distributed_ori_attr = {} + self.npu_distributed_ori_attr = {} + self.vf_ori_attr = {} + self.aten_ori_attr = {} + self.torch_npu_ori_attr = {} + + self.tensor_hook_attr = {} + self.torch_hook_attr = {} + self.functional_hook_attr = {} + self.distributed_hook_attr = {} + self.npu_distributed_hook_attr = {} + self.vf_hook_attr = {} + self.aten_hook_attr = {} + self.torch_npu_hook_attr = {} + + @staticmethod + def store_ori_attr(ori_api_group, api_list, api_ori_attr): + for api in api_list: + if '.' in api: + sub_module_name, sub_op = api.rsplit('.', 1) + sub_module = getattr(ori_api_group, sub_module_name) + api_ori_attr[api] = getattr(sub_module, sub_op) + else: + api_ori_attr[api] = getattr(ori_api_group, api) + + @staticmethod + def set_api_attr(api_group, attr_dict): + for api, api_attr in attr_dict.items(): + if '.' in api: + sub_module_name, sub_op = api.rsplit('.', 1) + sub_module = getattr(api_group, sub_module_name, None) + if sub_module is not None: + setattr(sub_module, sub_op, api_attr) + else: + setattr(api_group, api, api_attr) + + def api_modularity(self): + self.set_api_attr(torch.Tensor, self.tensor_hook_attr) + self.set_api_attr(torch, self.torch_hook_attr) + self.set_api_attr(torch.nn.functional, self.functional_hook_attr) + self.set_api_attr(dist, self.distributed_hook_attr) + self.set_api_attr(dist.distributed_c10d, self.distributed_hook_attr) + if not is_gpu and not torch_without_guard_version: + self.set_api_attr(torch_npu.distributed, self.npu_distributed_hook_attr) + self.set_api_attr(torch_npu.distributed.distributed_c10d, self.npu_distributed_hook_attr) + if torch_version_above_2: + self.set_api_attr(torch.ops.aten, self.aten_hook_attr) + self.set_api_attr(torch._VF, self.vf_hook_attr) + if not is_gpu: + self.set_api_attr(torch_npu, self.torch_npu_hook_attr) + + def api_originality(self): + self.set_api_attr(torch.Tensor, self.tensor_ori_attr) + self.set_api_attr(torch, self.torch_ori_attr) + self.set_api_attr(torch.nn.functional, self.functional_ori_attr) + self.set_api_attr(dist, self.distributed_ori_attr) + self.set_api_attr(dist.distributed_c10d, self.distributed_ori_attr) + if not is_gpu and not torch_without_guard_version: + self.set_api_attr(torch_npu.distributed, self.npu_distributed_ori_attr) + self.set_api_attr(torch_npu.distributed.distributed_c10d, self.npu_distributed_ori_attr) + if torch_version_above_2: + self.set_api_attr(torch.ops.aten, self.aten_ori_attr) + self.set_api_attr(torch._VF, self.vf_ori_attr) + if not is_gpu: + self.set_api_attr(torch_npu, self.torch_npu_ori_attr) + + def initialize_hook(self, hook): + self.store_ori_attr(torch.Tensor, get_tensor_ops(), self.tensor_ori_attr) + wrap_tensor.wrap_tensor_ops_and_bind(hook) + for attr_name in dir(wrap_tensor.HOOKTensor): + if attr_name.startswith("wrap_"): + self.tensor_hook_attr[attr_name[5:]] = getattr(wrap_tensor.HOOKTensor, attr_name) + + self.store_ori_attr(torch, get_torch_ops(), self.torch_ori_attr) + wrap_torch.wrap_torch_ops_and_bind(hook) + for attr_name in dir(wrap_torch.HOOKTorchOP): + if attr_name.startswith("wrap_"): + self.torch_hook_attr[attr_name[5:]] = getattr(wrap_torch.HOOKTorchOP, attr_name) + + self.store_ori_attr(torch.nn.functional, get_functional_ops(), self.functional_ori_attr) + wrap_functional.wrap_functional_ops_and_bind(hook) + for attr_name in dir(wrap_functional.HOOKFunctionalOP): + if attr_name.startswith("wrap_"): + self.functional_hook_attr[attr_name[5:]] = getattr(wrap_functional.HOOKFunctionalOP, attr_name) + + self.store_ori_attr(dist, get_distributed_ops(), self.distributed_ori_attr) + wrap_distributed.wrap_distributed_ops_and_bind(hook) + if not is_gpu and not torch_without_guard_version: + self.store_ori_attr(torch_npu.distributed, npu_distributed_api, self.npu_distributed_ori_attr) + for attr_name in dir(wrap_distributed.HOOKDistributedOP): + if attr_name.startswith("wrap_"): + self.distributed_hook_attr[attr_name[5:]] = getattr(wrap_distributed.HOOKDistributedOP, attr_name) + if not is_gpu and not torch_without_guard_version and attr_name[5:] in npu_distributed_api: + self.npu_distributed_hook_attr[attr_name[5:]] = getattr(wrap_distributed.HOOKDistributedOP, + attr_name) + + if torch_version_above_2: + self.store_ori_attr(torch.ops.aten, get_aten_ops(), self.aten_ori_attr) + wrap_aten.wrap_aten_ops_and_bind(hook) + for attr_name in dir(wrap_aten.HOOKAtenOP): + if attr_name.startswith("wrap_"): + self.aten_hook_attr[attr_name[5:]] = getattr(wrap_aten.HOOKAtenOP, attr_name) + + self.store_ori_attr(torch._VF, get_vf_ops(), self.vf_ori_attr) + wrap_vf.wrap_vf_ops_and_bind(hook) + for attr_name in dir(wrap_vf.HOOKVfOP): + if attr_name.startswith("wrap_"): + self.vf_hook_attr[attr_name[5:]] = getattr(wrap_vf.HOOKVfOP, attr_name) + + if not is_gpu: + self.store_ori_attr(torch_npu, get_npu_ops(), self.torch_npu_ori_attr) + wrap_npu_custom.wrap_npu_ops_and_bind(hook) + for attr_name in dir(wrap_npu_custom.HOOKNpuOP): + if attr_name.startswith("wrap_"): + self.torch_npu_hook_attr[attr_name[5:]] = getattr(wrap_npu_custom.HOOKNpuOP, attr_name) + + +api_register = ApiRegistry() diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/hook_module.py b/debug/accuracy_tools/atat/pytorch/hook_module/hook_module.py new file mode 100644 index 0000000000000000000000000000000000000000..ae4a7abdab12e46fcf25e7594b9016ca347599bc --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/hook_module/hook_module.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import functools +import threading +import torch +import torch.nn as nn +import torch.utils.hooks as full_hooks +from ..common.utils import Const + +class HOOKModule(nn.Module): + module_count = {} + inner_stop_hook = {} + + def __init__(self, build_hook) -> None: + super(HOOKModule, self).__init__() + self.has_overflow = False + self.prefix = "" + self.current_thread = threading.current_thread().ident + if self.current_thread not in HOOKModule.inner_stop_hook: + HOOKModule.inner_stop_hook[self.current_thread] = False + self.stop_hook = HOOKModule.inner_stop_hook.get(self.current_thread, False) + + if not self.stop_hook: + if hasattr(self, "prefix_op_name_"): + self.prefix = self.prefix_op_name_ + + if self.prefix not in HOOKModule.module_count: + HOOKModule.module_count[self.prefix] = 1 + self.prefix += '0' + Const.SEP + else: + HOOKModule.module_count[self.prefix] += 1 + self.prefix = self.prefix + str(HOOKModule.module_count[self.prefix] - 1) + Const.SEP + forward_pre_hook, forward_hook, backward_hook = build_hook(self.prefix) + self.register_forward_pre_hook(forward_pre_hook, with_kwargs=True) + self.register_forward_hook(forward_hook, with_kwargs=True) + self.register_backward_hook(backward_hook) + + def __call__(self, *input, **kwargs): + changed = False + if not self.stop_hook: + HOOKModule.inner_stop_hook[self.current_thread] = True + changed = True + result = self._call_func(*input, **kwargs) + if changed: + HOOKModule.inner_stop_hook[self.current_thread] = False + return result + + def _call_func(self, *input, **kwargs): + full_backward_hooks, non_full_backward_hooks = [], [] + if len(self._backward_hooks) > 0: + full_backward_hooks, non_full_backward_hooks = self._get_backward_hooks() + for hook in self._forward_pre_hooks.values(): + result_input, result_kwargs = hook(self, input, kwargs) + if result_input is not None: + if not isinstance(result_input, tuple): + result_input = (result_input,) + input = result_input + if result_kwargs is not None: + kwargs = result_kwargs + bw_hook = None + if len(full_backward_hooks) > 0: + bw_hook = full_hooks.BackwardHook(self, full_backward_hooks) + input = bw_hook.setup_input_hook(input) + if torch._C._get_tracing_state(): + result = self._slow_forward(*input, **kwargs) + else: + result = self.forward(*input, **kwargs) + for hook in self._forward_hooks.values(): + hook_result = hook(self, input, kwargs, result) + if hook_result is not None: + result = hook_result + if bw_hook: + result = bw_hook.setup_output_hook(result) + if len(non_full_backward_hooks) > 0: + var = result + while not isinstance(var, torch.Tensor): + if isinstance(var, dict): + var = next((v for v in var.values() if isinstance(v, torch.Tensor))) + elif isinstance(var, (list, tuple)): + if var: + var = var[0] + else: + return result + else: + return result + grad_fn = var.grad_fn + if grad_fn is not None: + for hook in non_full_backward_hooks: + wrapper = functools.partial(hook, self) + functools.update_wrapper(wrapper, hook) + grad_fn.register_hook(wrapper) + self._maybe_warn_non_full_backward_hook(input, result, grad_fn) + return result diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8bab7cd7696b33081df2630b527d79ba30cccbe2 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml @@ -0,0 +1,1877 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# List of ops that register hooks + +functional: + - conv1d + - conv2d + - conv3d + - conv_transpose1d + - conv_transpose2d + - conv_transpose3d + - conv_tbc + - avg_pool1d + - avg_pool2d + - avg_pool3d + - fractional_max_pool2d_with_indices + - fractional_max_pool2d + - fractional_max_pool3d_with_indices + - fractional_max_pool3d + - max_pool1d_with_indices + - max_pool1d + - max_pool2d_with_indices + - max_pool2d + - max_pool3d_with_indices + - max_pool3d + - max_unpool1d + - max_unpool2d + - max_unpool3d + - lp_pool2d + - lp_pool1d + - adaptive_max_pool1d_with_indices + - adaptive_max_pool1d + - adaptive_max_pool2d_with_indices + - adaptive_max_pool2d + - adaptive_max_pool3d_with_indices + - adaptive_max_pool3d + - adaptive_avg_pool1d + - adaptive_avg_pool2d + - adaptive_avg_pool3d + - dropout + - alpha_dropout + - dropout2d + - dropout3d + - feature_alpha_dropout + - threshold + - threshold_ + - relu + - relu_ + - glu + - hardtanh + - hardtanh_ + - relu6 + - elu + - elu_ + - selu + - selu_ + - celu + - celu_ + - leaky_relu + - leaky_relu_ + - prelu + - rrelu + - rrelu_ + - logsigmoid + - gelu + - hardshrink + - tanhshrink + - softsign + - softplus + - softmin + - softmax + - gumbel_softmax + - log_softmax + - softshrink + - tanh + - sigmoid + - hardsigmoid + - linear + - bilinear + - silu + - hardswish + - embedding + - embedding_bag + - batch_norm + - instance_norm + - layer_norm + - group_norm + - local_response_norm + - ctc_loss + - nll_loss + - poisson_nll_loss + - gaussian_nll_loss + - kl_div + - cross_entropy + - binary_cross_entropy + - binary_cross_entropy_with_logits + - smooth_l1_loss + - l1_loss + - mse_loss + - margin_ranking_loss + - hinge_embedding_loss + - multilabel_margin_loss + - soft_margin_loss + - multilabel_soft_margin_loss + - cosine_embedding_loss + - multi_margin_loss + - pixel_shuffle + - pixel_unshuffle + - channel_shuffle + - upsample + - interpolate + - upsample_nearest + - upsample_bilinear + - grid_sample + - affine_grid + - pad + - pairwise_distance + - pdist + - cosine_similarity + - one_hot + - triplet_margin_loss + - triplet_margin_with_distance_loss + - normalize + - unfold + - fold + - multi_head_attention_forward + - scaled_dot_product_attention + +tensor: + - __add__ + - __and__ + - __bool__ + - __div__ + - __eq__ + - __ge__ + - __gt__ + - __getitem__ + - __iadd__ + - __iand__ + - __idiv__ + - __ifloordiv__ + - __ilshift__ + - __imod__ + - __imul__ + - __ior__ + - __irshift__ + - __isub__ + - __ixor__ + - __lshift__ + - __matmul__ + - __mod__ + - __mul__ + - __nonzero__ + - __or__ + - __radd__ + - __rmul__ + - __rshift__ + - __setitem__ + - __sub__ + - __truediv__ + - __xor__ + - abs + - abs_ + - absolute + - absolute_ + - acos + - acos_ + - acosh + - acosh_ + - add + - add_ + - addbmm + - addbmm_ + - addcdiv + - addcdiv_ + - addcmul + - addcmul_ + - addmm + - addmm_ + - addmv + - addmv_ + - addr + - addr_ + - align_as + - align_to + - all + - allclose + - amax + - amin + - angle + - any + - arccos + - arccos_ + - arccosh + - arccosh_ + - arcsin + - arcsin_ + - arcsinh + - arcsinh_ + - arctan + - arctan_ + - arctanh + - arctanh_ + - argmax + - argmin + - argsort + - asin + - asin_ + - asinh + - asinh_ + - atan + - atan2 + - atan2_ + - atan_ + - atanh + - atanh_ + - baddbmm + - baddbmm_ + - bernoulli + - bernoulli_ + - bincount + - bitwise_and + - bitwise_and_ + - bitwise_not + - bitwise_not_ + - bitwise_or + - bitwise_or_ + - bitwise_xor + - bitwise_xor_ + - bmm + - broadcast_to + - cauchy_ + - ceil + - ceil_ + - cholesky + - chunk + - clamp + - cholesky_solve + - cholesky_inverse + - clamp_ + - clamp_max + - clamp_max_ + - clip + - clamp_min + - clamp_min_ + - clip_ + - copysign + - copysign_ + - cos + - cos_ + - cosh + - cosh_ + - count_nonzero + - cummax + - cummin + - cumprod + - cumprod_ + - cumsum + - cumsum_ + - deg2rad + - deg2rad_ + - det + - diag + - diag_embed + - diagflat + - diagonal + - diff + - dist + - digamma + - digamma_ + - div + - div_ + - divide + - divide_ + - dot + - eig + - eq + - eq_ + - erf + - equal + - erf_ + - erfc + - erfc_ + - erfinv + - erfinv_ + - exp + - exp2 + - exp2_ + - expm1 + - exp_ + - expm1_ + - exponential_ + - fill_ + - fix + - fill_diagonal_ + - fix_ + - flip + - fliplr + - flatten + - flipud + - float_power + - float_power_ + - floor + - floor_ + - floor_divide + - floor_divide_ + - fmax + - fmin + - fmod + - fmod_ + - frac + - frac_ + - gather + - gcd + - gcd_ + - ge + - ge_ + - geometric_ + - geqrf + - ger + - greater + - greater_ + - gt + - gt_ + - greater_equal + - greater_equal_ + - hardshrink + - heaviside + - heaviside_ + - histc + - hypot + - hypot_ + - igamma + - igamma_ + - igammac + - igammac_ + - index_add + - index_add_ + - inverse + - index_copy + - index_copy_ + - index_fill + - index_fill_ + - index_put + - index_put_ + - inner + - index_select + - isclose + - isfinite + - isinf + - isnan + - isneginf + - isposinf + - isreal + - kron + - kthvalue + - lcm + - lcm_ + - ldexp + - ldexp_ + - le + - le_ + - lerp + - lerp_ + - where + - less + - less_ + - less_equal + - less_equal_ + - lgamma + - lgamma_ + - log + - log10 + - log10_ + - log1p + - log1p_ + - log2 + - log2_ + - log_ + - log_normal_ + - log_softmax + - logcumsumexp + - logdet + - logaddexp + - logaddexp2 + - logical_and + - logical_and_ + - logical_not + - logit + - logical_not_ + - logical_or + - logical_or_ + - logical_xor + - logical_xor_ + - logit_ + - logsumexp + - lstsq + - lt + - lt_ + - lu_solve + - map2_ + - map_ + - masked_fill + - matmul + - masked_fill_ + - masked_scatter + - masked_scatter_ + - masked_select + - matrix_exp + - max + - maximum + - mean + - matrix_power + - median + - min + - minimum + - mm + - mode + - msort + - mul + - mul_ + - multinomial + - multiply + - multiply_ + - mv + - mvlgamma + - mvlgamma_ + - nansum + - narrow + - narrow_copy + - ne + - ne_ + - neg + - neg_ + - negative + - negative_ + - nonzero + - norm + - normal_ + - not_equal + - not_equal_ + - permute + - pinverse + - polygamma + - pow + - pow_ + - polygamma_ + - prelu + - prod + - put_ + - rad2deg + - rad2deg_ + - ravel + - real + - reciprocal + - reciprocal_ + - relu + - relu_ + - remainder + - repeat_interleave + - reshape + - remainder_ + - renorm + - renorm_ + - repeat + - reshape_as + - resize_ + - resize_as_ + - roll + - rot90 + - round + - round_ + - rsqrt + - rsqrt_ + - scatter + - scatter_ + - scatter_add + - scatter_add_ + - select + - sgn + - sgn_ + - sigmoid + - sigmoid_ + - sign + - sign_ + - signbit + - sin + - sin_ + - sinc + - sinc_ + - sinh + - sinh_ + - slogdet + - smm + - softmax + - solve + - sort + - split_with_sizes + - sqrt + - sqrt_ + - square + - square_ + - squeeze + - squeeze_ + - sspaddmm + - std + - sub + - sub_ + - sum + - sum_to_size + - svd + - symeig + - t + - t_ + - take + - tan + - tan_ + - tanh + - tanh_ + - tensor_split + - tile + - topk + - transpose + - transpose_ + - triangular_solve + - tril + - tril_ + - triu + - true_divide + - triu_ + - true_divide_ + - trunc + - trunc_ + - type_as + - unbind + - unflatten + - unfold + - unsafe_chunk + - unsqueeze + - unsafe_split + - unsafe_split_with_sizes + - var + - vdot + - unsqueeze_ + - view_as + - xlogy + - xlogy_ + +torch: + - linalg.norm + - linalg.vector_norm + - linalg.matrix_norm + - linalg.diagonal + - linalg.det + - linalg.slogdet + - linalg.cond + - linalg.matrix_rank + - linalg.qr + - linalg.lu + - linalg.lu_factor + - linalg.svd + - linalg.svdvals + - linalg.solve + - linalg.lstsq + - linalg.inv + - linalg.pinv + - linalg.matrix_exp + - linalg.matrix_power + - linalg.cross + - linalg.matmul + - linalg.vecdot + - linalg.multi_dot + - linalg.householder_product + - linalg.tensorsolve + - linalg.vander + - linalg.cholesky_ex + - linalg.inv_ex + - linalg.solve_ex + - linalg.lu_factor_ex + - linalg.ldl_factor + - linalg.ldl_factor_ex + - _adaptive_avg_pool2d + - _add_relu + - _add_relu_ + - _aminmax + - _batch_norm_impl_index + - _convolution + - _foreach_norm + - _softmax_backward_data + - abs + - abs_ + - absolute + - acos + - acos_ + - acosh + - acosh_ + - adaptive_avg_pool1d + - adaptive_max_pool1d + - add + - addbmm + - addcdiv + - addcmul + - addmm + - addmv + - addmv_ + - addr + - amax + - affine_grid_generator + - align_tensors + - all + - alpha_dropout + - amin + - alpha_dropout_ + - angle + - any + - arange + - arccos + - arccos_ + - arccosh + - arccosh_ + - arcsin + - arcsin_ + - arcsinh + - arcsinh_ + - arctan + - arctan_ + - arctanh + - arctanh_ + - argmax + - argmin + - argsort + - asin + - asin_ + - asinh + - asinh_ + - atan + - atan2 + - atan_ + - atanh + - atanh_ + - atleast_1d + - atleast_2d + - atleast_3d + - avg_pool1d + - baddbmm + - bartlett_window + - batch_norm_backward_elemt + - batch_norm_backward_reduce + - batch_norm_elemt + - batch_norm_gather_stats + - batch_norm_gather_stats_with_counts + - bernoulli + - batch_norm_stats + - batch_norm_update_stats + - bilinear + - bincount + - binomial + - binary_cross_entropy_with_logits + - bitwise_and + - bitwise_not + - bitwise_or + - bitwise_xor + - blackman_window + - block_diag + - bmm + - broadcast_tensors + - broadcast_to + - bucketize + - cartesian_prod + - cat + - cdist + - ceil + - ceil_ + - celu + - celu_ + - chain_matmul + - channel_shuffle + - cholesky + - cholesky_inverse + - cholesky_solve + - choose_qparams_optimized + - chunk + - clamp + - clamp_ + - clamp_max + - clamp_max_ + - clamp_min + - clamp_min_ + - clip + - clip_ + - clone + - column_stack + - combinations + - concat + - concatenate + - constant_pad_nd + - conv1d + - conv2d + - conv3d + - conv_tbc + - conv_transpose1d + - conv_transpose2d + - conv_transpose3d + - cos + - convolution + - copysign + - cos_ + - cosh + - cosh_ + - cosine_embedding_loss + - cosine_similarity + - count_nonzero + - cov + - cross + - ctc_loss + - cummax + - cummin + - cumprod + - cumsum + - deg2rad + - deg2rad_ + - det + - diag + - diag_embed + - diff + - diagflat + - diagonal + - digamma + - dist + - div + - divide + - dot + - dropout + - dropout_ + - dsmm + - dstack + - eig + - einsum + - embedding + - embedding_bag + - embedding_renorm_ + - eq + - equal + - erf + - erf_ + - erfc + - erfc_ + - erfinv + - exp + - exp2 + - exp2_ + - exp_ + - expm1 + - expm1_ + - eye + - feature_dropout + - feature_alpha_dropout + - feature_alpha_dropout_ + - feature_dropout_ + - fix + - fill_ + - fix_ + - flatten + - flip + - fliplr + - flipud + - float_power + - floor + - floor_ + - floor_divide + - fmax + - fmin + - fmod + - frac + - frac_ + - full + - frobenius_norm + - full_like + - gather + - gcd + - gcd_ + - ge + - geqrf + - ger + - greater + - greater_equal + - grid_sampler + - grid_sampler_2d + - group_norm + - grid_sampler_3d + - gru + - gru_cell + - gt + - hamming_window + - hann_window + - hardshrink + - heaviside + - hinge_embedding_loss + - histc + - hsmm + - hspmm + - hstack + - hypot + - igamma + - igammac + - index_add + - index_copy + - inner + - index_fill + - index_put + - index_put_ + - index_select + - instance_norm + - inverse + - isclose + - isfinite + - isinf + - isnan + - isneginf + - isposinf + - istft + - kaiser_window + - kl_div + - kron + - kthvalue + - layer_norm + - lcm + - lcm_ + - ldexp + - ldexp_ + - le + - lerp + - less + - less_equal + - lgamma + - linspace + - log + - log10 + - log10_ + - log1p + - log1p_ + - log2 + - log2_ + - log_softmax + - log_ + - logaddexp + - logaddexp2 + - logcumsumexp + - logdet + - logical_and + - logical_not + - logical_or + - logical_xor + - logit + - logit_ + - logspace + - logsumexp + - lstm + - lstm_cell + - lstsq + - lt + - lu_solve + - lu_unpack + - masked_fill + - margin_ranking_loss + - masked_scatter + - masked_select + - matrix_exp + - matmul + - matrix_power + - matrix_rank + - max + - max_pool1d + - max_pool2d + - max_pool1d_with_indices + - max_pool3d + - maximum + - mean + - median + - min + - minimum + - mm + - mode + - moveaxis + - movedim + - msort + - mul + - multinomial + - multiply + - mv + - mvlgamma + - nan_to_num + - nan_to_num_ + - nanmedian + - nansum + - narrow + - native_batch_norm + - native_group_norm + - narrow_copy + - native_layer_norm + - native_norm + - ne + - neg + - negative + - neg_ + - negative_ + - nextafter + - nonzero + - norm + - norm_except_dim + - normal + - not_equal + - nuclear_norm + - ones_like + - outer + - pairwise_distance + - pdist + - permute + - pinverse + - pixel_shuffle + - pixel_unshuffle + - poisson + - poisson_nll_loss + - polar + - polygamma + - pow + - prelu + - prod + - qr + - quantile + - rad2deg + - rad2deg_ + - range + - ravel + - real + - reciprocal + - relu + - reciprocal_ + - relu_ + - remainder + - renorm + - repeat_interleave + - reshape + - resize_as_ + - roll + - rot90 + - round + - round_ + - rrelu + - rrelu_ + - rsqrt + - row_stack + - rsqrt_ + - rsub + - saddmm + - scalar_tensor + - scatter + - select + - scatter_add + - searchsorted + - selu + - selu_ + - sgn + - sigmoid + - sigmoid_ + - sign + - signbit + - sin + - sin_ + - sinc + - sinc_ + - sinh + - sinh_ + - slogdet + - smm + - softmax + - solve + - sort + - sparse_coo_tensor + - square + - split + - split_with_sizes + - spmm + - sqrt + - sqrt_ + - square_ + - squeeze + - sspaddmm + - stack + - std + - std_mean + - stft + - sub + - subtract + - sum + - svd + - swapaxes + - swapdims + - symeig + - t + - take + - take_along_dim + - tan + - tan_ + - tanh + - tanh_ + - tensordot + - tensor_split + - threshold + - threshold_ + - tile + - topk + - transpose + - trapz + - triangular_solve + - tril + - tril_indices + - triplet_margin_loss + - triu + - triu_indices + - true_divide + - trunc + - trunc_ + - unique_consecutive + - xlogy + - unbind + - unsafe_chunk + - unsafe_split + - vander + - var + - vdot + - unsafe_split_with_sizes + - unsqueeze + - var_mean + - vstack + - where + - xlogy_ + +_VF: + - lstm + +torch_npu: + - one_ + - npu_sort_v2 + - npu_transpose + - npu_broadcast + - npu_dtype_cast + - empty_with_format + - npu_one_hot + - npu_stride_add + - npu_ps_roi_pooling + - npu_roi_align + - npu_nms_v4 + - npu_iou + - npu_nms_with_mask + - npu_pad + - npu_bounding_box_encode + - npu_bounding_box_decode + - npu_batch_nms + - npu_slice + - _npu_dropout + - npu_indexing + - npu_ifmr + - npu_max + - npu_scatter + - npu_layer_norm_eval + - npu_alloc_float_status + - npu_confusion_transpose + - npu_bmmV2 + - fast_gelu + - npu_sub_sample + - npu_deformable_conv2d + - npu_mish + - npu_anchor_response_flags + - npu_yolo_boxes_encode + - npu_grid_assign_positive + - npu_normalize_batch + - npu_masked_fill_range + - npu_linear + - npu_bert_apply_adam + - npu_giou + - npu_ciou + - npu_diou + - npu_sign_bits_pack + - npu_sign_bits_unpack + - npu_flash_attention + - npu_scaled_masked_softmax + - npu_rotary_mul + - npu_roi_align + - npu_roi_alignbk + - npu_ptiou + - npu_fusion_attention + - npu_dropout_with_add_softmax + - npu_random_choice_with_mask + - npu_rotated_iou + - npu_conv2d + - npu_conv3d + - npu_softmax_cross_entropy_with_logits + - npu_all_gather_base_mm + - npu_swiglu + - npu_rms_norm + - npu_mm_reduce_scatter_base + - npu_mm_all_reduce_base + - npu_conv_transpose2d + - npu_convolution + - npu_convolution_transpose + - npu_min + - npu_nms_rotated + - npu_reshape + - npu_rotated_box_decode + - npu_rotated_box_encode + - npu_rotated_overlaps + - npu_silu + - npu_fused_attention_score + - npu_multi_head_attention + - npu_gru + - npu_incre_flash_attention + - npu_prompt_flash_attention + - npu_lstm + - npu_apply_adam + +aten: + - signbit + - logical_not_ + - _foreach_copy_ + - clamp + - hardswish_ + - arcsin_ + - logsumexp + - native_group_norm + - special_i1e + - bitwise_and + - new_full + - fft_ihfft + - _adaptive_avg_pool2d + - scatter_add + - abs + - selu + - exponential + - silu + - _native_batch_norm_legit_functional + - special_hermite_polynomial_h + - tanh_ + - log_sigmoid_forward + - _fft_c2c + - heaviside_ + - sigmoid_backward + - zeros_like + - as_strided_scatter + - trace + - _assert_async + - avg_pool2d_backward + - exp2 + - binary_cross_entropy_backward + - geometric + - fft_ihfftn + - smooth_l1_loss + - multiply + - __lshift__ + - binary_cross_entropy_with_logits + - _embedding_bag + - arange + - linalg_qr + - _embedding_bag_forward_only + - _unsafe_view + - remainder + - cholesky_inverse + - sub_ + - zero + - fix + - xlogy + - __doc__ + - rsqrt_ + - cummin + - __xor__ + - eye + - _fused_adam + - ceil + - nll_loss2d_backward + - replication_pad3d_backward + - fill_ + - logaddexp2 + - _thnn_fused_lstm_cell_backward_impl + - native_dropout + - fft_ifft + - expand + - _cdist_backward + - avg_pool3d_backward + - round_ + - topk + - max_unpool3d + - xlogy_ + - reflection_pad2d_backward + - addcdiv_ + - relu6 + - multilabel_margin_loss_forward + - prelu + - logaddexp + - _cholesky_solve_helper + - _foreach_addcdiv + - arctan_ + - fft_irfftn + - logical_or + - bitwise_or_ + - hardtanh_backward + - uniform + - less_equal + - _foreach_sub + - linalg_cholesky_ex + - hardswish + - fft_fft2 + - sign + - min + - norm + - asin + - addcmul_ + - stft + - col2im + - special_chebyshev_polynomial_u + - adaptive_max_pool3d + - __ilshift__ + - _resize_output + - gather + - lu_unpack + - native_batch_norm_backward + - sigmoid + - sqrt + - new_empty_strided + - _foreach_lerp_ + - mean + - scatter_add_ + - _fft_c2r + - rand_like + - true_divide_ + - gcd_ + - multinomial + - permute + - index_put_ + - arcsinh_ + - log1p_ + - index_add + - atan + - glu_backward + - searchsorted + - fill + - _unsafe_index + - index_reduce_ + - replication_pad2d + - expm1_ + - hardsigmoid + - addmm + - fft_fftn + - fft_ifftshift + - special_modified_bessel_k1 + - fft_rfft + - ge + - _adaptive_avg_pool2d_backward + - argmin + - linalg_lu_factor_ex + - atanh_ + - addmv + - _foreach_sqrt_ + - huber_loss_backward + - empty_like + - softshrink + - subtract_ + - bitwise_left_shift_ + - special_modified_bessel_i0 + - _nested_tensor_from_tensor_list + - slice_backward + - special_modified_bessel_i1 + - special_chebyshev_polynomial_t + - conj_physical + - _cdist_forward + - margin_ranking_loss + - max_pool3d_with_indices_backward + - _foreach_reciprocal_ + - lcm + - transpose_ + - cudnn_batch_norm_backward + - reciprocal + - copysign_ + - _foreach_pow + - rad2deg + - _foreach_sqrt + - negative + - replication_pad3d + - atanh + - _linalg_eigh + - igamma_ + - special_i0e + - linalg_ldl_factor_ex + - special_ndtri + - logit + - diagonal_copy + - triu + - silu_ + - polygamma + - square_ + - nextafter_ + - special_scaled_modified_bessel_k0 + - bitwise_not + - var + - mkldnn_rnn_layer_backward + - upsample_bilinear2d + - arctan2 + - clone + - arcsin + - new_ones + - soft_margin_loss + - nan_to_num + - huber_loss + - linalg_lu_solve + - elu_backward + - acosh + - __ior__ + - _unsafe_index_put + - __or__ + - _linalg_slogdet + - arcsinh + - select_scatter + - less_ + - reflection_pad1d + - istft + - reflection_pad2d + - diagonal_backward + - special_entr + - _softmax_backward_data + - randn + - celu + - embedding + - igammac_ + - new_zeros + - native_layer_norm_backward + - nonzero_static + - diagonal_scatter + - grid_sampler_2d + - smooth_l1_loss_backward + - _to_copy + - fft_irfft2 + - relu_ + - fmod + - log1p + - i0 + - mse_loss_backward + - copy + - special_laguerre_polynomial_l + - addmv_ + - quantized_gru + - diag_embed + - acos + - fmod_ + - linalg_cross + - mvlgamma_ + - _foreach_mul + - cummax + - less_equal_ + - ne + - to + - _pdist_forward + - special_xlog1py + - digamma + - lgamma + - mv + - softplus + - special_bessel_y1 + - pin_memory + - logical_xor_ + - cat + - grid_sampler_2d_backward + - frac_ + - dropout + - unsafe_chunk + - masked_fill_ + - log + - negative_ + - _scaled_dot_product_flash_attention + - _amp_foreach_non_finite_check_and_unscale_ + - randn_like + - add + - roll + - threshold + - gcd + - asinh + - round + - t_ + - unfold_backward + - scatter_reduce + - softplus_backward + - bitwise_right_shift_ + - pdist + - select_backward + - relu + - special_bessel_j1 + - asinh_ + - pow + - fft_fftshift + - clamp_max_ + - logical_xor + - index_reduce + - _foreach_add_ + - adaptive_max_pool2d + - adaptive_max_pool3d_backward + - tan + - addbmm_ + - cosh_ + - __rshift__ + - _foreach_maximum + - fft_ifftn + - special_spherical_bessel_j0 + - split_with_sizes + - divide_ + - neg_ + - nll_loss + - _euclidean_dist + - pairwise_distance + - _adaptive_avg_pool3d + - slice + - absolute_ + - gelu_backward + - arccos + - sin + - tril_ + - triu_ + - fft_irfft + - flip + - _foreach_sign + - linalg_householder_product + - _list_to_tensor + - cumprod + - randint_like + - item + - narrow_copy + - tanh + - linalg_vector_norm + - _cudnn_rnn + - _scaled_dot_product_efficient_attention + - _reshape_alias + - _linalg_det + - constant_pad_nd + - _linalg_svd + - sinh_ + - view + - nll_loss_backward + - greater + - sqrt_ + - avg_pool3d + - arctan + - le_ + - _pdist_backward + - _adaptive_avg_pool3d_backward + - log_ + - logical_or_ + - mse_loss + - rrelu_with_noise_backward + - _native_batch_norm_legit + - log10 + - scatter_ + - atan2_ + - greater_equal + - index_select + - __iand__ + - digamma_ + - eq + - divide + - cholesky_solve + - _prelu_kernel + - fft_ifft2 + - _foreach_neg_ + - alias + - erfc_ + - not_equal + - mul + - gru + - _dir + - glu + - clip + - lt + - rsqrt + - avg_pool2d + - conj_physical_ + - quantized_lstm + - erfinv_ + - log10_ + - float_power_ + - _functional_assert_async + - hardtanh + - logical_and_ + - _resize_output_ + - clamp_min + - _functional_sym_constrain_range_for_size + - _addmm_activation + - bucketize + - _thnn_fused_lstm_cell + - zeros + - reflection_pad1d_backward + - tan_ + - bitwise_not_ + - addmm_ + - absolute + - as_strided + - special_ndtr + - gt_ + - baddbmm + - special_log_ndtr + - hardshrink + - fft_hfft + - hypot + - native_layer_norm + - _scaled_dot_product_flash_attention_backward + - floor_divide + - is_same_size + - std + - floor_divide_ + - clamp_min_ + - _foreach_sign_ + - std_mean + - tanh_backward + - _foreach_addcmul + - binary_cross_entropy + - threshold_backward + - deg2rad_ + - masked_fill + - linspace + - reflection_pad3d + - mish + - index_copy + - scatter_reduce_ + - _sparse_coo_tensor_with_dims_and_tensors + - __loader__ + - _foreach_div_ + - cosh + - _foreach_maximum_ + - neg + - lift_fresh + - logspace + - selu_ + - leaky_relu_ + - matmul + - _foreach_sub_ + - bitwise_or + - unfold + - fmin + - convolution + - argmax + - maximum + - reflection_pad3d_backward + - fft_fft + - mode + - remainder_ + - _foreach_neg + - erf_ + - special_zeta + - index_add_ + - arccos_ + - lgamma_ + - unsqueeze_ + - gelu_ + - bmm + - _add_relu + - unfold_copy + - not_equal_ + - subtract + - true_divide + - max_pool2d_with_indices_backward + - _native_batch_norm_legit_no_training + - replication_pad1d + - name + - greater_ + - log_normal + - minimum + - alpha_dropout + - rnn_tanh + - _functional_sym_constrain_range + - sum + - _prelu_kernel_backward + - cumsum_ + - ne_ + - _linalg_solve_ex + - native_batch_norm + - igammac + - hypot_ + - exp + - leaky_relu + - new_empty + - cudnn_batch_norm + - resize_as_ + - mm + - triangular_solve + - sign_ + - clamp_max + - bitwise_right_shift + - logical_and + - special_i0 + - index_copy_ + - arctanh_ + - elu + - index + - isposinf + - linalg_solve_triangular + - logcumsumexp + - arccosh + - nan_to_num_ + - nll_loss_forward + - convolution_backward + - sub + - special_scaled_modified_bessel_k1 + - mish_ + - diagonal + - median + - tril + - sgn + - native_group_norm_backward + - stack + - take + - linalg_lu + - log2 + - hardsigmoid_ + - erfc + - max + - native_dropout_backward + - logit_ + - addr + - clip_ + - _foreach_minimum_ + - atan_ + - repeat + - cumprod_ + - bitwise_xor_ + - less + - index_put + - rrelu_with_noise + - addbmm + - special_bessel_y0 + - __and__ + - bernoulli_ + - uniform_ + - log2_ + - mul_ + - adaptive_max_pool2d_backward + - _foreach_addcmul_ + - slice_scatter + - isneginf + - pow_ + - renorm_ + - arccosh_ + - replication_pad1d_backward + - bitwise_and_ + - heaviside + - renorm + - special_modified_bessel_k0 + - le + - is_pinned + - __ixor__ + - leaky_relu_backward + - count_nonzero + - _fused_adam_ + - repeat_interleave + - upsample_bicubic2d + - rsub + - arctan2_ + - frac + - scalar_tensor + - rrelu_with_noise_ + - rot90 + - erf + - lerp_ + - expm1 + - full + - sym_constrain_range_for_size + - prod + - normal_ + - elu_ + - special_airy_ai + - nextafter + - split + - addcdiv + - fft_rfft2 + - max_pool3d_with_indices + - positive + - transpose + - mish_backward + - clamp_ + - exp_ + - _foreach_reciprocal + - linalg_matrix_exp + - unsqueeze + - upsample_nearest2d + - sinc_ + - select + - rad2deg_ + - trunc_ + - _make_dep_token + - nanmedian + - fft_hfftn + - hardtanh_ + - sym_constrain_range + - index_fill_ + - deg2rad + - rand + - sinc + - pixel_shuffle + - tril_indices + - copy_ + - _int_mm + - greater_equal_ + - celu_ + - div + - igamma + - exp2_ + - cos + - log_normal_ + - _log_softmax_backward_data + - im2col + - reciprocal_ + - amax + - broadcast_tensors + - erfinv + - __spec__ + - _fused_dropout + - special_hermite_polynomial_he + - aminmax + - rnn_relu + - meshgrid + - var_mean + - eq_ + - upsample_nearest3d + - dot + - zero_ + - floor_ + - fft_rfftn + - special_erfcx + - _foreach_div + - fft_hfft2 + - _upsample_bilinear2d_aa + - sort + - log_sigmoid_backward + - add_ + - copysign + - bernoulli + - special_bessel_j0 + - max_pool2d_with_indices + - _scaled_dot_product_efficient_attention_backward + - t + - _softmax + - arctanh + - hinge_embedding_loss + - hardswish_backward + - fmax + - multiply_ + - floor + - lstm + - i0_ + - cholesky + - where + - __irshift__ + - addcmul + - embedding_dense_backward + - sigmoid_ + - fix_ + - ormqr + - exponential_ + - __name__ + - fft_ihfft2 + - logical_not + - ones + - sgn_ + - sinh + - any + - _foreach_addcdiv_ + - asin_ + - gt + - lift + - squeeze + - grid_sampler_3d_backward + - atan2 + - _fft_r2c + - angle + - silu_backward + - acosh_ + - abs_ + - lerp + - special_i1 + - complex + - ceil_ + - _foreach_minimum + - hardsigmoid_backward + - upsample_nearest1d + - mvlgamma + - acos_ + - lt_ + - grid_sampler_3d + - max_unpool2d + - ones_like + - soft_margin_loss_backward + - _fused_moving_avg_obs_fq_helper + - isnan + - nansum + - baddbmm_ + - amin + - isinf + - bitwise_left_shift + - unsafe_split_with_sizes + - full_like + - sin_ + - bitwise_xor + - linalg_ldl_solve + - cos_ + - div_ + - polar + - randint + - trunc + - __package__ + - nll_loss2d_forward + - diag + - argsort + - _foreach_mul_ + - square + - detach + - affine_grid_generator + - _pin_memory + - geometric_ + - unbind + - randperm + - upsample_nearest2d_backward + - all + - threshold_ + - unsafe_split + - cauchy + - normal + - linalg_inv_ex + - multi_margin_loss + - cumsum + - gelu + - index_fill + - scatter + - mkldnn_rnn_layer + - ge_ + - dist + - _foreach_add + - logit_backward + - triu_indices + - lcm_ + - empty_strided + - replication_pad2d_backward + - cauchy_ + - _log_softmax + - vdot + +distributed: + - send + - recv + - broadcast + - all_reduce + - reduce + - all_gather + - gather + - isend + - irecv + - scatter + - reduce_scatter + - _reduce_scatter_base + - _all_gather_base + - all_to_all_single \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_aten.py b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_aten.py new file mode 100644 index 0000000000000000000000000000000000000000..8666287095bbe12f7e9d5f314cff1db75d74a108 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_aten.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os +import torch + +import yaml + +from .hook_module import HOOKModule +from ..common.utils import torch_device_guard, Const +from ..common.file_check import FileOpen + + +cur_path = os.path.dirname(os.path.realpath(__file__)) +yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") +with FileOpen(yaml_path, 'r') as f: + WrapAtenOps = yaml.safe_load(f).get('aten') + + +aten_func = {} +for f in dir(torch.ops.aten): + aten_func[f] = getattr(torch.ops.aten, f) + + +def get_aten_ops(): + global WrapAtenOps + _all_aten_ops = dir(torch.ops.aten) + return set(WrapAtenOps) & set(_all_aten_ops) + + +class HOOKAtenOP(object): + pass + + +class AtenOPTemplate(HOOKModule): + def __init__(self, op, hook): + if isinstance(op, torch._ops.OpOverloadPacket): + op_name_ = op._qualified_op_name.split("::")[-1] + else: + op_name_ = op.name().split("::")[-1] + overload_name = op._overloadname + if not '.' + overload_name in op_name_: + op_name_ = op_name_ + '.' + overload_name + self.op = op + self.prefix_op_name_ = "Aten" + Const.SEP + str(op_name_) + Const.SEP + super().__init__(hook) + + @torch_device_guard + def forward(self, *args, **kwargs): + return self.op(*args, **kwargs) + + +class AtenOPPacketTemplate(): + def __init__(self, opPacket, hook): + self.opPacket = opPacket + self.hook = hook + + def __getattr__(self, key): + try: + attr = getattr(self.opPacket, key) + except AttributeError as e: + raise AttributeError(f"AtenOPPacketTemplate or OpOverloadPacket does not have attribute '{key}'.") from e + if isinstance(attr, torch._ops.OpOverload): + return AtenOPTemplate(attr, self.hook) + else: + return attr + + def overloads(self): + return self.opPacket.overloads() + + @torch_device_guard + def __call__(self, *args, **kwargs): + return AtenOPTemplate(self.opPacket, self.hook)(*args, **kwargs) + + +def wrap_aten_op(op, hook): + return AtenOPPacketTemplate(op, hook) + + +def wrap_aten_ops_and_bind(hook): + _aten_ops = get_aten_ops() + for op_name in _aten_ops: + if not isinstance(aten_func.get(op_name), torch._ops.OpOverloadPacket): + continue + setattr(HOOKAtenOP, "wrap_" + str(op_name), wrap_aten_op(aten_func.get(op_name), hook)) diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_distributed.py b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..68ce83c16b8414f43e61b1a667f8cb7c27899a10 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_distributed.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os +from functools import wraps +import torch.distributed as dist +import yaml + +from .hook_module import HOOKModule +from ..common.utils import torch_device_guard, Const +from ..common.file_check import FileOpen + + +cur_path = os.path.dirname(os.path.realpath(__file__)) +yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") +with FileOpen(yaml_path, 'r') as f: + WrapDistributedOps = yaml.safe_load(f).get('distributed') + + +distributed_func = {} +for f in dir(dist): + distributed_func[f] = getattr(dist, f) + + +def get_distributed_ops(): + global WrapDistributedOps + _all_distributed_ops = dir(dist) + return set(WrapDistributedOps) & set(_all_distributed_ops) + + +class HOOKDistributedOP(object): + pass + + +class DistributedOPTemplate(HOOKModule): + def __init__(self, op_name, build_hook): + self.op_name_ = op_name + self.prefix_op_name_ = "Distributed" + Const.SEP + str(op_name) + Const.SEP + super().__init__(build_hook) + if not self.stop_hook and self.op_name_ in Const.INPLACE_LIST: + self.op_is_inplace = True + + @torch_device_guard + def forward(self, *args, **kwargs): + return distributed_func.get(self.op_name_)(*args, **kwargs) + + +def wrap_distributed_op(op_name, hook): + @wraps(DistributedOPTemplate) + def distributed_op_template(*args, **kwargs): + return DistributedOPTemplate(op_name, hook)(*args, **kwargs) + + distributed_op_template.__name__ = op_name + return distributed_op_template + + +def wrap_distributed_ops_and_bind(hook): + _distributed_ops = get_distributed_ops() + for op_name in _distributed_ops: + setattr(HOOKDistributedOP, "wrap_" + str(op_name), wrap_distributed_op(op_name, hook)) diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_functional.py b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_functional.py new file mode 100644 index 0000000000000000000000000000000000000000..46f25efe664fca2bff917b93e3e0632398bdc74e --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_functional.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os + +import torch +import yaml + +from .hook_module import HOOKModule +from ..common.utils import torch_device_guard, Const +from ..common.log import print_info_log_rank_0 +from ..common.file_check import FileOpen + + +def remove_dropout(): + if torch.__version__ > "1.8": + print_info_log_rank_0("For precision comparison, the probability p in the dropout method is set to 0.") + import torch.nn.functional as F + from torch import _VF + from torch.overrides import has_torch_function_unary, handle_torch_function + + def function_dropout(input: torch.Tensor, p: float = 0.5, training: bool = True, + inplace: bool = False) -> torch.Tensor: + if has_torch_function_unary(input): + return handle_torch_function(function_dropout, (input,), input, p=0., training=training, inplace=inplace) + if p < 0.0 or p > 1.0: + raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p)) + return _VF.dropout_(input, 0., training) if inplace else _VF.dropout(input, 0., training) + + + def function_dropout2d(input: torch.Tensor, p: float = 0.5, training: bool = True, + inplace: bool = False) -> torch.Tensor: + if has_torch_function_unary(input): + return handle_torch_function(function_dropout2d, (input,), input, p=0., training=training, inplace=inplace) + if p < 0.0 or p > 1.0: + raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p)) + return _VF.feature_dropout_(input, 0., training) if inplace else _VF.feature_dropout(input, 0., training) + + + def function_dropout3d(input: torch.Tensor, p: float = 0.5, training: bool = True, + inplace: bool = False) -> torch.Tensor: + if has_torch_function_unary(input): + return handle_torch_function(function_dropout3d, (input,), input, p=0., training=training, inplace=inplace) + if p < 0.0 or p > 1.0: + raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p)) + return _VF.feature_dropout_(input, 0., training) if inplace else _VF.feature_dropout(input, 0., training) + + F.dropout = function_dropout + F.dropout2d = function_dropout2d + F.dropout3d = function_dropout3d + +cur_path = os.path.dirname(os.path.realpath(__file__)) +yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") +with FileOpen(yaml_path, 'r') as f: + WrapFunctionalOps = yaml.safe_load(f).get('functional') + + +def get_functional_ops(): + global WrapFunctionalOps + _all_functional_ops = dir(torch.nn.functional) + return set(WrapFunctionalOps) & set(_all_functional_ops) + + +TorchFunctions = {func: getattr(torch.nn.functional, func) for func in get_functional_ops()} + + +class HOOKFunctionalOP(object): + pass + + +class FunctionalOPTemplate(HOOKModule): + def __init__(self, op_name, hook): + self.op_name_ = op_name + self.prefix_op_name_ = "Functional" + Const.SEP + str(op_name) + Const.SEP + super().__init__(hook) + + @torch_device_guard + def forward(self, *args, **kwargs): + return TorchFunctions[str(self.op_name_)](*args, **kwargs) + + +def wrap_functional_op(op_name, hook): + def functional_op_template(*args, **kwargs): + return FunctionalOPTemplate(op_name, hook)(*args, **kwargs) + + return functional_op_template + + +def wrap_functional_ops_and_bind(hook): + _functional_ops = get_functional_ops() + for op_name in _functional_ops: + setattr(HOOKFunctionalOP, "wrap_" + op_name, wrap_functional_op(op_name, hook)) diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_npu_custom.py b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_npu_custom.py new file mode 100644 index 0000000000000000000000000000000000000000..e910e609c8379e0c66239755c3ec2a44953ef1ec --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_npu_custom.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os +import torch +import torch_npu +import yaml + +from .hook_module import HOOKModule +from ..common.utils import torch_device_guard, torch_without_guard_version, Const +from ..common.file_check import FileOpen + +cur_path = os.path.dirname(os.path.realpath(__file__)) +yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") +with FileOpen(yaml_path, 'r') as f: + WrapNpuOps = yaml.safe_load(f).get('torch_npu') + + +def get_npu_ops(): + global WrapNpuOps + if torch_without_guard_version: + _npu_ops = dir(torch.ops.npu) + else: + _npu_ops = dir(torch_npu._C._VariableFunctionsClass) + return set(WrapNpuOps) & set(_npu_ops) + + +class HOOKNpuOP(object): + pass + + +class NpuOPTemplate(HOOKModule): + + def __init__(self, op_name, hook): + self.op_name_ = op_name + self.prefix_op_name_ = "NPU" + Const.SEP + str(op_name) + Const.SEP + super().__init__(hook) + + @torch_device_guard + def forward(self, *args, **kwargs): + if torch_without_guard_version: + return getattr(torch.ops.npu, str(self.op_name_))(*args, **kwargs) + else: + return getattr(torch_npu._C._VariableFunctionsClass, str(self.op_name_))(*args, **kwargs) + + +def wrap_npu_op(op_name, hook): + + def npu_op_template(*args, **kwargs): + return NpuOPTemplate(op_name, hook)(*args, **kwargs) + + return npu_op_template + + +def wrap_npu_ops_and_bind(hook): + _npu_ops = get_npu_ops() + for op_name in _npu_ops: + setattr(HOOKNpuOP, "wrap_" + str(op_name), wrap_npu_op(op_name, hook)) diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_tensor.py b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_tensor.py new file mode 100644 index 0000000000000000000000000000000000000000..6b49826ab4712d440b4933651eb6b7eab950d023 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_tensor.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os + +import torch +import yaml + +from .hook_module import HOOKModule +from ..common.utils import torch_device_guard, parameter_adapter, Const +from ..common.file_check import FileOpen + +cur_path = os.path.dirname(os.path.realpath(__file__)) +yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") +with FileOpen(yaml_path, 'r') as f: + WrapTensorOps = yaml.safe_load(f).get('tensor') + + +def get_tensor_ops(): + global WrapTensorOps + _tensor_ops = dir(torch.Tensor) + return set(WrapTensorOps) & set(_tensor_ops) + + +TensorOps = {op: getattr(torch.Tensor, op) for op in get_tensor_ops()} + + +class HOOKTensor(object): + pass + + +class TensorOPTemplate(HOOKModule): + + def __init__(self, op_name, hook): + self.op_name_ = op_name + self.prefix_op_name_ = "Tensor" + Const.SEP + str(op_name) + Const.SEP + super().__init__(hook) + + @torch_device_guard + @parameter_adapter + def forward(self, *args, **kwargs): + return TensorOps[str(self.op_name_)](*args, **kwargs) + + +def wrap_tensor_op(op_name, hook): + + def tensor_op_template(*args, **kwargs): + return TensorOPTemplate(op_name, hook)(*args, **kwargs) + + return tensor_op_template + + +def wrap_tensor_ops_and_bind(hook): + _tensor_ops = get_tensor_ops() + for op_name in _tensor_ops: + setattr(HOOKTensor, "wrap_" + str(op_name), wrap_tensor_op(op_name, hook)) diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_torch.py b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_torch.py new file mode 100644 index 0000000000000000000000000000000000000000..889512e9c0c64d9d05dc19cbc30e542c6e5b577c --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_torch.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os + +import torch +import yaml + +from .hook_module import HOOKModule +from ..common.utils import torch_device_guard, Const +from ..common.file_check import FileOpen + +cur_path = os.path.dirname(os.path.realpath(__file__)) +yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") +with FileOpen(yaml_path, 'r') as f: + WrapTorchOps = yaml.safe_load(f).get('torch') + + +def get_torch_ops(): + global WrapTorchOps + _torch_ops = [] + for operation in WrapTorchOps: + if '.' in operation: + operation_sub_module_name, operation_sub_op = operation.rsplit('.', 1) + operation_sub_module = getattr(torch, operation_sub_module_name) + if operation_sub_op in dir(operation_sub_module): + _torch_ops.append(operation) + else: + if hasattr(torch, operation): + _torch_ops.append(operation) + return set(_torch_ops) + + +TorchOps = {} +for op in get_torch_ops(): + if '.' in op: + sub_module_name, sub_op = op.rsplit('.', 1) + sub_module = getattr(torch, sub_module_name) + TorchOps[op] = getattr(sub_module, sub_op) + else: + TorchOps[op] = getattr(torch, op) + + + +class HOOKTorchOP(object): + pass + + +class TorchOPTemplate(HOOKModule): + + def __init__(self, op_name, hook): + self.op_name_ = op_name + self.prefix_op_name_ = "Torch" + Const.SEP + str(op_name) + Const.SEP + super().__init__(hook) + + @torch_device_guard + def forward(self, *args, **kwargs): + return TorchOps[str(self.op_name_)](*args, **kwargs) + + +def wrap_torch_op(op_name, hook): + + def torch_op_template(*args, **kwargs): + return TorchOPTemplate(op_name, hook)(*args, **kwargs) + + return torch_op_template + + +def wrap_torch_ops_and_bind(hook): + _torch_ops = get_torch_ops() + for op_name in _torch_ops: + setattr(HOOKTorchOP, "wrap_" + op_name, wrap_torch_op(op_name, hook)) diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_vf.py b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_vf.py new file mode 100644 index 0000000000000000000000000000000000000000..08d47308e077981e65193eea71874d4f9432c6c0 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_vf.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os + +import torch +import yaml + +from .hook_module import HOOKModule +from ..common.utils import torch_device_guard, Const +from ..common.file_check import FileOpen + +cur_path = os.path.dirname(os.path.realpath(__file__)) +yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml") +with FileOpen(yaml_path, 'r') as f: + WrapVfOps = yaml.safe_load(f).get('_VF') + + +def get_vf_ops(): + global WrapVfOps + # _all_functional_ops = dir(torch.nn.functional) + # assert set(WrapFunctionalOps) <= set(_all_functional_ops) + return WrapVfOps + + +class HOOKVfOP(object): + pass + + +class VfOPTemplate(HOOKModule): + def __init__(self, op_name, hook): + self.op_name_ = op_name + self.prefix_op_name_ = "VF" + Const.SEP + str(op_name) + Const.SEP + super().__init__(hook) + + @torch_device_guard + def forward(self, *args, **kwargs): + return getattr(torch._C._VariableFunctionsClass, str(self.op_name_))(*args, **kwargs) + + +def wrap_vf_op(op_name, hook): + def vf_op_template(*args, **kwargs): + return VfOPTemplate(op_name, hook)(*args, **kwargs) + + return vf_op_template + + +def wrap_vf_ops_and_bind(hook): + _vf_ops = get_vf_ops() + for op_name in _vf_ops: + setattr(HOOKVfOP, "wrap_" + op_name, wrap_vf_op(op_name, hook)) diff --git a/debug/accuracy_tools/atat/pytorch/module_processer.py b/debug/accuracy_tools/atat/pytorch/module_processer.py new file mode 100644 index 0000000000000000000000000000000000000000..fda3d37bc92360fc104d761e78b13cfc793995bc --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/module_processer.py @@ -0,0 +1,98 @@ +from functools import wraps +import torch +from torch.utils.hooks import BackwardHook +from .functional.scope import ModuleRangeScope +from .common.utils import Const + + +class ModuleProcesser: + module_stack = [] + api_parent_node = "" + module_node = {} + current_module_name = "" + + def __init__(self, scope): + if isinstance(scope, ModuleRangeScope): + self.scope = scope + else: + self.scope = None + BackwardHook.setup_input_hook = ModuleProcesser.clone_return_value(BackwardHook.setup_input_hook) + BackwardHook.setup_output_hook = ModuleProcesser.clone_return_value(BackwardHook.setup_output_hook) + BackwardHook.setup_output_hook = ModuleProcesser.filter_tensor_and_tuple(BackwardHook.setup_output_hook) + self.module_count = {} + + @staticmethod + def filter_tensor_and_tuple(func): + @wraps(func) + def wrap_by_filter_tensor_and_tuple(*args, **kwargs): + # setup_output_hook传入非tensor数据,工具后续dump会报错,处理方式是非tensor数据不传入 + # setup_output_hook定义为setup_output_hook(self, args),因此处理第二个位置参数,即*args[1] + if not isinstance(args[1], (torch.Tensor, tuple)): + return args[1] + return func(*args, **kwargs) + + return wrap_by_filter_tensor_and_tuple + + @staticmethod + def clone_return_value(func): + @wraps(func) + def clone_return_value_func(*args, **kwargs): + result = func(*args, **kwargs) + return ModuleProcesser.clone_if_tensor(result) + + return clone_return_value_func + + @staticmethod + def clone_if_tensor(result): + if isinstance(result, torch.Tensor): + return result.clone() + elif isinstance(result, tuple): + return tuple(ModuleProcesser.clone_if_tensor(x) for x in result) + elif isinstance(result, list): + return list(ModuleProcesser.clone_if_tensor(x) for x in result) + elif isinstance(result, dict): + return {k: ModuleProcesser.clone_if_tensor(v) for k, v in result.items()} + else: + return result + + def node_hook(self, name_prefix, start_or_stop, **kwargs): + + def pre_hook(module, input, output=None): + try: + index = self.module_count_func(name_prefix) + except IndexError as e: + index = None + pass + module.mindstudio_reserved_name = full_name = name_prefix + Const.SEP + str(index) + if self.module_stack: + ModuleProcesser.module_node[full_name] = self.module_stack[-1] + else: + ModuleProcesser.module_node[full_name] = None + + ModuleProcesser.module_stack.append(full_name) + if self.module_stack: + ModuleProcesser.api_parent_node = self.module_stack[-1] + if self.scope: + self.scope.begin_module(full_name) + + def end_hook(module, input, output=None): + if self.module_stack: + ModuleProcesser.module_stack.pop() + if self.module_stack: + ModuleProcesser.api_parent_node = self.module_stack[-1] + else: + ModuleProcesser.api_parent_node = None + if self.scope: + self.scope.end_module(module.mindstudio_reserved_name) + + if Const.START in start_or_stop: + return pre_hook + else: + return end_hook + + def module_count_func(self, module_name): + if module_name not in self.module_count: + self.module_count[module_name] = 0 + else: + self.module_count[module_name] += 1 + return self.module_count[module_name] diff --git a/debug/accuracy_tools/atat/pytorch/overflow_check/__init__.py b/debug/accuracy_tools/atat/pytorch/overflow_check/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/overflow_check/info_dump.py b/debug/accuracy_tools/atat/pytorch/overflow_check/info_dump.py new file mode 100644 index 0000000000000000000000000000000000000000..161e9f23f0fb7c3a9c09bb5e7697eb9a7dfaef15 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/overflow_check/info_dump.py @@ -0,0 +1,252 @@ +import inspect +import fcntl +import os +import threading + +import json +import numpy as np +import torch + +from atat.core.file_check_util import FileOpen, FileCheckConst, change_mode +from atat.core.utils import get_time +from ..common.utils import print_error_log + + +special_torch_object = ["memory_format"] +lock = threading.Lock() + + +def write_npy(file_path, tensor): + saved_tensor = tensor.contiguous().cpu().detach() + if tensor.dtype == torch.bfloat16: + saved_numpy = saved_tensor.to(torch.float32).numpy() + else: + saved_numpy = saved_tensor.numpy() + if os.path.exists(file_path): + raise ValueError(f"File {file_path} already exists") + np.save(file_path, saved_numpy) + full_path = os.path.abspath(file_path) + return full_path + + +class APIInfo: + def __init__(self, api_name, is_forward, save_real_data=False): + self.rank = os.getpid() + self.api_name = api_name + self.save_real_data = save_real_data + self.torch_object_key = {'device': self.analyze_device_in_kwargs, 'dtype': self.analyze_dtype_in_kwargs} + self.is_forward = is_forward + self.args_num = 0 + + def analyze_element(self, element): + if isinstance(element, (list, tuple)): + out = [] + for item in element: + out.append(self.analyze_element(item)) + return out + elif isinstance(element, dict): + out_dict = {} + for key, value in element.items(): + if key in self.torch_object_key.keys(): + fun = self.torch_object_key[key] + out_dict[key] = fun(value) + elif key in special_torch_object: + continue + else: + out_dict[key] = self.analyze_element(value) + return out_dict + elif isinstance(element, torch.Tensor): + out_tensor = self.analyze_tensor(element, self.save_real_data) + return out_tensor + elif self.is_builtin_class(element): + out_builtin = self.analyze_builtin(element) + return out_builtin + else: + msg = f"Type {type(element)} is unsupported at analyze_element" + print_error_log(msg) + + raise NotImplementedError(msg) + + def analyze_tensor(self, arg, save_real_data): + single_arg = {} + if not save_real_data: + single_arg.update({'type': 'torch.Tensor'}) + single_arg.update({'dtype': str(arg.dtype)}) + single_arg.update({'shape': arg.shape}) + single_arg.update({'Max': self.transfer_types(self.get_tensor_extremum(arg, 'max'), str(arg.dtype))}) + single_arg.update({'Min': self.transfer_types(self.get_tensor_extremum(arg, 'min'), str(arg.dtype))}) + single_arg.update({'requires_grad': arg.requires_grad}) + + else: + dump_path = "./" + api_args = self.api_name + '.' + str(self.args_num) + rank = arg.device.index + if self.is_forward: + forward_real_data_path = os.path.join(dump_path, "forward_real_data_" + get_time(), f"rank{rank}") + if not os.path.exists(forward_real_data_path): + os.makedirs(forward_real_data_path, 0o755) + + file_path = os.path.join(forward_real_data_path, f'{api_args}.npy') + else: + backward_real_data_path = os.path.join(dump_path, "backward_real_data_" + get_time(), f"rank{rank}") + if not os.path.exists(backward_real_data_path): + os.makedirs(backward_real_data_path, 0o755) + file_path = os.path.join(backward_real_data_path, f'{api_args}.npy') + self.args_num += 1 + npy_path = write_npy(file_path, arg) + single_arg.update({'type': 'torch.Tensor'}) + single_arg.update({'datapath': npy_path}) + single_arg.update({'requires_grad': arg.requires_grad}) + return single_arg + + def analyze_builtin(self, arg): + single_arg = {} + if isinstance(arg, slice): + single_arg.update({'type': "slice"}) + single_arg.update({'value': [arg.start, arg.stop, arg.step]}) + else: + single_arg.update({'type': self.get_type_name(str(type(arg)))}) + single_arg.update({'value': arg}) + return single_arg + + def transfer_types(self, data, dtype): + if 'int' in dtype or 'bool' in dtype: + return int(data) + else: + return float(data) + + def is_builtin_class(self, element): + if element is None or isinstance(element, (bool, int, float, str, slice)): + return True + return False + + def analyze_device_in_kwargs(self, element): + single_arg = {} + single_arg.update({'type': 'torch.device'}) + if not isinstance(element, str): + + if hasattr(element, "index"): + device_value = element.type + ":" + str(element.index) + single_arg.update({'value': device_value}) + else: + device_value = element.type + else: + single_arg.update({'value': element}) + return single_arg + + def analyze_dtype_in_kwargs(self, element): + single_arg = {} + single_arg.update({'type': 'torch.dtype'}) + single_arg.update({'value': str(element)}) + return single_arg + + def get_tensor_extremum(self, data, operator): + if data.dtype is torch.bool: + if operator == 'max': + return True in data + elif operator == 'min': + return False not in data + if operator == 'max': + return torch._C._VariableFunctionsClass.max(data).item() + else: + return torch._C._VariableFunctionsClass.min(data).item() + + def get_type_name(self, name): + + left = name.index("'") + right = name.rindex("'") + return name[left + 1: right] + + +class ForwardAPIInfo(APIInfo): + def __init__(self, name, save_real_data, args, kwargs): + super().__init__(name, is_forward=True, save_real_data=save_real_data) + self.analyze_api_input(args, kwargs) + self.analyze_api_call_stack() + + def analyze_api_input(self, args, kwargs): + args_info_list = self.analyze_element(args) + kwargs_info_dict = self.analyze_element(kwargs) + self.api_info_struct = {self.api_name: {"args": args_info_list, "kwargs": kwargs_info_dict}} + + def analyze_api_call_stack(self): + stack_str = [] + for (_, path, line, func, code, _) in inspect.stack()[3:]: + if not code: + continue + stack_line = " ".join([ + "File", ", ".join([path, " ".join(["line", str(line)]), " ".join(["in", func]), + " ".join(["\n", code[0].strip()])])]) + stack_str.append(stack_line) + self.stack_info_struct = {self.api_name: stack_str} + + +class BackwardAPIInfo(APIInfo): + def __init__(self, name, grads): + super().__init__(name, is_forward=False) + self.analyze_api_input(grads) + + def analyze_api_input(self, grads): + grads_info_list = self.analyze_element(grads) + self.grad_info_struct = {self.api_name: grads_info_list} + + +def write_api_info_json(api_info): + dump_path = "./" + rank = api_info.rank + if isinstance(api_info, ForwardAPIInfo): + file_path = os.path.join(dump_path, f'forward_info_{rank}.json') + stack_file_path = os.path.join(dump_path, f'stack_info_{rank}.json') + write_json(file_path, api_info.api_info_struct) + write_json(stack_file_path, api_info.stack_info_struct, indent=4) + + elif isinstance(api_info, BackwardAPIInfo): + file_path = os.path.join(dump_path, f'backward_info_{rank}.json') + write_json(file_path, api_info.grad_info_struct) + else: + raise ValueError(f"Invalid api_info type {type(api_info)}") + + +def write_json(file_path, data, indent=None): + if not os.path.exists(file_path): + with FileOpen(file_path, 'w') as f: + f.write("{\n}") + change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY) + lock.acquire() + with FileOpen(file_path, 'a+') as f: + fcntl.flock(f, fcntl.LOCK_EX) + try: + f.seek(0, os.SEEK_END) + f.seek(f.tell() - 1, os.SEEK_SET) + f.truncate() + if f.tell() > 3: + f.seek(f.tell() - 1, os.SEEK_SET) + f.truncate() + f.write(',\n') + f.write(json.dumps(data, indent=indent)[1:-1] + '\n}') + except Exception as e: + raise ValueError(f"Json save failed:{e}") from e + finally: + fcntl.flock(f, fcntl.LOCK_UN) + lock.release() + + +def initialize_output_json(): + dump_path = os.path.realpath("./") + files = ['forward_info.json', 'backward_info.json', 'stack_info.json'] + + forward_real_data_path = os.path.join(dump_path, 'forward_real_data') + if os.path.exists(forward_real_data_path): + raise ValueError(f"file {forward_real_data_path} already exists, please remove it first") + else: + os.mkdir(forward_real_data_path, mode=0o750) + + backward_real_data_path = os.path.join(dump_path, 'backward_real_data') + if os.path.exists(backward_real_data_path): + raise ValueError(f"file {backward_real_data_path} already exists, please remove it first") + else: + os.mkdir(backward_real_data_path, mode=0o750) + for file in files: + file_path = os.path.join(dump_path, file) + if os.path.exists(file_path): + raise ValueError(f"file {file_path} already exists, please remove it first or use a new dump path") \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/overflow_check/overflow_check.py b/debug/accuracy_tools/atat/pytorch/overflow_check/overflow_check.py new file mode 100644 index 0000000000000000000000000000000000000000..f8f9926b6cd2bab4a347260e0126f551297aec8b --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/overflow_check/overflow_check.py @@ -0,0 +1,190 @@ +import os +from pathlib import Path + +import torch + +try: + import torch_npu +except ImportError: + is_gpu = True +else: + is_gpu = False + +from atat.core.file_check_util import FileCheckConst +from atat.core.utils import print_warn_log, get_time, print_info_log +from ..dump.dump import forward_init_status, forward_acl_dump +from .utils import OverFlowUtil, dump_overflow, check_overflow_npu, clear_overflow_npu +from ..dump.utils import DumpUtil, Const, get_tensor_rank, create_dirs_if_not_exist, check_single_rank_folder +from .info_dump import write_api_info_json, ForwardAPIInfo, BackwardAPIInfo +from ..dump import dump + +backward_init_status = False +api_overflow = [] +forward_api_info = {} +backward_api_info = {} +FORWARD_REAL_DATA_PATH = os.path.join('./', 'forward_real_data') +BACKWARD_REAL_DATA_PATH = os.path.join('./', 'backward_real_data') +rank = os.getpid() +pkl_name = '' + + +def check_overflow_environment(pid): + if not OverFlowUtil.get_overflow_check_switch(): + return False + if pid != os.getpid(): + return False + if is_gpu: + print_warn_log("Overflow detection is not supported in the GPU environment.") + return False + global backward_init_status + if backward_init_status or forward_init_status: + return False + return True + + +def check_data_overflow(x): + if isinstance(x, (tuple, list)) and x: + for i, item in enumerate(x): + if True == check_data_overflow(item): + return True + return False + else: + if isinstance(x, torch.Tensor) and x.numel() != 0 and x.dtype != torch.bool: + if x.is_meta: + return False + if len(x.shape) == 0: + tensor_max = x.cpu().detach().float().numpy().tolist() + tensor_min = tensor_max + else: + tensor_max = torch._C._VariableFunctionsClass.max(x).cpu().detach().float().numpy().tolist() + tensor_min = torch._C._VariableFunctionsClass.min(x).cpu().detach().float().numpy().tolist() + # inf + if tensor_max == float('inf') or tensor_min == float('-inf'): + return True + if x.dtype in [torch.float16, torch.float32, torch.bfloat16] and \ + (tensor_max == torch.finfo(x.dtype).max or tensor_min == torch.finfo(x.dtype).min): + return True + # nan + elif tensor_max != tensor_max or tensor_min != tensor_min: + return True + else: + return False + elif isinstance(x, bool) or isinstance(x, int) or isinstance(x, float): + if x == float('inf') or x == float('-inf') or x != x: + return True + else: + return False + else: + return False + + +def check_path(apis, path): + return any(api in path for api in apis) + + +def overflow_check(name, **kwargs): + overflow_nums = OverFlowUtil.overflow_nums + pid = kwargs.get('pid') + dump_mode = DumpUtil.dump_switch_mode + if not pid: + return RuntimeError("Not get the specified process pid.") + + def overflowcheck_hook(module, in_feat, out_feat=None): + if not check_overflow_environment(pid): + return + dump_file = DumpUtil.get_dump_path() + global rank + dump_dir, dump_filename = os.path.split(dump_file) + dump_dir = os.path.join(dump_dir, "step{}".format(DumpUtil.iter_num)) + if not os.path.exists(dump_dir): + Path(dump_dir).mkdir(mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True) + if DumpUtil.is_single_rank is None: + DumpUtil.is_single_rank = check_single_rank_folder(dump_dir) + dump_file = os.path.join(dump_dir, dump_filename) + rank_this = get_tensor_rank(in_feat, out_feat) + DumpUtil.dump_root = os.path.dirname(DumpUtil.dump_path) + if rank_this is not None and rank != rank_this: + rank = rank_this + dump.rename_() + if DumpUtil.target_rank is not None: + if rank != DumpUtil.target_rank: + return + dump_path = create_dirs_if_not_exist(rank, dump_file) + global pkl_name + pkl_name = dump_path + dump_dir = os.path.split(dump_path)[0] + global api_overflow + global forward_api_info + global backward_api_info + + module_name = name + if hasattr(torch_npu._C, '_npu_is_support_inf_nan') and torch_npu._C._npu_is_support_inf_nan(): + # backward API endwith backward + if module_name.endswith(Const.BACKWARD): + check_feat = in_feat + else: + check_feat = out_feat + module.has_overflow = check_data_overflow(check_feat) + else: + module.has_overflow = check_overflow_npu() + if not module.has_overflow: + if hasattr(module, 'input_args'): + del module.input_args + if hasattr(module, 'input_kwargs'): + del module.input_kwargs + if module.has_overflow and OverFlowUtil.check_overflow_dump_times(overflow_nums): + if overflow_type_judge(in_feat, out_feat, module_name) and DumpUtil.need_replicate: + if module_name.endswith(Const.FORWARD): + forward_api_info.update({name: ForwardAPIInfo(name, True, module.input_args, module.input_kwargs)}) + api_overflow.append(module_name) + else: + api_overflow.append(module_name.replace("backward", "forward")) + backward_api_info.update({name: BackwardAPIInfo(name, out_feat)}) + OverFlowUtil.inc_overflow_dump_times() + dump_file_name = os.path.join(dump_dir, + "{}_{}.pkl".format(module_name, OverFlowUtil.real_overflow_dump_times)) + dump_overflow(module_name, in_feat, out_feat, dump_file_name) + dump.pkl_name = dump_file_name + + print_warn_log("[overflow {} times]: module name :'{}' is overflow and dump file is saved in '{}'." + .format(OverFlowUtil.real_overflow_dump_times, module_name, + os.path.realpath(dump_file_name))) + if dump_mode == "acl": + acl_dump(module, module_name) + dump.write_to_disk() + # clear overflow flag for the next check + clear_overflow_npu() + if not OverFlowUtil.check_overflow_dump_times(overflow_nums): + for key in forward_api_info: + write_api_info_json(forward_api_info[key]) + for key in backward_api_info: + write_api_info_json(backward_api_info[key]) + raise ValueError("[overflow {} times]: dump file is saved in '{}'." + .format(OverFlowUtil.real_overflow_dump_times, os.path.realpath(dump_file_name))) + + def overflow_type_judge(in_feat, out_feat, module_name): + if module_name.endswith(Const.BACKWARD): + check_feat = out_feat + else: + check_feat = in_feat + if check_data_overflow(check_feat): + print_warn_log("module name :'{}' is overflow and its inputs already has an overflow, so you need " + "to go back to find where the overflow started.".format(module_name)) + return False + elif not check_data_overflow(in_feat) and not check_data_overflow(out_feat): + print_warn_log("module name :'{}' is overflow and its inputs and outputs do not overflow, " + "so this is a process overflow".format(module_name)) + return False + else: + print_warn_log("module name :'{}' is overflow. Its input is normal and its output " + "is overflow.".format(module_name)) + return True + + def acl_dump(module, module_name): + if "forward" in module_name: + forward_acl_dump(module, module_name) + if "backward" in module_name: + print_info_log("The overflow is caused by backward operator {}. " + "You can use reverse acl dump(mode='acl') to get operator dump data.".format(module_name)) + + return overflowcheck_hook diff --git a/debug/accuracy_tools/atat/pytorch/overflow_check/utils.py b/debug/accuracy_tools/atat/pytorch/overflow_check/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d254d5845505fb2ae0c41c56ac9a0e1d9225ba87 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/overflow_check/utils.py @@ -0,0 +1,114 @@ +import os +import torch + +try: + import torch_npu +except ImportError: + is_gpu = True +else: + is_gpu = False + +from atat.core.utils import check_switch_valid, check_inplace_op, OverflowConst +from ..common.utils import Const +from ..dump.dump import dump_stack_info, get_scalar_data_info, dump_data_by_rank_count, \ + get_not_float_tensor_info, get_float_tensor_info +from ..dump.utils import DumpUtil, make_dump_data_dir + + +class OverFlowUtil(object): + overflow_check_switch = None + overflow_filter_switch = Const.OFF + real_overflow_dump_times = 0 + overflow_nums = 1 + + @staticmethod + def set_overflow_check_switch(switch, filter_switch): + OverFlowUtil.overflow_check_switch = switch + OverFlowUtil.overflow_filter_switch = filter_switch + + @staticmethod + def get_overflow_check_switch(): + if OverFlowUtil.overflow_check_switch is None: + return True + return OverFlowUtil.overflow_check_switch == "ON" + + @staticmethod + def inc_overflow_dump_times(): + OverFlowUtil.real_overflow_dump_times += 1 + + @staticmethod + def check_overflow_dump_times(need_dump_times): + if need_dump_times == -1: + return True + return OverFlowUtil.real_overflow_dump_times < need_dump_times + + +def set_overflow_check_switch(switch, filter_switch=Const.OFF): + check_switch_valid(switch) + check_switch_valid(filter_switch) + + OverFlowUtil.set_overflow_check_switch(switch, filter_switch) + + +def dump_overflow(module_name, in_feat, out_feat, dump_file): + name_template = f"{module_name}" + "_{}" + DumpUtil.dump_data_dir = make_dump_data_dir(dump_file) + dump_stack_info(name_template) + if check_inplace_op(name_template): + if Const.PRE_FORWARD in name_template: + name_template = name_template.replace(Const.PRE_FORWARD, Const.FORWARD) + else: + _dump_tensor_completely(in_feat, name_template.format("output")) + return + + if "forward" in name_template: + _dump_tensor_completely(in_feat, name_template.format("input")) + _dump_tensor_completely(out_feat, name_template.format("output")) + else: + _dump_tensor_completely(in_feat, name_template.format("output")) + _dump_tensor_completely(out_feat, name_template.format("input")) + + +def _dump_tensor_completely(x, prefix): + dump_flag = Const.DUMP_RATIO_MAX + 1 + if isinstance(x, (tuple, list)) and x: + for i, item in enumerate(x): + _dump_tensor_completely(item, "{}.{}".format(prefix, i)) + elif isinstance(x, torch.Tensor): + if x.numel() == 0 or len(x.shape) == 0 or not x.is_floating_point(): + if OverFlowUtil.overflow_filter_switch == Const.OFF: + data_info = get_not_float_tensor_info(x) + dump_data_by_rank_count(dump_flag, prefix, data_info) + else: + data_info = get_float_tensor_info(x) + dump_data_by_rank_count(dump_flag, prefix, data_info) + + elif OverFlowUtil.overflow_filter_switch == Const.OFF: + if isinstance(x, bool) or isinstance(x, int) or isinstance(x, float): + data_info = get_scalar_data_info(x) + dump_data_by_rank_count(dump_flag, prefix, data_info) + + +def overflow_debug_mode_enalbe(): + overflow_mode = os.getenv(OverflowConst.OVERFLOW_DEBUG_MODE_ENABLE, Const.ENV_DISABLE) + return overflow_mode == Const.ENV_ENABLE + + +def check_overflow_npu(): + if overflow_debug_mode_enalbe(): + float_status = torch.zeros(8).npu() + result = torch_npu.npu_get_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE) + if (result.cpu()[0] != 0): + return True + else: + return False + else: + return torch_npu._C._check_overflow_npu() + + +def clear_overflow_npu(): + if overflow_debug_mode_enalbe(): + float_status = torch.zeros(8).npu() + torch_npu.npu_clear_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE) + else: + torch_npu._C._clear_overflow_npu() \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/pt_config.py b/debug/accuracy_tools/atat/pytorch/pt_config.py new file mode 100644 index 0000000000000000000000000000000000000000..a0691915cffc93b4a4505b2453560043b44cdc40 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/pt_config.py @@ -0,0 +1,90 @@ +import os +import json +from ..core.common_config import CommonConfig, BaseConfig +from ..core.utils import Const +from ..core.file_check_util import FileOpen + + +#特定任务配置类 +class TensorConfig(BaseConfig): + def __init__(self, json_config): + super().__init__(json_config) + self.check_config() + self._check_file_format() + + def _check_file_format(self): + if self.file_format is not None and self.file_format not in ["npy", "bin"]: + raise Exception("file_format is invalid") + + +class StatisticsConfig(BaseConfig): + def __init__(self, json_config): + super().__init__(json_config) + self.check_config() + self._check_summary_mode() + + def _check_summary_mode(self): + if self.summary_mode and self.summary_mode not in ["statistics", "md5"]: + raise Exception("summary_mode is invalid") + + +class OverflowCheckConfig(BaseConfig): + def __init__(self, json_config): + super().__init__(json_config) + self.overflow_num = json_config.get("overflow_nums") + self.check_mode = json_config.get("check_mode") + self.check_overflow_config() + + def check_overflow_config(self): + if self.overflow_num is not None and not isinstance(self.overflow_num, int): + raise Exception("overflow_num is invalid") + if self.check_mode is not None and self.check_mode not in ["all", "aicore", "atomic"]: + raise Exception("check_mode is invalid") + +class FreeBenchmarkCheckConfig(BaseConfig): + def __init__(self, json_config): + super().__init__(json_config) + self.fuzz_device = json_config.get("fuzz_device") + self.pert_mode = json_config.get("pert_mode") + self.handler_type = json_config.get("handler_type") + self.fuzz_level = json_config.get("fuzz_level") + self.fuzz_stage = json_config.get("fuzz_stage") + self.if_preheat = json_config.get("if_preheat") + self.preheat_step = json_config.get("preheat_step") + self.max_sample = json_config.get("max_sample") + self.check_freebenchmark_config() + + def check_freebenchmark_config(self): + if self.if_preheat and self.handler_type == "fix": + raise Exception("Preheating is not supported in fix handler type") + +def parse_task_config(task, json_config): + default_dic = {} + if task == Const.TENSOR: + config_dic = json_config.get(Const.TENSOR) if json_config.get(Const.TENSOR) else default_dic + return TensorConfig(config_dic) + elif task == Const.STATISTICS: + config_dic = json_config.get(Const.STATISTICS) if json_config.get(Const.STATISTICS) else default_dic + return StatisticsConfig(config_dic) + elif task == Const.OVERFLOW_CHECK: + config_dic = json_config.get(Const.OVERFLOW_CHECK) if json_config.get(Const.OVERFLOW_CHECK) else default_dic + return OverflowCheckConfig(config_dic) + elif task == Const.FREE_BENCHMARK: + config_dic = json_config.get(Const.FREE_BENCHMARK) if json_config.get(Const.FREE_BENCHMARK) else default_dic + return FreeBenchmarkCheckConfig(config_dic) + else: + return StatisticsConfig(default_dic) + + +def parse_json_config(json_file_path, task): + if not json_file_path: + config_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + json_file_path = os.path.join(os.path.join(config_dir, "config"), "config.json") + with FileOpen(json_file_path, 'r') as file: + json_config = json.load(file) + common_config = CommonConfig(json_config) + if task and task in Const.TASK_LIST: + task_config = parse_task_config(task, json_config) + else: + task_config = parse_task_config(common_config.task, json_config) + return common_config, task_config \ No newline at end of file diff --git a/debug/accuracy_tools/atat/pytorch/service.py b/debug/accuracy_tools/atat/pytorch/service.py new file mode 100644 index 0000000000000000000000000000000000000000..9c079aedebeec26120f27318bab1f1cdc8cf99cb --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/service.py @@ -0,0 +1,180 @@ +import os +from pathlib import Path +import functools +import torch +from .functional import build_repair, build_data_collector, build_step_post_process +from .functional.scope import BaseScope +from .common.utils import get_rank_if_initialized, is_gpu, Const +from .common.file_check import FileChecker, FileCheckConst, check_path_before_create +from .common import print_info_log_rank_0 +from .hook_module.api_registry import api_register +from .hook_module import remove_dropout +from .functional.data_processor import ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs +from .module_processer import ModuleProcesser + + +class Service: + make_dir_flag = True + REGISTER_HOOK_KWARGS = ["overflow_nums", "dump_mode", "dump_config"] + + def __init__(self, config): + self.model = None + self.config = config + self.data_collector = build_data_collector(config) + self.module_processor = ModuleProcesser(self.data_collector.scope) + self.repair = build_repair(config) + self.step_post_process = build_step_post_process(config) + self.switch = False + self.current_iter = 0 + self.first_start = True + self.current_rank = None + self.first_touch_dir = True + + def build_hook(self, module_type, name): + def pre_hook(repair, api_or_module_name, module, args, kwargs): + nonlocal module_type, pid + if module_type == BaseScope.Module_Type_Module: + api_or_module_name = module.mindstudio_reserved_name + self.data_collector.visit_and_clear_overflow_status(api_or_module_name) + + if not self.switch: + return args, kwargs + if repair: + args, kwargs = repair.convert(api_or_module_name, module_type, args, kwargs) + if self.data_collector: + module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=None) + self.data_collector.pre_forward_data_collect(api_or_module_name, module, pid, module_input_output) + return args, kwargs + + def forward_hook(repair, api_or_module_name, module, args, kwargs, output): + nonlocal module_type, pid + if module_type == BaseScope.Module_Type_Module: + api_or_module_name = module.mindstudio_reserved_name + self.data_collector.visit_and_clear_overflow_status(api_or_module_name) + + if not self.switch: + return + if self.data_collector: + module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=output) + self.data_collector.forward_data_collect(api_or_module_name, module, pid, module_input_output) + if self.data_collector.if_return_forward_new_output(): + return self.data_collector.get_forward_new_output() + if repair: + output = repair.invert(api_or_module_name, module_type, output) + + return output + + def backward_hook(repair, api_or_module_name, module, grad_input, grad_output): + nonlocal module_type, pid + if module_type == BaseScope.Module_Type_Module: + api_or_module_name = module.mindstudio_reserved_name + self.data_collector.visit_and_clear_overflow_status(api_or_module_name) + + if not self.switch: + return + if self.data_collector: + module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_input, grad_output=grad_output) + self.data_collector.backward_data_collect(api_or_module_name, module, pid, module_input_output) + + pid = os.getpid() + forward_name_template = name + Const.FORWARD + backward_name_template = name + Const.BACKWARD + pre_forward_hook = functools.partial(pre_hook, self.repair, forward_name_template) + forward_hook = functools.partial(forward_hook, self.repair, forward_name_template) + backward_hook = functools.partial(backward_hook, None, backward_name_template) + return pre_forward_hook, forward_hook, backward_hook + + def step(self): + self.current_iter += 1 + if self.step_post_process: + self.step_post_process() + self.data_collector.update_iter(self.current_iter) + + def start(self, model): + self.model = model + if self.config.step and self.current_iter > max(self.config.step): + self.stop() + raise Exception("atat: exit after iteration {}".format(max(self.config.step))) + if self.config.step and self.current_iter not in self.config.step: + return + if self.first_start: + self.current_rank = get_rank_if_initialized() + if self.config.rank and self.current_rank not in self.config.rank: + return + self.register_hook_new() + self.first_start = False + self.switch = True + print_info_log_rank_0(f"Dump switch is turned on at step {self.current_iter}. ") + if self.config.level != "L2": + self.create_dirs() + print_info_log_rank_0(f"Dump data will be saved in {self.dump_iter_dir}.") + + def stop(self): + if self.config.level == "L2": + return + if self.config.step and self.current_iter not in self.config.step: + return + if self.config.rank and self.current_rank not in self.config.rank: + return + self.switch = False + self.data_collector.write_json() + + def create_dirs(self): + check_path_before_create(self.config.dump_path) + if not os.path.exists(self.config.dump_path): + Path(self.config.dump_path).mkdir(mode=0o750, exist_ok=True) + file_check = FileChecker(self.config.dump_path, FileCheckConst.DIR) + file_check.common_check() + self.dump_iter_dir = os.path.join(self.config.dump_path, f"step{self.current_iter}") + cur_rank = self.current_rank if self.current_rank is not None else '' + dump_dir = os.path.join(self.dump_iter_dir, f"rank{cur_rank}") + if not os.path.exists(dump_dir): + Path(dump_dir).mkdir(mode=0o750, parents=True, exist_ok=True) + if self.config.task in self.data_collector.tasks_need_tensor_data: + dump_data_dir = os.path.join(dump_dir, "dump_tensor_data") + Path(dump_data_dir).mkdir(mode=0o750, exist_ok=True) + else: + dump_data_dir = None + + dump_file_path = os.path.join(dump_dir, "dump.json") + stack_file_path = os.path.join(dump_dir, "stack.json") + construct_file_path = os.path.join(dump_dir, "construct.json") + free_benchmark_file_path = os.path.join(self.config.dump_path, "free_benchmark.csv") + self.data_collector.update_dump_paths( + dump_file_path, stack_file_path, construct_file_path, dump_data_dir, free_benchmark_file_path) + + def register_hook_new(self): + hook_name = self.config.task + + if "overflow_check" in hook_name and not is_gpu: + pass + + print_info_log_rank_0("The {} hook function is successfully mounted to the model.".format(hook_name)) + if self.config.level in ["L0", "mix"]: + assert self.model is not None + print_info_log_rank_0("The init dump mode is enabled, and the module dump function will not be available") + for name, module in self.model.named_modules(): + if module == self.model: + continue + prefix = BaseScope.Module_Type_Module + Const.SEP + name + Const.SEP +\ + module.__class__.__name__ + Const.SEP + + pre_forward_hook, forward_hook, backward_hook = self.build_hook(BaseScope.Module_Type_Module, prefix) + module.register_forward_hook(forward_hook, with_kwargs=True) + module.register_full_backward_hook(backward_hook) + + module.register_forward_pre_hook( + self.module_processor.node_hook(prefix + Const.FORWARD, Const.START)) + module.register_forward_hook( + self.module_processor.node_hook(prefix + Const.FORWARD, Const.STOP)) + module.register_full_backward_pre_hook( + self.module_processor.node_hook(prefix + Const.BACKWARD, Const.START)) + module.register_full_backward_hook( + self.module_processor.node_hook(prefix + Const.BACKWARD, Const.STOP)) + + if self.config.level in ["mix", "L1", "L2"]: + api_register.initialize_hook(functools.partial(self.build_hook, BaseScope.Module_Type_API)) + api_register.api_modularity() + + if Const.STATISTICS in hook_name or Const.TENSOR in hook_name: + remove_dropout() diff --git a/debug/accuracy_tools/grad_tool/README.md b/debug/accuracy_tools/grad_tool/README.md index b7afed5ea8310f1bdd377abac22602e00a453651..6958a118d6c4b14949db53540152d8a03b814041 100644 --- a/debug/accuracy_tools/grad_tool/README.md +++ b/debug/accuracy_tools/grad_tool/README.md @@ -213,28 +213,57 @@ GradComparator.compare_distributed("配置文件里写的输出目录", ## 公开接口 +**接口说明** + ```python GradientMonitor.monitor(module) ``` -**参数说明** - | 参数 | 说明 | 是否必选 | | ----- | -------------------- | -------- | | module |Pytorch框架下传入模型,必须是torch.nn.Module;MindSpore框架下传入优化器。 | 是 | +**接口说明** ```python GradientMonitor.__init__(config_path) ``` -**参数说明** - | 参数 | 说明 | 是否必选 | | ----------- | ------------------------------------------------------------ | -------- | | config_path | 配置文件路径,需要以.yaml结尾。 | 是 | +**接口说明** + +```python +GradComparator.compare(rank_dir_path1, + rank_dir_path2, + output_path, + framework="PyTorch") +``` +| 参数 | 说明 | 是否必选 | +| ----- | -------------------- | -------- | +| rank_dir_path1 |需要比对的其中一个rank_id级目录。 | 是 | +| rank_dir_path2 |需要比对的其中一个rank_id级目录,与rank_dir_path1可以互换。 | 是 | +| output_path |输出结果目录,不存在会新建。 | 是 | +| framework |传入框架名字,可选Pytorch或者MindSpore。 | 是 | + +**接口说明** + +```python +GradComparator.compare_distributed(grad_output_path1, + grad_output_path2, + output_path, + framework="PyTorch") +``` + +| 参数 | 说明 | 是否必选 | +| ----- | -------------------- | -------- | +| grad_output_path1 |需要比对的其中一个dump目录,也就是配置文件里写的output_path。 | 是 | +| grad_output_path2 |需要比对的其中一个dump目录,也就是配置文件里写的output_path,与grad_output_path1可以互换。 | 是 | +| output_path |输出结果目录,不存在会新建。 | 是 | +| framework |传入框架名字,可选Pytorch或者MindSpore。 | 是 | # FAQ diff --git a/debug/accuracy_tools/grad_tool/common/base_comparator.py b/debug/accuracy_tools/grad_tool/common/base_comparator.py index 36ecc320c63e8b034ba60a3f0e6d1f49c65999c2..b5dc45b20cc41422838ef66c3bc4ac9690d18277 100644 --- a/debug/accuracy_tools/grad_tool/common/base_comparator.py +++ b/debug/accuracy_tools/grad_tool/common/base_comparator.py @@ -3,6 +3,7 @@ from typing import List from abc import ABC, abstractmethod from tqdm import tqdm +import pandas as pd import matplotlib.pyplot as plt from grad_tool.common.constant import GradConst @@ -71,15 +72,35 @@ class BaseComparator(ABC): head_tuple = tuple(['step'] + [str(step) for step in steps]) write_csv(os.path.join(output_dir, "similarities.csv"), [[key] + value], head_tuple) + @staticmethod + def _get_grad_weight_order(path1, path2): + for summary_file in os.listdir(path1): + if not summary_file.endswith(".csv"): + continue + if not os.path.exists(os.path.join(path2, summary_file)): + continue + summary_csv = pd.read_csv(os.path.join(path1, summary_file)) + return summary_csv["param_name"] + raise RuntimeError("no matched grad_summary.csv for comparison, please dump data in same configuration") + + @staticmethod + def _get_name_matched_grad_file(param_name, grad_files): + for grad_file in grad_files: + if param_name == grad_file[:grad_file.rfind('.')]: + return grad_file + raise RuntimeError("no matched grad_file for comparison, please dump data in same configuration") + @classmethod def _calculate_separated_similarities(cls, path1, path2, steps): similarities = {} print_info_log(f"{len(steps)} steps will be compared") + grad_weight_order = cls._get_grad_weight_order(path1, path2) for step in tqdm(steps, desc="culculate similarities (by step)"): grad_files = cls._get_matched_grad_files(path1, path2, step) same_count_summary = 0 total_count_summary = 0 - for grad_file in grad_files: + for grad_name in grad_weight_order: + grad_file = cls._get_name_matched_grad_file(grad_name, grad_files) grad1 = os.path.join(path1, f"step_{step}", grad_file) grad2 = os.path.join(path2, f"step_{step}", grad_file) same_count, total_count = cls._calculate_similarity(grad1, grad2) diff --git a/debug/accuracy_tools/grad_tool/grad_ms/grad_monitor.py b/debug/accuracy_tools/grad_tool/grad_ms/grad_monitor.py index f822fa923b5b909c2dd96caf505b07fa0c26e22e..34eb4ea23ae49f46059e582c99f6029637e4dc4b 100644 --- a/debug/accuracy_tools/grad_tool/grad_ms/grad_monitor.py +++ b/debug/accuracy_tools/grad_tool/grad_ms/grad_monitor.py @@ -10,9 +10,9 @@ class MsGradientMonitor(BaseMonitor): def __init__(self, config_file: str): super(MsGradientMonitor, self).__init__(config_file) grad_context.init_context(self.config) - csv_generator.init(grad_context) def monitor(self, module): + csv_generator.init(grad_context) hook_optimizer(module) def stop(self): diff --git a/debug/accuracy_tools/grad_tool/grad_pt/level_adapter.py b/debug/accuracy_tools/grad_tool/grad_pt/level_adapter.py index 8eb5b1eb857ca3cb44f9cff219f2be837139c3de..520b0ce0cd8623cd7ca91956f80e97608a407b9b 100644 --- a/debug/accuracy_tools/grad_tool/grad_pt/level_adapter.py +++ b/debug/accuracy_tools/grad_tool/grad_pt/level_adapter.py @@ -2,7 +2,7 @@ import os import hashlib from abc import ABC, abstractmethod import torch -from grad_tool.common.utils import print_info_log +from grad_tool.common.utils import print_info_log, create_directory class LevelOps: @@ -34,7 +34,7 @@ class LevelOps: @staticmethod def save_grad_direction(param_name, grad, save_path): if not os.path.exists(save_path): - os.makedirs(save_path) + create_directory(save_path) param_grad = grad.clone().detach() is_positive = param_grad > 0 torch.save(is_positive, f'{save_path}/{param_name}.pt') diff --git a/debug/accuracy_tools/grad_tool/test/resources/test_save_grad.yaml b/debug/accuracy_tools/grad_tool/test/resources/test_save_grad.yaml deleted file mode 100644 index 136803d26e96f4871f2c96137cf731b2b1366009..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/grad_tool/test/resources/test_save_grad.yaml +++ /dev/null @@ -1,6 +0,0 @@ -level: L2 -param_list: -rank: -step: -bounds: -output_path: ./output/test_save_grad \ No newline at end of file diff --git a/debug/accuracy_tools/grad_tool/test/ut/test_grad_csv.py b/debug/accuracy_tools/grad_tool/test/ut/test_grad_csv.py index 1f3544e06ee643e1374786bb25ba09cb5c32f5b0..48a23c887e02c14d021d4191cb2fdae2bf9dbc33 100644 --- a/debug/accuracy_tools/grad_tool/test/ut/test_grad_csv.py +++ b/debug/accuracy_tools/grad_tool/test/ut/test_grad_csv.py @@ -15,17 +15,13 @@ class TestGradCSV(unittest.TestCase): GradStatCsv.generate_csv_header(level=LevelAdapter.level_adapter("L0"), bounds=[-1, 0, 1])) def test_level_L1_header(self): - self.assertEqual(['param_name', 'MD5', '(-inf, -1]', '(-1, 0]', '(0, 1]', '(1, inf)', '=0', 'max', 'min', 'norm', 'shape'], + self.assertEqual(['param_name', 'max', 'min', 'norm', 'shape'], GradStatCsv.generate_csv_header(level=LevelAdapter.level_adapter("L1"), bounds=[-1, 0, 1])) def test_level_L2_header(self): - self.assertEqual(['param_name', 'MD5', 'max', 'min', 'norm', 'shape'], + self.assertEqual(['param_name', '(-inf, -1]', '(-1, 0]', '(0, 1]', '(1, inf)', '=0', 'max', 'min', 'norm', 'shape'], GradStatCsv.generate_csv_header(level=LevelAdapter.level_adapter("L2"), bounds=[-1, 0, 1])) - def test_level_L3_header(self): - self.assertEqual(['param_name', 'MD5', '(-inf, -1]', '(-1, 0]', '(0, 1]', '(1, inf)', '=0', 'max', 'min', 'norm', 'shape'], - GradStatCsv.generate_csv_header(level=LevelAdapter.level_adapter("L3"), bounds=[-1, 0, 1])) - def test_level_L0_content(self): generated_csv_line = GradStatCsv.generate_csv_line( level=LevelAdapter.level_adapter("L0"), @@ -41,7 +37,7 @@ class TestGradCSV(unittest.TestCase): param_name="model.conv2d", grad=grad_tensor, bounds=[-1, 0, 1]) - self.assertEqual(['model.conv2d', '678a6c7d9d9716682b56fda097d0936c', 0.25, 0.0, 0.5, 0.25, 0.0, 2.0, -2.0, 2.851315498352051, [2, 2]], + self.assertEqual(['model.conv2d', 2.0, -2.0, 2.851315498352051, [2, 2]], generated_csv_line) def test_level_L2_content(self): @@ -50,14 +46,5 @@ class TestGradCSV(unittest.TestCase): param_name="model.conv2d", grad=grad_tensor, bounds=[-1, 0, 1]) - self.assertEqual(['model.conv2d', '678a6c7d9d9716682b56fda097d0936c', 2.0, -2.0, 2.851315498352051, [2, 2]], - generated_csv_line) - - def test_level_L3_content(self): - generated_csv_line = GradStatCsv.generate_csv_line( - level=LevelAdapter.level_adapter("L3"), - param_name="model.conv2d", - grad=grad_tensor, - bounds=[-1, 0, 1]) - self.assertEqual(['model.conv2d', '678a6c7d9d9716682b56fda097d0936c', 0.25, 0.0, 0.5, 0.25, 0.0, 2.0, -2.0, 2.851315498352051, [2, 2]], + self.assertEqual(['model.conv2d', 0.25, 0.0, 0.5, 0.25, 0.0, 2.0, -2.0, 2.851315498352051, [2, 2]], generated_csv_line) diff --git a/debug/accuracy_tools/grad_tool/test/ut/test_grad_monitor.py b/debug/accuracy_tools/grad_tool/test/ut/test_grad_monitor.py index 4c33717f7cf5aaa162a566d9b9cd1f7e3165a9c5..174dae2693caba4df23d8a52c448ad42932bbd3e 100644 --- a/debug/accuracy_tools/grad_tool/test/ut/test_grad_monitor.py +++ b/debug/accuracy_tools/grad_tool/test/ut/test_grad_monitor.py @@ -52,32 +52,13 @@ def test_grad_monitor(): optimizer.step() return gm - -def test_grad_monitor_1(): - gm = GradientMonitor(os.path.join(base_dir, "resources/test_save_grad.yaml")) - loss_fun = nn.CrossEntropyLoss() - test_module = TestModule() - nn.init.constant_(test_module.linear.weight, 1.0) - nn.init.constant_(test_module.linear.bias, 1.0) - gm.monitor(test_module) - optimizer = torch.optim.SGD(test_module.parameters(), lr=1e-2) - for input_data, label in zip(inputs, labels): - output = test_module(input_data) - loss = loss_fun(output, label) - optimizer.zero_grad() - loss.backward() - optimizer.step() - return gm - - class TestGradMonitor(unittest.TestCase): def test_compare(self): - gm1 = test_grad_monitor() - gm2 = test_grad_monitor_1() - compare_output_path = os.path.join(os.path.dirname(gm1._output_path), "grad_compare") - GradComparator.compare_distributed(gm1._output_path, gm2._output_path, compare_output_path) + gm = test_grad_monitor() + compare_output_path = os.path.join(os.path.dirname(gm.grad_monitor._output_path), "grad_compare") + GradComparator.compare_distributed(gm.grad_monitor._output_path, gm.grad_monitor._output_path, compare_output_path) items = os.listdir(compare_output_path) self.assertEqual(len(items), 1) with open(os.path.join(compare_output_path, items[0], "similarities.csv"), 'r') as f: data = f.read() - self.assertEqual(hashlib.md5(data.encode("utf-8")).hexdigest(), "20441d98b8c8d14ee6f896ea29d01b14") + self.assertEqual(hashlib.md5(data.encode("utf-8")).hexdigest(), "3762fafc89c805e7863f50aaffaf8161") diff --git a/debug/accuracy_tools/kj600/README.md b/debug/accuracy_tools/kj600/README.md index 05fcb4a215b0e124b2227b1f64519819c9fa5b22..39c7bc6aa3754027338510f0bb9d1be6c328e17d 100644 --- a/debug/accuracy_tools/kj600/README.md +++ b/debug/accuracy_tools/kj600/README.md @@ -1,18 +1,19 @@ -# kj600 模型训练状态监控工具 +# TensorProbe (codename:kj600) 模型训练状态监控工具 ## 简介 -本项目开发了名为kj600的模型训练状态监控工具,能够收集和聚合模型训练过程中的层和优化器的中间状态,帮助诊断模型训练过程中出现的异常情况。 +本项目开发了一个模型训练状态监控工具,能够收集和聚合模型训练过程中的网络层,优化器, 通信算子的中间值,帮助诊断模型训练过程中计算, 通信,优化器各部分出现的异常情况。 ## 安装 ### 1. 安装依赖 | 依赖软件 | -| ----------- | -| PyTorch | +|-------------| +| torch | | torch_npu | | torchvision | +| tensorboard | ### 2. 安装 kj600 @@ -41,30 +42,44 @@ pip install -e . "targets": { "language_model.encoder.layers.0": {"input": "tuple[2]:0", "output": "tensor", "input_grad":"tuple[2]:0", "output_grad":"tuple[1]:0"} }, - "module_ranks": "1,2,3,4", - "ur_distribution": true + "print_struct": false, + "module_ranks": [1,2,3,4], + "ur_distribution": true, + "xy_distribution": true, + "mv_distribution": true, + "wg_distribution": true, + "mg_direction": true, + "cc_distribution": {"enable":true, "cc_codeline":[]}, + "alert": { + "rules": [{"rule_name": "AnomalyTurbulence", "args": {"threshold": 0.5}}] + }, + "ops": ["min", "max", "norm", "zeros", "id"], + "eps": 1e-8 } ``` -每个要监控的module有特定的输入输出格式(依赖于模型实现),所以我们需要指定前向输入输出格式和反向计算时输入张量的梯度和输出张量的梯度格式。 如果不清楚的话可以先猜测, 格式规范与实际输入不同时会报详细错误。 我们也会随时更新更多常用module的格式规范。 +每个要监控的module都有自己特定的输入输出格式(依赖于模型实现),所以我们需要指定前向输入输出格式和反向计算时输入张量的梯度和输出张量的梯度格式。 如果不清楚的话可以将"targets"填为空("targets":{}),然后将 "print_struct" 字段设置为 true, 之后工具会打印详细的模型结构。 我们也会随时更新更多常用module的格式规范。 下面详细解释各个字段: -"targets":必选字段,指定需要监控的大模型层, 例如transformer的第0层language_model.encoder.layers.0。如果不清楚层命名, 可以使用空的json配置文件, 之后监控工具会打印模型中torch module的名字, 你可以从中选择你关心的module。 - -"input":可选字段,"tuple[2]:0"的意思是目标module的前向input参数为长度为2的tuple, 我们关心的是tuple第0个元素。 - -"output":必选字段,"tensor"的意思是目标module的前向output参数类型为tensor - -"input_grad":可选字段,"tuple[2]:0"的意思是目标module的后向input_grad参数是长度为2的tuple, 我们关心的是tuple的第0个元素。 - -"output_grad":必选字段,"tuple[1]:0"的意思是目标module的后向input_grad参数是长度为1的tuple, 我们关心的是tuple的第0个元素。 - -"module_ranks":可选字段,用于在分布式训练场景中希望控制在哪些rank开启module监控。如果不填,则默认在所有rank开启。 - -"ur_distribution": 可选字段,若为true则会统计adam优化器的update和ratio的数值分布,并展示在heatmap里,默认为false。 - -"mg_direction": 可选字段,若为true则会统计adam优化器的动量与当前梯度方向一致的参数比例。 +| 字段名字 | 是否必选 | 解释 | +| ------------------------------------------------------------ | -------- | -------- | +|"targets"| 必选 |指定需要监控的大模型层, 例如transformer的第0层language_model.encoder.layers.0。如果不清楚模型结构, 可以将"targets"填为空("targets":{}),然后将 "print_struct" 字段设置为 true, 之后监控工具会打印模型中torch module的名字和详细结构,并在第1个step后退出, 你可以从中选择你关心的module。| +|"input"| 可选 |"tuple[2]:0"的意思是目标module的前向input参数为长度为2的tuple, 我们关心的是tuple第0个元素。| +|"output"| 必选 |"tensor"的意思是目标module的前向output参数类型为tensor| +|"input_grad"| 可选 |"tuple[2]:0"的意思是目标module的后向input_grad参数是长度为2的tuple, 我们关心的是tuple的第0个元素。| +|"output_grad"| 必选 |"tuple[1]:0"的意思是目标module的后向input_grad参数是长度为1的tuple, 我们关心的是tuple的第0个元素。| +|"print_struct"| 可选 |设置为true后监控工具会打印模型中torch module的名字和详细结构,并在第1个step后退出。不填默认为false。| +|"module_ranks"| 可选 |用于在分布式训练场景中希望控制在哪些rank开启module监控。如果不填,则默认在所有rank开启。| +|"ur_distribution"| 可选 |若为true则会统计adam优化器指定模块(targets中指定)参数的update和ratio向量的数值分布,并展示在heatmap里,默认为false。依赖histc算子, 需要CANN8.0.rc2以上版本, 否则会有严重的性能问题。 | +|"xy_distribution"| 可选 | 若为true则会监控指定module(targets中指定)的输入输出张量。 默认为false。| +|"mv_distribution"| 可选 | 若为true则会监控指定模块中的参数的优化器状态, 默认为false。需要在TrainerMon构造函数正确指定opt_ty. 目前只支持megatron的混合精度优化器以及megatron的分布式优化器。 Deepspeed的分布式优化器实现暂不支持。 | +|"wg_distribution"| 可选 | 若为true则会监控指定模块的参数梯度, 默认为false。 | +|"alert"| 必选 | 指定自动报警的异常检测机制及其相应的阈值。目前实现的异常检测是AnomalyTurbulence。 如果统计标量超出历史均值的指定浮动范围(threshold指定, 0.5意味着上浮或者下浮50%)。 目前报警是在控制台打印, 未来会实现发邮件和写数据库。| +|"mg_direction"| 可选 | 若为true则会统计adam优化器的一阶矩($m_{t-1}$)和当前梯度($g_t$)符号一致的参数比例。| +|"cc_distribution"| 可选 | 其中“enable”字段控制开关;“cc_codeline”字段指定监控的代码行,如:"train.py\\[23\\]",默认为空列表,不特别指定。"cc_log_only"字段控制是否监控数据,为true时,仅记录调用到的算子及其调用栈。| +|"ops"| 可选 |与ur_distribution、xy_distribution、mv_distribution、wg_distribution、mg_direction、cc_distribution配合,监控所选张量的min、max、norm、zeros值。其中,zeros代表监控所选张量的元素小于eps的比例,id代表监控所选的非张量本身,默认为[]。| +|"eps"| 可选 |若ops里包含"zeros"则需要配置,默认为1e-8。| 下面给出transformer架构模型中常见的module的前向计算的输入输出和反向计算输入张量的梯度和输出张量的梯度格式,以供参考: @@ -98,11 +113,14 @@ pip install -e . ``` from kj600.module_hook import TrainerMon - hooker = TrainerMon("./llama2_config.json") - hooker.hook_modules(model=model, global_batch_size=args.global_batch_size, dp=args.data_parallel_size, micro_batch_size=args.micro_batch_size, fwd_or_bkd=0) + hooker = TrainerMon("./llama2_config.json", params_have_main_grad=True, opt_ty="Megatron_DistributedOptimizer") # or opt_ty=Megatron_Float16OptimizerWithFloat16Params + hooker.hook_modules(model=model, grad_acc_steps=args.global_batch_size//args.data_parallel_size//args.micro_batch_size) ``` + params_have_main_grad: 若为True则参数权重梯度为main_grad,否则为grad,默认为True。 + + 如果不是Megatron-LM的训练框架, 可以设置对应的梯度累积步数grad_acc_steps。 - 如果要监控混合精度优化器的动量和方差, 需要在混合精度优化器构造后加入如下代码: + 如果要监控混合精度优化器的动量和方差, 需要在混合精度优化器构造后加入如下代码。 目前只支持Megatron_DistributedOptimizer, 使用bf16或者fp16混合精度时开启分布式优化器。 或者Megatron_Float16OptimizerWithFloat16Params, 使用bf16或者fp16混合精度选项并且不开启分布式优化器。 ``` model, optimizer, opt_param_scheduler = setup_model_and_optimizer( @@ -141,3 +159,37 @@ ssh -N -L localhost:6006:localhost:6006 your_username@remote_server_address 在工具配置文件中加入"params_effrank":"权重矩阵参数名" "params_effrank": ["language_model.encoder.layers.0.self_attention.query_key_value.weight"] +## 公开接口 + +**接口说明** + +```python +TrainerMon.__init__(config_file_path, params_have_main_grad=True, opt_ty=None) -> None +``` + +| 参数 | 说明 | 是否必选 | +| ----- | -------------------- | -------- | +| config_file_path |自己写的json配置文件路径。 | 是 | +| params_have_main_grad |权重是否使用main_grad,是就为True,否则为False。默认为True。 | 否 | +| opt_ty |优化器类型,有两个选项,Megatron_DistributedOptimizer:使用bf16或者fp16混合精度时开启分布式优化器;Megatron_Float16OptimizerWithFloat16Params:使用bf16或者fp16混合精度选项并且不开启分布式优化器,也适用于常规的adam优化器。如果使用的不是adam优化器,使用None。默认为None。 | 否 | + +**接口说明** + +```python +TrainerMon.hook_modules(model, grad_acc_steps) -> None +``` + +| 参数 | 说明 | 是否必选 | +| ----- | -------------------- | -------- | +| model |需要监控的模型,需要是一个torch.nn.Module。 | 是 | +| grad_acc_steps | 梯度累积步数。 | 是 | + +**接口说明** + +```python +TrainerMon.set_wrapped_optimizer(_wrapped_optimizer) -> None +``` + +| 参数 | 说明 | 是否必选 | +| ----- | -------------------- | -------- | +| _wrapped_optimizer |megatron创建好的混合精度优化器。 | 是 | \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/img/cpu_info.png b/debug/accuracy_tools/kj600/img/cpu_info.png new file mode 100644 index 0000000000000000000000000000000000000000..c69eb61b11be5901428fd20b3d5f69909efffafb Binary files /dev/null and b/debug/accuracy_tools/kj600/img/cpu_info.png differ diff --git a/debug/accuracy_tools/kj600/img/train.png b/debug/accuracy_tools/kj600/img/train.png new file mode 100644 index 0000000000000000000000000000000000000000..2dde057196dd934d29fb766cd9fea3ff527a696a Binary files /dev/null and b/debug/accuracy_tools/kj600/img/train.png differ diff --git a/debug/accuracy_tools/kj600/img/train_with_kj600.png b/debug/accuracy_tools/kj600/img/train_with_kj600.png new file mode 100644 index 0000000000000000000000000000000000000000..b64a6d1f48004ac246b53f381f863348f58d196c Binary files /dev/null and b/debug/accuracy_tools/kj600/img/train_with_kj600.png differ diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_detect.py b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py new file mode 100644 index 0000000000000000000000000000000000000000..24f37a7a84aa9ea2bf260259080510ba0e27733a --- /dev/null +++ b/debug/accuracy_tools/kj600/kj600/anomaly_detect.py @@ -0,0 +1,84 @@ +import statistics as st +from abc import ABC +from typing import List +import sys +from torch.utils.tensorboard import SummaryWriter +from collections import defaultdict + +class ScanRule(ABC): + def apply(self, history, cur): + raise NotImplemented("abstract method apply is not implemented") + +class AnomalyTurbulence(ScanRule): + name = "AnomalyTurbulence" + def __init__(self, threshold) -> None: + self.threshold = threshold + def apply(self, history, cur): + baseline = st.mean(history) if isinstance(history, list) else history + + up_bound = baseline + baseline * self.threshold + if baseline > 0: + return cur > up_bound + else: + return cur < up_bound + +class AnomalyScanner: + + @staticmethod + def load_rules(specs: List[dict]): + if specs is None: + return [] + alert_rules = [] + for spec in specs: + rule_cls_name = spec["rule_name"] + rule_args = spec["args"] + cur_module = sys.modules[__name__] + rule_cls = getattr(cur_module, rule_cls_name) + rule_instance = rule_cls(**rule_args) + alert_rules.append(rule_instance) + return alert_rules + + @staticmethod + def scan(scan_rules: List[ScanRule], history, cur): + anomaly = False + for rule in scan_rules: + anomaly = rule.apply(history, cur) + if anomaly: + return anomaly, rule.name + return anomaly, None + +class bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKCYAN = '\033[96m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + +class SummaryWriterWithAD(SummaryWriter): + def __init__(self, path, ad_rules, anomaly_inform=False): + super().__init__(path) + self.tag2scalars = defaultdict(list) + self.ad_rules = ad_rules + self.anomaly_inform = anomaly_inform + + def _ad(self, scalar_value, history): + return AnomalyScanner.scan(self.ad_rules, history, cur=scalar_value) + + def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False): + new_avg = avg = scalar_value + if tag in self.tag2scalars: + N = len(self.tag2scalars[tag]) + _, avg = self.tag2scalars[tag][-1] + new_avg = (avg*N + scalar_value)/(N + 1) + self.tag2scalars[tag].append((scalar_value, new_avg)) + detected, rule_name = self._ad(scalar_value, history=avg) + if detected: + print(f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}") + exception_message = f"{bcolors.WARNING}> Rule {rule_name} reports anomaly signal in {tag} at step {global_step}.{bcolors.ENDC}" + if self.anomaly_inform: + self.anomaly_inform.run(exception_message) + return super().add_scalar(tag, scalar_value, global_step, walltime, new_style, double_precision) diff --git a/debug/accuracy_tools/kj600/kj600/anomaly_inform.py b/debug/accuracy_tools/kj600/kj600/anomaly_inform.py new file mode 100644 index 0000000000000000000000000000000000000000..0bdafdaf827e5ac6658bccc0de83294d9f313602 --- /dev/null +++ b/debug/accuracy_tools/kj600/kj600/anomaly_inform.py @@ -0,0 +1,75 @@ +import smtplib +from email.mime.text import MIMEText +import sqlite3 +from datetime import datetime, timedelta + +# define class InformRegistry to get inform_sub_class +class AnomalyInformFactory: + @staticmethod + def create_informer(**kwargs): + if kwargs['recipient'] == "database": + return DatabaseInform(**kwargs) + elif kwargs['recipient'] == "email": + return EmailInform(**kwargs) + else: + raise ValueError("Invaild recipient specified") + +# define class AnomalyInform to inform with database or email +class AnomalyInform: + def __init__(self, **kwargs): + self.inform_args = kwargs + self.exception_message_list = [] + self.time = 0 + self.current_time = 0 + + def inform_fun(self, exception_message_list): + pass + + def run(self, exception_message): + if self.time != 0 and self.current_time == 0: + self.current_time = datetime.now() + if self.time == 0 or ((self.current_time - self.time) > timedelta(minutes=self.interval_time)): + self.exception_message_list.append(exception_message) + self.inform_fun(self.exception_message_list) + self.exception_message_list = [] + self.time = datetime.now() + elif (self.current_time - self.time) <= timedelta(minutes=self.interval_time): + self.exception_message_list.append(exception_message) + self.current_time = datetime.now() + +class DatabaseInform(AnomalyInform): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.interval_time = 2 + + def inform_fun(self, exception_message_list): + with sqlite3.connect(self.inform_args['connection_str']) as conn: + cursor = conn.cursor() + cursor.execute('''CREATE TABLE IF NOT EXISTS exceptions( + id INTEGER PRIMARY KEY, + message TEXT + )''') + now_time = datetime.now() + for exception_message in exception_message_list: + exception_message = f"Current time is :{now_time}" + exception_message + cursor.execute("INSERT INTO exceptions (message) VALUES (?)",(exception_message,)) + +class EmailInform(AnomalyInform): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.interval_time = 10 + + def inform_fun(self, exception_message_list): + subject = "Exception Detected in Your Program" + text = f"{len(exception_message_list)} exception was detected in your program:\n\n" + for exception_message in exception_message_list: + text += exception_message + '\n' + message = MIMEText(text, "plain") + message["Subject"] = subject + message["From"] = self.inform_args['email'] + message["To"] = self.inform_args['email'] + + with smtplib.SMTP(self.inform_args['smtp_server_name'], self.inform_args.get('smtp_number', 587)) as server: + server.starttls() + server.login(self.inform_args['id'], self.inform_args['password']) + server.sendmail(self.inform_args['email'], self.inform_args['email'], message.as_string()) diff --git a/debug/accuracy_tools/kj600/kj600/distributed/distributed_ops.yaml b/debug/accuracy_tools/kj600/kj600/distributed/distributed_ops.yaml new file mode 100644 index 0000000000000000000000000000000000000000..51c803eb0b075a9afe0dbe68b4c84535ae867c60 --- /dev/null +++ b/debug/accuracy_tools/kj600/kj600/distributed/distributed_ops.yaml @@ -0,0 +1,14 @@ +distributed: + - send + - recv + - broadcast + - all_reduce + - reduce + - all_gather + - gather + - isend + - irecv + - scatter + - reduce_scatter + - _reduce_scatter_base + - _all_gather_base \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py b/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..4d2ba7a274d9bbc0f645fb78393bb2e7ec05c5bc --- /dev/null +++ b/debug/accuracy_tools/kj600/kj600/distributed/wrap_distributed.py @@ -0,0 +1,191 @@ +import os +from functools import wraps +from collections import defaultdict +import yaml +import re +import inspect +import functools +import torch +import torch.nn as nn +import torch.distributed as dist +import torch.utils.hooks as full_hooks + +from ..module_metric import get_metrics + +try: + import torch_npu +except ImportError: + is_gpu = True +else: + is_gpu = False + + +cur_path = os.path.dirname(os.path.realpath(__file__)) +yaml_path = os.path.join(cur_path, "distributed_ops.yaml") +with open(yaml_path) as f: + WrapDistributedOps = yaml.safe_load(f).get('distributed') + +npu_distributed_api = ['isend', 'irecv'] + +distributed_func = {} +for f in dir(dist): + distributed_func[f] = getattr(dist, f) + +ORIGIN_WAIT = getattr(dist.Work, 'wait') +PENDING_ASYNC_CC_BY_HANDLE = {} + + +def get_distributed_ops(): + global WrapDistributedOps + _all_distributed_ops = dir(dist) + return set(WrapDistributedOps) & set(_all_distributed_ops) + + +class DistributedOPTemplate(nn.Module): + def __init__(self, op_name, hook): + super(DistributedOPTemplate, self).__init__() + self.op_name_ = op_name + self.prefix_op_name_ = str(op_name) + self.register_forward_hook(hook(), with_kwargs=True) + + def forward(self, *args, **kwargs): + return distributed_func.get(self.op_name_)(*args, **kwargs) + + +class ApiRegistry: + def __init__(self): + self.distributed_attr_origin = {} + self.distributed_attr_hooked = {} + + @staticmethod + def store_ori_attr(ori_api_group, api_list, api_ori_attr): + for api in api_list: + if '.' in api: + sub_module_name, sub_op = api.rsplit('.', 1) + sub_module = getattr(ori_api_group, sub_module_name) + api_ori_attr[api] = getattr(sub_module, sub_op) + else: + api_ori_attr[api] = getattr(ori_api_group, api) + + @staticmethod + def set_api_attr(api_group, attr_dict): + for cc_api_name, cc_api_entry_func in attr_dict.items(): + if '.' in cc_api_name: + sub_module_name, sub_op = cc_api_name.rsplit('.', 1) + sub_module = getattr(api_group, sub_module_name, None) + if sub_module is not None: + setattr(sub_module, sub_op, cc_api_entry_func) + else: + setattr(api_group, cc_api_name, cc_api_entry_func) + + def redirect_api(self): + self.set_api_attr(dist, self.distributed_attr_hooked) + self.set_api_attr(dist.distributed_c10d, self.distributed_attr_hooked) + self.redirect_wait() + + def restore_api(self): + self.set_api_attr(dist, self.distributed_attr_origin) + self.set_api_attr(dist.distributed_c10d, self.distributed_attr_origin) + setattr(dist.Work, 'wait', ORIGIN_WAIT) + + def initialize_hook(self, hook): + self.store_ori_attr(dist, get_distributed_ops(), self.distributed_attr_origin) + for op_name in get_distributed_ops(): + self.distributed_attr_hooked[op_name] = DistributedOPTemplate(op_name, hook) + + def redirect_wait(self): + global ORIGIN_WAIT + global PENDING_ASYNC_CC_BY_HANDLE + + def wrapped_wait(work): + def wrapped_wait(*args, **kwargs): + ORIGIN_WAIT(*args, **kwargs) + if args[0] in PENDING_ASYNC_CC_BY_HANDLE: + store_func = PENDING_ASYNC_CC_BY_HANDLE.pop(args[0]) + store_func() + return wrapped_wait + + dist.Work.wait = wrapped_wait(dist.Work) + + +def get_callstack(): + callstack = [] + for (_, path, line, func, code, _) in inspect.stack(): + stack_line = f'{path}[{line}]' + callstack.append(stack_line) + return callstack + +def op_aggregate(op, t1, t2): + if op == 'min': + return min(t1, t2) + if op == 'max': + return max(t1, t2) + if op == 'norm': + return (t1**2+t2**2)**0.5 + if op == 'zeros': # TODO wrong + return (t1+t2)/2 + +def update_data(old, new): + updated = {op:{} for op in new.keys()} + if old: + for op, tag2tensor in old.items(): + for tag, t_old in tag2tensor.items(): + t_new = new[op][tag] + updated[op][tag] = op_aggregate(op, t_old, t_new) + else: + updated = new + return updated + +def is_target_line(codeline): + stack = get_callstack() + whole_stack = ';'.join(stack) + if codeline == []: + return True + for pattern in codeline: + if re.search(pattern, whole_stack): + return True + return False + +def catch_data(cc_context, ops, module, args, out=None): + tensor_args = {} + for arg in args: + if isinstance(arg, torch.Tensor): + tensor_args[f'input_{len(tensor_args)}'] = arg + elif isinstance(arg, list): + arg = torch.stack(arg) + tensor_args[f'input_{len(tensor_args)}'] = arg + new_data = {op: get_metrics(op, tensor_args, 1e-8) for op in ops} + cc_context.indata=update_data(cc_context.indata, new_data) + if out and isinstance(out, dist.Work): + tensor_res = {} + for res in out.result(): + if isinstance(res, torch.Tensor): + tensor_res[f'output_{len(tensor_res)}'] = res + new_data = {op: get_metrics(op, tensor_res, 1e-8) for op in ops} + cc_context.outdata=update_data(cc_context.outdata, new_data) + +def create_store_func(context, ops, module, args, out): + def store_data(): + catch_data(context, ops, module, args, out) + return store_data + +def create_hook(context, monitor): + def cc_hook(module, args, kwargs, out=None): + if monitor.cc_log_only: + stack = ';'.join(get_callstack()[4:7]) + monitor.cc_logged_stack[module.prefix_op_name_].add(stack) + return out + args = args + tuple(kwargs.values()) + if (dist.is_initialized() and dist.get_rank() not in monitor.module_rank_list and monitor.module_rank_list != []): + return out + if not is_target_line(monitor.cc_codeline): + return out + if out: # async + PENDING_ASYNC_CC_BY_HANDLE[out] = create_store_func(context[module.prefix_op_name_], monitor.ops, module, args, out) + return out + catch_data(context[module.prefix_op_name_], monitor.ops, module, args, out) + return out + return cc_hook + +api_register = ApiRegistry() + diff --git a/debug/accuracy_tools/kj600/kj600/features.py b/debug/accuracy_tools/kj600/kj600/features.py index b4fc8f3085ee606972ea6fcdf32a56196a713877..be54215241fb9d1a384ee0944b62a59ea28a0afa 100644 --- a/debug/accuracy_tools/kj600/kj600/features.py +++ b/debug/accuracy_tools/kj600/kj600/features.py @@ -6,6 +6,26 @@ from torch.autograd.functional import jacobian def square_sum(x: torch.tensor): return (x * x).sum() +@torch.no_grad() +def get_min(x: torch.tensor): + return torch.min(x) + + +@torch.no_grad() +def get_max(x: torch.tensor): + return torch.max(x) + + +@torch.no_grad() +def get_zeros(x: torch.tensor, eps: float): + return torch.sum(torch.abs(x) < eps) / x.numel() + +@torch.no_grad() +def get_sign_matches(x: torch.tensor, y:torch.tensor): + xs = x.sign() + ys = y.sign() + same_direction_ratio = ((xs * ys).sum()/ys.numel() + 1)/2 + return same_direction_ratio @torch.no_grad() def eff_rank(param: torch.tensor, threshold=1e-10): diff --git a/debug/accuracy_tools/kj600/kj600/module_hook.py b/debug/accuracy_tools/kj600/kj600/module_hook.py index 233b000f88bbca32b381a2dc7e97922a3776537e..fe009c467041d0333b0017d3bf1e903c9b4c6511 100644 --- a/debug/accuracy_tools/kj600/kj600/module_hook.py +++ b/debug/accuracy_tools/kj600/kj600/module_hook.py @@ -1,25 +1,21 @@ import os import uuid +import json from collections import defaultdict -from typing import List from datetime import datetime +from functools import partial import torch -from torch.nn.modules.module import register_module_forward_hook import torch.distributed as dist from torch.optim.optimizer import register_optimizer_step_pre_hook, register_optimizer_step_post_hook -from torch.utils.tensorboard import SummaryWriter -from kj600.features import square_sum from kj600.module_spec_verifier import get_config, validate_config_spec -from kj600.optimizer_collect import MixPrecsionOptimizerMon, print_rank_0 -from kj600.features import eff_rank +from kj600.optimizer_collect import MixPrecsionOptimizerMon, print_rank_0, OptimizerMonFactory, MegatronDistributedOptimizerMon +from kj600.features import eff_rank, get_sign_matches from kj600.visualizer import HeatmapVisualizer - - -def get_summary_writer_tag_name(module_or_param_name:str, tag:str, rank): - if rank is None: - return f"{module_or_param_name}/{tag}" - else: - return f"{module_or_param_name}/{rank}/{tag}" +from kj600.anomaly_detect import AnomalyScanner, SummaryWriterWithAD +from kj600.anomaly_inform import AnomalyInformFactory +from kj600.module_metric import get_metrics, write_metrics_tensorboard, get_summary_writer_tag_name +from kj600.distributed.wrap_distributed import api_register, create_hook +from kj600.utils import print_warn_log, print_info_log, get_param_struct class ModuleHookContext: @@ -47,41 +43,82 @@ class ModuleHookContext: class OptimizerContext: def __init__(self) -> None: self.step = 0 - self.param_gnorm = defaultdict(float) # norm of grad - self.param_exp_avg_norm = defaultdict(float) # norm of expection of gradient average (m_{t-1}) - self.param_exp_avg_sign = defaultdict(int) # sign of expection of gradient average (m_{t-1}) - self.param_mg_direction = defaultdict(float) # ratio of parameters in same direction between g_{t} and m_{t-1} - self.param_exp_avg_sq_norm = defaultdict(float) # norm of expection of gradient square (v_{t-1}) - self.param_effective_rank = defaultdict(float) # ratio of parameters above a threshold - self.param_adam_update = defaultdict() # distribution of update (m_t/(v_t**0.5+eps)) - self.param_adam_ratio = defaultdict() # distribution of ratio (m_t/v_t**0.5) + self.param_effective_rank = defaultdict(float) + self.param_mg_direction = defaultdict(float) + self.param_adam_update = defaultdict() + self.param_adam_ratio = defaultdict() + self.param_weight_grad = defaultdict() + self.param_exp_avg = defaultdict() + self.param_exp_avg_sq = defaultdict() + self.metric_list = [] + +class CommunicationContext: + def __init__(self) -> None: + self.indata = {} + self.outdata = {} + + def reset(self): + self.indata = {} + self.outdata = {} class TrainerMon: - + @staticmethod def set_wrapped_optimizer(_wrapped_optimizer): MixPrecsionOptimizerMon.set_wrapped_optimizer(_wrapped_optimizer) - def __init__(self, config_file_path) -> None: + # opt_ty: "Megatron_Float16OptimizerWithFloat16Params" or "Megatron_DistributedOptimizer" + def __init__(self, config_file_path, params_have_main_grad=True, opt_ty=None) -> None: self.module_fwd_hook_context_by_module = defaultdict(ModuleHookContext) self.module_bwd_hook_context_by_module = defaultdict(ModuleHookContext) self.optimizer_context = defaultdict(OptimizerContext) - self.params_have_main_grad = True + self.cc_context = defaultdict(CommunicationContext) + self.params_have_main_grad = params_have_main_grad self.config = get_config(config_file_path) - self.module_rank_list = [int(rank) for rank in self.config.get("module_ranks", "").split(',') if rank.strip()] + self.module_rank_list = self.config.get("module_ranks", []) + self.eps = self.config.get('eps', 1e-8) + self.ops = self.config.get('ops', []) + self.xy_distribution = self.config.get('xy_distribution', False) + if not self.xy_distribution: + print_rank_0("> module input/output input_grad/output_grad is not monitored. ") self.ur_distribution = self.config.get('ur_distribution', False) + if not self.ur_distribution: + print_rank_0("> update vector and ratio vector of adam is not monitored. ") + self.mv_distribution = self.config.get("mv_distribution", False) + if not self.mv_distribution: + print_rank_0("> momentum and variance of adam is not monitored. ") + self.wg_distribution = self.config.get("wg_distribution", False) + if not self.wg_distribution: + print_rank_0("> weight grad of specified module is not monitored. ") self.mg_direction = self.config.get('mg_direction', False) + if not self.mg_direction: + print_rank_0('> grad and momentum direction will not be compared.') + self.cc_distribution = self.config.get("cc_distribution", {}) + if not self.cc_distribution.get('enable', False): + print_rank_0("> cc operator is not monitored.") + else: + self.cc_codeline = self.cc_distribution.get('cc_codeline', []) + self.cc_log_only = self.cc_distribution.get('cc_log_only', False) + self.cc_logged_stack = defaultdict(set) + api_register.initialize_hook(partial(create_hook, context=self.cc_context, monitor=self)) + api_register.redirect_api() + alert_setting = self.config.get('alert', {"rules":[]}) + self.alert_rules = AnomalyScanner.load_rules(alert_setting["rules"]) + + anomaly_inform = AnomalyInformFactory.create_informer(**alert_setting["inform"]) if "inform" in alert_setting else None + self.optimizer_hooked = False output_base_dir = os.getenv('KJ600_OUTPUT_DIR', './kj600_output') cur_time = datetime.now().strftime('%b%d_%H-%M-%S') unique_id = str(uuid.uuid4())[:8] if dist.is_initialized(): if (dist.get_rank() in self.module_rank_list) or len(self.module_rank_list) == 0: - self.summary_writer = SummaryWriter(os.path.join(output_base_dir, f"{cur_time}-rank{dist.get_rank()}-{unique_id}")) + self.summary_writer = SummaryWriterWithAD( + os.path.join(output_base_dir, f"{cur_time}-rank{dist.get_rank()}-{unique_id}"), self.alert_rules, anomaly_inform) else: - self.summary_writer = SummaryWriter(os.path.join(output_base_dir, f"{cur_time}-{unique_id}")) + self.summary_writer = SummaryWriterWithAD(os.path.join(output_base_dir, f"{cur_time}-{unique_id}"), self.alert_rules, anomaly_inform) # A HeatmapVisualizer instance is associated with an image self.update_heatmap_visualizer = defaultdict(HeatmapVisualizer) self.ratio_heatmap_visualizer = defaultdict(HeatmapVisualizer) @@ -90,21 +127,39 @@ class TrainerMon: self.param_name_list = [] self.param2name = defaultdict(str) - self.mix_precision_optimizer_mon = MixPrecsionOptimizerMon() + self.mix_precision_optimizer_mon = OptimizerMonFactory.create_optimizer_mon(opt_ty) + if opt_ty is None: + assert not self.ur_distribution, "ur_distribution cannot be enabled with unknown optimizer." + assert not self.mv_distribution, "mv_distribution cannot be enabled with unknown optimizer." + self.print_struct = self.config.get("print_struct", False) + self.struct_printed = False + self.module_struct = {} return - + def __del__(self): if hasattr(self, "summary_writer"): self.summary_writer.close() - def _hook_module(self, target_name:str, module: torch.nn.Module, fwd_or_bkd): - paths = target_name.split('.') + def _smallest_rank_print(self, msg): + if dist.is_initialized(): + if dist.get_rank() == min(self.module_rank_list): + print_info_log(msg) + else: + print_info_log(msg) + + def _hook_module(self, target_names, module: torch.nn.Module, fwd_or_bkd): if '_modules' not in module.__dict__: # nothing to hook return 0 - + def fwd_hook_fun(module, module_input, module_output): - context = self.module_fwd_hook_context_by_module[module] + context: ModuleHookContext = self.module_fwd_hook_context_by_module[module] + if self.print_struct: + self.module_struct[context.module_name].update( + {"input": f"{get_param_struct(module_input)}", "output": f"{get_param_struct(module_output)}"}) + return + if not self.xy_distribution: + return if not context.format_by_arg: context.set_format_by_arg('input', self.config['targets']) context.set_format_by_arg('output', self.config['targets']) @@ -114,22 +169,35 @@ class TrainerMon: context.focused_out_col = validate_config_spec(context.format_by_arg['output'], module_output, context.module_name, 'output') context.verified = True # expect output be tensor type + tbtag_tensor_map = {} if not context.ignore_in: cared_input = module_input if context.focused_in_col is None else module_input[context.focused_in_col] - cared_input_cal_result = square_sum(cared_input) - else: - cared_input_cal_result = None + tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name, 'input', cared_input)) cared_output = module_output if context.focused_out_col is None else module_output[context.focused_out_col] - context.actv.append((cared_input_cal_result, square_sum(cared_output))) + tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name, 'output', cared_output)) + metric_dict = {} + for metric_name in self.ops: + metric_dict[metric_name] = get_metrics(metric_name, tbtag_tensor_map, self.eps) + if context.micro_step == 0 and context.actv: + print_warn_log( + f"actv context of {context.module_name} is not empty when first micro_step, maybe something wrong happened. Now clear it.") + context.actv.clear() + context.actv.append(metric_dict) context.micro_step += 1 if context.micro_step == self.micro_batch_number: context.micro_step = 0 context.step += 1 return - + def bwd_hook_fun(module, input_grad, output_grad): - context = self.module_bwd_hook_context_by_module[module] + context: ModuleHookContext = self.module_bwd_hook_context_by_module[module] + if self.print_struct: + self.module_struct[context.module_name].update( + {"input_grad": f"{get_param_struct(input_grad)}", "output_grad": f"{get_param_struct(output_grad)}"}) + return + if not self.xy_distribution: + return if not context.format_by_arg: context.set_format_by_arg('input_grad', self.config['targets']) context.set_format_by_arg('output_grad', self.config['targets']) @@ -138,44 +206,53 @@ class TrainerMon: context.focused_in_col = validate_config_spec(context.format_by_arg['input_grad'], input_grad, context.module_name, 'input_grad') context.focused_out_col = validate_config_spec(context.format_by_arg['output_grad'], output_grad, context.module_name, 'output_grad') context.verified = True + + tbtag_tensor_map = {} if not context.ignore_in: cared_input_grad = input_grad if context.focused_in_col is None else input_grad[context.focused_in_col] - cared_input_grad_cal_result = square_sum(cared_input_grad) if cared_input_grad is not None else torch.tensor(0.) - else: - cared_input_grad_cal_result = None + tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name, 'input_grad', cared_input_grad)) cared_output_grad = output_grad if context.focused_out_col is None else output_grad[context.focused_out_col] - context.actvgrad.append((cared_input_grad_cal_result, square_sum(cared_output_grad))) + tbtag_tensor_map.update(self.build_tbtag_tensor_map(context.module_name, 'output_grad', cared_output_grad)) + metric_dict = {} + for metric_name in self.ops: + metric_dict[metric_name] = get_metrics(metric_name, tbtag_tensor_map, self.eps) + if context.micro_step == 0 and context.actvgrad: + print_warn_log(f"actvgrad context of {context.module_name} is not empty when first micro_step, maybe something wrong happened. Now clear it.") + context.actvgrad.clear() + context.actvgrad.append(metric_dict) + context.micro_step += 1 if context.micro_step == self.micro_batch_number: context.micro_step = 0 context.step += 1 return - + + hooked_count = 0 for name, submodule in module.named_modules(): - if name == target_name: + self.module_struct[name] = {} + if name in target_names: submodule.register_forward_hook(fwd_hook_fun) self.module_fwd_hook_context_by_module[submodule] = ModuleHookContext(name) submodule.register_full_backward_hook(bwd_hook_fun) self.module_bwd_hook_context_by_module[submodule] = ModuleHookContext(name) print_rank_0(f"> {name} is monitored successfully") - return 1 - return 0 + hooked_count += 1 + return hooked_count - def hook_modules(self, model:torch.nn.Module, global_batch_size, dp, micro_batch_size, fwd_or_bkd, params_have_main_grad=True): + def hook_modules(self, model:torch.nn.Module, grad_acc_steps): # fwd=0, bkd=1 # targets is module name list like ["xx.xxx1", "xxx.xxx2"] which can be obtained when first run. print_rank_0("> module names:") for name, _ in model.named_modules(): print_rank_0(f"\t{name}") - self.micro_batch_number = global_batch_size // dp // micro_batch_size - + self.micro_batch_number = grad_acc_steps + if not self.module_rank_list or (dist.is_initialized() and dist.get_rank() in self.module_rank_list): - hooked = 0 - for target, _ in self.config['targets'].items(): - hooked += self._hook_module(target, model, fwd_or_bkd=0) - print_rank_0(f"> {hooked} out of {len(self.config['targets'])} are monitored.") + targets = [x for x, _ in model.named_modules()] if self.print_struct else self.config['targets'].keys() + hooked_count = self._hook_module(targets, model, fwd_or_bkd=0) + print_rank_0(f"> {hooked_count} out of {len(self.config['targets'])} are monitored.") else: - return + return if not self.optimizer_hooked: self.optimizer_hooked = True @@ -187,72 +264,131 @@ class TrainerMon: self.param_name_list.append(name) self.param2name[param] = name self.hook_optimizer() - self.params_have_main_grad = params_have_main_grad return + + def build_tbtag_tensor_map(self, module_name, tag, tensor): + metrics = {} + rank = dist.get_rank() if dist.is_initialized() else None + key = get_summary_writer_tag_name(module_name, tag, rank) + if tensor is not None: + metrics[key] = tensor + return metrics + + def generate_param_metrics(self, tag, param_tensor): + metrics = {} + rank = dist.get_rank() if dist.is_initialized() else None + for param, name in self.param2name.items(): + key = get_summary_writer_tag_name(name, tag, rank) + if name not in param_tensor or param_tensor[name] is None: + continue + metrics[key] = param_tensor[name] + return metrics + def generate_cc_metrics(self, cc_name, cc_tensor): + metrics = defaultdict(dict) + rank = dist.get_rank() if dist.is_initialized() else None + for op, tag2tensor in cc_tensor.indata.items(): + for tag, tensor in tag2tensor.items(): + key = get_summary_writer_tag_name(cc_name, tag, rank) + metrics[op].update({key: tensor}) + for op, tag2tensor in cc_tensor.outdata.items(): + for tag, tensor in tag2tensor.items(): + key = get_summary_writer_tag_name(cc_name, tag, rank) + metrics[op].update({key: tensor}) + cc_tensor.reset() + return metrics + + def write_xy_tb(self, step): + if not self.xy_distribution: + return + for _, fwd_context in self.module_fwd_hook_context_by_module.items(): + if not len(fwd_context.actv) == self.micro_batch_number: + print_warn_log(f"fwd_context.actv not equal to micro_batch_number: {len(fwd_context.actv)}, {self.micro_batch_number}") + for metric_name in self.ops: + write_metrics_tensorboard(metric_name, self.summary_writer, fwd_context.actv, step) + fwd_context.actv.clear() + + for _, bwd_context in self.module_bwd_hook_context_by_module.items(): + if not len(bwd_context.actvgrad) == self.micro_batch_number: + print_warn_log(f"bwd_context.actvgrad not equal to micro_batch_number: {len(bwd_context.actvgrad)}, {self.micro_batch_number}") + for metric_name in self.ops: + write_metrics_tensorboard(metric_name, self.summary_writer, bwd_context.actvgrad, step) + bwd_context.actvgrad.clear() + def hook_optimizer(self): # in DDP by default use params_have_main_grad def optimizer_pre_step_hook(optimizer, args, kwargs): context = self.optimizer_context[optimizer] + if self.print_struct and not all(value == {} for value in self.module_struct.values()) and not self.struct_printed: + self._smallest_rank_print("> module struct:") + self._smallest_rank_print(json.dumps(self.module_struct, indent=4)) + if not self.cc_log_only: + raise Exception("exit after first step when print model struct") + if self.cc_log_only and context.step > 0: + self._smallest_rank_print("> Used communication ops and corresponding stack") + self._smallest_rank_print(json.dumps({k:list(v) for k,v in self.cc_logged_stack.items()}, indent=4)) + raise Exception("exit after first step when print cc stack") + - context.param_exp_avg_norm, context.param_exp_avg_sign, context.param_exp_avg_sq_norm, context.param_adam_update, context.param_adam_ratio = self.mix_precision_optimizer_mon.fetch_mv( - optimizer, self.param2name, self.update_heatmap_visualizer, self.ratio_heatmap_visualizer, self.ur_distribution, self.mg_direction) + context.param_exp_avg, context.param_exp_avg_sq, context.param_adam_update, context.param_adam_ratio = self.mix_precision_optimizer_mon.fetch_mv(self, + optimizer, self.param2name) for param, name in self.param2name.items(): - grad = param.main_grad if self.params_have_main_grad else param.grad - context.param_gnorm[name] = grad.detach().norm() if "params_effrank" in self.config and name in self.config["params_effrank"]: context.param_effective_rank[name] = eff_rank(param.detach()) - + grad = param.main_grad if self.params_have_main_grad else param.grad + if grad is None: + print_warn_log(f"grad is None: {name}, maybe something wrong happened.") + continue + if self.wg_distribution: + context.param_weight_grad[name] = grad if self.mg_direction: - if name in context.param_exp_avg_sign: - g_sign = grad.detach().sign() - m_sign = context.param_exp_avg_sign.pop(name) - same_direction_ratio = ((m_sign * g_sign).sum().item()/m_sign.numel() + 1)/2 + if context.step == 0: + same_direction_ratio = torch.tensor(1.) else: - same_direction_ratio = 1 + same_direction_ratio = get_sign_matches(grad, context.param_exp_avg[name]) context.param_mg_direction[name] = same_direction_ratio + tbtag_tensor_map = {} + if self.wg_distribution: + tbtag_tensor_map.update(self.generate_param_metrics('weight_grad', context.param_weight_grad)) + if self.mv_distribution: + tbtag_tensor_map.update(self.generate_param_metrics('exp_avg', context.param_exp_avg)) + tbtag_tensor_map.update(self.generate_param_metrics('exp_avg_sq', context.param_exp_avg_sq)) + if self.mg_direction: + tbtag_tensor_map.update(self.generate_param_metrics('mg_direction', context.param_mg_direction)) + # if not tbtag_tensor_map: + # return + metric_dict = {} + for metric_name in self.ops: + metric_dict[metric_name] = get_metrics(metric_name, tbtag_tensor_map, self.eps) + if self.cc_distribution: + for k, c in self.cc_context.items(): + cc_metrics = self.generate_cc_metrics(k, c) + for op, m in cc_metrics.items(): + metric_dict[op].update(m) + if not metric_dict: + return + context.metric_list.append(metric_dict) return - + def optimizer_post_step_hook(optimizer, args, kwargs): context = self.optimizer_context[optimizer] rank = dist.get_rank() if dist.is_initialized() else None - for _, fwd_context in self.module_fwd_hook_context_by_module.items(): - if not len(fwd_context.actv) == self.micro_batch_number: - raise Exception(f"fwd_context.actv not equal to micro_batch_number: {len(fwd_context.actv)}, {self.micro_batch_number}") - if not fwd_context.ignore_in: - x_norm = sum([x.item() for x, _ in fwd_context.actv]) - self.summary_writer.add_scalar(get_summary_writer_tag_name(fwd_context.module_name, 'input', rank), x_norm, context.step) - y_norm = sum([y.item() for _, y in fwd_context.actv]) - self.summary_writer.add_scalar(get_summary_writer_tag_name(fwd_context.module_name, 'output', rank), y_norm, context.step) - fwd_context.actv.clear() - - for _, bwd_context in self.module_bwd_hook_context_by_module.items(): - if not len(bwd_context.actvgrad) == self.micro_batch_number: - raise Exception(f"fwd_context.actvgrad not equal to micro_batch_number: {len(fwd_context.actvgrad)}, {self.micro_batch_number}") - if not bwd_context.ignore_in: - x_grad_norm = sum([x.item() for x, _ in bwd_context.actvgrad]) - self.summary_writer.add_scalar(get_summary_writer_tag_name(bwd_context.module_name, 'input_grad', rank), x_grad_norm, context.step) - y_grad_norm = sum([y.item() for _, y in bwd_context.actvgrad]) - self.summary_writer.add_scalar(get_summary_writer_tag_name(bwd_context.module_name, 'output_grad', rank), y_grad_norm, context.step) - bwd_context.actvgrad.clear() - - for param_name, grad_norm in context.param_gnorm.items(): - self.summary_writer.add_scalar(get_summary_writer_tag_name(param_name, 'weight_grad', rank), grad_norm.item(), context.step) - - for param_name, exp_avg_norm in context.param_exp_avg_norm.items(): - self.summary_writer.add_scalar(get_summary_writer_tag_name(param_name, 'exp_avg_norm', rank), exp_avg_norm.item(), context.step) - for param_name, exp_avg_sq_norm in context.param_exp_avg_sq_norm.items(): - self.summary_writer.add_scalar(get_summary_writer_tag_name(param_name, 'exp_avg_sq_norm', rank), exp_avg_sq_norm.item(), context.step) + + self.write_xy_tb(context.step) + if self.ur_distribution: for param_name, _ in context.param_adam_update.items(): self.update_heatmap_visualizer[param_name].visualize(get_summary_writer_tag_name(param_name, 'adam_update', rank), context.step, self.summary_writer) for param_name, _ in context.param_adam_ratio.items(): self.ratio_heatmap_visualizer[param_name].visualize(get_summary_writer_tag_name(param_name, 'adam_ratio', rank), context.step, self.summary_writer) - if self.mg_direction: - for param_name, mg_direction in context.param_mg_direction.items(): - self.summary_writer.add_scalar(get_summary_writer_tag_name(param_name, 'adam_mg_direction', rank), mg_direction, context.step) + + for metric_name in self.ops: + if not context.metric_list: + break + write_metrics_tensorboard(metric_name, self.summary_writer, context.metric_list, context.step) + context.metric_list.clear() context.step += 1 return diff --git a/debug/accuracy_tools/kj600/kj600/module_metric.py b/debug/accuracy_tools/kj600/kj600/module_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..d42749d2be3a230c56aa28a4e983dd6d989c003e --- /dev/null +++ b/debug/accuracy_tools/kj600/kj600/module_metric.py @@ -0,0 +1,125 @@ +import math +import statistics + +from kj600.features import square_sum, get_max, get_min, get_zeros + + +def get_summary_writer_tag_name(module_or_param_name:str, tag:str, rank): + if rank is None: + return f"{module_or_param_name}/{tag}" + else: + return f"{module_or_param_name}/{rank}/{tag}" + + +# 用于存储所有metric实现类的注册表 +config_metric_registry = {} + + +def register_config_metric(key, cls=None): + """装饰器 用于注册Metric的实现类""" + if cls is None: + # 无参数时,返回装饰器函数 + return lambda cls: register_config_metric(key, cls) + config_metric_registry[key] = cls + return cls + + +class Metric(object): + @staticmethod + def get_metric_value(tensor, eps): + pass + + def get_metrics(self, tag2tensor: dict, eps): + metrics_dict = {} + for tag, tensor in tag2tensor.items(): + metrics_dict[tag] = self.get_metric_value(tensor, eps) + return metrics_dict + + @staticmethod + def metric_tensorboard(metric_name, summary_writer, metric_value, step): + pass + + +@register_config_metric("min") +class MinMetric(Metric): + @staticmethod + def get_metric_value(tensor, eps): + return get_min(tensor) + + @staticmethod + def metric_tensorboard(metric_name, summary_writer, metric_value, step): + for key in metric_value[0][metric_name].keys(): + min_value = min([item[metric_name][key].item() for item in metric_value]) + summary_writer.add_scalar(f'{key}_min', min_value, step) + + +@register_config_metric("max") +class MaxMetric(Metric): + @staticmethod + def get_metric_value(tensor, eps): + return get_max(tensor) + + @staticmethod + def metric_tensorboard(metric_name, summary_writer, metric_value, step): + for key in metric_value[0][metric_name].keys(): + max_value = max([item[metric_name][key].item() for item in metric_value]) + summary_writer.add_scalar(f'{key}_max', max_value, step) + + +@register_config_metric("norm") +class NormMetric(Metric): + @staticmethod + def get_metric_value(tensor, eps): + return square_sum(tensor) + + @staticmethod + def metric_tensorboard(metric_name, summary_writer, metric_value, step): + for key in metric_value[0][metric_name].keys(): + norm_value = math.sqrt(sum([item[metric_name][key].item() for item in metric_value])) + summary_writer.add_scalar(f'{key}_norm', norm_value, step) + + +@register_config_metric("zeros") +class ZerosMetric(Metric): + @staticmethod + def get_metric_value(tensor, eps): + return get_zeros(tensor, eps) + + @staticmethod + def metric_tensorboard(metric_name, summary_writer, metric_value, step): + for key in metric_value[0][metric_name].keys(): + zeros_value = statistics.mean([item[metric_name][key].item() for item in metric_value]) + summary_writer.add_scalar(f'{key}_zeros', zeros_value, step) + + +@register_config_metric("id") +class IdentMetric(Metric): + @staticmethod + def get_metric_value(tensor, eps): + if tensor.dim() != 0: + return None + return tensor + + @staticmethod + def metric_tensorboard(metric_name, summary_writer, metric_value, step): #metric_value is a dict, key is parameter name and value is a list of scalar tensor + if len(metric_value) == 1: + for key, value in metric_value[0][metric_name].items(): + if not value: + continue + summary_writer.add_scalar(f'{key}_identical', value.item(), step) + + +def get_metrics(metric_name, tag2tensor, eps): + try: + fun_metric = config_metric_registry[metric_name] + return fun_metric().get_metrics(tag2tensor, eps) + except KeyError as e: + raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") + + +def write_metrics_tensorboard(metric_name, summary_writer, metric_value, step): + try: + fun_metric = config_metric_registry[metric_name] + return fun_metric.metric_tensorboard(metric_name, summary_writer, metric_value, step) + except KeyError as e: + raise ValueError(f"Not supported this metric, expected metric: {config_metric_registry.keys()}, actual metric: {metric_name}") diff --git a/debug/accuracy_tools/kj600/kj600/module_spec_verifier.py b/debug/accuracy_tools/kj600/kj600/module_spec_verifier.py index cfd1ebc6ffc56d6d821507f062704d62501723db..395aa82f17a87cdf742a8294e29ccb1c32081200 100644 --- a/debug/accuracy_tools/kj600/kj600/module_spec_verifier.py +++ b/debug/accuracy_tools/kj600/kj600/module_spec_verifier.py @@ -2,9 +2,11 @@ import json import re import abc import torch +from kj600.utils import check_file_valid_readable def get_config(file_path='config.json'): + check_file_valid_readable(file_path) with open(file_path, 'r') as file: config = json.load(file) return config diff --git a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py index 44f478416cc30054d566d2167426095eed941210..dfb473ca074f135d809e32c85a8fc9b4047da4d3 100644 --- a/debug/accuracy_tools/kj600/kj600/optimizer_collect.py +++ b/debug/accuracy_tools/kj600/kj600/optimizer_collect.py @@ -22,20 +22,10 @@ class MixPrecsionOptimizerMon: def __init__(self) -> None: self.fp16_to_fp32_param = {} - - # parameter tensors we want to monitor and their names are in params2name_dict - # base_optimizer is pytorch optimizer, wrapped_optimizer is a normal object with base_optimizer - def fetch_mv(self, torch_opt, params2name, update_heatmap_visualizer, ratio_heatmap_visualizer, ur_distribution, mg_direction): - mix_prec_opt = MixPrecsionOptimizerMon.wrapped_optimizer - - if not self.fp16_to_fp32_param and mix_prec_opt is not None: - for fp16_group, fp32_group in zip(mix_prec_opt.float16_groups, mix_prec_opt.fp32_from_float16_groups): - for fp16_param, fp32_param in zip(fp16_group, fp32_group): - self.fp16_to_fp32_param[fp16_param] = fp32_param - exp_avg_norm_dict = defaultdict(float) - exp_avg_sign_dict = defaultdict(int) - exp_avg_sq_norm_dict = defaultdict(float) + def _fetch_mv_in_adam(self, params2name, torch_opt, monitor): + exp_avg_dict = defaultdict(float) + exp_avg_sq_dict = defaultdict(float) update_dict = defaultdict() ratio_dict = defaultdict() @@ -46,16 +36,52 @@ class MixPrecsionOptimizerMon: if param in torch_opt.state: exp_avg = torch_opt.state[param]["exp_avg"] exp_avg_sq = torch_opt.state[param]["exp_avg_sq"] - exp_avg_norm = exp_avg.detach().norm() - exp_avg_sq_norm = exp_avg_sq.detach().norm() - exp_avg_norm_dict[name] = exp_avg_norm - exp_avg_sq_norm_dict[name] = exp_avg_sq_norm - if mg_direction: - exp_avg_sign_dict[name] = exp_avg.detach().sign() - if ur_distribution: + if monitor.mv_distribution: + exp_avg_dict[name] = exp_avg + exp_avg_sq_dict[name] = exp_avg_sq + if monitor.mg_direction: + exp_avg_dict[name] = exp_avg + if monitor.ur_distribution: update_dict[name] = exp_avg / (torch.sqrt(exp_avg_sq) + torch_opt.defaults['eps']) ratio_dict[name] = exp_avg / torch.sqrt(exp_avg_sq) - update_heatmap_visualizer[name].pre_cal(update_dict[name]) - ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name]) - - return exp_avg_norm_dict, exp_avg_sign_dict, exp_avg_sq_norm_dict, update_dict, ratio_dict + monitor.update_heatmap_visualizer[name].pre_cal(update_dict[name]) + monitor.ratio_heatmap_visualizer[name].pre_cal(ratio_dict[name]) + return exp_avg_dict, exp_avg_sq_dict, update_dict, ratio_dict + + # parameter tensors we want to monitor and their names are in params2name_dict + # base_optimizer is pytorch optimizer, wrapped_optimizer is a normal object with base_optimizer + def fetch_mv(self, monitor, torch_opt, params2name): + mix_prec_opt = MixPrecsionOptimizerMon.wrapped_optimizer + + if not self.fp16_to_fp32_param and mix_prec_opt is not None: + for fp16_group, fp32_group in zip(mix_prec_opt.float16_groups, mix_prec_opt.fp32_from_float16_groups): + for fp16_param, fp32_param in zip(fp16_group, fp32_group): + self.fp16_to_fp32_param[fp16_param] = fp32_param + return self._fetch_mv_in_adam(params2name, torch_opt, monitor) + +class MegatronDistributedOptimizerMon(MixPrecsionOptimizerMon): + def fetch_mv(self, monitor, torch_opt, params2name): + mix_prec_opt = MixPrecsionOptimizerMon.wrapped_optimizer + assert hasattr(mix_prec_opt, "model_float16_groups") and hasattr(mix_prec_opt, "shard_fp32_from_float16_groups"), \ + "megatron distributed optimizer should have model_float16_groups and shard_fp32_from_float16_groups, if not, please check megatron-lm version" + if not self.fp16_to_fp32_param and mix_prec_opt is not None: + for fp16_group, shard_fp32_group in zip(mix_prec_opt.model_float16_groups, mix_prec_opt.shard_fp32_from_float16_groups): + for fp16_param, shard_fp32_param in zip(fp16_group, shard_fp32_group): + self.fp16_to_fp32_param[fp16_param] = shard_fp32_param + + return self._fetch_mv_in_adam(params2name, torch_opt, monitor) + +class DummyOptimizerMon(MixPrecsionOptimizerMon): + def fetch_mv(self, monitor, torch_opt, params2name): + return None, None, None, None + +class OptimizerMonFactory: + @staticmethod + def create_optimizer_mon(opt_ty:str): + if opt_ty == "Megatron_Float16OptimizerWithFloat16Params": + return MixPrecsionOptimizerMon() + if opt_ty == "Megatron_DistributedOptimizer": + return MegatronDistributedOptimizerMon() + if opt_ty == None or opt_ty == "unknown": + return DummyOptimizerMon() + assert opt_ty != None, "opt_ty should be Megatron_Float16OptimizerWithFloat16Params or Megatron_DistributedOptimizer or None or unknown" \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/unittest/config_1.json b/debug/accuracy_tools/kj600/kj600/unittest/config_1.json deleted file mode 100644 index a3b10f731d10b64a8b2df703079f9c56080876eb..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/kj600/kj600/unittest/config_1.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "targets": { - "fc": {"input": "tuple[1]:0", "output": "tensor", "input_grad":"tuple[1]:0", "output_grad":"tuple[1]:0"}, - "relu": {"input": "tuple[1]:0", "output": "tensor", "input_grad":"tuple[1]:0", "output_grad":"tuple[1]:0"} - }, - "ur_distribution": true, - "mg_direction": true -} \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_features.py b/debug/accuracy_tools/kj600/kj600/unittest/test_features.py deleted file mode 100644 index bc8c6dd71ab4e0bf708cf3d97d02dab3a2ded9cc..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_features.py +++ /dev/null @@ -1,33 +0,0 @@ -import unittest -import torch -import torch.nn as nn -import torch_npu -from kj600.features import eff_rank - - -class TestFeatureCalculation(unittest.TestCase): - def test_effective_rank(self): - param = torch.randn(10, 10).npu() - rank = eff_rank(param) - self.assertTrue(rank.item() >= 1) - - def test_lambda_max(self): - pass - # input_dim = 10 - # hidden_dim = 100 - # output_dim = 1 - # num_samples = 100 - # X = torch.randn(num_samples, input_dim) - # network = nn.Sequential( - # nn.Linear(input_dim, hidden_dim), - # nn.ReLU(), - # nn.Linear(hidden_dim, output_dim) - # ) - # Y = network(X) - # Y.backward() - # for name, param in network.named_parameters(): - # lm = lambda_max(param) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/debug/accuracy_tools/kj600/kj600/unittest/test_module_hook.py b/debug/accuracy_tools/kj600/kj600/unittest/test_module_hook.py deleted file mode 100644 index f077fc7004dafddc0836300d5e0ffc19d1ed3d06..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/kj600/kj600/unittest/test_module_hook.py +++ /dev/null @@ -1,78 +0,0 @@ -import argparse -import torch_npu -import torch -import torch.nn.functional as F -from kj600.module_hook import TrainerMon # Modify PYTHONPATH to import TrainerMon -#from hook_api import reg_grad_hook, reg_grad_one_hook, reg_module_backward_hook, reg_module_forward_hook -#from torch.cuda.amp import GradScaler - -from torch.npu.amp import GradScaler - - -# from ptdbg_ascend import PrecisionDebugger as PD -# from monitor import GradientMonitor - -print(torch_npu.__version__) - -#debugger = PD(dump_path="./dump/", hook_name="dump", step=[1, 2, 3], enable_dataloader=False) -#debugger.configure_hook(mode="list", scope=["optim_Adam_step"], ) - -parser = argparse.ArgumentParser(prog="kj600 debug", description="kj600 sample code", epilog="") -parser.add_argument("-o", "--out_dir", type=str, default=".") -args = parser.parse_args() -DTYPE = torch.float32 - - -class Model(torch.nn.Module): - def __init__(self): - super().__init__() - self.fc = torch.nn.Linear(784, 10, dtype=DTYPE) - self.relu = torch.nn.ReLU() - - def forward(self, x): - return self.relu(self.fc(x).type(DTYPE)) - -npu = torch.device('npu:0') -net = Model().to(device=npu) - -config = { - "targets": { - "fc": {"input": "tuple[2]:0", "output": "tensor::"}, - "relu": {"input": "..", "output": ".."} - } -} -# reg_grad_hook(net, hook_factory=hook_factory, config=config) -# reg_grad_one_hook(net, hook=monitor_hook, config=config) -# net.fc.register_forward_hook(get_actv_hook("fc")) -# reg_module_forward_hook(net, module_fwd_hook, config) -# reg_module_backward_hook(net, module_bwd_hook, config) -optimizer = torch.optim.Adam(net.parameters(), lr=0.0001) - -hooker = TrainerMon('./kj600/unittest/config_1.json') -hooker.hook_modules(model=net, global_batch_size=2, dp=1, micro_batch_size=2, fwd_or_bkd=0, params_have_main_grad=False) -# hooker.hook_optimizer(optimizer) - - -class ToyDataset(torch.utils.data.Dataset): - def __init__(self): - self.data = torch.randn(16, 784, dtype=DTYPE, requires_grad=True) - self.labels = torch.randint(low=0, high=9, size=(16,)) - - def __len__(self): - return len(self.labels) - - def __getitem__(self, idx): - return self.data[idx].to(npu), self.labels[idx].to(npu) - -train_ds = ToyDataset() -train_loader = torch.utils.data.DataLoader(train_ds, shuffle=True, batch_size=2) - - -# scaler = GradScaler() -for (inputs, labels) in train_loader: - optimizer.zero_grad() - outputs = net(inputs) - loss = F.cross_entropy(outputs, labels) - - loss.backward() - optimizer.step() diff --git a/debug/accuracy_tools/kj600/kj600/utils.py b/debug/accuracy_tools/kj600/kj600/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..53d47d9988647202bdb711afde38b94b51899b5a --- /dev/null +++ b/debug/accuracy_tools/kj600/kj600/utils.py @@ -0,0 +1,110 @@ +import os +import time +import sys +import re + +FILE_MAX_SIZE = 10 * 1024 * 1024 * 1024 +FILE_NAME_MAX_LENGTH = 255 +DIRECTORY_MAX_LENGTH = 4096 +FILE_NAME_VALID_PATTERN = r"^[a-zA-Z0-9_.:/-]+$" + +def _print_log(level, msg, end='\n'): + current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) + pid = os.getgid() + print(current_time + "(" + str(pid) + ")-[" + level + "]" + msg, end=end) + sys.stdout.flush() + + +def print_info_log(info_msg, end='\n'): + """ + Function Description: + print info log. + Parameter: + info_msg: the info message. + """ + _print_log("INFO", info_msg, end=end) + + +def print_error_log(error_msg): + """ + Function Description: + print error log. + Parameter: + error_msg: the error message. + """ + _print_log("ERROR", error_msg) + + +def print_warn_log(warn_msg): + """ + Function Description: + print warn log. + Parameter: + warn_msg: the warning message. + """ + _print_log("WARNING", warn_msg) + +def get_param_struct(param): + if isinstance(param, tuple): + return f"tuple[{len(param)}]" + if isinstance(param, list): + return f"list[{len(param)}]" + return "tensor" + +def check_link(path): + abs_path = os.path.abspath(path) + if os.path.islink(abs_path): + raise RuntimeError("The path is a soft link.") + + +def check_path_length(path, name_length_limit=None): + file_max_name_length = name_length_limit if name_length_limit else FILE_NAME_MAX_LENGTH + if len(path) > DIRECTORY_MAX_LENGTH or \ + len(os.path.basename(path)) > file_max_name_length: + raise RuntimeError("The file path length exceeds limit.") + + +def check_path_pattern_vaild(path): + if not re.match(FILE_NAME_VALID_PATTERN, path): + raise RuntimeError("The file path contains special characters.") + + +def check_path_readability(path): + if not os.access(path, os.R_OK): + raise RuntimeError("The file path is not readable.") + + +def check_path_writability(path): + if not os.access(path, os.W_OK): + raise RuntimeError("The file path is not writable.") + + +def check_file_size(file_path, max_size=FILE_MAX_SIZE): + file_size = os.path.getsize(file_path) + if file_size >= max_size: + raise RuntimeError("The file size excess limit.") + + +def check_path_exists(path): + if not os.path.exists(path): + raise RuntimeError("The file path does not exist.") + + +def check_file_valid(path): + check_path_exists(path) + check_link(path) + real_path = os.path.realpath(path) + check_path_length(real_path) + check_path_pattern_vaild(real_path) + check_file_size(real_path) + + +def check_file_valid_readable(path): + check_file_valid(path) + check_path_readability(path) + + +def check_file_valid_writable(path): + check_file_valid(path) + check_path_writability(path) + \ No newline at end of file diff --git "a/debug/accuracy_tools/kj600/\350\256\255\347\273\203\347\212\266\346\200\201\347\233\221\346\216\247\345\267\245\345\205\267\346\200\247\350\203\275\345\237\272\347\272\277.md" "b/debug/accuracy_tools/kj600/\350\256\255\347\273\203\347\212\266\346\200\201\347\233\221\346\216\247\345\267\245\345\205\267\346\200\247\350\203\275\345\237\272\347\272\277.md" new file mode 100644 index 0000000000000000000000000000000000000000..90461fa5c86a822f0b3db9b984b7598eb681259c --- /dev/null +++ "b/debug/accuracy_tools/kj600/\350\256\255\347\273\203\347\212\266\346\200\201\347\233\221\346\216\247\345\267\245\345\205\267\346\200\247\350\203\275\345\237\272\347\272\277.md" @@ -0,0 +1,52 @@ +# ptdbg_ascend精度工具标准性能基线报告 + +## 环境信息 + +NPU:Atlas A2 训练系列产品 + +CPU: + +![输入图片说明](img/cpu_info.png) + +Torch:2.1.0 + +CANN:8.0.RC2 + +除上述环境信息影响性能外,被检控的模块的数量和结构会对性能产生影响,因此本次选取典型网络进行测试,并且选取耗时稳定后的步数进行测试。工具输出键小,对内存无要求。 + +## 模型信息和性能基线 + +以下场景的性能基线测试数据均为多次测试后取平均值,因此实际运行时性能数据可能会根据环境状态稍有浮动。 + +### LLAMA2-13B + +主要数据类型:BFLOAT16 + +模型层数:40 + +配置文件(采了10层): +``` +{ + "targets": { + "language_model.encoder.layers.0": {"input": "tuple[2]:0", "output": "tensor", "input_grad":"tuple[2]:0", "output_grad":"tuple[1]:0"}, + "language_model.encoder.layers.1": {"input": "tuple[2]:0", "output": "tensor", "input_grad":"tuple[2]:0", "output_grad":"tuple[1]:0"}, + "language_model.encoder.layers.2": {"input": "tuple[2]:0", "output": "tensor", "input_grad":"tuple[2]:0", "output_grad":"tuple[1]:0"}, + "language_model.encoder.layers.3": {"input": "tuple[2]:0", "output": "tensor", "input_grad":"tuple[2]:0", "output_grad":"tuple[1]:0"}, + "language_model.encoder.layers.4": {"input": "tuple[2]:0", "output": "tensor", "input_grad":"tuple[2]:0", "output_grad":"tuple[1]:0"}, + "language_model.encoder.layers.5": {"input": "tuple[2]:0", "output": "tensor", "input_grad":"tuple[2]:0", "output_grad":"tuple[1]:0"}, + "language_model.encoder.layers.6": {"input": "tuple[2]:0", "output": "tensor", "input_grad":"tuple[2]:0", "output_grad":"tuple[1]:0"}, + "language_model.encoder.layers.7": {"input": "tuple[2]:0", "output": "tensor", "input_grad":"tuple[2]:0", "output_grad":"tuple[1]:0"}, + "language_model.encoder.layers.8": {"input": "tuple[2]:0", "output": "tensor", "input_grad":"tuple[2]:0", "output_grad":"tuple[1]:0"}, + "language_model.encoder.layers.9": {"input": "tuple[2]:0", "output": "tensor", "input_grad":"tuple[2]:0", "output_grad":"tuple[1]:0"} + }, + "module_ranks": "0" +} +``` + +启动命令参数:python3 -u pretrain_gpt.py --local-rank=1 --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --sequence-parallel --num-layers 40 --hidden-size 5120 --ffn-hidden-size 13824 --num-attention-heads 40 --tokenizer-type Llama2Tokenizer --tokenizer-model /new_data/LLM/checkpoint_origin/llama2-13b-hf/tokenizer.model --seq-length 4096 --max-position-embeddings 4096 --micro-batch-size 2 --global-batch-size 16 --make-vocab-size-divisible-by 1 --lr 1e-6 --train-iters 5000 --lr-decay-style cosine --untie-embeddings-and-output-weights --disable-bias-linear --attention-dropout 0.0 --init-method-std 0.01 --hidden-dropout 0.0 --position-embedding-type rope --normalization RMSNorm --use-fused-rmsnorm --swiglu --use-flash-attn --no-masked-softmax-fusion --attention-softmax-in-fp32 --min-lr 1e-8 --weight-decay 1e-1 --lr-warmup-fraction 0.01 --clip-grad 1.0 --adam-beta1 0.9 --initial-loss-scale 4096 --adam-beta2 0.95 --no-gradient-accumulation-fusion --load /data/LLM/checkpoint_magatron/llama2_13b_tp1_pp8 --no-load-optim --no-load-rng --use-fused-swiglu --use-fused-rotary-pos-emb --use-mc2 --bf16 --data-path /data/LLM/data_modellink/llama2_13b/alpaca_text_document --split 949,50,1 --log-interval 1 --save-interval 10000 --eval-interval 1000 --eval-iters 10 --distributed-backend nccl --save ./ckpt + +不加工具原始耗时:**4s** + +加工具后单卡耗时:**4.25s** + +加工具后多卡耗时:**4.35s** diff --git a/debug/accuracy_tools/ptdbg_ascend/README.md b/debug/accuracy_tools/ptdbg_ascend/README.md index 4702d9229055c88a78e97ab5c59de8dbac15eb85..2fcbf89f29c65978cd941651fa0ec290b95f90b7 100644 --- a/debug/accuracy_tools/ptdbg_ascend/README.md +++ b/debug/accuracy_tools/ptdbg_ascend/README.md @@ -1,5 +1,9 @@ # **PyTorch精度工具** +## 版本过渡提示 + +当前版本ptdbg维护到2024/09/30,准备于2024/09/30下线,相关目录att/debug/accuracy_tools/ptdbg_ascend将于2024/09/30删除。新版本ptdbg已经合到att/debug/accuracy_tools/atat目录下。 + ## 快速安装 进行PyTorch精度比对需要将ptdbg_ascend精度工具分别安装在CPU或GPU环境以及NPU环境下。 @@ -10,6 +14,7 @@ | ptdbg_ascend版本 | 发布日期 | 支持PyTorch版本 | 下载链接 | 参考指南 | 校验码 | | ---------------- | ---------- | -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | + | 6.0.T4 | 2024-06-11 | 1.11.0/2.0/2.1/2.2 | [ptdbg_ascend-v6.0.T4-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/6.0/ptdbg_ascend-6.0.T4-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v6.0.T4](doc/ptdbg_ascend精度工具功能说明_v6.0.T3.md) | 138d78497476c10b1b27239814bdfb5ce78ea8c01a8544a95fffbf10fb166221 | | 6.0.T3 | 2024-05-25 | 1.11.0/2.0/2.1/2.2 | [ptdbg_ascend-v6.0.T3-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/6.0/ptdbg_ascend-6.0.T3-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v6.0.T3](doc/ptdbg_ascend精度工具功能说明_v6.0.T3.md) | f417f18e3ff52d2e15f9cadeea9931017bf9521b4f34fb657e013cead6c6bd31 | | 6.0.T2 | 2024-05-09 | 1.11.0/2.0/2.1/2.2 | [ptdbg_ascend-v6.0.T2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/6.0/ptdbg_ascend-6.0.T2-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v6.0.T2](doc/ptdbg_ascend精度工具功能说明_v6.0.T2.md) | ca173e73d3908aa69cb10c8a1bb4e2b38f6488d3ceb5cca2877cae1500c7729d | | 6.0.T1 | 2024-04-25 | 1.11.0/2.0/2.1/2.2 | [ptdbg_ascend-v6.0.T1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/6.0/ptdbg_ascend-6.0.T1-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v6.0.T1](doc/ptdbg_ascend精度工具功能说明_v6.0.T1.md) | 40aeaad94c8d446b5e3229989527fad0715ea9d103cf46305832ee21d362ae50 | @@ -121,6 +126,7 @@ ptdbg_ascend精度工具的安装方式包括:**下载whl包安装**和**源 | ptdbg_ascend版本 | 发布日期 | 支持PyTorch版本 | 下载链接 | 参考指南 | 校验码 | | ---------------- | ---------- | -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | + | 6.0.T4 | 2024-06-11 | 1.11.0/2.0/2.1/2.2 | [ptdbg_ascend-v6.0.T4-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/6.0/ptdbg_ascend-6.0.T4-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v6.0.T4](doc/ptdbg_ascend精度工具功能说明_v6.0.T3.md) | 138d78497476c10b1b27239814bdfb5ce78ea8c01a8544a95fffbf10fb166221 | | 6.0.T3 | 2024-05-25 | 1.11.0/2.0/2.1/2.2 | [ptdbg_ascend-v6.0.T3-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/6.0/ptdbg_ascend-6.0.T3-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v6.0.T3](doc/ptdbg_ascend精度工具功能说明_v6.0.T3.md) | f417f18e3ff52d2e15f9cadeea9931017bf9521b4f34fb657e013cead6c6bd31 | | 6.0.T2 | 2024-05-09 | 1.11.0/2.0/2.1/2.2 | [ptdbg_ascend-v6.0.T2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/6.0/ptdbg_ascend-6.0.T2-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v6.0.T2](doc/ptdbg_ascend精度工具功能说明_v6.0.T2.md) | ca173e73d3908aa69cb10c8a1bb4e2b38f6488d3ceb5cca2877cae1500c7729d | | 6.0.T1 | 2024-04-25 | 1.11.0/2.0/2.1/2.2 | [ptdbg_ascend-v6.0.T1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/6.0/ptdbg_ascend-6.0.T1-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v6.0.T1](doc/ptdbg_ascend精度工具功能说明_v6.0.T1.md) | 40aeaad94c8d446b5e3229989527fad0715ea9d103cf46305832ee21d362ae50 | diff --git a/debug/accuracy_tools/ptdbg_ascend/doc/img/compare_result.png b/debug/accuracy_tools/ptdbg_ascend/doc/img/compare_result.png new file mode 100644 index 0000000000000000000000000000000000000000..e3a721036335ddaf17dc44dc4ae6fb5b27915979 Binary files /dev/null and b/debug/accuracy_tools/ptdbg_ascend/doc/img/compare_result.png differ diff --git a/debug/accuracy_tools/ptdbg_ascend/doc/img/compare_result_pkl.png b/debug/accuracy_tools/ptdbg_ascend/doc/img/compare_result_pkl.png index c64e9380c6d9c01bb2ad18c81e430ead0800bb7d..863708bf6daf46985328f0dc42d48f0a5b849af5 100644 Binary files a/debug/accuracy_tools/ptdbg_ascend/doc/img/compare_result_pkl.png and b/debug/accuracy_tools/ptdbg_ascend/doc/img/compare_result_pkl.png differ diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v6.0.T4.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v6.0.T4.md" new file mode 100644 index 0000000000000000000000000000000000000000..af73a56849588c1a080962f00249700aee9a3630 --- /dev/null +++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v6.0.T4.md" @@ -0,0 +1,2301 @@ +# **PyTorch精度工具使用指南** + +本文主要介绍PyTorch精度工具ptdbg_ascend的使用以及精度比对场景示例。 + +ptdbg_ascend工具的原理及安装请参见《[PyTorch精度工具](https://gitee.com/ascend/att/blob/master/debug/accuracy_tools/ptdbg_ascend/README.md)》。 + +ptdbg_ascend工具主要支持PyTorch API精度数据dump、溢出检测、精度比对以及parse数据解析功能。其中dump和溢出检测功能支持使用debugger和register_hook方式进行精度数据的dump和溢出检测,推荐使用debugger方式。 + +## PyTorch精度比对总体流程 + +1. 准备CPU或GPU训练工程。 + +2. 在环境下安装ptdbg_ascend工具。 + +3. 在训练脚本内插入ptdbg_ascend工具dump接口。 + +4. 执行训练dump数据。 + +5. 将CPU或GPU训练工程迁移为NPU训练工程。 + + 请参见《[PyTorch模型迁移和训练指南](https://www.hiascend.com/document/detail/zh/canncommercial/63RC1/modeldevpt/ptmigr/ptmigr_0001.html)》。 + +6. 在NPU环境下安装ptdbg_ascend工具。 + +7. 在NPU训练脚本内插入ptdbg_ascend工具dump接口。 + +8. NPU环境下执行训练dump数据。 + +9. 创建并配置精度比对脚本,例如compare.py。 + +10. 执行CPU或GPU dump与NPU dump数据的精度比对。 + +11. 比对结果分析。 + +## 快速入门(debugger方式) + +本章节主要介绍通过ptdbg_ascend工具进行精度比对和分析,主要使用“**debugger方式dump和溢出检测**”和“**CPU或GPU与NPU精度数据比对**”章节中介绍的ptdbg_ascend工具接口。 + +### 单卡场景精度比对 + +**精度分析建议** + +PyTorch训练场景的精度问题分析建议参考以下思路进行精度比对和比对结果分析: + +1. 整网比对:dump整网数据并进行精度比对,初步定位异常范围。 + + 对于模型数据庞大(比如达到T级别)的场景,不推荐直接dump整网比对,整网dump可能导致磁盘不足,需要预留足够的存储空间或者分多次dump。 + +2. 缩小范围:根据Accuracy Reached or Not找出不符合精度标准的API。 + +3. 范围比对:对不符合精度标准的API重新dump详细信息。 + +4. 分析原因并优化:分析API精度不符合标准的原因并进行优化调整。 + +5. 整网比对:重新进行整网比对,判断优化后的API是否已符合精度标准以及是否出现新的精度问题。 + +6. 重复1~5步,直到不存在精度问题为止。 + +**精度分析示例** + +1. dump整网数据。 + + 分别dump CPU或GPU以及NPU数据,在PyTorch训练脚本插入dump接口,示例代码如下(下面以NPU为例,CPU或GPU dump基本相同): + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./npu_dump", hook_name="dump", step=[0]) + debugger.configure_hook(mode="api_stack") + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + +2. 比对整网数据。 + + 第1步中的NPU dump数据目录为npu_dump,假设GPU dump数据目录为gpu_dump;dump将生成pkl数据文件api_stack_dump.pkl和npy数据目录api_stack_dump。 + + 创建并配置精度比对脚本,以创建compare.py为例,示例代码如下: + + ```python + from ptdbg_ascend import * + dump_result_param={ + "npu_pkl_path": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", + "bench_pkl_path": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", + "npu_dump_data_dir": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump", + "bench_dump_data_dir": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump", + "is_print_compare_log": True + } + compare(dump_result_param, "./output", stack_mode=True) + ``` + + 执行比对: + + ```bash + python3 compare.py + ``` + + 在output目录下生成结果文件,包括:`compare_result_{timestamp}.csv`和`advisor_{timestamp}.txt` + +3. 找出存在问题的API。 + + 1. 根据`advisor_{timestamp}.txt`或打屏信息的提示,可找到存在精度问题的算子(Suspect Nodes)和专家建议(Expert Advice)。 + + ![auto_analyze_log](img/auto_analyze_log.png) + + 2. 根据第2步结果文件`compare_result_{timestamp}.csv`中的Accuracy Reached or No字段显示为NO的API,针对该API执行后续比对操作,分析该API存在的精度问题。 + +4. (可选)提取指定API的堆栈信息和dump数据统计信息。 + + 通过parse接口可以清晰的显示特定API的堆栈信息和dump数据统计信息,结合堆栈信息分析代码中可能存在的精度问题。 + + 创建并配置提取脚本,以创建parse.py为例,示例代码如下: + + ```python + from ptdbg_ascend import * + + # 提取dump信息中第1次调用的API:Torch.batch.normal的堆栈信息及数据统计信息 + parse("./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", "Torch.batch.normal.1.forward") + ``` + + 执行提取: + + ```bash + python3 parse.py + ``` + + + +5. (可选)指定API对其底层ACL数据进行dump。 + + - dump指定前向API的ACL级别数据 + + ```python + debugger = PrecisionDebugger(dump_path="./npu_dump", hook_name="dump", step=[0]) + debugger.configure_hook(mode="acl", scope=["Tensor.permute.1.forward"], acl_config='./dump.json') + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + + - dump指定反向API的ACL级别数据 + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./npu_dump", hook_name="dump", step=[0]) + # dump指定反向API的ACL级别数据、bool和整型的tensor以及浮点、bool和整型的标量 + debugger.configure_hook(mode="acl", scope=["Functional.conv2d.1.backward"], acl_config="./dump.json", backward_input=["./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump/Functional.conv2d.1.backward_input.0.npy"]) + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + +6. (可选)重新比对。 + + 根据第4或5步的dump数据重新配置compare.py并执行比对,可以对单API模型进行问题复现。 + +**注意事项** + +* dump_mode="acl"场景下,会增加npu的内存消耗,请谨慎开启。 +* 部分API存在调用嵌套关系,比如functional.batch_norm实际调用torch.batch_norm,该场景会影响acl init初始化多次,导致功能异常。 + +### 溢出检测场景 + +溢出检测是针对NPU的PyTorch API,检测是否存在溢出的情况。当前仅支持识别aicore浮点溢出。 + +溢出检测原理:针对溢出阶段,开启acl dump模式,重新对溢出阶段执行,落盘数据。 + +建议按照如下步骤操作: + +1. 在NPU环境下安装ptdbg_ascend工具。 + +2. 在NPU训练脚本内插入ptdbg_ascend工具溢出检测接口。 + + - 示例1:全量溢出检测 + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./overflow_dump", hook_name="overflow_check", step=[0]) + debugger.configure_hook(overflow_nums=-1) + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + + 多卡使用时各卡单独计算溢出次数。 + + - 示例2:dump指定前向API的ACL级别溢出数据 + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./overflow_dump", hook_name="overflow_check", step=[0]) + debugger.configure_hook(mode="acl", acl_config="./dump.json") + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + + - 示例3:dump指定反向API的ACL级别的溢出数据 + + 1. 进行全量溢出检测 + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./overflow_dump", hook_name="overflow_check", step=[0]) + debugger.configure_hook(overflow_nums=-1) + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + + + + 2. dump指定反向API的ACL级别的溢出数据 + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./overflow_dump", hook_name="dump", step=[0]) + debugger.configure_hook(mode="acl", scope=["Functional.conv2d.1.backward"], acl_config="./dump.json", backward_input=["./overflow_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump/Functional.conv2d.1.backward_input.0.npy"]) + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + + 针对前向溢出API,可以通过overflow_nums,配置允许的溢出次数,并将每次溢出API的全部ACL数据dump下来,到达指定溢出次数后停止,停止后会看到堆栈打印包含如下字段。 + + ```bash + ValueError: [overflow xxx times]: dump file is saved in '*.pkl'. + ``` + + 其中xxx times为用户设置的次数,*.pkl为文件生成路径。 + +3. NPU环境下执行训练dump溢出数据。 + + 针对输入正常但输出存在溢出的API,会训练执行目录下将溢出的API信息dump并保存为`forward_info_{pid}.json`和`backward_info_{pid}.json`,通过[Ascend模型精度预检工具](https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/api_accuracy_checker)对json文件进行解析,输出溢出API为正常溢出还是非正常溢出,从而帮助用户快速判断。 + + 精度预检工具执行命令如下: + + ```bash + # 下载att代码仓后执行如下命令 + export PYTHONPATH=$PYTHONPATH:$ATT_HOME/debug/accuracy_tools/ + cd $ATT_HOME/debug/accuracy_tools/api_accuracy_checker/run_ut + python run_overflow_check.py -forward ./forward_info_0.json + ``` + + 反向过程溢出的API暂不支持精度预检功能。 + + 当重复执行溢出检测dump操作时,需要删除上一次dump目录下的溢出检测dump数据,否则将因重名而报错。 + +**注意事项** + +* dump_mode="acl"场景下,会增加npu的内存消耗,请谨慎开启。 +* 部分API存在调用嵌套关系,比如functional.batch_norm实际调用torch.batch_norm,该场景会影响acl init初始化多次,导致功能异常。 +* 混合精度动态loss scale场景下,正常训练会有"Gradient overflow. SKipping step"日志,添加溢出检测后日志消失,可以通过设置环境变量export OVERFLOW_DEBUG_MODE_ENABLE=1,并将register_hook位置调整amp.initialize之前解决。此功能需要cann包配套支持,不支持版本执行报错EZ3003。 + +## 场景化示例 + +本章节主要介绍通过ptdbg_ascend工具进行精度比对和分析,主要使用“**CPU或GPU及NPU精度数据dump**”和“**CPU或GPU与NPU精度数据比对**”章节中介绍的ptdbg_ascend工具接口。 + +### 多卡场景精度比对 + +精度工具支持多卡场景的精度比对,多卡场景的dump步骤与单卡场景完全一致,请参见“**单卡场景精度比对**”章节,不同的是多卡数据精度比对时需要使用“compare_distributed”函数进行比对。 + +**大模型场景下dump推荐使用debugger方式的手动模式。** + +如下示例: + +说明:多机多卡场景需要每个设备单独执行比对操作。 + +假设NPU dump npy数据目录为npu_dump/ptdbg_dump_v4.0,GPU dump npy数据目录为gpu_dump/ptdbg_dump_v4.0。 + +1. 创建比对脚本,例如compare_distributed.py,拷贝如下代码。 + + ```python + from ptdbg_ascend import * + compare_distributed('./npu_dump/ptdbg_dump_v4.0/step0', './gpu_dump/ptdbg_dump_v4.0/step0', './output') + ``` + + dump数据目录须指定到step级。 + +2. 执行比对: + + ```bash + python3 compare_distributed.py + ``` + +两次运行须用相同数量的卡,传入`compare_distributed`的两个文件夹下须有相同个数的rank文件夹,且不包含其他无关文件,否则将无法比对。 + +**多卡set_dump_path注意事项** + +多卡一般为多进程,须保证每个进程都正确调用PrecisionDebugger或set_dump_path,或把PrecisionDebugger或set_dump_path插入到import语句后,如: + +```python +from ptdbg_ascend import * +debugger = PrecisionDebugger(dump_path="./npu_dump", hook_name="dump", step=[0]) +``` + +或 + +```python +from ptdbg_ascend import * +seed_all() +set_dump_path('./dump_resnet') +``` + +如此可保证set_dump_path在每个进程都被调用。 + +**多卡register_hook注意事项** + +register_hook需要在set_dump_path之后调用,也需要在每个进程上被调用,建议在搬运模型数据到卡之后调用。识别方法如下: + +- 找到训练代码中遍历epoch的for循环或遍历数据集的for循环,把register_hook放到循环开始前即可。 +- 找到训练代码中调用DDP或者DistributedDataParallel的代码行,把register_hook放到该代码行所在的代码块之后。 +- 若代码中均无以上两种情况,需要保证register_hook在模型定义之后插入,并配置rank参数。rank参数获取rank_id请参见“**[rank_id获取方法](https://gitee.com/ascend/att/blob/master/debug/accuracy_tools/ptdbg_ascend/doc/rank_id获取方法.md)**”。 + +### NPU vs NPU精度比对 + +对于NPU vs NPU场景,是针对同一模型,进行迭代(模型、API版本升级或设备硬件升级)时存在的精度下降问题,对比相同模型在迭代前后版本的API计算数值,进行问题定位。 + +一般情况下迭代涉及NPU自定义算子,因此,可以仅dump NPU自定义算子进行比对。比对精度问题分析请参见“**单卡场景精度比对**”章节。 + +工具当前支持dump NPU自定义算子如下: + +| 序号 | NPU自定义算子 | +| :--- | ----------------------------------------------- | +| 1 | torch_npu.one_ | +| 2 | torch_npu.npu_sort_v2 | +| 3 | torch_npu.npu_transpose | +| 4 | torch_npu.npu_broadcast | +| 5 | torch_npu.npu_dtype_cast | +| 6 | torch_npu.empty_with_format | +| 7 | torch_npu.npu_one_hot | +| 8 | torch_npu.npu_stride_add | +| 9 | torch_npu.npu_ps_roi_pooling | +| 10 | torch_npu.npu_roi_align | +| 11 | torch_npu.npu_nms_v4 | +| 12 | torch_npu.npu_iou | +| 13 | torch_npu.npu_nms_with_mask | +| 14 | torch_npu.npu_pad | +| 15 | torch_npu.npu_bounding_box_encode | +| 16 | torch_npu.npu_bounding_box_decode | +| 17 | torch_npu.npu_batch_nms | +| 18 | torch_npu.npu_slice | +| 19 | torch_npu._npu_dropout | +| 20 | torch_npu.npu_indexing | +| 21 | torch_npu.npu_ifmr | +| 22 | torch_npu.npu_max | +| 23 | torch_npu.npu_scatter | +| 24 | torch_npu.npu_layer_norm_eval | +| 25 | torch_npu.npu_alloc_float_status | +| 26 | torch_npu.npu_confusion_transpose | +| 27 | torch_npu.npu_bmmV2 | +| 28 | torch_npu.fast_gelu | +| 29 | torch_npu.npu_sub_sample | +| 30 | torch_npu.npu_deformable_conv2d | +| 31 | torch_npu.npu_mish | +| 32 | torch_npu.npu_anchor_response_flags | +| 33 | torch_npu.npu_yolo_boxes_encode | +| 34 | torch_npu.npu_grid_assign_positive | +| 35 | torch_npu.npu_normalize_batch | +| 36 | torch_npu.npu_masked_fill_range | +| 37 | torch_npu.npu_linear | +| 38 | torch_npu.npu_bert_apply_adam | +| 39 | torch_npu.npu_giou | +| 40 | torch_npu.npu_ciou | +| 41 | torch_npu.npu_diou | +| 42 | torch_npu.npu_sign_bits_pack | +| 43 | torch_npu.npu_sign_bits_unpack | +| 44 | torch_npu.npu_flash_attention | +| 45 | torch_npu.npu_scaled_masked_softmax | +| 46 | torch_npu.npu_rotary_mul | +| 47 | torch_npu.npu_roi_align | +| 48 | torch_npu.npu_roi_alignbk | +| 49 | torch_npu.npu_ptiou | +| 50 | torch_npu.npu_fusion_attention | +| 51 | torch_npu.npu_dropout_with_add_softmax | +| 52 | torch_npu.npu_random_choice_with_mask | +| 53 | torch_npu.npu_rotated_iou | +| 54 | torch_npu.npu_conv2d | +| 55 | torch_npu.npu_conv3d | +| 56 | torch_npu.npu_softmax_cross_entropy_with_logits | +| 57 | torch_npu.npu_all_gather_base_mm | +| 58 | torch_npu.npu_swiglu | +| 59 | torch_npu.npu_rms_norm | +| 60 | torch_npu.npu_mm_reduce_scatter_base | +| 61 | torch_npu.npu_mm_all_reduce_base | +| 62 | torch_npu.npu_conv_transpose2d | +| 63 | torch_npu.npu_convolution | +| 64 | torch_npu.npu_convolution_transpose | +| 65 | torch_npu.npu_min | +| 66 | torch_npu.npu_nms_rotated | +| 67 | torch_npu.npu_reshape | +| 68 | torch_npu.npu_rotated_box_decode | +| 69 | torch_npu.npu_rotated_box_encode | +| 70 | torch_npu.npu_rotated_overlaps | +| 71 | torch_npu.npu_silu | +| 72 | torch_npu.npu_fused_attention_score | +| 73 | torch_npu.npu_multi_head_attention | +| 74 | torch_npu.npu_gru | +| 75 | torch_npu.npu_incre_flash_attention | +| 76 | torch_npu.npu_prompt_flash_attention | +| 77 | torch_npu.npu_lstm | +| 78 | torch_npu.npu_apply_adam | + +### 通信API的数据dump + +通信类API数据可以使用全量dump方式获取,若只dump通信类API数据,可以使用如下示例: + +```python +debugger.configure_hook(mode="api_list", api_list=["distributed"]) +``` + +或 + +```python +set_dump_switch("ON", mode="api_list", api_list=["distributed"]) +``` + +通信类API支持列表: + +| 序号 | Distributed | +| :--- | -------------------- | +| 1 | send | +| 2 | recv | +| 3 | broadcast | +| 4 | all_reduce | +| 5 | reduce | +| 6 | all_gather | +| 7 | gather | +| 8 | isend | +| 9 | irecv | +| 10 | scatter | +| 11 | reduce_scatter | +| 12 | _reduce_scatter_base | +| 13 | _all_gather_base | + +### 单卡场景精度比对(register_hook方式) + +**精度分析建议** + +PyTorch训练场景的精度问题分析建议参考以下思路进行精度比对和比对结果分析: + +1. 整网比对:dump整网数据并进行精度比对,初步定位异常范围。 +2. 缩小范围:根据Accuracy Reached or Not找出不符合精度标准的API。 +3. 范围比对:对不符合精度标准的API重新dump。 +4. 分析原因并优化:分析API精度不符合标准的原因并进行优化调整。 +5. 整网比对:重新进行整网比对,判断优化后的API是否已符合精度标准以及是否出现新的精度问题。 +6. 重复1~5步,直到不存在精度问题为止。 + +**精度分析示例** + +1. dump整网数据。 + + 分别dump CPU或GPU以及NPU数据,在PyTorch训练脚本插入dump接口,示例代码如下(下面以NPU为例,CPU或GPU dump基本相同): + + ```python + from ptdbg_ascend import * + + # 在main函数开始前固定随机数 + seed_all() + + # 配置dump数据目录路径和名称 + set_dump_path("./npu_dump", dump_tag='all') + + # 注册dump回调函数 + register_hook(model, acc_cmp_dump) + + ... + + # 在第一个迭代开始的位置开启dump和堆栈模式,同时为保证数据完整性开启dump bool和整型的tensor以及浮点、bool和整型的标量 + set_dump_switch("ON", mode="api_stack", filter_switch="OFF") + + ... + + # 在第一个迭代结束的位置关闭dump + set_dump_switch("OFF") + ``` + +2. 比对整网数据。 + + 第1步中的NPU dump数据文件为npu_dump.pkl,假设NPU dump npy数据目录为npu_dump,GPU dump数据文件为gpu_dump.pkl,GPU dump npy数据目录为gpu_dump。 + + 创建并配置精度比对脚本,以创建compare.py为例,示例代码如下: + + ```python + from ptdbg_ascend import * + dump_result_param={ + "npu_pkl_path": "./npu_dump/all_v4.0/step0/rank0/api_stack_dump.pkl", + "bench_pkl_path": "./gpu_dump/all_v4.0/step0/rank0/api_stack_dump.pkl", + "npu_dump_data_dir": "./npu_dump/all_v4.0/step0/rank0/api_stack_dump", + "bench_dump_data_dir": "./gpu_dump/all_v4.0/step0/rank0/api_stack_dump", + "is_print_compare_log": True + } + compare(dump_result_param, "./output", stack_mode=True) + ``` + + 执行比对: + + ```bash + python3 compare.py + ``` + + 在output目录下生成结果文件,包括:`compare_result_{timestamp}.csv`和`advisor_{timestamp}.txt` + +3. 找出存在问题的API。 + + 1. 根据`advisor_{timestamp}.txt`或打屏信息的提示,可找到存在精度问题的算子(Suspect Nodes)和专家建议(Expert Advice) + + ![auto_analyze_log](img/auto_analyze_log.png) + + 2. 根据第2步结果文件`compare_result_{timestamp}.csv`中的Accuracy Reached or No字段显示为NO的API,针对该API执行后续比对操作,分析该API存在的精度问题。 + +4. (可选)提取指定API的堆栈信息和dump数据统计信息。 + + 通过parse接口可以清晰的显示特定API的堆栈信息和dump数据统计信息,结合堆栈信息分析代码中可能存在的精度问题。 + + 创建并配置提取脚本,以创建parse.py为例,示例代码如下: + + ```python + from ptdbg_ascend import * + + # 提取dump信息中第1次调用的API:Torch.batch.normal的堆栈信息及数据统计信息 + parse("./npu_dump/all_v4.0/step0/rank0/api_stack_dump.pkl", "Torch.batch.normal.1.forward") + ``` + + 执行提取: + + ```bash + python3 parse.py + ``` + +5. (可选)指定API对其底层ACL数据进行dump。 + + - dump指定前向API的ACL级别数据 + + ```python + from ptdbg_ascend import * + + # 固定随机数,开启确定性计算 + seed_all(mode=True) + set_dump_path("./dump_path", dump_tag='forward') + register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json') + + # dump指定前向API的ACL级别数据、bool和整型的tensor以及浮点、bool和整型的标量 + set_dump_switch("ON", mode="acl", scope=["Tensor.permute.1.forward"], filter_switch="OFF") + + ... + + set_dump_switch("OFF") + ``` + + - dump指定反向API的ACL级别数据 + + ```python + from ptdbg_ascend import * + + # 固定随机数,开启确定性计算 + seed_all(mode=True) + set_dump_path("./dump_path", dump_tag='backward') + register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json') + + # dump指定反向API的ACL级别数据、bool和整型的tensor以及浮点、bool和整型的标量 + set_dump_switch("ON", mode="acl", scope=["Functional.conv2d.1.backward"], filter_switch="OFF") + set_backward_input(["./npu_dump/all_v4.0/step0/rank0/api_stack_dump/Functional.conv2d.1.backward.input.0.npy"]) + + ... + + set_dump_switch("OFF") + ``` + +6. (可选)重新比对。 + + 根据第4或5步的dump数据重新配置compare.py并执行比对,可以对单API模型进行问题复现。 + +**注意事项** + +* dump_mode="acl"场景下,会增加npu的内存消耗,请谨慎开启。 +* 部分API存在调用嵌套关系,比如functional.batch_norm实际调用torch.batch_norm,该场景会影响acl init初始化多次,导致功能异常。 + +### 溢出检测场景(register_hook方式) + +溢出检测是针对NPU的PyTorch API,检测是否存在溢出的情况。当前仅支持识别aicore浮点溢出。 + +溢出检测原理:针对溢出阶段,开启acl dump模式,重新对溢出阶段执行,落盘数据。 + +建议按照如下步骤操作: + +1. 在NPU环境下安装ptdbg_ascend工具。 + +2. 在NPU训练脚本内插入ptdbg_ascend工具溢出检测接口。 + + - 示例1:全量溢出检测 + + ```python + from ptdbg_ascend import * + seed_all() + # 配置溢出数据目录路径和名称 + set_dump_path("./overflow_dump") + ... + # 设置检测到3次溢出后退出训练 + register_hook(model, overflow_check, overflow_nums=3) + + ... + ``` + + 多卡使用时各卡单独计算溢出次数。 + + - 示例2:dump指定API的ACL级别溢出数据 + + ```python + from ptdbg_ascend import * + seed_all() + # 配置溢出数据目录路径和名称 + set_dump_path("./overflow_dump") + ... + # dump指定API的ACL级别溢出数据 + register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json') + + # 在期望溢出检测的step位置开始前打开溢出检测开关 + set_overflow_check_switch("ON") + + ... + + # 在step结束的位置关闭溢出检测开关 + set_overflow_check_switch("OFF") + + ... + ``` + + - 示例3:dump指定反向API的ACL级别的溢出数据 + + 1. 进行全量溢出检测 + + ```python + from ptdbg_ascend import * + seed_all() + # 配置溢出数据目录路径和名称 + set_dump_path("./overflow_dump") + ... + # 设置检测到3次溢出后退出训练 + register_hook(model, overflow_check) + + ... + ``` + + 2. dump指定反向API的ACL级别的溢出数据 + + ```python + from ptdbg_ascend import * + seed_all() + # 配置溢出数据目录路径和名称 + set_dump_path("./overflow_dump") + ... + # dump指定反向API的ACL级别溢出数据 + register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json') + set_dump_switch("ON", mode="acl", scope=["Functional.conv2d.1.backward"]) + set_backward_input(["./npu_dump/ptdbg_dump_v4.0/step0/rank0/dump/Functional.conv2d.1.backward.input.0.npy"]) + ``` + + 针对前向溢出API,可以通过overflow_nums,配置允许的溢出次数,并将每次溢出API的全部ACL数据dump下来,到达指定溢出次数后停止,停止后会看到堆栈打印包含如下字段。 + + ```bash + ValueError: [overflow xxx times]: dump file is saved in '*.pkl'. + ``` + + 其中xxx times为用户设置的次数,*.pkl为文件生成路径。 + +3. NPU环境下执行训练dump溢出数据。 + + 针对输入正常但输出存在溢出的API,会训练执行目录下将溢出的API信息dump并保存为`forward_info_{pid}.json`和`backward_info_{pid}.json`,通过 [Ascend模型精度预检工具](https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/api_accuracy_checker)对json文件进行解析,输出溢出API为正常溢出还是非正常溢出,从而帮助用户快速判断。 + + 精度预检工具执行命令如下: + + ```bash + # 下载att代码仓后执行如下命令 + export PYTHONPATH=$PYTHONPATH:$ATT_HOME/debug/accuracy_tools/ + cd $ATT_HOME/debug/accuracy_tools/api_accuracy_checker/run_ut + python run_overflow_check.py -forward ./forward_info_0.json + ``` + + 反向过程溢出的API暂不支持精度预检功能。 + + 当重复执行溢出检测dump操作时,需要删除上一次dump目录下的溢出检测dump数据,否则将因重名而报错。 + +**注意事项** + +* dump_mode="acl"场景下,会增加npu的内存消耗,请谨慎开启。 +* 部分API存在调用嵌套关系,比如functional.batch_norm实际调用torch.batch_norm,该场景会影响acl init初始化多次,导致功能异常。 +* 混合精度动态loss scale场景下,正常训练会有"Gradient overflow. SKipping step"日志,添加溢出检测后日志消失,可以通过设置环境变量export OVERFLOW_DEBUG_MODE_ENABLE=1,并将register_hook位置调整amp.initialize之前解决。此功能需要cann包配套支持,不支持版本执行报错EZ3003。 + +## debugger方式dump和溢出检测(推荐) + +### PrecisionDebugger模块 + +**功能说明** + +PrecisionDebugger模块包含dump和溢出检测功能的总体配置项。可以指定dump目录,设置dump或溢出检测功能,指定dump的卡和迭代。 + +可以在from ptdbg_ascend import *和模型初始化之间的任意位置添加该模块。 + +**原型** + +```python +PrecisionDebugger(dump_path=None, hook_name=None, rank=None, step=[], enable_dataloader=False, model=None): +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ----------------- | ------------------------------------------------------------ | -------- | +| dump_path | 设置dump数据目录路径,参数示例:"./dump_path"。数据类型:str。
默认在dump_path目录下生成`ptdbg_dump_{version}`目录,并在该目录下生成`dump.pkl`文件以及`dump`数据文件保存目录。
当**configure_hook**函数配置了mode参数时,`dump.pkl`文件以及`dump`数据文件保存目录名称添加mode参数值为前缀,详情请参见“**dump数据存盘说明**”。
未配置dump_path时,也可以通过环境变量ASCEND_WORK_PATH配置dump路径,此时dump数据将落盘在${ASCEND_WORK_PATH}/dump_data下,自定义配置dump_path优先级高于环境变量,dump_path和环境变量需要二选一。 | 否 | +| hook_name | dump模式,可取值"dump"和"overflow_check",表示dump和溢出检测功能,二选一。参数示例:hook_name="dump"。数据类型:str。 | 是 | +| rank | 指定对某张卡上的数据进行dump或溢出检测,默认未配置(表示dump所有卡的数据),须根据实际卡的Rank ID配置。应配置为大于0的正整数,且须根据实际卡的Rank ID配置,若所配置的值大于实际训练所运行的卡的Rank ID,则dump数据为空,比如当前环境Rank ID为0到7,实际训练运行0到3卡,此时若配置Rank ID为4或不存在的10等其他值,此时dump数据为空。数据类型:int。 | 否 | +| step | 指定dump某个step的数据,默认未配置,表示dump所有step数据。dump特定step时,须指定为训练脚本中存在的step。step为list格式,可配置逐个step,例如:step=[0,1,2];也可以配置step范围,例如:step=list(range(0,9)),表示dump第0到第8个step。数据类型:List[int]。 | 否 | +| enable_dataloader | 自动控制开关,可取值True(开启)或False(关闭),默认为False。配置为True后自动识别dump step参数指定的迭代,并在该迭代执行完成后退出训练,此时start和stop函数可不配置,开启该开关要求训练脚本是通过torch.utils.data.dataloader方式加载数据;配置为False则需要配置start和stop函数,并在最后一个stop函数后或一个step结束的位置添加debugger.step()。数据类型:bool。 | 否 | +| model | 开启init dump模式,传入网络模型实例化的对象,配置该参数后,dump操作仅dump网络中init方法里调用的方法(nn.Module类),不会对所有API进行dump。参数示例: model=net,net为网络模型实例化的对象名称。默认未配置。
配置该参数时,PrecisionDebugger模块请在模型实例化之后调用。数据类型:torch.nn.Module。
该模式不支持“溢出检测”、”ACL级别数据dump“和“模块级精度数据dump”。此模式下dump文件名前缀为网络中定义的模块名或层名。 | 否 | + +#### init dump模式示例代码和数据落盘说明 + +**示例代码** + +```python +import os +import torch +import torch.nn as nn +import torch_npu +from ptdbg_ascend import * + +torch.npu.set_device("npu:0") + + +class Net(nn.Module): + + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=1, padding=2) + self.relu1 = nn.ReLU() + self.bn1 = nn.BatchNorm2d(16) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + output = self.relu1(x) + return output + +if __name__ == "__main__": + net = Net().npu() + # model参数传入net, 开启init dump 功能 + debugger = PrecisionDebugger(dump_path="./dump", hook_name="dump", model=net) + debugger.configure_hook(mode="api_stack") + debugger.start() + x = torch.randn(1, 1, 28, 28).npu() + out = net(x) + loss = out.sum() + loss.backward() + debugger.stop() +``` + +**落盘数据说明** + +该模式下dump数据命名格式为:`{Layer_name}.{Module_name}.{call_num}.{forward/backward}.{input/output}.npy` + +``` +# 按照上述用例代码进行dump,落盘数据命名示例如下: +conv1.Conv2d.0.forward.input.0.npy +conv1.Conv2d.0.forward.output.npy +relu1.ReLU.0.forward.input.0.npy +....... +bn1.BatchNorm2d.0.backward.output.2.npy +``` + +### configure_hook函数(可选) + +**功能说明** + +设置dump范围。 + +建议在**PrecisionDebugger**模块与模型初始化之间的任意位置添加,不添加此函数时默认使用mode="api_stack" dump整网数据。 + +**原型** + +dump: + +```python +debugger.configure_hook(mode="api_stack", scope=[], api_list=[], filter_switch="OFF", acl_config=None, backward_input=[], input_output_mode=["all"], summary_only=False, summary_mode="all") +``` + +溢出检测: + +```python +debugger.configure_hook(mode=None, acl_config=None, overflow_nums=1, need_replicate=False) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ----------------- | ------------------------------------------------------------ | -------- | +| mode | dump模式。可取值"all"、"list"、"range"、"stack"、"acl"、"api_list"、"api_stack",各参数含义请参见本节的“**函数示例**”。参数示例:mode="list"。默认为"api_stack"。该参数配置值将作为dump数据文件名的前缀,详情请参见“**dump数据存盘说明**”。数据类型:str。 | 否 | +| scope或api_list | dump范围。根据model配置的模式选择dump的API范围,mode="api_list"时,需要配置api_list=[],其他模式有需要时配置scope=[]。参数示例:scope=["Tensor.permute.1.forward", "Tensor.transpose.2.forward"]、api_list=["relu"]。默认为空。数据类型:List[str]。 | 否 | +| filter_switch | dump bool和整型的tensor以及浮点、bool和整型的标量的过滤开关。可取值"ON"(表示开启过滤,即不dump)或"OFF"(表示关闭过滤)。参数示例:filter_switch="ON"。默认不配置,即filter_switch="OFF",表示dump上述数据。数据类型:str。 | 否 | +| acl_config | acl dump的配置文件。mode="acl"时,该参数必选;mode为其他值时,该参数不选。参数示例:acl_config='./dump.json'。dump.json配置文件详细介绍请参见“**dump.json配置文件说明**”。数据类型:str。 | 否 | +| backward_input | 该输入文件为首次运行训练dump得到反向API输入的.npy文件。例如若需要dump Functional.conv2d.1 API的反向过程的输入输出,则需要在dump目录下查找命名包含Functional.conv2d.1、backward和input字段的.npy文件。数据类型:str。 | 否 | +| input_output_mode | dump数据过滤。可取值"all"、"forward"、"backward"、"input"和"output",表示仅保存dump的数据中文件名包含"forward"、"backward"、"input"和"output"的前向、反向、输入或输出的.npy文件。参数示例input_output_mode=["backward"]或input_output_mode=["forward", "backward"]。默认为["all"],即保存所有dump的数据。除了all参数只能单独配置外,其他参数可以自由组合。数据类型:list。 | 否 | +| summary_only | dump npy文件过滤,可取值True或False,配置为True后仅dump保存API统计信息的pkl文件,参数示例:summary_only=False,默认为False。数据类型:bool。 | 否 | +| summary_mode | 控制dump文件输出的模式,可取值md5(dump仅输出包含md5值的pkl文件,用于验证数据的完整性)、summary(dump仅输出包含API统计信息的pkl文件)、all(dump输出包含API统计信息的pkl文件以及具体的npy文件),参数示例:summary_mode="md5",默认为"all"。summary_only=True时,不允许配置该参数。数据类型:str。 | 否 | +| overflow_nums | 控制溢出次数,表示第N次溢出时,停止训练,过程中检测到溢出API对应ACL数据均dump。参数示例:overflow_nums=3。配置overflow_check时可配置,默认不配置,即检测到1次溢出,训练停止,配置为-1时,表示持续检测溢出直到训练结束。数据类型:int。 | 否 | +| need_replicate | 过程dump数据生成开关,执行溢出检测时,dump目录下会生成forward_real_data和backward_real_data的过程dump数据目录,可取值True(生成)或False(不生成),默认不生成。数据类型:bool。 | 否 | + +**函数示例** + +configure_hook可配置多种dump模式,示例如下: + +说明: + +以下均以dump部分API数据为例,API名可以从首次dump整网数据的结果csv文件中的NPU Name或Bench Name列获取。 + +以下仅为该函数配置示例,完整代码请参见“**示例代码**”章节。 + +- 示例1:dump指定API列表 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(mode="list", scope=["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]) + ``` + +- 示例2:dump指定范围 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(mode="range", scope=["Tensor.abs.1.forward", "Tensor.transpose.3.forward"]) + ``` + +- 示例3:STACK模式,只dump堆栈信息 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(mode="stack", scope=["Tensor.abs.1.forward", "Tensor.transpose.3.forward"]) + ``` + +- 示例4:dump指定前向API的ACL级别数据 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(mode="acl", scope=["Tensor.permute.1.forward"], acl_config="./dump.json") + ``` + +- 示例5:dump指定反向API的ACL级别数据 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(mode="acl", scope=["Functional.conv2d.1.backward"], acl_config="./dump.json", backward_input=["./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump/Functional.conv2d.1.backward.input.0.npy"]) + ``` + +- 示例6:dump指定某一类API的API级别输入输出数据 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(mode="api_list", api_list=["relu"]) + ``` + + mode="api_list"时不配置scope。 + +- 示例7:dump全部API级别输入输出数据以及相应堆栈信息 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(mode="api_stack") + ``` + + mode="api_stack"时不配置scope。 + +- 示例8: dump全部API级别输入输出数据并包含bool和整型的tensor以及浮点、bool和整型的标量,配置为OFF,会dump bool和整型数据 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(filter_switch="OFF") + ``` + + 配置filter_switch="OFF"同时也可以配置mode、scope和api_list,除dump ACL级别数据。 + +- 示例9:仅保存dump的数据文件名包含“backward”的反向.npy文件 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(input_output_mode=["backward"]) + ``` + +- 示例10:仅dump pkl文件 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(summary_only=True) + ``` + +- 示例11:溢出检测dump + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="overflow_check", step=[0]) + debugger.configure_hook(overflow_nums=1) + ``` + + dump执行时会在**PrecisionDebugger**模块的dump_path参数指定的目录下生成ptdbg_dump_{version}目录,保存溢出数据。 + + 多卡场景时,需要检测到至少有一张卡溢出次数达到overflow_nums时,训练结束。 + + 仅支持NPU环境。 + +- 示例11:dump溢出API的ACL级别数据 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="overflow_check", step=[0]) + debugger.configure_hook(mode="acl", acl_config="./dump.json") + ``` + + 该场景会在原有数据基础上,额外在dump.json文件配置的dump_path目录下生成一份ACL算子数据,该数据可通过“**ptdbg_ascend.parse**”工具进行解析。 + + 仅支持NPU环境。 + +### start函数(可选) + +**功能说明** + +dump或溢出检测启动函数。 + +在模型初始化之后的任意位置添加。 + +**原型** + +```python +debugger.start() +``` + +该函数为类函数,可以使用debugger.start()也可以使用PrecisionDebugger.start()。 + +### stop函数(可选) + +**功能说明** + +dump或溢出检测停止函数。 + +在**start**函数之后的任意位置添加。 + +**原型** + +```python +debugger.stop() +``` + +该函数为类函数,可以使用debugger.stop()也可以使用PrecisionDebugger.stop()。 + +### 示例代码(自动模式) + +**需要保证用户训练代码是通过torch.utils.data.dataloader方式加载数据。** + +- 示例1:开启dump + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0,2], enable_dataloader=True) + # 请勿将以上初始化流程插入到循环代码中 + ``` + +- 示例2:开启溢出检测dump + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="overflow_check", step=[0,2], enable_dataloader=True) + # 请勿将以上初始化流程插入到循环代码中 + ``` + +### 示例代码(手动模式) + +一般情况下使用自动模式可以快速方便进行dump操作,但个别大模型可能在部分卡的训练操作中没有调用dataloader,这会导致自动模式无法dump指定迭代的数据,此时需要关闭自动模式手动在迭代前后插入start()和stop()函数,并在最后一个stop函数后或一个step结束的位置添加debugger.step()以标识dump结束。 + +- 示例1:开启dump + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + +- 示例2:开启溢出检测dump + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="overflow_check", step=[0]) + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + +## register_hook方式dump和溢出检测 + +### 总体说明 + +- 本节主要介绍CPU或GPU及NPU精度数据dump和溢出检测所需要的函数以及示例。 + +- ptdbg_ascend工具默认情况下仅dump PyTorch模型的API输入输出数据进行精度比对,若在比对结果中发现某个API下可能存在ACL的精度问题,那么可以选择dump该API的ACL级别数据进行精度分析。 + +- 某些torch api的输出不是Tensor类型的数据。对于此类API的反向过程进行ACL dump,工具会在运行日志中给出对应的Warning(is not of tensor type and cannot be automatically derived)提示。如若想要进行该类API反向ACL dump,可以通过手动构建单API用例的方式进行ACL dump,具体用例可参见“**[反向ACL dump用例说明](https://gitee.com/ascend/att/blob/master/debug/accuracy_tools/ptdbg_ascend/doc/%E5%8F%8D%E5%90%91ACL%20dump%E7%94%A8%E4%BE%8B%E8%AF%B4%E6%98%8E.md)**”。 + +- 工具性能:dump数据量较小时(小于5G),参考dump速度0.1GB/s;dump数据量较大时,参考dump速度0.2GB/s。 + 推荐环境配置:独占环境,CPU核心数192,固态硬盘(IO速度参考:固态硬盘 > 500MB/s,机械硬盘60 ~ 170MB/s)。 + + 用户环境性能弱于标准约束或非独占使用的比对速度酌情向下浮动。Dump速度的计算方式:Dump数据量/(单个step添加Dump耗时-原始单个step耗时)。 + +### 约束 +- 进行CPU或GPU数据dump时,请安装torch包而非torch_npu包,避免工具无法识别使用场景,导致失败。 + +- TASK_QUEUE_ENABLE环境变量会导致API下发和执行异步进行,因此在ACL dump前需要将TASK_QUEUE_ENABLE关闭,即export TASK_QUEUE_ENABLE=0。 + +- 不建议在PyTorch训练脚本中同时添加dump接口和性能数据采集(如Ascend PyThon Profiler)接口,二者可能相互影响导致数据不准确。 + +### seed_all + +**功能说明** + +固定随机数。通过固定随机数保证模型的输入或输出一致。在训练主函数开始前调用,避免随机数固定不全。 + +使用form ptdbg import *后自动导入该函数,代码无需再次添加,若需要修改随机数种子和确定性计算模式,则需要通过添加该函数修改。 + +**函数原型** + +```python +seed_all(seed=1234, mode=False) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ------ | ------------------------------------------------------------ | -------- | +| seed | 随机数种子。参数示例:seed=1000。默认值为:1234。数据类型:int。 | 否 | +| mode | 确定性计算模式。可配置True或False。参数示例:mode=True。默认为False。数据类型:bool。
即使在相同的硬件和输入下,API多次执行的结果也可能不同,开启确定性计算是为了保证在相同的硬件和输入下,API多次执行的结果相同。
确定性计算会导致API执行性能降低,建议在发现模型多次执行结果不同的情况下开启。
rnn类算子、ReduceSum、ReduceMean等算子可能与确定性计算存在冲突,若开启确定性计算后多次执行的结果不相同,则考虑存在这些算子。 | 否 | + +**函数示例** + +seed_all函数的随机数种子,取默认值即可,无须配置;第二个参数默认关闭,不开启确定性计算时也无须配置。 + +- 示例1:仅固定随机数,不开启确定性计算 + + ```python + seed_all() + ``` + +- 示例2:固定随机数,开启确定性计算 + + ```python + seed_all(mode=True) + ``` + +**固定随机数范围** + +seed_all函数可固定随机数的范围如下表。 + +| API | 固定随机数 | +| ---------------------------------------- | --------------------------- | +| os.environ['PYTHONHASHSEED'] = str(seed) | 禁止Python中的hash随机化 | +| random.seed(seed) | 设置random随机生成器的种子 | +| np.random.seed(seed) | 设置numpy中随机生成器的种子 | +| torch.manual_seed(seed) | 设置当前CPU的随机种子 | +| torch.cuda.manual_seed(seed) | 设置当前GPU的随机种子 | +| torch.cuda.manual_seed_all(seed) | 设置所有GPU的随机种子 | +| torch_npu.npu.manual_seed(seed) | 设置当前NPU的随机种子 | +| torch_npu.npu.manual_seed_all(seed) | 设置所有NPU的随机种子 | +| torch.backends.cudnn.enable=False | 关闭cuDNN | +| torch.backends.cudnn.benchmark=False | cuDNN确定性地选择算法 | +| torch.backends.cudnn.deterministic=True | cuDNN仅使用确定性的卷积算法 | + +需要保证CPU或GPU以及NPU的模型输入完全一致,dump数据的比对才有意义,seed_all并不能保证模型输入完全一致,如下表所示场景需要保证输入的一致性。 + +| 场景 | 固定方法 | +| --------------- | ------------- | +| 数据集的shuffle | 关闭shuffle。 | +| dropout | 关闭dropout。 | + +关闭shuffle示例: + +```python +train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size = batch_size, + shuffle = False, + num_workers = num_workers +) +``` + +关闭dropout: + +在使用from ptdbg import *后,工具会自动将torch.nn.functional.dropout、torch.nn.functional.dropout2d、torch.nn.functional.dropout3d、torch.nn.Dropout、torch.nn.Dropout2d、torch.nn.Dropout3d的接口参数p置为0。 + +### set_dump_path + +**功能说明** + +设置数据保存目录。建议在seed_all函数之后调用且需要保证训练进程能够调用该函数;多卡时须保证每个进程都能调用该函数。 + +**函数原型** + +```python +set_dump_path(fpath=None, dump_tag='ptdbg_dump') +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| -------- | ------------------------------------------------------------ | -------- | +| fpath | 设置数据目录路径。参数示例:'./dump_path'。数据类型:str。
默认在dump_path目录下生成`ptdbg_dump_{version}`目录,并在该目录下生成`dump.pkl`文件以及`dump`数据文件保存目录。
当set_dump_switch函数配置了mode参数时,`dump.pkl`文件以及`dump`数据文件保存目录名称添加mode参数值为前缀,详情请参见“**dump数据存盘说明**”。
未配置fpath时,也可以通过环境变量ASCEND_WORK_PATH配置dump路径,此时数据将落盘在${ASCEND_WORK_PATH}/dump_data下,自定义配置dump_path优先级高于环境变量,fpath和环境变量需要二选一。 | 否 | +| dump_tag | 设置数据目录名称。参数示例:dump_tag='dump_conv2d'。默认数据目录命名为ptdbg_dump_{version}。数据类型:str。
{version}为当前安装ptdbg_ascend工具版本。目录结构参见“**dump数据存盘说明**”。
配置该参数会将生成的`ptdbg_dump_{version}`目录名称变更为dump_tag配置的值,如`dump_conv2d_{version}`。 | 否 | + +**函数示例** + +- 示例1:设置数据目录路径 + + ```python + set_dump_path('./dump_path') + ``` + +- 示例2:设置数据目录名称 + + ```python + set_dump_path('./dump_path', dump_tag='dump_conv2d') + ``` + + +若以相同的数据目录多次dump,则会因同名导致覆盖;多次dump建议配置不同的dump_tag。 + +### register_hook + +**功能说明** + +注册工具钩子函数。在set_dump_path之后调用。 + +dump操作必选。 + +**函数原型** + +```python +register_hook(model, hook, overflow_nums=overflow_nums, dump_mode=dump_mode, dump_config=dump_config_file) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ------------- | ------------------------------------------------------------ | -------- | +| model | 传入网络模型实例化的对象。参数示例: model=net,net为网络模型实例化的对象名称。数据类型:torch.nn.Module。 | 是 | +| hook | 注册工具的dump和溢出检测钩子。可取值overflow_check(表示溢出检测)和acc_cmp_dump(表示dump数据),二选一。数据类型:Callable。 | 是 | +| overflow_nums | 控制溢出次数,表示第N次溢出时,停止训练,过程中检测到溢出API对应ACL数据均dump。参数示例:overflow_nums=3。配置overflow_check时可配置,默认不配置,即检测到1次溢出,训练停止,配置为-1时,表示持续检测溢出直到训练结束。数据类型:int。 | 否 | +| dump_mode | 控制针对溢出API的dump模式,可取值"acl"或"api"。配置acl时,表示dump ACL级别的溢出数据,此时set_dump_path参数不生效,dump数据目录由dump_config的.json文件配置。参数示例:dump_mode="acl"。默认不配置,即dump API级别的溢出数据。数据类型:str。 | 否 | +| dump_config | acl dump的配置文件。dump_mode="acl"时,该参数必选;dump_mode="api"时,该参数不选。参数示例:dump_config='./dump.json'。数据类型:str。 | 否 | + +**函数示例** + +- 示例1:注册工具钩子函数 + + ```python + register_hook(model, acc_cmp_dump) + ``` + +- 示例2:dump指定API的ACL级别数据 + + ```python + register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json') + ``` + + 需要配置set_dump_switch的mode="acl"以及scope指定为前向或反向API,请参见“**set_dump_switch”**的示例。 + + 该场景set_dump_path不生效,由dump_config中的dump.json文件配置dump数据目录。 + +- 示例3:溢出检测dump + + ```python + register_hook(model, overflow_check, overflow_nums=3) + ``` + + dump执行时会在set_dump_path的fpath参数指定的目录下生成ptdbg_dump_{version}目录,保存溢出数据。 + + 多卡场景时,需要检测到至少有一张卡溢出次数达到overflow_nums时,训练结束。 + + 仅支持NPU环境。 + +- 示例4:dump指定API的ACL级别溢出数据 + + ```python + register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json') + ``` + + 该场景会在原有数据基础上,额外在dump.json文件配置的dump_path目录下生成一份ACL算子数据,该数据可通过“**ptdbg_ascend.parse**”工具进行解析。 + + 仅支持NPU环境。 + +### set_dump_switch + +**功能说明** + +设置dump范围。建议在register_hook函数之后的脚本内任意位置插入,但进行精度问题排查建议参照“场景化示例 > 单卡场景精度比对”章节的顺序,先从第一个迭代开始的位置调用并dump整网数据。 + +dump操作必选。 + +**函数原型** + +```python +def set_dump_switch(switch, mode="all", scope=[], api_list=[], filter_switch="OFF", dump_mode=["all"], summary_only=False): +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| --------------- | ------------------------------------------------------------ | -------- | +| switch | dump开关。可取值"ON"或"OFF"。须在选定dump开始的位置配置set_dump_switch("ON");dump结束的位置设置set_dump_switch("OFF")。数据类型:str。 | 是 | +| mode | dump模式。可取值"all"、"list"、"range"、"stack"、"acl"、"api_list"、"api_stack",各参数含义请参见本节的“**函数示例**”。参数示例:mode="list"。默认为"all"。该参数配置值将作为dump数据文件名的前缀,详情请参见“**dump数据存盘说明**”。数据类型:str。 | 否 | +| scope或api_list | dump范围。根据model配置的模式选择dump的API范围。参数示例:scope=["Tensor.permute.1.forward", "Tensor.transpose.2.forward"]、api_list=["relu"]。默认为空。数据类型:List[str]。 | 否 | +| filter_switch | dump bool和整型的tensor以及浮点、bool和整型的标量的过滤开关。可取值"ON"或"OFF"。参数示例:filter_switch="ON"。默认不配置,即filter_switch="OFF",表示dump上述数据。数据类型:str。 | 否 | +| dump_mode | dump数据过滤。可取值"all"、"forward"、"backward"、"input"和"output",表示仅保存dump的数据中文件名包含"forward"、"backward"、"input"和"output"的前向、反向、输入或输出的.npy文件。参数示例dump_mode=["backward"]或dump_mode=["forward", "backward"]。默认为all,即保存所有dump的数据。除了all参数只能单独配置外,其他参数可以自由组合。数据类型:List[str]。 | 否 | +| summary_only | dump npy文件过滤,可取值True或False,配置为True后仅dump保存API统计信息的pkl文件,参数示例:summary_only=False,默认为False。数据类型:bool。 | 否 | + +**推荐配置** + +```python +set_dump_switch("ON", mode="api_stack", filter_switch="OFF") +``` + +开启dump数据和堆栈模式,同时为保证数据完整性开启dump bool和整型的tensor以及浮点、bool和整型的标量。 + +**函数示例** + +set_dump_switch可配置多种dump模式,示例如下: + +说明:以下均以dump部分API数据为例,API名可以从首次dump整网数据的结果csv文件中的NPU Name或Bench Name列获取。 + +- 示例1:dump指定API列表 + + ```python + set_dump_switch("ON", mode="list", scope=["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]) + ``` + +- 示例2:dump指定范围 + + ```python + set_dump_switch("ON", mode="range", scope=["Tensor.abs.1.forward", "Tensor.transpose.3.forward"]) + ``` + +- 示例3:STACK模式,只dump堆栈信息 + + ```python + set_dump_switch("ON", mode="stack", scope=["Tensor.abs.1.forward", "Tensor.transpose.3.forward"]) + ``` + +- 示例4:dump指定前向API的ACL级别数据 + + ```python + register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json') + set_dump_switch("ON", mode="acl", scope=["Tensor.permute.1.forward"]) + ``` + + 需要配置register_hook的dump_mode='acl'和dump_config配置文件。 + +- 示例4:dump指定反向API的ACL级别数据 + + ```python + register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json') + set_dump_switch("ON", mode="acl", scope=["Functional.conv2d.1.backward"]) + set_backward_input(["./npu_dump/dump_conv2d_v4.0/step0/rank0/dump/Functional.conv2d.1.backward.input.0.npy"]) + ``` + + 需要配置register_hook的dump_mode='acl'和dump_config配置文件,并通过set_backward_input设置反向API输入的.npy文件。 + +- 示例5:dump指定某一类API的API级别输入输出数据 + + ```python + set_dump_switch("ON", mode="api_list", api_list=["relu"]) + ``` + + mode="api_list"时不配置scope。 + +- 示例6:dump全部API级别输入输出数据以及相应堆栈信息 + + ```python + set_dump_switch("ON", mode="api_stack") + ``` + + mode="api_stack"时不配置scope。 + +- 示例7: dump全部API级别输入输出数据并包含bool和整型的tensor以及浮点、bool和整型的标量,配置为OFF,会dump bool和整型数据 + + ```python + set_dump_switch("ON", filter_switch="OFF") + ``` + + 配置filter_switch="OFF"同时也可以配置mode、scope和api_list,除dump ACL级别数据。 + +- 示例8:仅保存dump的数据文件名包含“backward”的反向.npy文件 + + ```python + set_dump_switch("ON", dump_mode=["backward"]) + ``` + +- 示例9:仅dump pkl文件 + + ```python + set_dump_switch("ON", summary_only=True) + ``` + +以上示例均需要在结束dump的位置插入set_dump_switch("OFF")。 + +set_dump_switch配置mode为all或api_stack时,结束dump后,在dump目录下会自动生成compare_data.py比对脚本模板,示例如下: + +```python +from ptdbg_ascend import compare +from ptdbg_ascend.common.file_check_util import FileChecker +import argparse +import os.path + +pkl_path = "%s" +dump_data_dir = "%s" + +parser = argparse.ArgumentParser(description="compare data") +parser.add_argument("--npu_pkl_path", type=str, default=pkl_path, help="npu保存数据的pkl路径") +parser.add_argument("--bench_pkl_path", type=str, default=pkl_path, help="对比数据的pkl路径") +parser.add_argument("--output_path", type=str, default="./", help="导出对比数据的路径") + +args = parser.parse_args() +npu_pkl_path = args.npu_pkl_path +bench_pkl_path = args.bench_pkl_path +output_path = args.output_path + +suffix = ".pkl" +npu_path_checker = FileChecker(npu_pkl_path, "file", "read", suffix) +npu_path_checker.common_check() +bench_path_checker = FileChecker(bench_pkl_path, "file", "read", suffix) +bench_path_checker.common_check() + +npu_dump_data_dir = npu_pkl_path[:-len(suffix)] +bench_dump_data_dir = bench_pkl_path[:-len(suffix)] +if not os.path.exists(npu_dump_data_dir) or not os.path.exists(bench_dump_data_dir): + npu_dump_data_dir = "" + bench_dump_data_dir = "" + +dump_path_param = { + "npu_pkl_path": npu_pkl_path, + "bench_pkl_path": bench_pkl_path, + "npu_dump_data_dir": npu_dump_data_dir, + "bench_dump_data_dir": bench_dump_data_dir, + "is_print_compare_log": True +} + +compare(dump_path_param, output_path=output_path, stack_mode=%s) +``` + +compare_data.py比对脚本模板可以直接使用命令行配置比对参数,不需要通过编辑compare_data.py文件来修改,示例如下: + +```bash +python3 compare_data.py --npu_pkl_path "./npu_dump/ptdbg_dump_v6.0/step0/rank0/api_stack_dump.pkl" --bench_pkl_path "./gpu_dump/ptdbg_dump_v6.0/step0/rank0/api_stack_dump.pkl" --output_path "./output_path" +``` + +该命令行支持--npu_pkl_path、--bench_pkl_path和--output三个比对参数,其中pkl_path两个参数配置后,脚本可以自动识别同级目录下的dump_data目录,若同级目录下不存在dump_data目录,则直接执行“**pkl文件比对**”。仅ptdbg_ascend 6.0或更高版本支持比对命令行配置比对参数。更多介绍请参见“**执行比对操作**”。 + +### set_overflow_check_switch + +**功能说明** + +置溢出检测范围。默认不配置该函数,全量进行溢出检测。 + +仅支持NPU环境。 + +**函数原型** + +```python +set_overflow_check_switch(switch, filter_switch='OFF') +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ------------- | ------------------------------------------------------------ | -------- | +| switch, | 检测开关。可取值"ON"或"OFF"。如果只在特定的step溢出检测,则在期望溢出检测的step位置开始前插入set_overflow_check_switch("ON"),在step结束的位置插入set_overflow_check_switch("OFF")。数据类型:str。 | 是 | +| filter_switch | dump bool和整型的tensor以及浮点、bool和整型的标量的过滤开关。可取值"ON"或"OFF"。参数示例:filter_switch="ON"。默认不配置,即filter_switch="OFF",表示dump上述数据。数据类型:str。 | 否 | + +**函数示例** + +- 示例1:指定范围溢出检测 + + ```python + register_hook(model, overflow_check) + set_overflow_check_switch("ON") + + ... + + set_overflow_check_switch("OFF") + ``` + + 该场景set_dump_path不生效,dump执行时会在当前目录自动生成ptdbg_dump_{version}目录,保存溢出数据。 + +- 示例2:前向API的ACL级别范围溢出检测 + + ```python + register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json') + set_overflow_check_switch("ON") + + ... + + set_overflow_check_switch("OFF") + ``` + + 该场景set_dump_path不生效,由dump_config中的dump.json文件配置溢出数据目录。 + +### set_backward_input + +**功能说明** + +设置反向ACL级别dump时需要的反向输入的.npy文件。 + +**函数原型** + +```python +set_backward_input(backward_input) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| -------------- | ------------------------------------------------------------ | -------- | +| backward_input | 该输入文件为首次运行训练dump得到反向API输入的.npy文件。例如若需要dump Functional.conv2d.1 API的反向过程的输入输出,则需要在dump目录下查找命名包含Functional.conv2d.1、backward和input字段的.npy文件。数据类型:str。 | 是 | + +**函数示例** + +```python +register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json') +set_dump_switch("ON", mode="acl", scope=["Functional.conv2d.1.backward"]) +set_backward_input(["./npu_dump/dump_conv2d_v4.0/step0/rank0/dump/Functional.conv2d.1.backward.input.0.npy"]) +``` + +## dump.json配置文件说明 + +**dump.json配置示例** + +```python +{ + "dump": + { + "dump_list":[], + "dump_path":"./dump/output", + "dump_mode":"all", + "dump_op_switch":"on" + } +} +``` + +**dump.json参数说明** + +| 字段名 | 说明 | +| -------------- | ------------------------------------------------------------ | +| dump_list | 待dump数据的API模型。为空,无需配置。 | +| dump_path | dump数据文件存储到运行环境的目录,主要用于指定ACL dump数据路径。支持配置绝对路径或相对路径。dump_path须为已存在目录。 | +| dump_mode | dump数据模式,配置如下:
output:dump API的输出数据。默认值。
input:dump API的输入数据。
all:dump API的输入、输出数据。 | +| dump_op_switch | 单API模型dump数据开关,配置如下: * off:关闭单API模型dump,默认值。 * on:开启单API模型dump。 | + +**dump目录说明** + +配置register_hook的dump_config后,采集的dump数据会在{dump_path}/{time}/{deviceid}/{model_id}目录下生成,例如“/home/HwHiAiUser/output/20200808163566/0/0” + +```bash +├── 20230131172437 +│   └── 1 +│   ├── 0 +│   │   ├── Add.Add.45.0.1675157077183551 +│   │   ├── Cast.trans_Cast_0.31.0.1675157077159449 +│   │   ├── Cast.trans_Cast_5.43.0.1675157077180129 +│   │   ├── MatMul.MatMul.39.0.1675157077172961 +│   │   ├── Mul.Mul.29.0.1675157077155731 +│   │   ├── NPUAllocFloatStatus.NPUAllocFloatStatus.24.0.1675157077145262 +│   │   ├── TransData.trans_TransData_1.33.0.1675157077162791 +│   │   └── TransData.trans_TransData_4.41.0.1675157077176648 +│   ├── 1701737061 +│   │   └── Cast.trans_Cast_2.35.0.1675157077166214 +│   ├── 25 +│   │   └── NPUClearFloatStatus.NPUClearFloatStatus.26.0.1675157077150342 +│   └── 68 +│   └── TransData.trans_TransData_3.37.0.1675157077169473 +``` + +## 模块级精度数据dump + +### 总体说明 + +大模型场景下,通常不是简单的利用自动迁移能力实现GPU到NPU的训练脚本迁移,而是会对NPU网络进行一系列针对性的适配,因此,常常会造成迁移后的NPU模型存在部分子结构不能与GPU原始模型完全对应。模型结构不一致导致API调用类型及数量不一致,若直接按照API粒度进行精度数据dump和比对,则无法完全比对所有的API。 + +本节介绍的功能是对模型中的大粒度模块进行数据dump,使其比对时,对于无法以API粒度比对的模块可以直接以模块粒度进行比对。 + +模块指的是继承自nn.Module类模块,通常情况下这类模块就是一个小模型,可以被视为一个整体,dump数据时以模块为粒度进行dump。 + +### module_dump + +**功能说明** + +开启模块级精度数据dump。 + +模块级精度数据dump时必选。 + +**函数原型** + +```python +module_dump(module, module_name) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ----------- | ------------------------------------------------------------ | -------- | +| module | 网络中实例化好的nn.Module类对象。数据类型:torch.nn.Module。 | 是 | +| module_name | 用户自定义的该model名称。主要用于dump数据文件的命名,便于在比对时识别模块级数据。数据类型:str。 | 是 | + +### module_dump_end + +**功能说明** + +结束模块级精度数据dump。 + +模块级精度数据dump时必选。 + +**函数原型** + +```python +module_dump_end() +``` + +### 示例代码 + +```python +# 根据需要import包 +import os +import torch +import torch.nn as nn +import torch_npu +import torch.nn.functional as F +from ptdbg_ascend import * + +torch.npu.set_device("npu:0") +# 定义一个简单的网络 +class ModuleOP(nn.Module): + def __init__(self) -> None: + super().__init__() + self.linear_1 = nn.Linear(in_features=8, out_features=4) + self.linear_2 = nn.Linear(in_features=4, out_features=2) + def forward(self, x): + x1 = self.linear_1(x) + x2 = self.linear_2(x1) + r1 = F.relu(x2) + return r1 + +if __name__ == "__main__": + module = ModuleOP() + + # 注册工具 + pdbg = PrecisionDebugger("./dump_data/npu", hook_name="dump") + pdbg.start() + + x = torch.randn(10, 8) + module_dump(module, "MyModuleOP") # 开启模块级精度数据dump + out = module(x) + module_dump_end() # 结束模块级精度数据dump + loss = out.sum() + loss.backward() + pdbg.stop() +``` + +## dump数据存盘说明 + +dump结果目录结构示例如下: + +```bash +├── dump_path +│ └── ptdbg_dump_{version} +│ ├── step0 +│ | ├── rank0 +│ | │ ├── dump +| | | | ├── Tensor.permute.1.forward.npy +| | | | ├── MyModule.0.forward.input.npy # 开启模块级精度数据dump时存在模块级的dump数据文件 +| | | | ... +| | | | └── Fcuntion.linear.5.backward.output.npy +│ | │ └── dump.pkl +│ | ├── rank1 +| | | ├── dump +| | | | └── ... +| | | └── dump.pkl +│ | ├── ... +│ | | +| | └── rank7 +│ ├── step1 +│ | ├── ... +│ ├── step2 +``` + +dump过程中,npy文件在对应算子或者模块被执行后就会落盘,而pkl文件则需要在正常执行PrecisionDebugger.stop()或set_dump_switch("OFF")后才会被落盘保存,异常的程序终止会保存终止前被执行算子的相关npy文件,但是不会生成pkl文件。 + +其中`ptdbg_dump_{version}`为默认命名,debugger方式dump不支持修改该文件夹名称,使用set_dump_path函数则支持通过dump_tag参数修改文件夹名称;rank为设备上各卡的ID,每张卡上dump的数据会生成对应dump目录。 + +**精度比对dump场景** + +精度比对dump场景的结果如下: + +* dump.pkl文件:包含dump数据的API名称(命名格式为:`{api_type}.{api_name}.{API调用次数}.{前向反向}.{input/output}.{参数序号}`)、dtype、 shape、各数据的max、min、mean、L2norm统计信息以及当配置summary_mode="md5"时的md5数据。 + + 其中,“参数序号”表示该API下的第n个参数,例如1,则为第一个参数,若该参数为list格式,则根据list继续排序,例如1.1,表示该API的第1个参数的第1个子参数;L2norm表示2范数(平方根)。 + +* dump目录:目录下为npy格式的dump数据。 + + npy文件保存的前缀和PyTorch对应关系如下 + + | 前缀 | Torch模块 | + | ----------- | ------------------- | + | Tensor | torch.Tensor | + | Torch | torch | + | Functional | torch.nn.functional | + | NPU | NPU亲和算子 | + | VF | torch._VF | + | Aten | torch.ops.aten | + | Distributed | torch.distributed | + +当configure_hook或set_dump_switch配置mode参数(例如:mode="api_stack" )时,dump结果的文件名会添加api_stack前缀,dump结果如下: + +* api_stack_dump.pkl +* api_stack_dump目录 + +**溢出检测dump场景** + +PrecisionDebugger模块的hook_name参数或register_hook函数设置了overflow_check时,检测API溢出,dump结果的文件名格式为:`{api_type}.{api_name}.{API调用次数}.{前向反向}.{当前溢出次数}`,dump结果示例如下: + +* `Tensor_add_1_forward_1.pkl` +* `Tensor_add_1_forward_1`目录 + +## 工具支持的API列表 + +ptdbug_ascend工具维护固定的API支持列表,若需要删除或增加dump的API,可以在[support_wrap_ops.yaml](../src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml)文件内手动修改,如下示例: + +```bash +functional: # functional为算子类别,找到对应的类别,在该类别下按照下列格式删除或添加API + - conv1d + - conv2d + - conv3d +``` + +## CPU或GPU与NPU精度数据比对 + +### 总体说明 + +- 本节主要介绍CPU或GPU与NPU精度数据比对的函数以及示例。 + +- 比对函数均通过单独创建精度比对脚本执行,可支持单卡和多卡场景的精度数据比对。 + +- 工具性能:比对数据量较小时(参考值单份文件小于10GB),参考比对速度0.1GB/s;比对数据量较大时,参考比对速度0.3GB/s。 + 推荐环境配置:独占环境,CPU核心数192,固态硬盘(IO速度参考:固态硬盘 > 500MB/s,机械硬盘60 ~ 170MB/s)。 + + 用户环境性能弱于标准约束或非独占使用的比对速度酌情向下浮动。比对速度的计算方式:两份比对文件大小/比对耗时。 + +### 约束 + +- NPU自研API,在CPU或GPU若没有对应的API,该API的dump数据不比对。 + +- NPU与CPU或GPU的计算结果误差可能会随着模型的执行不断累积,最终会出现同一个API因为输入的数据差异较大而无法比对的情况。 + +- CPU或GPU与NPU中两个相同的API会因为调用次数不同导致无法比对或比对到错误的API,不影响整体运行,该API忽略。 + +### compare_distributed + +**功能说明** + +将CPU或GPU与NPU的dump文件进行比对,支持单卡和多卡,可同时比对多卡的dump数据。多机场景需要每个设备单独执行比对操作。可自动检索和匹配对应卡和进程所dump的数据文件,再调用compare进行比对。单机单卡时与compare函数二选一。 + +**函数原型** + +```python +compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| -------------- | ------------------------------------------------------------ | -------- | +| npu_dump_dir | 配置NPU环境下的dump目录。dump数据目录须指定到step级。参数示例:'./npu_dump/ptdbg_dump_v4.0/step0'。register_hook方式可通过set_dump_path函数的dump_tag参数修改该目录名称。数据类型:str。 | 是 | +| bench_dump_dir | 配置CPU、GPU或NPU环境下的dump目录。参数示例:'./gpu_dump/ptdbg_dump_v4.0/step0'。register_hook方式可通过set_dump_path函数的dump_tag参数修改该目录名称。数据类型:str。 | 是 | +| output_path | 配置比对结果csv文件存盘目录。需要预先创建output_path目录。参数示例:'./output'。文件名称基于时间戳自动生成,格式为:`compare_result_rank{npu_ID}-rank{cpu/gpu/npu_ID}_{timestamp}.csv`。数据类型:str。 | 是 | +| **kwargs | 支持compare的所有可选参数。 | 否 | + +**函数示例** + +创建比对脚本,例如compare_distributed.py,拷贝如下代码,具体参数请根据实际环境修改。 + +```python +from ptdbg_ascend import * +compare_distributed('./npu_dump/ptdbg_dump_v4.0/step0', './gpu_dump/ptdbg_dump_v4.0/step0', './output') +``` + +dump数据目录须指定到step级。 + +### compare + +**功能说明** + +将CPU或GPU与NPU的dump文件进行比对,仅支持单机单卡。 + +**函数原型** + +```python +compare(input_param, output_path, stack_mode=False, auto_analyze=True, fuzzy_match=False) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ------------ | ------------------------------------------------------------ | -------- | +| input_param | 配置dump数据文件及目录。数据类型:dict。配置参数包括:
"npu_pkl_path":指定NPU dump目录下的.pkl文件。参数示例:"npu_pkl_path": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl"。必选。
"bench_pkl_path":指定CPU、GPU或NPU dump目录下的.pkl文件。参数示例:"bench_pkl_path": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl"。必选。
"npu_dump_data_dir":"指定NPU dump目录下的dump数据目录。参数示例:"npu_dump_data_dir": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump"。可选,仅比对pkl文件时不选。
"bench_dump_data_dir":"指定CPU、GPU或NPU dump目录下的dump数据目录。参数示例:"npu_dump_data_dir": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump"。可选,仅比对pkl文件时不选。
"is_print_compare_log":配置是否开启日志打屏。可取值True或False。可选。 | 是 | +| output_path | 配置比对结果csv文件存盘目录。参数示例:"./output_path",默认为"./"。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.csv`。数据类型:str。 | 否 | +| stack_mode | 配置stack_mode的开关。仅当dump数据时配置debugger.configure_hook或set_dump_switch的mode="api_stack"时需要开启。可取值True或False,参数示例:stack_mode=True,默认为False。数据类型:bool。 | 否 | +| auto_analyze | 自动精度分析,开启后工具自动针对比对结果进行分析,识别到第一个精度不达标节点(在比对结果文件中的“Accuracy Reached or Not”列显示为No),并给出问题可能产生的原因(打屏展示并生成advisor_{timestamp}.txt文件)。可取值True或False,参数示例:auto_analyze=False,默认为True。数据类型:bool。 | 否 | +| fuzzy_match | 模糊匹配。开启后,对于网络中同一层级且命名仅调用次数不同的API,可匹配并进行比对。可取值True或False,参数示例:fuzzy_match=True,默认为False。数据类型:bool。 | 否 | + +**函数示例** + +单机单卡场景下创建比对脚本,例如compare.py,拷贝如下代码,具体参数请根据实际环境修改。 + +```python +from ptdbg_ascend import compare +dump_result_param={ +"npu_pkl_path": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", +"bench_pkl_path": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", +"npu_dump_data_dir": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump", +"bench_dump_data_dir": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump", +"is_print_compare_log": True +} +compare(dump_result_param, output_path="./output_path", stack_mode=True) +``` + +### pkl文件比对 + +若使用**compare**或**compare_distributed**函数创建的比对脚本中,input_param参数只配置了npu_pkl_path和bench_pkl_path或使用summary_only、summary_mode(取值为md5或summary)方式dump时,可以进行pkl文件的比对,此时比对dump.pkl文件中的统计信息,开启后的比对结果文件生成Max diff、Min diff、Mean diff和L2norm diff,表示NPU dump数据中API的输入或输出与标杆数据输入或输出的最大值、最小值、平均值以及L2范数的差。可以通过该值判断API是否存在精度问题:当某个API的输入和输出的Max diff、Min diff、Mean diff和L2norm diff均为0或无限趋于0,那么可以判断该API无精度问题,反之则可能存在精度问题。 + +**比对脚本示例** + +以compare.py为例。 + +```python +from ptdbg_ascend import compare +dump_result_param={ +"npu_pkl_path": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", +"bench_pkl_path": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", +"is_print_compare_log": True +} +compare(dump_result_param, output_path="./output_path", stack_mode=True) +``` + +**比对结果** + +pkl文件比对同样生成`compare_result_{timestamp}.csv`和`advisor_{timestamp}.txt`文件。其中`advisor_{timestamp}.txt`主要对`compare_result_{timestamp}.csv`中可能存在精度问题(Result为Waring)的API提出定位建议;`compare_result_{timestamp}.csv`主要有如下两种情况: + +- configure_hook配置summary_only=True、summary_mode=summary或不配置前面两个参数直接比对pkl文件: + + ![compare_result_pkl](./img/compare_result_pkl.png) + + 上图是对pkl文件中NPU及标杆API的统计信息进行比对,判断可能存在精度问题的API,文件中记录NPU及标杆API的基本信息和统计信息,其中需要关注Result列,包含结果:Waring(NPU与标杆统计信息的比对中存在相对误差大于0.5,则需要重点检查该API);为空(相对误差小于等于0.5,可以不需要重点关注,但不代表不存在精度问题);Nan(表示统计信息数据没有匹配上)。 + +- configure_hook配置summary_mode=md5: + + ![compare_result_pkl_md5.png](./img/compare_result_pkl_md5.png.png) + + 上图是对pkl文件中NPU及标杆API的MD5信息进行比对,判断API数据的完整性,文件中记录NPU及标杆API的基本信息和MD5信息,其中需要关注Result列,包含结果:Pass(表示NPU与标杆的MD5值一致,即API数据完整);Different(表示NPU与标杆的MD5值不一致,即API数据不完全一致,可以通过NPU_Stack_Info列API调用栈查询该API的详细信息);Nan(表示MD5信息数据没有匹配上)。 + +### parse + +**功能说明** + +解析并提取dump信息中的堆栈信息及数据统计信息。 + +**函数原型** + +```python +parse(pkl_file, module_name_prefix) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ------------------ | ------------------------------------------------------------ | -------- | +| pkl_file | 指定dump数据文件中的pkl文件名。参数示例:"./npu_dump/ptdbg_dump_v4.0/step0/rank0/dump.pkl"。数据类型:str。 | 是 | +| module_name_prefix | 指定待提取的API接口前缀。参数示例:"Torch.norm.1.forward"。数据类型:str。 | 是 | + +**函数示例** + +创建堆栈信息及数据统计信息提取脚本,例如parse.py,拷贝如下代码,具体参数请根据实际环境修改。 + +```python +from ptdbg_ascend import * +parse("./npu_dump/ptdbg_dump_v4.0/step0/rank0/dump.pkl", "Torch.batch.normal.1.forward") +``` + +### 执行比对操作 + +比对操作通过执行比对脚本启动,根据不同的比对脚本分为如下场景: + +- dump数据时自动生成比对脚本模板,脚本名为compare_data.py,该脚本模板也可以直接手动创建: + + ```python + from ptdbg_ascend import compare + from ptdbg_ascend.common.file_check_util import FileChecker + import argparse + import os.path + + pkl_path = "%s" + dump_data_dir = "%s" + + parser = argparse.ArgumentParser(description="compare data") + parser.add_argument("--npu_pkl_path", type=str, default=pkl_path, help="npu保存数据的pkl路径") + parser.add_argument("--bench_pkl_path", type=str, default=pkl_path, help="对比数据的pkl路径") + parser.add_argument("--output_path", type=str, default="./", help="导出对比数据的路径") + + args = parser.parse_args() + npu_pkl_path = args.npu_pkl_path + bench_pkl_path = args.bench_pkl_path + output_path = args.output_path + + suffix = ".pkl" + npu_path_checker = FileChecker(npu_pkl_path, "file", "read", suffix) + npu_path_checker.common_check() + bench_path_checker = FileChecker(bench_pkl_path, "file", "read", suffix) + bench_path_checker.common_check() + + npu_dump_data_dir = npu_pkl_path[:-len(suffix)] + bench_dump_data_dir = bench_pkl_path[:-len(suffix)] + if not os.path.exists(npu_dump_data_dir) or not os.path.exists(bench_dump_data_dir): + npu_dump_data_dir = "" + bench_dump_data_dir = "" + + dump_path_param = { + "npu_pkl_path": npu_pkl_path, + "bench_pkl_path": bench_pkl_path, + "npu_dump_data_dir": npu_dump_data_dir, + "bench_dump_data_dir": bench_dump_data_dir, + "is_print_compare_log": True + } + + compare(dump_path_param, output_path=output_path, stack_mode=%s) + ``` + + 执行如下命令启动比对操作: + + ```bash + python3 compare_data.py --npu_pkl_path "npu_pkl_path" --bench_pkl_path "bench_pkl_path" --output_path "output_path" + ``` + + 命令行示例:python3 compare_data.py --npu_pkl_path "./npu_dump/ptdbg_dump_v6.0/step0/rank0/api_stack_dump.pkl" --bench_pkl_path "./gpu_dump/ptdbg_dump_v6.0/step0/rank0/api_stack_dump.pkl" --output_path "./output" + + - 该命令行支持--npu_pkl_path、--bench_pkl_path和--output三个**命令行比对参数**,其中pkl_path两个参数配置后,脚本可以自动识别同级目录下的dump_data目录,若同级目录下不存在dump_data目录,则直接执行“**pkl文件比对**”。 + - **命令行比对参数**的优先级高于compare.py比对脚本内的参数,配置命令行比对参数后,不需要通过编辑compare_data.py文件来修改比对参数。 + - **命令行比对参数**均为可选,但若未配置pkl_path两个参数,则需要在比对脚本中配置。 + - 仅ptdbg_ascend 6.0或更高版本支持**命令行比对参数**。 + + | 参数 | 说明 | 是否必选 | + | ---------------- | ------------------------------------------------------------ | -------- | + | --npu_pkl_path | 指定NPU dump目录下的.pkl文件。参数示例:--npu_pkl_path "./npu_dump/ptdbg_dump_v6.0/step0/rank0/api_stack_dump.pkl"。 | 否 | + | --bench_pkl_path | 指定CPU、GPU或NPU dump目录下的.pkl文件。参数示例:--bench_pkl_path "./gpu_dump/ptdbg_dump_v6.0/step0/rank0/api_stack_dump.pkl" | 否 | + | --output_path | 配置比对结果csv文件存盘目录。参数示例:--output_path "./output",默认为"./"。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.csv`。 | 否 | + +- 手动创建比对脚本,自定义脚本名为compare.py: + + ```python + from ptdbg_ascend import compare + dump_result_param={ + "npu_pkl_path": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", + "bench_pkl_path": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", + "npu_dump_data_dir": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump", + "bench_dump_data_dir": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump", + "is_print_compare_log": True + } + compare(dump_result_param, output_path="./output_path", stack_mode=True) + ``` + + 执行如下命令启动比对操作: + + ```bash + python3 compare.py + ``` + +### 计算精度评价指标 + +PyTorch精度比对是以CPU或GPU的计算结果为标杆,通过计算精度评价指标判断API在运行时是否存在精度问题。 + +计算精度评价指标: + +1. Cosine:通过计算两个向量的余弦值来判断其相似度,数值越接近于1说明计算出的两个张量越相似,实际可接受阈值为大于0.99。在计算中可能会存在nan,主要由于可能会出现其中一个向量为0。 + +2. MaxAbsErr:当最大绝对误差越接近0表示其计算的误差越小,实际可接受阈值为小于0.001。 + +3. MaxRelativeErr:当最大相对误差越接近0表示其计算的误差越小。 + + 当dump数据中存在0或Nan时,比对结果中最大相对误差则出现inf或Nan的情况,属于正常现象。 + +4. One Thousandth Err Ratio(双千分之一)、Five Thousandths Err Ratio(双千分之五)精度指标:是指NPU的Tensor中的元素逐个与对应的标杆数据对比,相对误差大于千分之一、千分之五的比例占总元素个数的比例小于千分之一、千分之五。该数据仅作为精度下降趋势的参考,并不参与计算精度是否通过的判定。 + +精度比对结果csv文件中只需要通过Accuracy Reached or Not来判断计算精度是否达标,判断标准如下: + +1. Cosine < 0.99 且 MaxAbsError > 0.001时,精度不达标,标记为“No”。 +2. Cosine < 0.9,精度不达标,标记为“No”。 +3. MaxAbsError > 1,精度不达标,标记为“No”。 +5. 其余情况下记为精度达标,标记为“Yes”。 + +## ptdbg_ascend.parse数据解析功能 + +ptdbg_ascend.parse为命令行交互式界面解析工具,提供更多的数据解析功能并且展示结果。 + +使用场景:本工具主要用于比对前后两次NPU ACL层级dump数据的一致性。 + +### 进入parse交互式界面 + +安装ptdbg_ascend工具后,可以通过使用命令 **python -m ptdbg_ascend.parse** 进入交互式界面,如下所示: + +```bash +python -m ptdbg_ascend.parse +Parse >>> +``` + +可在parse的界面中执行Shell命令,以及如下场景的相关解析命令: + +- 支持指定ACL层级算子数据比对。 +- 支持指定ACL层级算子数据转换及展示。 +- 支持交互式指定pkl文件中API对应dump数据查看。 +- 支持API进行可选层级比对和打印(统计级和像素级)。 + +Ctrl+C可以退出parse交互式界面。不退出parse交互式界面若需要执行非该界面下的内置Shell命令,且命令与parse交互式界面命令冲突时,非该界面命令需要使用run命令,在相关命令前加上run前缀,如下示例: + +```bash +python -m ptdbg_ascend.parse +Parse >>> run vim cli.py +Parse >>> vim cli.py +``` + +以上各场景详细介绍请参见下文章节。 + +### ACL层级算子数据批量转换 + +本功能会将原有待比对dump数据目录下的dump数据按照算子名和时间戳进行梳理并分类,之后再将dump数据转为为npy文件。 + +依赖:CANN包中的msaccucmp工具,需要安装Ascend-CANN-toolkit,详见《[CANN 软件安装指南](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F700%2Fenvdeployment%2Finstg%2Finstg_0001.html)》。 + +输入以下比对命令进行数据转换。 + +```bash +cad -m my_dump_path [-out output_path] [-asc msaccucmp_path] +``` + +| 参数名称 | 说明 | 是否必选 | +| -------- | ------------------------------------------------------------ | -------- | +| -m | 待转换ACL dump数据目录。需要指定到ACL dump数据的deviceid级目录。 | 是 | +| -out | 结果输出目录,须指定已存在的目录,默认为./parse_data/acl_batch_convert。未指定时保存在默认路径下,比对结束后会打印log提示输出结果存放路径。 | 否 | +| -asc | 指定msaccucmp路径,默认路径为:/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py。 | 否 | + +**示例** + +```bash +# 传入待比对数据目录 +Parse >>> cad -m /home/xxx/my_dump_path/20000124003856/0 +# 转换结果打印 +...... +╭──────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +# 转换前的dump文件 +│ SrcFile: /home/xxx/my_dump_path/20000124003856/0/272/TransData.trans_TransData_22.112.21.948645536672764 │ +# 转换后的npy文件 +│ - TransData.trans_TransData_22.112.21.948645536672764.output.0.npy │ +│ - TransData.trans_TransData_22.112.21.948645536672764.input.0.npy │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +...... +[INFO] The comparison result have been written to "./parse_data/acl_batch_convert". +``` + +输出结果: + +原dump数据目录: + +```bash +├── /home/xxx/my_dump_path/20000124003856/0/ +│ ├── 272 +│ │ ├── {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp} +│ │ ... +│ ├── 512 +│ ... +``` + +转换后: + +```bash +├── ./parse_data/acl_batch_convert/{timestamp} +│ ├── {op_name1} +│ │ ├── {timestamp1} +│ │ | ├── {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input/output}.{参数序号}.npy +│ │ | │ ... +│ │ ├── {timestamp2} +│ │ | ... +│ ├── {op_name2} +│ ├── ... +``` + +### ACL层级算子数据比对 + +本功能主要用于比对前后两次NPU ACL层级dump数据的一致性。 + +本功能支持批量比对,若需要进行批量比对,需要先将两份待比对的NPU ACL层级dump数据进行“**ACL层级算子数据批量转换**”,可以使两份数据更好的匹配;若直接进行dump数据的比对,建议只比对单个dump数据文件。 + +输入以下比对命令进行数据比对。 + +```bash +vc -m my_dump_path -g golden_dump_path [-out output_path] [-cmp_path msaccucmp_path] +``` + +| 参数名称 | 说明 | 是否必选 | +| --------- | ------------------------------------------------------------ | -------- | +| -m | 待比对ACL dump数据目录。如果比对单个算子,需要指定到ACL dump数据的model_id级目录;如果批量比对,则指定到cad转换后的timestamp级目录。 | 是 | +| -g | 标杆ACL dump数据目录。如果比对单个算子,需要指定到ACL dump数据的model_id级目录;如果批量比对,则指定到cad转换后的timestamp级目录。 | 是 | +| -out | 结果输出目录,须指定已存在的目录,默认为./parse_data/acl_batch_comapre。未指定时保存在默认路径下,比对结束后会打印log提示输出结果存放路径。 | 否 | +| -cmp_path | 指定msaccucmp路径,默认路径为:/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py | 否 | + +输出结果:batch_compare_{timestamp}.csv文件。 + +**示例** + +```bash +# 传入待比对数据目录以及标杆数据目录 +Parse >>> vc -m ./my_dump_path -g ./golden_data_path +[INFO]Compare result is saved in : parse_data/acl_batch_comapre/batch_compare_1707271118.csv +``` + +### ACL算子数据的npy转换 + +依赖:CANN包中的msaccucmp工具,需要安装Ascend-CANN-toolkit,详见《[CANN 软件安装指南](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F700%2Fenvdeployment%2Finstg%2Finstg_0001.html)》。 + +输入以下转换命令进行数据转换, 将ACL级别dump数据转为npy文件。 + +```bash +dc -n file_name/file_path [-f format] [-out output_path] +``` + +| 参数名称 | 说明 | 是否必选 | +| --------- | ------------------------------------------------------------ | -------- | +| -n | 需转换的dump数据文件或dump数据文件目录。 | 是 | +| -f | 开启format转换,指定该参数时需要配置format格式。当前内置的Format转换支持如下类型:
FRACTAL_NZ转换NCHW
FRACTAL_NZ转换成NHWC
FRACTAL_NZ转换ND
HWCN转换FRACTAL_Z
HWCN转换成NCHW
HWCN转换成NHWC
NC1HWC0转换成HWCN
NC1HWC0转换成NCHW
NC1HWC0转换成NHWC
NCHW转换成FRACTAL_Z
NCHW转换成NHWC
NHWC转换成FRACTAL_Z
NHWC转换成HWCN
NHWC转换成NCHW
NDC1HWC0转换成NCDHW | 否 | +| -out | 结果输出目录。 | 否 | +| -cmp_path | 指定msaccucmp路径,默认路径为:/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py | 否 | + +[^]: 若传入单个dump文件,则转换单个文件,若传入dump文件目录则转换目录下所有dump文件。 + +- 输出结果:npy文件。 +- 若指定-out参数需要用户传入输出路径,并且路径需要已存在。 +- 若未指定输出目录, 则比对结束后将结果保存在默认目录 “./parse_data/convert_result”中,比对结束后会打印log提示输出结果存放路径及转换结果。 + +- 输入以下命令,展示npy数据统计信息。 + + ```bash + pt -n file_path + ``` + + | 参数名称 | 说明 | 是否必选 | + | -------- | ------------- | -------- | + | -n | npy文件路径。 | 是 | + + 打印统计信息:shape, dtype, max, min和mean。默认在npy文件路径下将该数据保存为txt文件。 + +**示例1** + +```bash +# 传入需转换的dump文件目录 +Parse >>> dc -n ./dump_data/ +...... +# 转换结果 +╭──────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ SrcFile: ./dump_data/ +│ - Add.fp32.vars.add.2fp32.vars.Relu.9.31.5.1636595794731103.input.0.npy │ +│ - Add.fp32.vars.add.1fp32.vars.Relu.6.24.5.1636595794631347.output.0.npy │ +│ - Add.fp32.vars.add.2fp32.vars.Relu.9.31.5.1636595794731103.input.1.npy │ +│ - Add.fp32.vars.add.1fp32.vars.Relu.6.24.5.1636595794631347.input.1.npy │ +│ - Add.fp32.vars.add.3fp32.vars.Relu.12.40.5.1636595794846124.input.1.npy │ +│ - Add.fp32.vars.add.1fp32.vars.Relu.6.24.5.1636595794631347.input.0.npy │ +│ - Add.fp32.vars.add.3fp32.vars.Relu.12.40.5.1636595794846124.input.0.npy │ +│ - Add.fp32.vars.add.2fp32.vars.Relu.9.31.5.1636595794731103.output.0.npy │ +│ - Add.fp32.vars.add.3fp32.vars.Relu.12.40.5.1636595794846124.output.0.npy │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────╯ +``` + +**示例2** + +```bash +# 查看某个dump数据块的数据信息 +# 默认会将数据中的tensor保存成 txt +Parse >>> pt -n ./parse_data/dump_convert/Add.fp32.vars.add.1fp32.vars.Relu.6.24.5.1636595794631347.output.0.npy +...... +# 打印统计信息 +[Shape: (1, 16, 56, 56, 16)] [Dtype: float16] [Max: 452.0] [Min: -408.5] [Mean: -3.809] +Path: ./parse_data/dump_convert/Add.fp32.vars.add.1fp32.vars.Relu.6.24.5.1636595794631347.input.0.npy +TextFile:./parse_data/dump_convert/Add.fp32.vars.add.1fp32.vars.Relu.6.24.5.1636595794631347.input.0.npy.txt +``` + +### pkl文件中指定API的dump数据信息查看 + +输入以下命令,解析并输出pkl文件中指定api的统计信息。 + +```bash +pk -f pkl_path -n api_name +``` + +| 参数名称 | 说明 | 是否必选 | +| -------- | ----------------- | -------- | +| -f | 指定pkl文件路径。 | 是 | +| -n | 指定API名称。 | 是 | + +- 输出结果:打印统计信息(shape, dtype, max和min mean)。 +- 若pkl文件中存在相应的堆栈信息,则会打印堆栈信息。 + +**示例** + +```bash +# 传入pkl文件及api名称 +Parse >>> pk -f ./torch_dump/ptdbg_v3.2/rank0/api_stack_dump.pkl -n Functional.conv2d.0.forward +...... +# 打印统计信息及堆栈(pkl文件不包含堆栈则不会打印堆栈) + +Statistic Info: + [Functional.conv2d.0.forward.input.0][dtype: torch.float32][shape: [2, 1, 2, 2]][max: 1.576936960220337][min: -0.9757485389709473][mean: 0.4961632490158081] + [Functional.conv2d.0.forward.input.1][dtype: torch.float32][shape: [2, 1, 2, 2]][max: 0.20064473152160645][min: -0.47102075815200806][mean: -0.20796933770179749] + [Functional.conv2d.0.forward.input.2][dtype: torch.float32][shape: [2]][max: 0.17380613088607788][min: -0.16853803396224976][mean: 0.0026340484619140625] + [Functional.conv2d.0.forward.output][dtype: torch.float32][shape: [2, 2, 1, 1]][max: 0.02364911139011383][min: -1.762906551361084][mean: -0.6710853576660156] +``` + +### API可选层级比对 + +输入以下命令, 进行统计级和像素级比对。 + +```bash +cn -m my_data*.npy -g gloden*.npy [-p num] [-al atol] [-rl rtol] +``` + +- 统计级比对:对tensor整体进行余弦值及相对误差的计算。 +- 像素级比对:对输入的两个npy文件进行逐元素比对。若两个tensor对应元素的相对误差或绝对误差大于**误差阈值**(-al和-rl配置)则被标记为错误数据。 + +| 参数名称 | 说明 | 是否必选 | +| -------- | ----------------------------------------------- | -------- | +| -m | 待比对数据。 | 是 | +| -g | 标杆数据。 | 是 | +| -p | 设置比对结束后打印错误元素的个数,默认值20。 | 否 | +| -al | 判定数据存在精度问题的绝对误差阈值,默认0.001。 | 否 | +| -rl | 判定数据存在精度问题的相对误差阈值,默认0.001。 | 否 | +| -s | 将npy文件保存成txt文件,用于查看,默认开启。 | 否 | + +输出结果: + +- 统计级比对结果。 +- 两个文件的统计信息(shape, dtype, max, min和mean)。 +- 错误数据打印表格。 + +**示例** + +```bash +# 对比两个tensor的数据 +Parse >>> cn -m Add.InceptionV3.InceptionV3.Mixed.7a.Branch.0.add.3.323.1619494134703053.output.0.npy -g InceptionV3.InceptionV3.Mixed.7a.Branch.0.add.3.0.1619492699305998.npy -p 10 -s -al 0.002 -rl 0.005 + Error Item Table Top Item Table +┏━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ ┏━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ +┃ Index ┃ Left ┃ Right ┃ Diff ┃ ┃ Index ┃ Left ┃ Right ┃ Diff ┃ +┡━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ ┡━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ +│ 155 │ 0.024600908 │ 0.022271132 │ 0.002329776 │ │ 0 │ -0.9206961 │ -0.9222216 │ 0.0015255213 │ +│ 247 │ 0.015752593 │ 0.017937578 │ 0.0021849852 │ │ 1 │ -0.6416973 │ -0.64051837 │ 0.0011789203 │ +│ 282 │ -0.0101207765 │ -0.007852031 │ 0.0022687456 │ │ 2 │ -0.35383835 │ -0.35433492 │ 0.0004965663 │ +│ 292 │ 0.019581757 │ 0.02240482 │ 0.0028230622 │ │ 3 │ -0.18851271 │ -0.18883198 │ 0.00031927228 │ +│ 640 │ -0.06593232 │ -0.06874806 │ 0.0028157383 │ │ 4 │ -0.43508735 │ -0.43534422 │ 0.00025686622 │ +│ 1420 │ 0.09293677 │ 0.09586689 │ 0.0029301196 │ │ 5 │ 1.4447614 │ 1.4466647 │ 0.0019032955 │ +│ 1462 │ -0.085207745 │ -0.088047795 │ 0.0028400496 │ │ 6 │ -0.3455438 │ -0.3444429 │ 0.0011008978 │ +│ 1891 │ -0.03433288 │ -0.036525503 │ 0.002192624 │ │ 7 │ -0.6560242 │ -0.6564579 │ 0.0004336834 │ +│ 2033 │ 0.06828873 │ 0.07139922 │ 0.0031104907 │ │ 8 │ -2.6964858 │ -2.6975214 │ 0.0010356903 │ +│ 2246 │ -0.06376442 │ -0.06121233 │ 0.002552092 │ │ 9 │ -0.73746175 │ -0.73650354 │ 0.00095820427 │ +└───────┴───────────────┴──────────────┴──────────────┘ └───────┴─────────────┴─────────────┴───────────────┘ +╭───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ Left: | +│ |- NpyFile: ./dump/temp/decode/Add.InceptionV3.InceptionV3.Mixed.7a.Branch.0.add.3.323.1619494134703053.output.0.npy | +│ |- TxtFile: ./dump/temp/decode/Add.InceptionV3.InceptionV3.Mixed.7a.Branch.0.add.3.323.1619494134703053.output.0.npy.txt | +│ |- NpySpec: [Shape: (32, 8, 8, 320)] [Dtype: float32] [Max: 5.846897] [Min: -8.368301] [Mean: -0.72565556] | +│ DstFile: │ +│ |- NpyFile: ./dump/cpu/InceptionV3.InceptionV3.Mixed.7a.Branch.0.add.3.0.1619492699305998.npy | +│ |- TxtFile: ./dump/cpu/InceptionV3.InceptionV3.Mixed.7a.Branch.0.add.3.0.1619492699305998.npy.txt | +│ |- NpySpec: [Shape: (32, 8, 8, 320)] [Dtype: float32] [Max: 5.8425903] [Min: -8.374472] [Mean: -0.7256237] │ +│ NumCnt: 655360 │ +│ AllClose: False │ +│ CosSim: 0.99999493 │ +│ ErrorPer: 0.023504638671875 (rl= 0.005, al= 0.002) │ +╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +``` + +## FAQ + +[FAQ](https://gitee.com/ascend/att/blob/master/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md) diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v6.0.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v6.0.md" new file mode 100644 index 0000000000000000000000000000000000000000..6a014a1e010f10392a729589aaa6d30dd7019124 --- /dev/null +++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v6.0.md" @@ -0,0 +1,2332 @@ +# **PyTorch精度工具使用指南** + +本文主要介绍PyTorch精度工具ptdbg_ascend的使用以及精度比对场景示例。 + +ptdbg_ascend工具的原理及安装请参见《[PyTorch精度工具](https://gitee.com/ascend/att/blob/master/debug/accuracy_tools/ptdbg_ascend/README.md)》。 + +ptdbg_ascend工具主要支持PyTorch API精度数据dump、溢出检测、精度比对以及parse数据解析功能。其中dump和溢出检测功能支持使用debugger和register_hook方式进行精度数据的dump和溢出检测,推荐使用debugger方式。 + +## PyTorch精度比对总体流程 + +1. 准备CPU或GPU训练工程。 + +2. 在环境下安装ptdbg_ascend工具。 + +3. 在训练脚本内插入ptdbg_ascend工具dump接口。 + +4. 执行训练dump数据。 + +5. 将CPU或GPU训练工程迁移为NPU训练工程。 + + 请参见《[PyTorch模型迁移和训练指南](https://www.hiascend.com/document/detail/zh/canncommercial/63RC1/modeldevpt/ptmigr/ptmigr_0001.html)》。 + +6. 在NPU环境下安装ptdbg_ascend工具。 + +7. 在NPU训练脚本内插入ptdbg_ascend工具dump接口。 + +8. NPU环境下执行训练dump数据。 + +9. 创建并配置精度比对脚本,例如compare.py。 + +10. 执行CPU或GPU dump与NPU dump数据的精度比对。 + +11. 比对结果分析。 + +## 快速入门(debugger方式) + +本章节主要介绍通过ptdbg_ascend工具进行精度比对和分析,主要使用“**debugger方式dump和溢出检测**”和“**CPU或GPU与NPU精度数据比对**”章节中介绍的ptdbg_ascend工具接口。 + +### 单卡场景精度比对 + +**精度分析建议** + +PyTorch训练场景的精度问题分析建议参考以下思路进行精度比对和比对结果分析: + +1. 整网比对:dump整网数据并进行精度比对,初步定位异常范围。 + + 对于模型数据庞大(比如达到T级别)的场景,不推荐直接dump整网比对,整网dump可能导致磁盘不足,需要预留足够的存储空间或者分多次dump。 + +2. 缩小范围:根据Accuracy Reached or Not找出不符合精度标准的API。 + +3. 范围比对:对不符合精度标准的API重新dump详细信息。 + +4. 分析原因并优化:分析API精度不符合标准的原因并进行优化调整。 + +5. 整网比对:重新进行整网比对,判断优化后的API是否已符合精度标准以及是否出现新的精度问题。 + +6. 重复1~5步,直到不存在精度问题为止。 + +**精度分析示例** + +1. dump整网数据。 + + 分别dump CPU或GPU以及NPU数据,在PyTorch训练脚本插入dump接口,示例代码如下(下面以NPU为例,CPU或GPU dump基本相同): + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./npu_dump", hook_name="dump", step=[0]) + debugger.configure_hook(mode="api_stack") + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + +2. 比对整网数据。 + + 第1步中的NPU dump数据目录为npu_dump,假设GPU dump数据目录为gpu_dump;dump将生成pkl数据文件api_stack_dump.pkl和npy数据目录api_stack_dump。 + + 创建并配置精度比对脚本,以创建compare.py为例,示例代码如下: + + ```python + from ptdbg_ascend import * + dump_result_param={ + "npu_pkl_path": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", + "bench_pkl_path": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", + "npu_dump_data_dir": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump", + "bench_dump_data_dir": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump", + "is_print_compare_log": True + } + compare(dump_result_param, "./output", stack_mode=True) + ``` + + 执行比对: + + ```bash + python3 compare.py + ``` + + 在output目录下生成结果文件,包括:`compare_result_{timestamp}.xlsx`和`advisor_{timestamp}.txt` + +3. 找出存在问题的API。 + + 1. 根据`advisor_{timestamp}.txt`或打屏信息的提示,可找到存在精度问题的算子(Suspect Nodes)和专家建议(Expert Advice)。 + + ![auto_analyze_log](img/auto_analyze_log.png) + + 2. 根据第2步结果文件`compare_result_{timestamp}.xlsx`中的Accuracy Reached or No字段显示为NO的API,针对该API执行后续比对操作,分析该API存在的精度问题。 + +4. (可选)提取指定API的堆栈信息和dump数据统计信息。 + + 通过parse接口可以清晰的显示特定API的堆栈信息和dump数据统计信息,结合堆栈信息分析代码中可能存在的精度问题。 + + 创建并配置提取脚本,以创建parse.py为例,示例代码如下: + + ```python + from ptdbg_ascend import * + + # 提取dump信息中第1次调用的API:Torch.batch.normal的堆栈信息及数据统计信息 + parse("./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", "Torch.batch.normal.1.forward") + ``` + + 执行提取: + + ```bash + python3 parse.py + ``` + + + +5. (可选)指定API对其底层ACL数据进行dump。 + + - dump指定前向API的ACL级别数据 + + ```python + debugger = PrecisionDebugger(dump_path="./npu_dump", hook_name="dump", step=[0]) + debugger.configure_hook(mode="acl", scope=["Tensor.permute.1.forward"], acl_config='./dump.json') + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + + - dump指定反向API的ACL级别数据 + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./npu_dump", hook_name="dump", step=[0]) + # dump指定反向API的ACL级别数据、bool和整型的tensor以及浮点、bool和整型的标量 + debugger.configure_hook(mode="acl", scope=["Functional.conv2d.1.backward"], acl_config="./dump.json", backward_input=["./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump/Functional.conv2d.1.backward_input.0.npy"]) + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + +6. (可选)重新比对。 + + 根据第4或5步的dump数据重新配置compare.py并执行比对,可以对单API模型进行问题复现。 + +**注意事项** + +* dump_mode="acl"场景下,会增加npu的内存消耗,请谨慎开启。 +* 部分API存在调用嵌套关系,比如functional.batch_norm实际调用torch.batch_norm,该场景会影响acl init初始化多次,导致功能异常。 + +### 溢出检测场景 + +溢出检测是针对NPU的PyTorch API,检测是否存在溢出的情况。当前仅支持识别aicore浮点溢出。 + +溢出检测原理:针对溢出阶段,开启acl dump模式,重新对溢出阶段执行,落盘数据。 + +建议按照如下步骤操作: + +1. 在NPU环境下安装ptdbg_ascend工具。 + +2. 在NPU训练脚本内插入ptdbg_ascend工具溢出检测接口。 + + - 示例1:全量溢出检测 + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./overflow_dump", hook_name="overflow_check", step=[0]) + debugger.configure_hook(overflow_nums=-1) + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + + 多卡使用时各卡单独计算溢出次数。 + + - 示例2:dump指定前向API的ACL级别溢出数据 + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./overflow_dump", hook_name="overflow_check", step=[0]) + debugger.configure_hook(mode="acl", acl_config="./dump.json") + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + + - 示例3:dump指定反向API的ACL级别的溢出数据 + + 1. 进行全量溢出检测 + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./overflow_dump", hook_name="overflow_check", step=[0]) + debugger.configure_hook(overflow_nums=-1) + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + + + + 2. dump指定反向API的ACL级别的溢出数据 + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./overflow_dump", hook_name="dump", step=[0]) + debugger.configure_hook(mode="acl", scope=["Functional.conv2d.1.backward"], acl_config="./dump.json", backward_input=["./overflow_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump/Functional.conv2d.1.backward_input.0.npy"]) + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + + 针对前向溢出API,可以通过overflow_nums,配置允许的溢出次数,并将每次溢出API的全部ACL数据dump下来,到达指定溢出次数后停止,停止后会看到堆栈打印包含如下字段。 + + ```bash + ValueError: [overflow xxx times]: dump file is saved in '*.pkl'. + ``` + + 其中xxx times为用户设置的次数,*.pkl为文件生成路径。 + +3. NPU环境下执行训练dump溢出数据。 + + 针对输入正常但输出存在溢出的API,会训练执行目录下将溢出的API信息dump并保存为`forward_info_{pid}.json`和`backward_info_{pid}.json`,通过[Ascend模型精度预检工具](https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/api_accuracy_checker)对json文件进行解析,输出溢出API为正常溢出还是非正常溢出,从而帮助用户快速判断。 + + 精度预检工具执行命令如下: + + ```bash + # 下载att代码仓后执行如下命令 + export PYTHONPATH=$PYTHONPATH:$ATT_HOME/debug/accuracy_tools/ + cd $ATT_HOME/debug/accuracy_tools/api_accuracy_checker/run_ut + python run_overflow_check.py -forward ./forward_info_0.json + ``` + + 反向过程溢出的API暂不支持精度预检功能。 + + 当重复执行溢出检测dump操作时,需要删除上一次dump目录下的溢出检测dump数据,否则将因重名而报错。 + +**注意事项** + +* dump_mode="acl"场景下,会增加npu的内存消耗,请谨慎开启。 +* 部分API存在调用嵌套关系,比如functional.batch_norm实际调用torch.batch_norm,该场景会影响acl init初始化多次,导致功能异常。 +* 混合精度动态loss scale场景下,正常训练会有"Gradient overflow. SKipping step"日志,添加溢出检测后日志消失,可以通过设置环境变量export OVERFLOW_DEBUG_MODE_ENABLE=1,并将register_hook位置调整amp.initialize之前解决。此功能需要cann包配套支持,不支持版本执行报错EZ3003。 + +## 场景化示例 + +本章节主要介绍通过ptdbg_ascend工具进行精度比对和分析,主要使用“**CPU或GPU及NPU精度数据dump**”和“**CPU或GPU与NPU精度数据比对**”章节中介绍的ptdbg_ascend工具接口。 + +### 多卡场景精度比对 + +精度工具支持多卡场景的精度比对,多卡场景的dump步骤与单卡场景完全一致,请参见“**单卡场景精度比对**”章节,不同的是多卡数据精度比对时需要使用“compare_distributed”函数进行比对。 + +**大模型场景下dump推荐使用debugger方式的手动模式。** + +如下示例: + +说明:多机多卡场景需要每个设备单独执行比对操作。 + +假设NPU dump npy数据目录为npu_dump/ptdbg_dump_v4.0,GPU dump npy数据目录为gpu_dump/ptdbg_dump_v4.0。 + +1. 创建比对脚本,例如compare_distributed.py,拷贝如下代码。 + + ```python + from ptdbg_ascend import * + compare_distributed('./npu_dump/ptdbg_dump_v4.0/step0', './gpu_dump/ptdbg_dump_v4.0/step0', './output') + ``` + + dump数据目录须指定到step级。 + +2. 执行比对: + + ```bash + python3 compare_distributed.py + ``` + +两次运行须用相同数量的卡,传入`compare_distributed`的两个文件夹下须有相同个数的rank文件夹,且不包含其他无关文件,否则将无法比对。 + +**多卡set_dump_path注意事项** + +多卡一般为多进程,须保证每个进程都正确调用PrecisionDebugger或set_dump_path,或把PrecisionDebugger或set_dump_path插入到import语句后,如: + +```python +from ptdbg_ascend import * +debugger = PrecisionDebugger(dump_path="./npu_dump", hook_name="dump", step=[0]) +``` + +或 + +```python +from ptdbg_ascend import * +seed_all() +set_dump_path('./dump_resnet') +``` + +如此可保证set_dump_path在每个进程都被调用。 + +**多卡register_hook注意事项** + +register_hook需要在set_dump_path之后调用,也需要在每个进程上被调用,建议在搬运模型数据到卡之后调用。识别方法如下: + +- 找到训练代码中遍历epoch的for循环或遍历数据集的for循环,把register_hook放到循环开始前即可。 +- 找到训练代码中调用DDP或者DistributedDataParallel的代码行,把register_hook放到该代码行所在的代码块之后。 +- 若代码中均无以上两种情况,需要保证register_hook在模型定义之后插入,并配置rank参数。rank参数获取rank_id请参见“**[rank_id获取方法](https://gitee.com/ascend/att/blob/master/debug/accuracy_tools/ptdbg_ascend/doc/rank_id获取方法.md)**”。 + +### NPU vs NPU精度比对 + +对于NPU vs NPU场景,是针对同一模型,进行迭代(模型、API版本升级或设备硬件升级)时存在的精度下降问题,对比相同模型在迭代前后版本的API计算数值,进行问题定位。 + +一般情况下迭代涉及NPU自定义算子,因此,可以仅dump NPU自定义算子进行比对。比对精度问题分析请参见“**单卡场景精度比对**”章节。 + +工具当前支持dump NPU自定义算子如下: + +| 序号 | NPU自定义算子 | +| :--- | ----------------------------------------------- | +| 1 | torch_npu.one_ | +| 2 | torch_npu.npu_sort_v2 | +| 3 | torch_npu.npu_transpose | +| 4 | torch_npu.npu_broadcast | +| 5 | torch_npu.npu_dtype_cast | +| 6 | torch_npu.empty_with_format | +| 7 | torch_npu.npu_one_hot | +| 8 | torch_npu.npu_stride_add | +| 9 | torch_npu.npu_ps_roi_pooling | +| 10 | torch_npu.npu_roi_align | +| 11 | torch_npu.npu_nms_v4 | +| 12 | torch_npu.npu_iou | +| 13 | torch_npu.npu_nms_with_mask | +| 14 | torch_npu.npu_pad | +| 15 | torch_npu.npu_bounding_box_encode | +| 16 | torch_npu.npu_bounding_box_decode | +| 17 | torch_npu.npu_batch_nms | +| 18 | torch_npu.npu_slice | +| 19 | torch_npu._npu_dropout | +| 20 | torch_npu.npu_indexing | +| 21 | torch_npu.npu_ifmr | +| 22 | torch_npu.npu_max | +| 23 | torch_npu.npu_scatter | +| 24 | torch_npu.npu_layer_norm_eval | +| 25 | torch_npu.npu_alloc_float_status | +| 26 | torch_npu.npu_confusion_transpose | +| 27 | torch_npu.npu_bmmV2 | +| 28 | torch_npu.fast_gelu | +| 29 | torch_npu.npu_sub_sample | +| 30 | torch_npu.npu_deformable_conv2d | +| 31 | torch_npu.npu_mish | +| 32 | torch_npu.npu_anchor_response_flags | +| 33 | torch_npu.npu_yolo_boxes_encode | +| 34 | torch_npu.npu_grid_assign_positive | +| 35 | torch_npu.npu_normalize_batch | +| 36 | torch_npu.npu_masked_fill_range | +| 37 | torch_npu.npu_linear | +| 38 | torch_npu.npu_bert_apply_adam | +| 39 | torch_npu.npu_giou | +| 40 | torch_npu.npu_ciou | +| 41 | torch_npu.npu_diou | +| 42 | torch_npu.npu_sign_bits_pack | +| 43 | torch_npu.npu_sign_bits_unpack | +| 44 | torch_npu.npu_flash_attention | +| 45 | torch_npu.npu_scaled_masked_softmax | +| 46 | torch_npu.npu_rotary_mul | +| 47 | torch_npu.npu_roi_align | +| 48 | torch_npu.npu_roi_alignbk | +| 49 | torch_npu.npu_ptiou | +| 50 | torch_npu.npu_fusion_attention | +| 51 | torch_npu.npu_dropout_with_add_softmax | +| 52 | torch_npu.npu_random_choice_with_mask | +| 53 | torch_npu.npu_rotated_iou | +| 54 | torch_npu.npu_conv2d | +| 55 | torch_npu.npu_conv3d | +| 56 | torch_npu.npu_softmax_cross_entropy_with_logits | +| 57 | torch_npu.npu_all_gather_base_mm | +| 58 | torch_npu.npu_swiglu | +| 59 | torch_npu.npu_rms_norm | +| 60 | torch_npu.npu_mm_reduce_scatter_base | +| 61 | torch_npu.npu_mm_all_reduce_base | +| 62 | torch_npu.npu_conv_transpose2d | +| 63 | torch_npu.npu_convolution | +| 64 | torch_npu.npu_convolution_transpose | +| 65 | torch_npu.npu_min | +| 66 | torch_npu.npu_nms_rotated | +| 67 | torch_npu.npu_reshape | +| 68 | torch_npu.npu_rotated_box_decode | +| 69 | torch_npu.npu_rotated_box_encode | +| 70 | torch_npu.npu_rotated_overlaps | +| 71 | torch_npu.npu_silu | +| 72 | torch_npu.npu_fused_attention_score | +| 73 | torch_npu.npu_multi_head_attention | +| 74 | torch_npu.npu_gru | +| 75 | torch_npu.npu_incre_flash_attention | +| 76 | torch_npu.npu_prompt_flash_attention | +| 77 | torch_npu.npu_lstm | +| 78 | torch_npu.npu_apply_adam | + +### 通信API的数据dump + +通信类API数据可以使用全量dump方式获取,若只dump通信类API数据,可以使用如下示例: + +```python +debugger.configure_hook(mode="api_list", api_list=["distributed"]) +``` + +或 + +```python +set_dump_switch("ON", mode="api_list", api_list=["distributed"]) +``` + +通信类API支持列表: + +| 序号 | Distributed | +| :--- | -------------------- | +| 1 | send | +| 2 | recv | +| 3 | broadcast | +| 4 | all_reduce | +| 5 | reduce | +| 6 | all_gather | +| 7 | gather | +| 8 | isend | +| 9 | irecv | +| 10 | scatter | +| 11 | reduce_scatter | +| 12 | _reduce_scatter_base | +| 13 | _all_gather_base | + +### 单卡场景精度比对(register_hook方式) + +**精度分析建议** + +PyTorch训练场景的精度问题分析建议参考以下思路进行精度比对和比对结果分析: + +1. 整网比对:dump整网数据并进行精度比对,初步定位异常范围。 +2. 缩小范围:根据Accuracy Reached or Not找出不符合精度标准的API。 +3. 范围比对:对不符合精度标准的API重新dump。 +4. 分析原因并优化:分析API精度不符合标准的原因并进行优化调整。 +5. 整网比对:重新进行整网比对,判断优化后的API是否已符合精度标准以及是否出现新的精度问题。 +6. 重复1~5步,直到不存在精度问题为止。 + +**精度分析示例** + +1. dump整网数据。 + + 分别dump CPU或GPU以及NPU数据,在PyTorch训练脚本插入dump接口,示例代码如下(下面以NPU为例,CPU或GPU dump基本相同): + + ```python + from ptdbg_ascend import * + + # 在main函数开始前固定随机数 + seed_all() + + # 配置dump数据目录路径和名称 + set_dump_path("./npu_dump", dump_tag='all') + + # 注册dump回调函数 + register_hook(model, acc_cmp_dump) + + ... + + # 在第一个迭代开始的位置开启dump和堆栈模式,同时为保证数据完整性开启dump bool和整型的tensor以及浮点、bool和整型的标量 + set_dump_switch("ON", mode="api_stack", filter_switch="OFF") + + ... + + # 在第一个迭代结束的位置关闭dump + set_dump_switch("OFF") + ``` + +2. 比对整网数据。 + + 第1步中的NPU dump数据文件为npu_dump.pkl,假设NPU dump npy数据目录为npu_dump,GPU dump数据文件为gpu_dump.pkl,GPU dump npy数据目录为gpu_dump。 + + 创建并配置精度比对脚本,以创建compare.py为例,示例代码如下: + + ```python + from ptdbg_ascend import * + dump_result_param={ + "npu_pkl_path": "./npu_dump/all_v4.0/step0/rank0/api_stack_dump.pkl", + "bench_pkl_path": "./gpu_dump/all_v4.0/step0/rank0/api_stack_dump.pkl", + "npu_dump_data_dir": "./npu_dump/all_v4.0/step0/rank0/api_stack_dump", + "bench_dump_data_dir": "./gpu_dump/all_v4.0/step0/rank0/api_stack_dump", + "is_print_compare_log": True + } + compare(dump_result_param, "./output", stack_mode=True) + ``` + + 执行比对: + + ```bash + python3 compare.py + ``` + + 在output目录下生成结果文件,包括:`compare_result_{timestamp}.xlsx`和`advisor_{timestamp}.txt` + +3. 找出存在问题的API。 + + 1. 根据`advisor_{timestamp}.txt`或打屏信息的提示,可找到存在精度问题的算子(Suspect Nodes)和专家建议(Expert Advice) + + ![auto_analyze_log](img/auto_analyze_log.png) + + 2. 根据第2步结果文件`compare_result_{timestamp}.xlsx`中的Accuracy Reached or No字段显示为NO的API,针对该API执行后续比对操作,分析该API存在的精度问题。 + +4. (可选)提取指定API的堆栈信息和dump数据统计信息。 + + 通过parse接口可以清晰的显示特定API的堆栈信息和dump数据统计信息,结合堆栈信息分析代码中可能存在的精度问题。 + + 创建并配置提取脚本,以创建parse.py为例,示例代码如下: + + ```python + from ptdbg_ascend import * + + # 提取dump信息中第1次调用的API:Torch.batch.normal的堆栈信息及数据统计信息 + parse("./npu_dump/all_v4.0/step0/rank0/api_stack_dump.pkl", "Torch.batch.normal.1.forward") + ``` + + 执行提取: + + ```bash + python3 parse.py + ``` + +5. (可选)指定API对其底层ACL数据进行dump。 + + - dump指定前向API的ACL级别数据 + + ```python + from ptdbg_ascend import * + + # 固定随机数,开启确定性计算 + seed_all(mode=True) + set_dump_path("./dump_path", dump_tag='forward') + register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json') + + # dump指定前向API的ACL级别数据、bool和整型的tensor以及浮点、bool和整型的标量 + set_dump_switch("ON", mode="acl", scope=["Tensor.permute.1.forward"], filter_switch="OFF") + + ... + + set_dump_switch("OFF") + ``` + + - dump指定反向API的ACL级别数据 + + ```python + from ptdbg_ascend import * + + # 固定随机数,开启确定性计算 + seed_all(mode=True) + set_dump_path("./dump_path", dump_tag='backward') + register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json') + + # dump指定反向API的ACL级别数据、bool和整型的tensor以及浮点、bool和整型的标量 + set_dump_switch("ON", mode="acl", scope=["Functional.conv2d.1.backward"], filter_switch="OFF") + set_backward_input(["./npu_dump/all_v4.0/step0/rank0/api_stack_dump/Functional.conv2d.1.backward.input.0.npy"]) + + ... + + set_dump_switch("OFF") + ``` + +6. (可选)重新比对。 + + 根据第4或5步的dump数据重新配置compare.py并执行比对,可以对单API模型进行问题复现。 + +**注意事项** + +* dump_mode="acl"场景下,会增加npu的内存消耗,请谨慎开启。 +* 部分API存在调用嵌套关系,比如functional.batch_norm实际调用torch.batch_norm,该场景会影响acl init初始化多次,导致功能异常。 + +### 溢出检测场景(register_hook方式) + +溢出检测是针对NPU的PyTorch API,检测是否存在溢出的情况。当前仅支持识别aicore浮点溢出。 + +溢出检测原理:针对溢出阶段,开启acl dump模式,重新对溢出阶段执行,落盘数据。 + +建议按照如下步骤操作: + +1. 在NPU环境下安装ptdbg_ascend工具。 + +2. 在NPU训练脚本内插入ptdbg_ascend工具溢出检测接口。 + + - 示例1:全量溢出检测 + + ```python + from ptdbg_ascend import * + seed_all() + # 配置溢出数据目录路径和名称 + set_dump_path("./overflow_dump") + ... + # 设置检测到3次溢出后退出训练 + register_hook(model, overflow_check, overflow_nums=3) + + ... + ``` + + 多卡使用时各卡单独计算溢出次数。 + + - 示例2:dump指定API的ACL级别溢出数据 + + ```python + from ptdbg_ascend import * + seed_all() + # 配置溢出数据目录路径和名称 + set_dump_path("./overflow_dump") + ... + # dump指定API的ACL级别溢出数据 + register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json') + + # 在期望溢出检测的step位置开始前打开溢出检测开关 + set_overflow_check_switch("ON") + + ... + + # 在step结束的位置关闭溢出检测开关 + set_overflow_check_switch("OFF") + + ... + ``` + + - 示例3:dump指定反向API的ACL级别的溢出数据 + + 1. 进行全量溢出检测 + + ```python + from ptdbg_ascend import * + seed_all() + # 配置溢出数据目录路径和名称 + set_dump_path("./overflow_dump") + ... + # 设置检测到3次溢出后退出训练 + register_hook(model, overflow_check) + + ... + ``` + + 2. dump指定反向API的ACL级别的溢出数据 + + ```python + from ptdbg_ascend import * + seed_all() + # 配置溢出数据目录路径和名称 + set_dump_path("./overflow_dump") + ... + # dump指定反向API的ACL级别溢出数据 + register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json') + set_dump_switch("ON", mode="acl", scope=["Functional.conv2d.1.backward"]) + set_backward_input(["./npu_dump/ptdbg_dump_v4.0/step0/rank0/dump/Functional.conv2d.1.backward.input.0.npy"]) + ``` + + 针对前向溢出API,可以通过overflow_nums,配置允许的溢出次数,并将每次溢出API的全部ACL数据dump下来,到达指定溢出次数后停止,停止后会看到堆栈打印包含如下字段。 + + ```bash + ValueError: [overflow xxx times]: dump file is saved in '*.pkl'. + ``` + + 其中xxx times为用户设置的次数,*.pkl为文件生成路径。 + +3. NPU环境下执行训练dump溢出数据。 + + 针对输入正常但输出存在溢出的API,会训练执行目录下将溢出的API信息dump并保存为`forward_info_{pid}.json`和`backward_info_{pid}.json`,通过 [Ascend模型精度预检工具](https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/api_accuracy_checker)对json文件进行解析,输出溢出API为正常溢出还是非正常溢出,从而帮助用户快速判断。 + + 精度预检工具执行命令如下: + + ```bash + # 下载att代码仓后执行如下命令 + export PYTHONPATH=$PYTHONPATH:$ATT_HOME/debug/accuracy_tools/ + cd $ATT_HOME/debug/accuracy_tools/api_accuracy_checker/run_ut + python run_overflow_check.py -forward ./forward_info_0.json + ``` + + 反向过程溢出的API暂不支持精度预检功能。 + + 当重复执行溢出检测dump操作时,需要删除上一次dump目录下的溢出检测dump数据,否则将因重名而报错。 + +**注意事项** + +* dump_mode="acl"场景下,会增加npu的内存消耗,请谨慎开启。 +* 部分API存在调用嵌套关系,比如functional.batch_norm实际调用torch.batch_norm,该场景会影响acl init初始化多次,导致功能异常。 +* 混合精度动态loss scale场景下,正常训练会有"Gradient overflow. SKipping step"日志,添加溢出检测后日志消失,可以通过设置环境变量export OVERFLOW_DEBUG_MODE_ENABLE=1,并将register_hook位置调整amp.initialize之前解决。此功能需要cann包配套支持,不支持版本执行报错EZ3003。 + +## debugger方式dump和溢出检测(推荐) + +### PrecisionDebugger模块 + +**功能说明** + +PrecisionDebugger模块包含dump和溢出检测功能的总体配置项。可以指定dump目录,设置dump或溢出检测功能,指定dump的卡和迭代。 + +可以在from ptdbg_ascend import *和模型初始化之间的任意位置添加该模块。 + +**原型** + +```python +PrecisionDebugger(dump_path=None, hook_name=None, rank=None, step=[], enable_dataloader=False, model=None): +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ----------------- | ------------------------------------------------------------ | -------- | +| dump_path | 设置dump数据目录路径,参数示例:"./dump_path"。数据类型:str。
默认在dump_path目录下生成`ptdbg_dump_{version}`目录,并在该目录下生成`dump.pkl`文件以及`dump`数据文件保存目录。
当**configure_hook**函数配置了mode参数时,`dump.pkl`文件以及`dump`数据文件保存目录名称添加mode参数值为前缀,详情请参见“**dump数据存盘说明**”。
未配置dump_path时,也可以通过环境变量ASCEND_WORK_PATH配置dump路径,此时dump数据将落盘在${ASCEND_WORK_PATH}/dump_data下,自定义配置dump_path优先级高于环境变量,dump_path和环境变量需要二选一。 | 否 | +| hook_name | dump模式,可取值"dump"和"overflow_check",表示dump和溢出检测功能,二选一。参数示例:hook_name="dump"。数据类型:str。 | 是 | +| rank | 指定对某张卡上的数据进行dump或溢出检测,默认未配置(表示dump所有卡的数据),须根据实际卡的Rank ID配置。应配置为大于0的正整数,且须根据实际卡的Rank ID配置,若所配置的值大于实际训练所运行的卡的Rank ID,则dump数据为空,比如当前环境Rank ID为0到7,实际训练运行0到3卡,此时若配置Rank ID为4或不存在的10等其他值,此时dump数据为空。数据类型:int。 | 否 | +| step | 指定dump某个step的数据,默认未配置,表示dump所有step数据。dump特定step时,须指定为训练脚本中存在的step。step为list格式,可配置逐个step,例如:step=[0,1,2];也可以配置step范围,例如:step=list(range(0,9)),表示dump第0到第8个step。数据类型:List[int]。 | 否 | +| enable_dataloader | 自动控制开关,可取值True(开启)或False(关闭),默认为False。配置为True后自动识别dump step参数指定的迭代,并在该迭代执行完成后退出训练,此时start和stop函数可不配置,开启该开关要求训练脚本是通过torch.utils.data.dataloader方式加载数据;配置为False则需要配置start和stop函数,并在最后一个stop函数后或一个step结束的位置添加debugger.step()。数据类型:bool。 | 否 | +| model | 开启init dump模式,传入网络模型实例化的对象,配置该参数后,dump操作仅dump网络中init方法里调用的方法(nn.Module类),不会对所有API进行dump。参数示例: model=net,net为网络模型实例化的对象名称。默认未配置。
配置该参数时,PrecisionDebugger模块请在模型实例化之后调用。数据类型:torch.nn.Module。
该模式不支持“溢出检测”、”ACL级别数据dump“和“模块级精度数据dump”。此模式下dump文件名前缀为网络中定义的模块名或层名。 | 否 | + +#### init dump模式示例代码和数据落盘说明 + +**示例代码** + +```python +import os +import torch +import torch.nn as nn +import torch_npu +from ptdbg_ascend import * + +torch.npu.set_device("npu:0") + + +class Net(nn.Module): + + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=1, padding=2) + self.relu1 = nn.ReLU() + self.bn1 = nn.BatchNorm2d(16) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + output = self.relu1(x) + return output + +if __name__ == "__main__": + net = Net().npu() + # model参数传入net, 开启init dump 功能 + debugger = PrecisionDebugger(dump_path="./dump", hook_name="dump", model=net) + debugger.configure_hook(mode="api_stack") + debugger.start() + x = torch.randn(1, 1, 28, 28).npu() + out = net(x) + loss = out.sum() + loss.backward() + debugger.stop() +``` + +**落盘数据说明** + +该模式下dump数据命名格式为:`{Layer_name}.{Module_name}.{call_num}.{forward/backward}.{input/output}.npy` + +``` +# 按照上述用例代码进行dump,落盘数据命名示例如下: +conv1.Conv2d.0.forward.input.0.npy +conv1.Conv2d.0.forward.output.npy +relu1.ReLU.0.forward.input.0.npy +....... +bn1.BatchNorm2d.0.backward.output.2.npy +``` + +### configure_hook函数(可选) + +**功能说明** + +设置dump范围。 + +建议在**PrecisionDebugger**模块与模型初始化之间的任意位置添加,不添加此函数时默认使用mode="api_stack" dump整网数据。 + +**原型** + +dump: + +```python +debugger.configure_hook(mode="api_stack", scope=[], api_list=[], filter_switch="OFF", acl_config=None, backward_input=[], input_output_mode=["all"], summary_only=False, summary_mode="all") +``` + +溢出检测: + +```python +debugger.configure_hook(mode=None, acl_config=None, overflow_nums=1, need_replicate=False) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ----------------- | ------------------------------------------------------------ | -------- | +| mode | dump模式。可取值"all"、"list"、"range"、"stack"、"acl"、"api_list"、"api_stack",各参数含义请参见本节的“**函数示例**”。参数示例:mode="list"。默认为"api_stack"。该参数配置值将作为dump数据文件名的前缀,详情请参见“**dump数据存盘说明**”。数据类型:str。 | 否 | +| scope或api_list | dump范围。根据model配置的模式选择dump的API范围,mode="api_list"时,需要配置api_list=[],其他模式有需要时配置scope=[]。参数示例:scope=["Tensor.permute.1.forward", "Tensor.transpose.2.forward"]、api_list=["relu"]。默认为空。数据类型:List[str]。 | 否 | +| filter_switch | dump bool和整型的tensor以及浮点、bool和整型的标量的过滤开关。可取值"ON"(表示开启过滤,即不dump)或"OFF"(表示关闭过滤)。参数示例:filter_switch="ON"。默认不配置,即filter_switch="OFF",表示dump上述数据。数据类型:str。 | 否 | +| acl_config | acl dump的配置文件。mode="acl"时,该参数必选;mode为其他值时,该参数不选。参数示例:acl_config='./dump.json'。dump.json配置文件详细介绍请参见“**dump.json配置文件说明**”。数据类型:str。 | 否 | +| backward_input | 该输入文件为首次运行训练dump得到反向API输入的.npy文件。例如若需要dump Functional.conv2d.1 API的反向过程的输入输出,则需要在dump目录下查找命名包含Functional.conv2d.1、backward和input字段的.npy文件。数据类型:str。 | 否 | +| input_output_mode | dump数据过滤。可取值"all"、"forward"、"backward"、"input"和"output",表示仅保存dump的数据中文件名包含"forward"、"backward"、"input"和"output"的前向、反向、输入或输出的.npy文件。参数示例input_output_mode=["backward"]或input_output_mode=["forward", "backward"]。默认为["all"],即保存所有dump的数据。除了all参数只能单独配置外,其他参数可以自由组合。数据类型:list。 | 否 | +| summary_only | dump npy文件过滤,可取值True或False,配置为True后仅dump保存API统计信息的pkl文件,参数示例:summary_only=False,默认为False。数据类型:bool。 | 否 | +| summary_mode | 控制dump文件输出的模式,可取值md5(dump仅输出包含md5值的pkl文件,用于验证数据的完整性)、summary(dump仅输出包含API统计信息的pkl文件)、all(dump输出包含API统计信息的pkl文件以及具体的npy文件),参数示例:summary_mode="md5",默认为"all"。summary_only=True时,不允许配置该参数。数据类型:str。 | 否 | +| overflow_nums | 控制溢出次数,表示第N次溢出时,停止训练,过程中检测到溢出API对应ACL数据均dump。参数示例:overflow_nums=3。配置overflow_check时可配置,默认不配置,即检测到1次溢出,训练停止,配置为-1时,表示持续检测溢出直到训练结束。数据类型:int。 | 否 | +| need_replicate | 过程dump数据生成开关,执行溢出检测时,dump目录下会生成forward_real_data和backward_real_data的过程dump数据目录,可取值True(生成)或False(不生成),默认不生成。数据类型:bool。 | 否 | + +**函数示例** + +configure_hook可配置多种dump模式,示例如下: + +说明: + +以下均以dump部分API数据为例,API名可以从首次dump整网数据的结果文件中的NPU Name或Bench Name列获取。 + +以下仅为该函数配置示例,完整代码请参见“**示例代码**”章节。 + +- 示例1:dump指定API列表 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(mode="list", scope=["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]) + ``` + +- 示例2:dump指定范围 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(mode="range", scope=["Tensor.abs.1.forward", "Tensor.transpose.3.forward"]) + ``` + +- 示例3:STACK模式,只dump堆栈信息 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(mode="stack", scope=["Tensor.abs.1.forward", "Tensor.transpose.3.forward"]) + ``` + +- 示例4:dump指定前向API的ACL级别数据 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(mode="acl", scope=["Tensor.permute.1.forward"], acl_config="./dump.json") + ``` + +- 示例5:dump指定反向API的ACL级别数据 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(mode="acl", scope=["Functional.conv2d.1.backward"], acl_config="./dump.json", backward_input=["./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump/Functional.conv2d.1.backward.input.0.npy"]) + ``` + +- 示例6:dump指定某一类API的API级别输入输出数据 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(mode="api_list", api_list=["relu"]) + ``` + + mode="api_list"时不配置scope。 + +- 示例7:dump全部API级别输入输出数据以及相应堆栈信息 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(mode="api_stack") + ``` + + mode="api_stack"时不配置scope。 + +- 示例8: dump全部API级别输入输出数据并包含bool和整型的tensor以及浮点、bool和整型的标量,配置为OFF,会dump bool和整型数据 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(filter_switch="OFF") + ``` + + 配置filter_switch="OFF"同时也可以配置mode、scope和api_list,除dump ACL级别数据。 + +- 示例9:仅保存dump的数据文件名包含“backward”的反向.npy文件 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(input_output_mode=["backward"]) + ``` + +- 示例10:仅dump pkl文件 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + debugger.configure_hook(summary_only=True) + ``` + +- 示例11:溢出检测dump + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="overflow_check", step=[0]) + debugger.configure_hook(overflow_nums=1) + ``` + + dump执行时会在**PrecisionDebugger**模块的dump_path参数指定的目录下生成ptdbg_dump_{version}目录,保存溢出数据。 + + 多卡场景时,需要检测到至少有一张卡溢出次数达到overflow_nums时,训练结束。 + + 仅支持NPU环境。 + +- 示例11:dump溢出API的ACL级别数据 + + ```python + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="overflow_check", step=[0]) + debugger.configure_hook(mode="acl", acl_config="./dump.json") + ``` + + 该场景会在原有数据基础上,额外在dump.json文件配置的dump_path目录下生成一份ACL算子数据,该数据可通过“**ptdbg_ascend.parse**”工具进行解析。 + + 仅支持NPU环境。 + +### start函数(可选) + +**功能说明** + +dump或溢出检测启动函数。 + +在模型初始化之后的任意位置添加。 + +**原型** + +```python +debugger.start() +``` + +该函数为类函数,可以使用debugger.start()也可以使用PrecisionDebugger.start()。 + +### stop函数(可选) + +**功能说明** + +dump或溢出检测停止函数。 + +在**start**函数之后的任意位置添加。 + +**原型** + +```python +debugger.stop() +``` + +该函数为类函数,可以使用debugger.stop()也可以使用PrecisionDebugger.stop()。 + +### 示例代码(自动模式) + +**需要保证用户训练代码是通过torch.utils.data.dataloader方式加载数据。** + +- 示例1:开启dump + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0,2], enable_dataloader=True) + # 请勿将以上初始化流程插入到循环代码中 + ``` + +- 示例2:开启溢出检测dump + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="overflow_check", step=[0,2], enable_dataloader=True) + # 请勿将以上初始化流程插入到循环代码中 + ``` + +### 示例代码(手动模式) + +一般情况下使用自动模式可以快速方便进行dump操作,但个别大模型可能在部分卡的训练操作中没有调用dataloader,这会导致自动模式无法dump指定迭代的数据,此时需要关闭自动模式手动在迭代前后插入start()和stop()函数,并在最后一个stop函数后或一个step结束的位置添加debugger.step()以标识dump结束。 + +- 示例1:开启dump + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", step=[0]) + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + +- 示例2:开启溢出检测dump + + ```python + from ptdbg_ascend import * + debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="overflow_check", step=[0]) + # 请勿将以上初始化流程插入到循环代码中 + + # 模型初始化 + # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop() + debugger.start() + + # 需要dump的代码片段1 + + debugger.stop() + debugger.start() + + # 需要dump的代码片段2 + + debugger.stop() + debugger.step() + ``` + +## register_hook方式dump和溢出检测 + +### 总体说明 + +- 本节主要介绍CPU或GPU及NPU精度数据dump和溢出检测所需要的函数以及示例。 + +- ptdbg_ascend工具默认情况下仅dump PyTorch模型的API输入输出数据进行精度比对,若在比对结果中发现某个API下可能存在ACL的精度问题,那么可以选择dump该API的ACL级别数据进行精度分析。 + +- 某些torch api的输出不是Tensor类型的数据。对于此类API的反向过程进行ACL dump,工具会在运行日志中给出对应的Warning(is not of tensor type and cannot be automatically derived)提示。如若想要进行该类API反向ACL dump,可以通过手动构建单API用例的方式进行ACL dump,具体用例可参见“**[反向ACL dump用例说明](https://gitee.com/ascend/att/blob/master/debug/accuracy_tools/ptdbg_ascend/doc/%E5%8F%8D%E5%90%91ACL%20dump%E7%94%A8%E4%BE%8B%E8%AF%B4%E6%98%8E.md)**”。 + +- 工具性能:dump数据量较小时(小于5G),参考dump速度0.1GB/s;dump数据量较大时,参考dump速度0.2GB/s。 + 推荐环境配置:独占环境,CPU核心数192,固态硬盘(IO速度参考:固态硬盘 > 500MB/s,机械硬盘60 ~ 170MB/s)。 + + 用户环境性能弱于标准约束或非独占使用的比对速度酌情向下浮动。Dump速度的计算方式:Dump数据量/(单个step添加Dump耗时-原始单个step耗时)。 + +### 约束 +- 进行CPU或GPU数据dump时,请安装torch包而非torch_npu包,避免工具无法识别使用场景,导致失败。 + +- TASK_QUEUE_ENABLE环境变量会导致API下发和执行异步进行,因此在ACL dump前需要将TASK_QUEUE_ENABLE关闭,即export TASK_QUEUE_ENABLE=0。 + +- 不建议在PyTorch训练脚本中同时添加dump接口和性能数据采集(如Ascend PyThon Profiler)接口,二者可能相互影响导致数据不准确。 + +### seed_all + +**功能说明** + +固定随机数。通过固定随机数保证模型的输入或输出一致。在训练主函数开始前调用,避免随机数固定不全。 + +使用form ptdbg import *后自动导入该函数,代码无需再次添加,若需要修改随机数种子和确定性计算模式,则需要通过添加该函数修改。 + +**函数原型** + +```python +seed_all(seed=1234, mode=False) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ------ | ------------------------------------------------------------ | -------- | +| seed | 随机数种子。参数示例:seed=1000。默认值为:1234。数据类型:int。 | 否 | +| mode | 确定性计算模式。可配置True或False。参数示例:mode=True。默认为False。数据类型:bool。
即使在相同的硬件和输入下,API多次执行的结果也可能不同,开启确定性计算是为了保证在相同的硬件和输入下,API多次执行的结果相同。
确定性计算会导致API执行性能降低,建议在发现模型多次执行结果不同的情况下开启。
rnn类算子、ReduceSum、ReduceMean等算子可能与确定性计算存在冲突,若开启确定性计算后多次执行的结果不相同,则考虑存在这些算子。 | 否 | + +**函数示例** + +seed_all函数的随机数种子,取默认值即可,无须配置;第二个参数默认关闭,不开启确定性计算时也无须配置。 + +- 示例1:仅固定随机数,不开启确定性计算 + + ```python + seed_all() + ``` + +- 示例2:固定随机数,开启确定性计算 + + ```python + seed_all(mode=True) + ``` + +**固定随机数范围** + +seed_all函数可固定随机数的范围如下表。 + +| API | 固定随机数 | +| ---------------------------------------- | --------------------------- | +| os.environ['PYTHONHASHSEED'] = str(seed) | 禁止Python中的hash随机化 | +| random.seed(seed) | 设置random随机生成器的种子 | +| np.random.seed(seed) | 设置numpy中随机生成器的种子 | +| torch.manual_seed(seed) | 设置当前CPU的随机种子 | +| torch.cuda.manual_seed(seed) | 设置当前GPU的随机种子 | +| torch.cuda.manual_seed_all(seed) | 设置所有GPU的随机种子 | +| torch_npu.npu.manual_seed(seed) | 设置当前NPU的随机种子 | +| torch_npu.npu.manual_seed_all(seed) | 设置所有NPU的随机种子 | +| torch.backends.cudnn.enable=False | 关闭cuDNN | +| torch.backends.cudnn.benchmark=False | cuDNN确定性地选择算法 | +| torch.backends.cudnn.deterministic=True | cuDNN仅使用确定性的卷积算法 | + +需要保证CPU或GPU以及NPU的模型输入完全一致,dump数据的比对才有意义,seed_all并不能保证模型输入完全一致,如下表所示场景需要保证输入的一致性。 + +| 场景 | 固定方法 | +| --------------- | ------------- | +| 数据集的shuffle | 关闭shuffle。 | +| dropout | 关闭dropout。 | + +关闭shuffle示例: + +```python +train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size = batch_size, + shuffle = False, + num_workers = num_workers +) +``` + +关闭dropout: + +在使用from ptdbg import *后,工具会自动将torch.nn.functional.dropout、torch.nn.functional.dropout2d、torch.nn.functional.dropout3d、torch.nn.Dropout、torch.nn.Dropout2d、torch.nn.Dropout3d的接口参数p置为0。 + +### set_dump_path + +**功能说明** + +设置数据保存目录。建议在seed_all函数之后调用且需要保证训练进程能够调用该函数;多卡时须保证每个进程都能调用该函数。 + +**函数原型** + +```python +set_dump_path(fpath=None, dump_tag='ptdbg_dump') +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| -------- | ------------------------------------------------------------ | -------- | +| fpath | 设置数据目录路径。参数示例:'./dump_path'。数据类型:str。
默认在dump_path目录下生成`ptdbg_dump_{version}`目录,并在该目录下生成`dump.pkl`文件以及`dump`数据文件保存目录。
当set_dump_switch函数配置了mode参数时,`dump.pkl`文件以及`dump`数据文件保存目录名称添加mode参数值为前缀,详情请参见“**dump数据存盘说明**”。
未配置fpath时,也可以通过环境变量ASCEND_WORK_PATH配置dump路径,此时数据将落盘在${ASCEND_WORK_PATH}/dump_data下,自定义配置dump_path优先级高于环境变量,fpath和环境变量需要二选一。 | 否 | +| dump_tag | 设置数据目录名称。参数示例:dump_tag='dump_conv2d'。默认数据目录命名为ptdbg_dump_{version}。数据类型:str。
{version}为当前安装ptdbg_ascend工具版本。目录结构参见“**dump数据存盘说明**”。
配置该参数会将生成的`ptdbg_dump_{version}`目录名称变更为dump_tag配置的值,如`dump_conv2d_{version}`。 | 否 | + +**函数示例** + +- 示例1:设置数据目录路径 + + ```python + set_dump_path('./dump_path') + ``` + +- 示例2:设置数据目录名称 + + ```python + set_dump_path('./dump_path', dump_tag='dump_conv2d') + ``` + + +若以相同的数据目录多次dump,则会因同名导致覆盖;多次dump建议配置不同的dump_tag。 + +### register_hook + +**功能说明** + +注册工具钩子函数。在set_dump_path之后调用。 + +dump操作必选。 + +**函数原型** + +```python +register_hook(model, hook, overflow_nums=overflow_nums, dump_mode=dump_mode, dump_config=dump_config_file) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ------------- | ------------------------------------------------------------ | -------- | +| model | 传入网络模型实例化的对象。参数示例: model=net,net为网络模型实例化的对象名称。数据类型:torch.nn.Module。 | 是 | +| hook | 注册工具的dump和溢出检测钩子。可取值overflow_check(表示溢出检测)和acc_cmp_dump(表示dump数据),二选一。数据类型:Callable。 | 是 | +| overflow_nums | 控制溢出次数,表示第N次溢出时,停止训练,过程中检测到溢出API对应ACL数据均dump。参数示例:overflow_nums=3。配置overflow_check时可配置,默认不配置,即检测到1次溢出,训练停止,配置为-1时,表示持续检测溢出直到训练结束。数据类型:int。 | 否 | +| dump_mode | 控制针对溢出API的dump模式,可取值"acl"或"api"。配置acl时,表示dump ACL级别的溢出数据,此时set_dump_path参数不生效,dump数据目录由dump_config的.json文件配置。参数示例:dump_mode="acl"。默认不配置,即dump API级别的溢出数据。数据类型:str。 | 否 | +| dump_config | acl dump的配置文件。dump_mode="acl"时,该参数必选;dump_mode="api"时,该参数不选。参数示例:dump_config='./dump.json'。数据类型:str。 | 否 | + +**函数示例** + +- 示例1:注册工具钩子函数 + + ```python + register_hook(model, acc_cmp_dump) + ``` + +- 示例2:dump指定API的ACL级别数据 + + ```python + register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json') + ``` + + 需要配置set_dump_switch的mode="acl"以及scope指定为前向或反向API,请参见“**set_dump_switch”**的示例。 + + 该场景set_dump_path不生效,由dump_config中的dump.json文件配置dump数据目录。 + +- 示例3:溢出检测dump + + ```python + register_hook(model, overflow_check, overflow_nums=3) + ``` + + dump执行时会在set_dump_path的fpath参数指定的目录下生成ptdbg_dump_{version}目录,保存溢出数据。 + + 多卡场景时,需要检测到至少有一张卡溢出次数达到overflow_nums时,训练结束。 + + 仅支持NPU环境。 + +- 示例4:dump指定API的ACL级别溢出数据 + + ```python + register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json') + ``` + + 该场景会在原有数据基础上,额外在dump.json文件配置的dump_path目录下生成一份ACL算子数据,该数据可通过“**ptdbg_ascend.parse**”工具进行解析。 + + 仅支持NPU环境。 + +### set_dump_switch + +**功能说明** + +设置dump范围。建议在register_hook函数之后的脚本内任意位置插入,但进行精度问题排查建议参照“场景化示例 > 单卡场景精度比对”章节的顺序,先从第一个迭代开始的位置调用并dump整网数据。 + +dump操作必选。 + +**函数原型** + +```python +def set_dump_switch(switch, mode="all", scope=[], api_list=[], filter_switch="OFF", dump_mode=["all"], summary_only=False): +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| --------------- | ------------------------------------------------------------ | -------- | +| switch | dump开关。可取值"ON"或"OFF"。须在选定dump开始的位置配置set_dump_switch("ON");dump结束的位置设置set_dump_switch("OFF")。数据类型:str。 | 是 | +| mode | dump模式。可取值"all"、"list"、"range"、"stack"、"acl"、"api_list"、"api_stack",各参数含义请参见本节的“**函数示例**”。参数示例:mode="list"。默认为"all"。该参数配置值将作为dump数据文件名的前缀,详情请参见“**dump数据存盘说明**”。数据类型:str。 | 否 | +| scope或api_list | dump范围。根据model配置的模式选择dump的API范围。参数示例:scope=["Tensor.permute.1.forward", "Tensor.transpose.2.forward"]、api_list=["relu"]。默认为空。数据类型:List[str]。 | 否 | +| filter_switch | dump bool和整型的tensor以及浮点、bool和整型的标量的过滤开关。可取值"ON"或"OFF"。参数示例:filter_switch="ON"。默认不配置,即filter_switch="OFF",表示dump上述数据。数据类型:str。 | 否 | +| dump_mode | dump数据过滤。可取值"all"、"forward"、"backward"、"input"和"output",表示仅保存dump的数据中文件名包含"forward"、"backward"、"input"和"output"的前向、反向、输入或输出的.npy文件。参数示例dump_mode=["backward"]或dump_mode=["forward", "backward"]。默认为all,即保存所有dump的数据。除了all参数只能单独配置外,其他参数可以自由组合。数据类型:List[str]。 | 否 | +| summary_only | dump npy文件过滤,可取值True或False,配置为True后仅dump保存API统计信息的pkl文件,参数示例:summary_only=False,默认为False。数据类型:bool。 | 否 | + +**推荐配置** + +```python +set_dump_switch("ON", mode="api_stack", filter_switch="OFF") +``` + +开启dump数据和堆栈模式,同时为保证数据完整性开启dump bool和整型的tensor以及浮点、bool和整型的标量。 + +**函数示例** + +set_dump_switch可配置多种dump模式,示例如下: + +说明:以下均以dump部分API数据为例,API名可以从首次dump整网数据的结果文件中的NPU Name或Bench Name列获取。 + +- 示例1:dump指定API列表 + + ```python + set_dump_switch("ON", mode="list", scope=["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]) + ``` + +- 示例2:dump指定范围 + + ```python + set_dump_switch("ON", mode="range", scope=["Tensor.abs.1.forward", "Tensor.transpose.3.forward"]) + ``` + +- 示例3:STACK模式,只dump堆栈信息 + + ```python + set_dump_switch("ON", mode="stack", scope=["Tensor.abs.1.forward", "Tensor.transpose.3.forward"]) + ``` + +- 示例4:dump指定前向API的ACL级别数据 + + ```python + register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json') + set_dump_switch("ON", mode="acl", scope=["Tensor.permute.1.forward"]) + ``` + + 需要配置register_hook的dump_mode='acl'和dump_config配置文件。 + +- 示例4:dump指定反向API的ACL级别数据 + + ```python + register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json') + set_dump_switch("ON", mode="acl", scope=["Functional.conv2d.1.backward"]) + set_backward_input(["./npu_dump/dump_conv2d_v4.0/step0/rank0/dump/Functional.conv2d.1.backward.input.0.npy"]) + ``` + + 需要配置register_hook的dump_mode='acl'和dump_config配置文件,并通过set_backward_input设置反向API输入的.npy文件。 + +- 示例5:dump指定某一类API的API级别输入输出数据 + + ```python + set_dump_switch("ON", mode="api_list", api_list=["relu"]) + ``` + + mode="api_list"时不配置scope。 + +- 示例6:dump全部API级别输入输出数据以及相应堆栈信息 + + ```python + set_dump_switch("ON", mode="api_stack") + ``` + + mode="api_stack"时不配置scope。 + +- 示例7: dump全部API级别输入输出数据并包含bool和整型的tensor以及浮点、bool和整型的标量,配置为OFF,会dump bool和整型数据 + + ```python + set_dump_switch("ON", filter_switch="OFF") + ``` + + 配置filter_switch="OFF"同时也可以配置mode、scope和api_list,除dump ACL级别数据。 + +- 示例8:仅保存dump的数据文件名包含“backward”的反向.npy文件 + + ```python + set_dump_switch("ON", dump_mode=["backward"]) + ``` + +- 示例9:仅dump pkl文件 + + ```python + set_dump_switch("ON", summary_only=True) + ``` + +以上示例均需要在结束dump的位置插入set_dump_switch("OFF")。 + +set_dump_switch配置mode为all或api_stack时,结束dump后,在dump目录下会自动生成compare_data.py比对脚本模板,示例如下: + +```python +from ptdbg_ascend import compare +from ptdbg_ascend.common.file_check_util import FileChecker +import argparse +import os.path + +pkl_path = "%s" +dump_data_dir = "%s" + +parser = argparse.ArgumentParser(description="compare data") +parser.add_argument("--npu_pkl_path", type=str, default=pkl_path, help="npu保存数据的pkl路径") +parser.add_argument("--bench_pkl_path", type=str, default=pkl_path, help="对比数据的pkl路径") +parser.add_argument("--output_path", type=str, default="./", help="导出对比数据的路径") + +args = parser.parse_args() +npu_pkl_path = args.npu_pkl_path +bench_pkl_path = args.bench_pkl_path +output_path = args.output_path + +suffix = ".pkl" +npu_path_checker = FileChecker(npu_pkl_path, "file", "read", suffix) +npu_path_checker.common_check() +bench_path_checker = FileChecker(bench_pkl_path, "file", "read", suffix) +bench_path_checker.common_check() + +npu_dump_data_dir = npu_pkl_path[:-len(suffix)] +bench_dump_data_dir = bench_pkl_path[:-len(suffix)] +if not os.path.exists(npu_dump_data_dir) or not os.path.exists(bench_dump_data_dir): + npu_dump_data_dir = "" + bench_dump_data_dir = "" + +dump_path_param = { + "npu_pkl_path": npu_pkl_path, + "bench_pkl_path": bench_pkl_path, + "npu_dump_data_dir": npu_dump_data_dir, + "bench_dump_data_dir": bench_dump_data_dir, + "is_print_compare_log": True +} + +compare(dump_path_param, output_path=output_path, stack_mode=%s) +``` + +compare_data.py比对脚本模板可以直接使用命令行配置比对参数,不需要通过编辑compare_data.py文件来修改,示例如下: + +```bash +python3 compare_data.py --npu_pkl_path "./npu_dump/ptdbg_dump_v6.0/step0/rank0/api_stack_dump.pkl" --bench_pkl_path "./gpu_dump/ptdbg_dump_v6.0/step0/rank0/api_stack_dump.pkl" --output_path "./output_path" +``` + +该命令行支持--npu_pkl_path、--bench_pkl_path和--output三个比对参数,其中pkl_path两个参数配置后,脚本可以自动识别同级目录下的dump_data目录,若同级目录下不存在dump_data目录,则直接执行“**pkl文件比对**”。仅ptdbg_ascend 6.0或更高版本支持比对命令行配置比对参数。更多介绍请参见“**执行比对操作**”。 + +### set_overflow_check_switch + +**功能说明** + +置溢出检测范围。默认不配置该函数,全量进行溢出检测。 + +仅支持NPU环境。 + +**函数原型** + +```python +set_overflow_check_switch(switch, filter_switch='OFF') +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ------------- | ------------------------------------------------------------ | -------- | +| switch, | 检测开关。可取值"ON"或"OFF"。如果只在特定的step溢出检测,则在期望溢出检测的step位置开始前插入set_overflow_check_switch("ON"),在step结束的位置插入set_overflow_check_switch("OFF")。数据类型:str。 | 是 | +| filter_switch | dump bool和整型的tensor以及浮点、bool和整型的标量的过滤开关。可取值"ON"或"OFF"。参数示例:filter_switch="ON"。默认不配置,即filter_switch="OFF",表示dump上述数据。数据类型:str。 | 否 | + +**函数示例** + +- 示例1:指定范围溢出检测 + + ```python + register_hook(model, overflow_check) + set_overflow_check_switch("ON") + + ... + + set_overflow_check_switch("OFF") + ``` + + 该场景set_dump_path不生效,dump执行时会在当前目录自动生成ptdbg_dump_{version}目录,保存溢出数据。 + +- 示例2:前向API的ACL级别范围溢出检测 + + ```python + register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json') + set_overflow_check_switch("ON") + + ... + + set_overflow_check_switch("OFF") + ``` + + 该场景set_dump_path不生效,由dump_config中的dump.json文件配置溢出数据目录。 + +### set_backward_input + +**功能说明** + +设置反向ACL级别dump时需要的反向输入的.npy文件。 + +**函数原型** + +```python +set_backward_input(backward_input) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| -------------- | ------------------------------------------------------------ | -------- | +| backward_input | 该输入文件为首次运行训练dump得到反向API输入的.npy文件。例如若需要dump Functional.conv2d.1 API的反向过程的输入输出,则需要在dump目录下查找命名包含Functional.conv2d.1、backward和input字段的.npy文件。数据类型:str。 | 是 | + +**函数示例** + +```python +register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json') +set_dump_switch("ON", mode="acl", scope=["Functional.conv2d.1.backward"]) +set_backward_input(["./npu_dump/dump_conv2d_v4.0/step0/rank0/dump/Functional.conv2d.1.backward.input.0.npy"]) +``` + +## dump.json配置文件说明 + +**dump.json配置示例** + +```python +{ + "dump": + { + "dump_list":[], + "dump_path":"./dump/output", + "dump_mode":"all", + "dump_op_switch":"on" + } +} +``` + +**dump.json参数说明** + +| 字段名 | 说明 | +| -------------- | ------------------------------------------------------------ | +| dump_list | 待dump数据的API模型。为空,无需配置。 | +| dump_path | dump数据文件存储到运行环境的目录,主要用于指定ACL dump数据路径。支持配置绝对路径或相对路径。dump_path须为已存在目录。 | +| dump_mode | dump数据模式,配置如下:
output:dump API的输出数据。默认值。
input:dump API的输入数据。
all:dump API的输入、输出数据。 | +| dump_op_switch | 单API模型dump数据开关,配置如下: * off:关闭单API模型dump,默认值。 * on:开启单API模型dump。 | + +**dump目录说明** + +配置register_hook的dump_config后,采集的dump数据会在{dump_path}/{time}/{deviceid}/{model_id}目录下生成,例如“/home/HwHiAiUser/output/20200808163566/0/0” + +```bash +├── 20230131172437 +│   └── 1 +│   ├── 0 +│   │   ├── Add.Add.45.0.1675157077183551 +│   │   ├── Cast.trans_Cast_0.31.0.1675157077159449 +│   │   ├── Cast.trans_Cast_5.43.0.1675157077180129 +│   │   ├── MatMul.MatMul.39.0.1675157077172961 +│   │   ├── Mul.Mul.29.0.1675157077155731 +│   │   ├── NPUAllocFloatStatus.NPUAllocFloatStatus.24.0.1675157077145262 +│   │   ├── TransData.trans_TransData_1.33.0.1675157077162791 +│   │   └── TransData.trans_TransData_4.41.0.1675157077176648 +│   ├── 1701737061 +│   │   └── Cast.trans_Cast_2.35.0.1675157077166214 +│   ├── 25 +│   │   └── NPUClearFloatStatus.NPUClearFloatStatus.26.0.1675157077150342 +│   └── 68 +│   └── TransData.trans_TransData_3.37.0.1675157077169473 +``` + +## 模块级精度数据dump + +### 总体说明 + +大模型场景下,通常不是简单的利用自动迁移能力实现GPU到NPU的训练脚本迁移,而是会对NPU网络进行一系列针对性的适配,因此,常常会造成迁移后的NPU模型存在部分子结构不能与GPU原始模型完全对应。模型结构不一致导致API调用类型及数量不一致,若直接按照API粒度进行精度数据dump和比对,则无法完全比对所有的API。 + +本节介绍的功能是对模型中的大粒度模块进行数据dump,使其比对时,对于无法以API粒度比对的模块可以直接以模块粒度进行比对。 + +模块指的是继承自nn.Module类模块,通常情况下这类模块就是一个小模型,可以被视为一个整体,dump数据时以模块为粒度进行dump。 + +### module_dump + +**功能说明** + +开启模块级精度数据dump。 + +模块级精度数据dump时必选。 + +**函数原型** + +```python +module_dump(module, module_name) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ----------- | ------------------------------------------------------------ | -------- | +| module | 网络中实例化好的nn.Module类对象。数据类型:torch.nn.Module。 | 是 | +| module_name | 用户自定义的该model名称。主要用于dump数据文件的命名,便于在比对时识别模块级数据。数据类型:str。 | 是 | + +### module_dump_end + +**功能说明** + +结束模块级精度数据dump。 + +模块级精度数据dump时必选。 + +**函数原型** + +```python +module_dump_end() +``` + +### 示例代码 + +```python +# 根据需要import包 +import os +import torch +import torch.nn as nn +import torch_npu +import torch.nn.functional as F +from ptdbg_ascend import * + +torch.npu.set_device("npu:0") +# 定义一个简单的网络 +class ModuleOP(nn.Module): + def __init__(self) -> None: + super().__init__() + self.linear_1 = nn.Linear(in_features=8, out_features=4) + self.linear_2 = nn.Linear(in_features=4, out_features=2) + def forward(self, x): + x1 = self.linear_1(x) + x2 = self.linear_2(x1) + r1 = F.relu(x2) + return r1 + +if __name__ == "__main__": + module = ModuleOP() + + # 注册工具 + pdbg = PrecisionDebugger("./dump_data/npu", hook_name="dump") + pdbg.start() + + x = torch.randn(10, 8) + module_dump(module, "MyModuleOP") # 开启模块级精度数据dump + out = module(x) + module_dump_end() # 结束模块级精度数据dump + loss = out.sum() + loss.backward() + pdbg.stop() +``` + +## dump数据存盘说明 + +dump结果目录结构示例如下: + +```bash +├── dump_path +│ └── ptdbg_dump_{version} +│ ├── step0 +│ | ├── rank0 +│ | │ ├── dump +| | | | ├── Tensor.permute.1.forward.npy +| | | | ├── MyModule.0.forward.input.npy # 开启模块级精度数据dump时存在模块级的dump数据文件 +| | | | ... +| | | | └── Fcuntion.linear.5.backward.output.npy +│ | │ └── dump.pkl +│ | ├── rank1 +| | | ├── dump +| | | | └── ... +| | | └── dump.pkl +│ | ├── ... +│ | | +| | └── rank7 +│ ├── step1 +│ | ├── ... +│ ├── step2 +``` + +dump过程中,npy文件在对应算子或者模块被执行后就会落盘,而pkl文件则需要在正常执行PrecisionDebugger.stop()或set_dump_switch("OFF")后才会被落盘保存,异常的程序终止会保存终止前被执行算子的相关npy文件,但是不会生成pkl文件。 + +其中`ptdbg_dump_{version}`为默认命名,debugger方式dump不支持修改该文件夹名称,使用set_dump_path函数则支持通过dump_tag参数修改文件夹名称;rank为设备上各卡的ID,每张卡上dump的数据会生成对应dump目录。 + +**精度比对dump场景** + +精度比对dump场景的结果如下: + +* dump.pkl文件:包含dump数据的API名称(命名格式为:`{api_type}.{api_name}.{API调用次数}.{前向反向}.{input/output}.{参数序号}`)、dtype、 shape、各数据的max、min、mean、L2norm统计信息以及当配置summary_mode="md5"时的md5数据。 + + 其中,“参数序号”表示该API下的第n个参数,例如1,则为第一个参数,若该参数为list格式,则根据list继续排序,例如1.1,表示该API的第1个参数的第1个子参数;L2norm表示2范数(平方根)。 + +* dump目录:目录下为npy格式的dump数据。 + + npy文件保存的前缀和PyTorch对应关系如下 + + | 前缀 | Torch模块 | + | ----------- | ------------------- | + | Tensor | torch.Tensor | + | Torch | torch | + | Functional | torch.nn.functional | + | NPU | NPU亲和算子 | + | VF | torch._VF | + | Aten | torch.ops.aten | + | Distributed | torch.distributed | + +当configure_hook或set_dump_switch配置mode参数(例如:mode="api_stack" )时,dump结果的文件名会添加api_stack前缀,dump结果如下: + +* api_stack_dump.pkl +* api_stack_dump目录 + +**溢出检测dump场景** + +PrecisionDebugger模块的hook_name参数或register_hook函数设置了overflow_check时,检测API溢出,dump结果的文件名格式为:`{api_type}.{api_name}.{API调用次数}.{前向反向}.{当前溢出次数}`,dump结果示例如下: + +* `Tensor_add_1_forward_1.pkl` +* `Tensor_add_1_forward_1`目录 + +## 工具支持的API列表 + +ptdbug_ascend工具维护固定的API支持列表,若需要删除或增加dump的API,可以在[support_wrap_ops.yaml](../src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml)文件内手动修改,如下示例: + +```bash +functional: # functional为算子类别,找到对应的类别,在该类别下按照下列格式删除或添加API + - conv1d + - conv2d + - conv3d +``` + +## CPU或GPU与NPU精度数据比对 + +### 总体说明 + +- 本节主要介绍CPU或GPU与NPU精度数据比对的函数以及示例。 + +- 比对函数均通过单独创建精度比对脚本执行,可支持单卡和多卡场景的精度数据比对。 + +- 工具性能:比对数据量较小时(参考值单份文件小于10GB),参考比对速度0.1GB/s;比对数据量较大时,参考比对速度0.3GB/s。 + 推荐环境配置:独占环境,CPU核心数192,固态硬盘(IO速度参考:固态硬盘 > 500MB/s,机械硬盘60 ~ 170MB/s)。 + + 用户环境性能弱于标准约束或非独占使用的比对速度酌情向下浮动。比对速度的计算方式:两份比对文件大小/比对耗时。 + +### 约束 + +- NPU自研API,在CPU或GPU若没有对应的API,该API的dump数据不比对。 + +- NPU与CPU或GPU的计算结果误差可能会随着模型的执行不断累积,最终会出现同一个API因为输入的数据差异较大而无法比对的情况。 + +- CPU或GPU与NPU中两个相同的API会因为调用次数不同导致无法比对或比对到错误的API,不影响整体运行,该API忽略。 + +### compare_distributed + +**功能说明** + +将CPU或GPU与NPU的dump文件进行比对,支持单卡和多卡,可同时比对多卡的dump数据。多机场景需要每个设备单独执行比对操作。可自动检索和匹配对应卡和进程所dump的数据文件,再调用compare进行比对。单机单卡时与compare函数二选一。 + +**函数原型** + +```python +compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| -------------- | ------------------------------------------------------------ | -------- | +| npu_dump_dir | 配置NPU环境下的dump目录。dump数据目录须指定到step级。参数示例:'./npu_dump/ptdbg_dump_v4.0/step0'。register_hook方式可通过set_dump_path函数的dump_tag参数修改该目录名称。数据类型:str。 | 是 | +| bench_dump_dir | 配置CPU、GPU或NPU环境下的dump目录。参数示例:'./gpu_dump/ptdbg_dump_v4.0/step0'。register_hook方式可通过set_dump_path函数的dump_tag参数修改该目录名称。数据类型:str。 | 是 | +| output_path | 配置比对结果文件存盘目录。需要预先创建output_path目录。参数示例:'./output'。文件名称基于时间戳自动生成,格式为:`compare_result_rank{npu_ID}-rank{cpu/gpu/npu_ID}_{timestamp}.xlsx`。数据类型:str。 | 是 | +| **kwargs | 支持compare的所有可选参数。 | 否 | + +**函数示例** + +创建比对脚本,例如compare_distributed.py,拷贝如下代码,具体参数请根据实际环境修改。 + +```python +from ptdbg_ascend import * +compare_distributed('./npu_dump/ptdbg_dump_v4.0/step0', './gpu_dump/ptdbg_dump_v4.0/step0', './output') +``` + +dump数据目录须指定到step级。 + +### compare + +**功能说明** + +将CPU或GPU与NPU的dump文件进行比对,仅支持单机单卡。 + +**函数原型** + +```python +compare(input_param, output_path, stack_mode=False, auto_analyze=True, fuzzy_match=False) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ------------ | ------------------------------------------------------------ | -------- | +| input_param | 配置dump数据文件及目录。数据类型:dict。配置参数包括:
"npu_pkl_path":指定NPU dump目录下的.pkl文件。参数示例:"npu_pkl_path": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl"。必选。
"bench_pkl_path":指定CPU、GPU或NPU dump目录下的.pkl文件。参数示例:"bench_pkl_path": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl"。必选。
"npu_dump_data_dir":"指定NPU dump目录下的dump数据目录。参数示例:"npu_dump_data_dir": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump"。可选,仅比对pkl文件时不选。
"bench_dump_data_dir":"指定CPU、GPU或NPU dump目录下的dump数据目录。参数示例:"npu_dump_data_dir": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump"。可选,仅比对pkl文件时不选。
"is_print_compare_log":配置是否开启日志打屏。可取值True或False。可选。 | 是 | +| output_path | 配置比对结果文件存盘目录。参数示例:"./output_path",默认为"./"。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.xlsx`。数据类型:str。 | 否 | +| stack_mode | 配置stack_mode的开关。仅当dump数据时配置debugger.configure_hook或set_dump_switch的mode="api_stack"时需要开启。可取值True或False,参数示例:stack_mode=True,默认为False。数据类型:bool。 | 否 | +| auto_analyze | 自动精度分析,开启后工具自动针对比对结果进行分析,识别到第一个精度不达标节点(在比对结果文件中的“Accuracy Reached or Not”列显示为No),并给出问题可能产生的原因(打屏展示并生成advisor_{timestamp}.txt文件)。可取值True或False,参数示例:auto_analyze=False,默认为True。数据类型:bool。 | 否 | +| fuzzy_match | 模糊匹配。开启后,对于网络中同一层级且命名仅调用次数不同的API,可匹配并进行比对。可取值True或False,参数示例:fuzzy_match=True,默认为False。数据类型:bool。 | 否 | + +**函数示例** + +单机单卡场景下创建比对脚本,例如compare.py,拷贝如下代码,具体参数请根据实际环境修改。 + +```python +from ptdbg_ascend import compare +dump_result_param={ +"npu_pkl_path": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", +"bench_pkl_path": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", +"npu_dump_data_dir": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump", +"bench_dump_data_dir": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump", +"is_print_compare_log": True +} +compare(dump_result_param, output_path="./output_path", stack_mode=True) +``` + +### pkl文件比对 + +若使用**compare**或**compare_distributed**函数创建的比对脚本中,input_param参数只配置了npu_pkl_path和bench_pkl_path或使用summary_only、summary_mode(取值为md5或summary)方式dump时,可以进行pkl文件的比对,此时比对dump.pkl文件中的统计信息,开启后的比对结果文件生成Max diff、Min diff、Mean diff和L2norm diff,表示NPU dump数据中API的输入或输出与标杆数据输入或输出的最大值、最小值、平均值以及L2范数的差。可以通过该值判断API是否存在精度问题:当某个API的输入和输出的Max diff、Min diff、Mean diff和L2norm diff均为0或无限趋于0,那么可以判断该API无精度问题,反之则可能存在精度问题。 + +**比对脚本示例** + +以compare.py为例。 + +```python +from ptdbg_ascend import compare +dump_result_param={ +"npu_pkl_path": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", +"bench_pkl_path": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", +"is_print_compare_log": True +} +compare(dump_result_param, output_path="./output_path", stack_mode=True) +``` + +**比对结果** + +pkl文件比对同样生成`compare_result_{timestamp}.xlsx`和`advisor_{timestamp}.txt`文件。其中`advisor_{timestamp}.txt`主要对`compare_result_{timestamp}.xlsx`中可能存在精度问题(Result为Waring)的API提出定位建议;`compare_result_{timestamp}.xlsx`主要有如下两种情况: + +- configure_hook配置summary_only=True、summary_mode=summary或不配置前面两个参数直接比对pkl文件: + + ![compare_result_pkl](./img/compare_result_pkl.png) + + 上图是对pkl文件中NPU及标杆API的统计信息进行比对,判断可能存在精度问题的API,文件中记录NPU及标杆API的基本信息和统计信息,其中需要关注Result列,包含结果:Waring(NPU与标杆统计信息的比对中存在相对误差大于0.5,则需要重点检查该API);为空(相对误差小于等于0.5,可以不需要重点关注,但不代表不存在精度问题);Nan(表示统计信息数据没有匹配上)。 + 同时,需要关注高亮的API,存在红色或黄色高亮的API被认为可能存在问题,具体高亮准则参见“**比对结果分析 > 异常信息识别**”章节。 + +- configure_hook配置summary_mode=md5: + + ![compare_result_pkl_md5.png](./img/compare_result_pkl_md5.png.png) + + 上图是对pkl文件中NPU及标杆API的MD5信息进行比对,判断API数据的完整性,文件中记录NPU及标杆API的基本信息和MD5信息,其中需要关注Result列,包含结果:Pass(表示NPU与标杆的MD5值一致,即API数据完整);Different(表示NPU与标杆的MD5值不一致,即API数据不完全一致,可以通过NPU_Stack_Info列API调用栈查询该API的详细信息);Nan(表示MD5信息数据没有匹配上)。 + +### parse + +**功能说明** + +解析并提取dump信息中的堆栈信息及数据统计信息。 + +**函数原型** + +```python +parse(pkl_file, module_name_prefix) +``` + +**参数说明** + +| 参数名 | 说明 | 是否必选 | +| ------------------ | ------------------------------------------------------------ | -------- | +| pkl_file | 指定dump数据文件中的pkl文件名。参数示例:"./npu_dump/ptdbg_dump_v4.0/step0/rank0/dump.pkl"。数据类型:str。 | 是 | +| module_name_prefix | 指定待提取的API接口前缀。参数示例:"Torch.norm.1.forward"。数据类型:str。 | 是 | + +**函数示例** + +创建堆栈信息及数据统计信息提取脚本,例如parse.py,拷贝如下代码,具体参数请根据实际环境修改。 + +```python +from ptdbg_ascend import * +parse("./npu_dump/ptdbg_dump_v4.0/step0/rank0/dump.pkl", "Torch.batch.normal.1.forward") +``` + +### 执行比对操作 + +比对操作通过执行比对脚本启动,根据不同的比对脚本分为如下场景: + +- dump数据时自动生成比对脚本模板,脚本名为compare_data.py,该脚本模板也可以直接手动创建: + + ```python + from ptdbg_ascend import compare + from ptdbg_ascend.common.file_check_util import FileChecker + import argparse + import os.path + + pkl_path = "%s" + dump_data_dir = "%s" + + parser = argparse.ArgumentParser(description="compare data") + parser.add_argument("--npu_pkl_path", type=str, default=pkl_path, help="npu保存数据的pkl路径") + parser.add_argument("--bench_pkl_path", type=str, default=pkl_path, help="对比数据的pkl路径") + parser.add_argument("--output_path", type=str, default="./", help="导出对比数据的路径") + + args = parser.parse_args() + npu_pkl_path = args.npu_pkl_path + bench_pkl_path = args.bench_pkl_path + output_path = args.output_path + + suffix = ".pkl" + npu_path_checker = FileChecker(npu_pkl_path, "file", "read", suffix) + npu_path_checker.common_check() + bench_path_checker = FileChecker(bench_pkl_path, "file", "read", suffix) + bench_path_checker.common_check() + + npu_dump_data_dir = npu_pkl_path[:-len(suffix)] + bench_dump_data_dir = bench_pkl_path[:-len(suffix)] + if not os.path.exists(npu_dump_data_dir) or not os.path.exists(bench_dump_data_dir): + npu_dump_data_dir = "" + bench_dump_data_dir = "" + + dump_path_param = { + "npu_pkl_path": npu_pkl_path, + "bench_pkl_path": bench_pkl_path, + "npu_dump_data_dir": npu_dump_data_dir, + "bench_dump_data_dir": bench_dump_data_dir, + "is_print_compare_log": True + } + + compare(dump_path_param, output_path=output_path, stack_mode=%s) + ``` + + 执行如下命令启动比对操作: + + ```bash + python3 compare_data.py --npu_pkl_path "npu_pkl_path" --bench_pkl_path "bench_pkl_path" --output_path "output_path" + ``` + + 命令行示例:python3 compare_data.py --npu_pkl_path "./npu_dump/ptdbg_dump_v6.0/step0/rank0/api_stack_dump.pkl" --bench_pkl_path "./gpu_dump/ptdbg_dump_v6.0/step0/rank0/api_stack_dump.pkl" --output_path "./output" + + - 该命令行支持--npu_pkl_path、--bench_pkl_path和--output三个**命令行比对参数**,其中pkl_path两个参数配置后,脚本可以自动识别同级目录下的dump_data目录,若同级目录下不存在dump_data目录,则直接执行“**pkl文件比对**”。 + - **命令行比对参数**的优先级高于compare.py比对脚本内的参数,配置命令行比对参数后,不需要通过编辑compare_data.py文件来修改比对参数。 + - **命令行比对参数**均为可选,但若未配置pkl_path两个参数,则需要在比对脚本中配置。 + - 仅ptdbg_ascend 6.0或更高版本支持**命令行比对参数**。 + + | 参数 | 说明 | 是否必选 | + | ---------------- | ------------------------------------------------------------ | -------- | + | --npu_pkl_path | 指定NPU dump目录下的.pkl文件。参数示例:--npu_pkl_path "./npu_dump/ptdbg_dump_v6.0/step0/rank0/api_stack_dump.pkl"。 | 否 | + | --bench_pkl_path | 指定CPU、GPU或NPU dump目录下的.pkl文件。参数示例:--bench_pkl_path "./gpu_dump/ptdbg_dump_v6.0/step0/rank0/api_stack_dump.pkl" | 否 | + | --output_path | 配置比对结果文件存盘目录。参数示例:--output_path "./output",默认为"./"。文件名称基于时间戳自动生成,格式为:`compare_result_{timestamp}.xlsx`。 | 否 | + +- 手动创建比对脚本,自定义脚本名为compare.py: + + ```python + from ptdbg_ascend import compare + dump_result_param={ + "npu_pkl_path": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", + "bench_pkl_path": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump.pkl", + "npu_dump_data_dir": "./npu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump", + "bench_dump_data_dir": "./gpu_dump/ptdbg_dump_v4.0/step0/rank0/api_stack_dump", + "is_print_compare_log": True + } + compare(dump_result_param, output_path="./output_path", stack_mode=True) + ``` + + 执行如下命令启动比对操作: + + ```bash + python3 compare.py + ``` + + 比对完成后在output_path输出比对结果文件,包括:`compare_result_{timestamp}.xlsx`和`advisor_{timestamp}.txt` + + 比对结果详细分析请参见“**比对结果分析**”。 + +### 比对结果分析 + +PyTorch精度比对是以CPU或GPU的计算结果为标杆,通过计算精度评价指标判断API在运行时是否存在精度问题。 + +- `advisor_{timestamp}.txt`文件中给出了可能存在精度问题的API的专家建议,可直接打开查看。 + +- `compare_result_{timestamp}.xlsx`文件列出了所有执行精度比对的API详细信息和比对结果,如下示例: + + ![compare_result](./img/compare_result.png) + + 可以从该结果文件中进行“**判断计算精度达标情况**”、“**计算精度评价指标分析**”以及“**异常信息识别**”等分析动作。 + +**判断计算精度达标情况** + +精度比对结果`compare_result_{timestamp}.xlsx`文件中只需要通过Accuracy Reached or Not来判断计算精度是否达标,判断标准如下: + +1. Cosine < 0.99 且 MaxAbsError > 0.001时,精度不达标,标记为“No”。 +2. Cosine < 0.9,精度不达标,标记为“No”。 +3. MaxAbsError > 1,精度不达标,标记为“No”。 +4. 其余情况下记为精度达标,标记为“Yes”。 + +**计算精度评价指标分析** + +1. Cosine:通过计算两个向量的余弦值来判断其相似度,数值越接近于1说明计算出的两个张量越相似,实际可接受阈值为大于0.99。在计算中可能会存在nan,主要由于可能会出现其中一个向量为0。 + +2. MaxAbsErr:当最大绝对误差越接近0表示其计算的误差越小,实际可接受阈值为小于0.001。 + +3. MaxRelativeErr:当最大相对误差越接近0表示其计算的误差越小。 + + 当dump数据中存在0或Nan时,比对结果中最大相对误差则出现inf或Nan的情况,属于正常现象。 + +4. One Thousandth Err Ratio(双千分之一)、Five Thousandths Err Ratio(双千分之五)精度指标:是指NPU的Tensor中的元素逐个与对应的标杆数据对比,相对误差大于千分之一、千分之五的比例占总元素个数的比例小于千分之一、千分之五。该数据仅作为精度下降趋势的参考,并不参与计算精度是否通过的判定。 + +**异常信息识别** + +精度比对结果`compare_result_{timestamp}.xlsx`文件中对于存在异常信息的API会进行高亮处理: + +- 红色可能出现的情况有: + - NPU max或NPU min信息中存在nan/inf + - Max diff存在大于1e+10的值 + - 统计数据中Max diff除以Bench max > 0.5 + - 真实数据中One Thousandth Err Ratio的input > 0.9同时output < 0.6 + +- 黄色可能出现的情况有: + - Max diff的输出与输入存在数量级差 + - 统计数据Max diff除以Bench max的input > 0.1同时input < 0.01 + - 真实数据One Thousandth Err Ratio的output - input > 0.1 + - 真实数据Cosine的output - input > 0.1 + +## ptdbg_ascend.parse数据解析功能 + +ptdbg_ascend.parse为命令行交互式界面解析工具,提供更多的数据解析功能并且展示结果。 + +使用场景:本工具主要用于比对前后两次NPU ACL层级dump数据的一致性。 + +### 进入parse交互式界面 + +安装ptdbg_ascend工具后,可以通过使用命令 **python -m ptdbg_ascend.parse** 进入交互式界面,如下所示: + +```bash +python -m ptdbg_ascend.parse +Parse >>> +``` + +可在parse的界面中执行Shell命令,以及如下场景的相关解析命令: + +- 支持指定ACL层级算子数据比对。 +- 支持指定ACL层级算子数据转换及展示。 +- 支持交互式指定pkl文件中API对应dump数据查看。 +- 支持API进行可选层级比对和打印(统计级和像素级)。 + +Ctrl+C可以退出parse交互式界面。不退出parse交互式界面若需要执行非该界面下的内置Shell命令,且命令与parse交互式界面命令冲突时,非该界面命令需要使用run命令,在相关命令前加上run前缀,如下示例: + +```bash +python -m ptdbg_ascend.parse +Parse >>> run vim cli.py +Parse >>> vim cli.py +``` + +以上各场景详细介绍请参见下文章节。 + +### ACL层级算子数据批量转换 + +本功能会将原有待比对dump数据目录下的dump数据按照算子名和时间戳进行梳理并分类,之后再将dump数据转为为npy文件。 + +依赖:CANN包中的msaccucmp工具,需要安装Ascend-CANN-toolkit,详见《[CANN 软件安装指南](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F700%2Fenvdeployment%2Finstg%2Finstg_0001.html)》。 + +输入以下比对命令进行数据转换。 + +```bash +cad -m my_dump_path [-out output_path] [-asc msaccucmp_path] +``` + +| 参数名称 | 说明 | 是否必选 | +| -------- | ------------------------------------------------------------ | -------- | +| -m | 待转换ACL dump数据目录。需要指定到ACL dump数据的deviceid级目录。 | 是 | +| -out | 结果输出目录,须指定已存在的目录,默认为./parse_data/acl_batch_convert。未指定时保存在默认路径下,比对结束后会打印log提示输出结果存放路径。 | 否 | +| -asc | 指定msaccucmp路径,默认路径为:/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py。 | 否 | + +**示例** + +```bash +# 传入待比对数据目录 +Parse >>> cad -m /home/xxx/my_dump_path/20000124003856/0 +# 转换结果打印 +...... +╭──────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +# 转换前的dump文件 +│ SrcFile: /home/xxx/my_dump_path/20000124003856/0/272/TransData.trans_TransData_22.112.21.948645536672764 │ +# 转换后的npy文件 +│ - TransData.trans_TransData_22.112.21.948645536672764.output.0.npy │ +│ - TransData.trans_TransData_22.112.21.948645536672764.input.0.npy │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +...... +[INFO] The comparison result have been written to "./parse_data/acl_batch_convert". +``` + +输出结果: + +原dump数据目录: + +```bash +├── /home/xxx/my_dump_path/20000124003856/0/ +│ ├── 272 +│ │ ├── {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp} +│ │ ... +│ ├── 512 +│ ... +``` + +转换后: + +```bash +├── ./parse_data/acl_batch_convert/{timestamp} +│ ├── {op_name1} +│ │ ├── {timestamp1} +│ │ | ├── {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input/output}.{参数序号}.npy +│ │ | │ ... +│ │ ├── {timestamp2} +│ │ | ... +│ ├── {op_name2} +│ ├── ... +``` + +### ACL层级算子数据比对 + +本功能主要用于比对前后两次NPU ACL层级dump数据的一致性。 + +本功能支持批量比对,若需要进行批量比对,需要先将两份待比对的NPU ACL层级dump数据进行“**ACL层级算子数据批量转换**”,可以使两份数据更好的匹配;若直接进行dump数据的比对,建议只比对单个dump数据文件。 + +输入以下比对命令进行数据比对。 + +```bash +vc -m my_dump_path -g golden_dump_path [-out output_path] [-cmp_path msaccucmp_path] +``` + +| 参数名称 | 说明 | 是否必选 | +| --------- | ------------------------------------------------------------ | -------- | +| -m | 待比对ACL dump数据目录。如果比对单个算子,需要指定到ACL dump数据的model_id级目录;如果批量比对,则指定到cad转换后的timestamp级目录。 | 是 | +| -g | 标杆ACL dump数据目录。如果比对单个算子,需要指定到ACL dump数据的model_id级目录;如果批量比对,则指定到cad转换后的timestamp级目录。 | 是 | +| -out | 结果输出目录,须指定已存在的目录,默认为./parse_data/acl_batch_comapre。未指定时保存在默认路径下,比对结束后会打印log提示输出结果存放路径。 | 否 | +| -cmp_path | 指定msaccucmp路径,默认路径为:/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py | 否 | + +输出结果:batch_compare_{timestamp}.csv文件。 + +**示例** + +```bash +# 传入待比对数据目录以及标杆数据目录 +Parse >>> vc -m ./my_dump_path -g ./golden_data_path +[INFO]Compare result is saved in : parse_data/acl_batch_comapre/batch_compare_1707271118.csv +``` + +### ACL算子数据的npy转换 + +依赖:CANN包中的msaccucmp工具,需要安装Ascend-CANN-toolkit,详见《[CANN 软件安装指南](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F700%2Fenvdeployment%2Finstg%2Finstg_0001.html)》。 + +输入以下转换命令进行数据转换, 将ACL级别dump数据转为npy文件。 + +```bash +dc -n file_name/file_path [-f format] [-out output_path] +``` + +| 参数名称 | 说明 | 是否必选 | +| --------- | ------------------------------------------------------------ | -------- | +| -n | 需转换的dump数据文件或dump数据文件目录。 | 是 | +| -f | 开启format转换,指定该参数时需要配置format格式。当前内置的Format转换支持如下类型:
FRACTAL_NZ转换NCHW
FRACTAL_NZ转换成NHWC
FRACTAL_NZ转换ND
HWCN转换FRACTAL_Z
HWCN转换成NCHW
HWCN转换成NHWC
NC1HWC0转换成HWCN
NC1HWC0转换成NCHW
NC1HWC0转换成NHWC
NCHW转换成FRACTAL_Z
NCHW转换成NHWC
NHWC转换成FRACTAL_Z
NHWC转换成HWCN
NHWC转换成NCHW
NDC1HWC0转换成NCDHW | 否 | +| -out | 结果输出目录。 | 否 | +| -cmp_path | 指定msaccucmp路径,默认路径为:/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py | 否 | + +[^]: 若传入单个dump文件,则转换单个文件,若传入dump文件目录则转换目录下所有dump文件。 + +- 输出结果:npy文件。 +- 若指定-out参数需要用户传入输出路径,并且路径需要已存在。 +- 若未指定输出目录, 则比对结束后将结果保存在默认目录 “./parse_data/convert_result”中,比对结束后会打印log提示输出结果存放路径及转换结果。 + +- 输入以下命令,展示npy数据统计信息。 + + ```bash + pt -n file_path + ``` + + | 参数名称 | 说明 | 是否必选 | + | -------- | ------------- | -------- | + | -n | npy文件路径。 | 是 | + + 打印统计信息:shape, dtype, max, min和mean。默认在npy文件路径下将该数据保存为txt文件。 + +**示例1** + +```bash +# 传入需转换的dump文件目录 +Parse >>> dc -n ./dump_data/ +...... +# 转换结果 +╭──────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ SrcFile: ./dump_data/ +│ - Add.fp32.vars.add.2fp32.vars.Relu.9.31.5.1636595794731103.input.0.npy │ +│ - Add.fp32.vars.add.1fp32.vars.Relu.6.24.5.1636595794631347.output.0.npy │ +│ - Add.fp32.vars.add.2fp32.vars.Relu.9.31.5.1636595794731103.input.1.npy │ +│ - Add.fp32.vars.add.1fp32.vars.Relu.6.24.5.1636595794631347.input.1.npy │ +│ - Add.fp32.vars.add.3fp32.vars.Relu.12.40.5.1636595794846124.input.1.npy │ +│ - Add.fp32.vars.add.1fp32.vars.Relu.6.24.5.1636595794631347.input.0.npy │ +│ - Add.fp32.vars.add.3fp32.vars.Relu.12.40.5.1636595794846124.input.0.npy │ +│ - Add.fp32.vars.add.2fp32.vars.Relu.9.31.5.1636595794731103.output.0.npy │ +│ - Add.fp32.vars.add.3fp32.vars.Relu.12.40.5.1636595794846124.output.0.npy │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────────╯ +``` + +**示例2** + +```bash +# 查看某个dump数据块的数据信息 +# 默认会将数据中的tensor保存成 txt +Parse >>> pt -n ./parse_data/dump_convert/Add.fp32.vars.add.1fp32.vars.Relu.6.24.5.1636595794631347.output.0.npy +...... +# 打印统计信息 +[Shape: (1, 16, 56, 56, 16)] [Dtype: float16] [Max: 452.0] [Min: -408.5] [Mean: -3.809] +Path: ./parse_data/dump_convert/Add.fp32.vars.add.1fp32.vars.Relu.6.24.5.1636595794631347.input.0.npy +TextFile:./parse_data/dump_convert/Add.fp32.vars.add.1fp32.vars.Relu.6.24.5.1636595794631347.input.0.npy.txt +``` + +### pkl文件中指定API的dump数据信息查看 + +输入以下命令,解析并输出pkl文件中指定api的统计信息。 + +```bash +pk -f pkl_path -n api_name +``` + +| 参数名称 | 说明 | 是否必选 | +| -------- | ----------------- | -------- | +| -f | 指定pkl文件路径。 | 是 | +| -n | 指定API名称。 | 是 | + +- 输出结果:打印统计信息(shape, dtype, max和min mean)。 +- 若pkl文件中存在相应的堆栈信息,则会打印堆栈信息。 + +**示例** + +```bash +# 传入pkl文件及api名称 +Parse >>> pk -f ./torch_dump/ptdbg_v3.2/rank0/api_stack_dump.pkl -n Functional.conv2d.0.forward +...... +# 打印统计信息及堆栈(pkl文件不包含堆栈则不会打印堆栈) + +Statistic Info: + [Functional.conv2d.0.forward.input.0][dtype: torch.float32][shape: [2, 1, 2, 2]][max: 1.576936960220337][min: -0.9757485389709473][mean: 0.4961632490158081] + [Functional.conv2d.0.forward.input.1][dtype: torch.float32][shape: [2, 1, 2, 2]][max: 0.20064473152160645][min: -0.47102075815200806][mean: -0.20796933770179749] + [Functional.conv2d.0.forward.input.2][dtype: torch.float32][shape: [2]][max: 0.17380613088607788][min: -0.16853803396224976][mean: 0.0026340484619140625] + [Functional.conv2d.0.forward.output][dtype: torch.float32][shape: [2, 2, 1, 1]][max: 0.02364911139011383][min: -1.762906551361084][mean: -0.6710853576660156] +``` + +### API可选层级比对 + +输入以下命令, 进行统计级和像素级比对。 + +```bash +cn -m my_data*.npy -g gloden*.npy [-p num] [-al atol] [-rl rtol] +``` + +- 统计级比对:对tensor整体进行余弦值及相对误差的计算。 +- 像素级比对:对输入的两个npy文件进行逐元素比对。若两个tensor对应元素的相对误差或绝对误差大于**误差阈值**(-al和-rl配置)则被标记为错误数据。 + +| 参数名称 | 说明 | 是否必选 | +| -------- | ----------------------------------------------- | -------- | +| -m | 待比对数据。 | 是 | +| -g | 标杆数据。 | 是 | +| -p | 设置比对结束后打印错误元素的个数,默认值20。 | 否 | +| -al | 判定数据存在精度问题的绝对误差阈值,默认0.001。 | 否 | +| -rl | 判定数据存在精度问题的相对误差阈值,默认0.001。 | 否 | +| -s | 将npy文件保存成txt文件,用于查看,默认开启。 | 否 | + +输出结果: + +- 统计级比对结果。 +- 两个文件的统计信息(shape, dtype, max, min和mean)。 +- 错误数据打印表格。 + +**示例** + +```bash +# 对比两个tensor的数据 +Parse >>> cn -m Add.InceptionV3.InceptionV3.Mixed.7a.Branch.0.add.3.323.1619494134703053.output.0.npy -g InceptionV3.InceptionV3.Mixed.7a.Branch.0.add.3.0.1619492699305998.npy -p 10 -s -al 0.002 -rl 0.005 + Error Item Table Top Item Table +┏━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ ┏━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ +┃ Index ┃ Left ┃ Right ┃ Diff ┃ ┃ Index ┃ Left ┃ Right ┃ Diff ┃ +┡━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ ┡━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ +│ 155 │ 0.024600908 │ 0.022271132 │ 0.002329776 │ │ 0 │ -0.9206961 │ -0.9222216 │ 0.0015255213 │ +│ 247 │ 0.015752593 │ 0.017937578 │ 0.0021849852 │ │ 1 │ -0.6416973 │ -0.64051837 │ 0.0011789203 │ +│ 282 │ -0.0101207765 │ -0.007852031 │ 0.0022687456 │ │ 2 │ -0.35383835 │ -0.35433492 │ 0.0004965663 │ +│ 292 │ 0.019581757 │ 0.02240482 │ 0.0028230622 │ │ 3 │ -0.18851271 │ -0.18883198 │ 0.00031927228 │ +│ 640 │ -0.06593232 │ -0.06874806 │ 0.0028157383 │ │ 4 │ -0.43508735 │ -0.43534422 │ 0.00025686622 │ +│ 1420 │ 0.09293677 │ 0.09586689 │ 0.0029301196 │ │ 5 │ 1.4447614 │ 1.4466647 │ 0.0019032955 │ +│ 1462 │ -0.085207745 │ -0.088047795 │ 0.0028400496 │ │ 6 │ -0.3455438 │ -0.3444429 │ 0.0011008978 │ +│ 1891 │ -0.03433288 │ -0.036525503 │ 0.002192624 │ │ 7 │ -0.6560242 │ -0.6564579 │ 0.0004336834 │ +│ 2033 │ 0.06828873 │ 0.07139922 │ 0.0031104907 │ │ 8 │ -2.6964858 │ -2.6975214 │ 0.0010356903 │ +│ 2246 │ -0.06376442 │ -0.06121233 │ 0.002552092 │ │ 9 │ -0.73746175 │ -0.73650354 │ 0.00095820427 │ +└───────┴───────────────┴──────────────┴──────────────┘ └───────┴─────────────┴─────────────┴───────────────┘ +╭───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ Left: | +│ |- NpyFile: ./dump/temp/decode/Add.InceptionV3.InceptionV3.Mixed.7a.Branch.0.add.3.323.1619494134703053.output.0.npy | +│ |- TxtFile: ./dump/temp/decode/Add.InceptionV3.InceptionV3.Mixed.7a.Branch.0.add.3.323.1619494134703053.output.0.npy.txt | +│ |- NpySpec: [Shape: (32, 8, 8, 320)] [Dtype: float32] [Max: 5.846897] [Min: -8.368301] [Mean: -0.72565556] | +│ DstFile: │ +│ |- NpyFile: ./dump/cpu/InceptionV3.InceptionV3.Mixed.7a.Branch.0.add.3.0.1619492699305998.npy | +│ |- TxtFile: ./dump/cpu/InceptionV3.InceptionV3.Mixed.7a.Branch.0.add.3.0.1619492699305998.npy.txt | +│ |- NpySpec: [Shape: (32, 8, 8, 320)] [Dtype: float32] [Max: 5.8425903] [Min: -8.374472] [Mean: -0.7256237] │ +│ NumCnt: 655360 │ +│ AllClose: False │ +│ CosSim: 0.99999493 │ +│ ErrorPer: 0.023504638671875 (rl= 0.005, al= 0.002) │ +╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +``` + +## FAQ + +[FAQ](https://gitee.com/ascend/att/blob/master/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py index 6f34129aba0ab15e2817ba4edf8352e9b9b73c2f..03a320e53d6bbb0249ff3cf97683f7ae7ea191b2 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py @@ -30,18 +30,12 @@ class Advisor: Class for generate advisor """ - def __init__(self, input_file, out_path=""): - self.input_file = os.path.realpath(input_file) + def __init__(self, input_data, out_path=""): + self.input_data = input_data self.out_path = os.path.realpath(out_path) - def _parse_input_file(self): - try: - df = pd.read_csv(self.input_file, on_bad_lines='skip') - except OSError as os_err: - print_error_log('Failed to parse the input file %s. %s' - % (self.input_file, str(os_err))) - raise CompareException(CompareException.PARSE_FILE_ERROR) from os_err - data_columns = df.columns.values + def _parse_input_data(self): + data_columns = self.input_data.columns.values if {CompareConst.ACCURACY, CompareConst.NPU_NAME}.issubset(data_columns): self.file_type = Const.ALL elif {CompareConst.RESULT, CompareConst.NPU_MD5}.issubset(data_columns): @@ -49,17 +43,12 @@ class Advisor: elif {CompareConst.MAX_DIFF, CompareConst.RESULT}.issubset(data_columns): self.file_type = Const.SUMMARY else: - print_error_log('Compare result file does not meet the required conditions.') - raise CompareException(CompareException.INVALID_FILE_ERROR) - df.reset_index(inplace=True) - # The value of index is consistent with the line number of csv, csv file first line is 2 - df.iloc[:, 0] += 2 + print_error_log('Compare result does not meet the required conditions.') + raise CompareException(CompareException.INVALID_DATA_ERROR) + df = self.input_data.reset_index() return df def _check_path_vaild(self): - input_file_checker = FileChecker(self.input_file, FileCheckConst.FILE, FileCheckConst.READ_ABLE, - FileCheckConst.CSV_SUFFIX) - input_file_checker.common_check() out_path_checker = FileChecker(self.out_path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE) out_path_checker.common_check() @@ -116,8 +105,8 @@ class Advisor: def analysis(self): self._check_path_vaild() - analyze_data = self._parse_input_file() - print_info_log("Start analyzing the comparison result: %s" % self.input_file) + analyze_data = self._parse_input_data() + print_info_log("Start analyzing the comparison result of %s" % self.file_type) self.analyze_unmatched(analyze_data) if self.file_type == Const.ALL: failing_data = analyze_data[analyze_data[CompareConst.ACCURACY] == CompareConst.ACCURACY_CHECK_NO] diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/file_check_util.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/file_check_util.py index ec7ddf8d01ff0a0bc463d5781bf1cfdb04482e7d..93ace4d2527aa52ab6718463b65a2950c3bf8e08 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/file_check_util.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/file_check_util.py @@ -112,7 +112,7 @@ class FileChecker: self.check_path_ability() if self.is_script: check_path_owner_consistent(self.file_path) - check_path_pattern_vaild(self.file_path) + check_path_pattern_valid(self.file_path) check_common_file_size(self.file_path) check_file_suffix(self.file_path, self.file_type) return self.file_path @@ -167,7 +167,7 @@ class FileOpen: self.file_path = os.path.realpath(self.file_path) check_path_length(self.file_path) self.check_ability_and_owner() - check_path_pattern_vaild(self.file_path) + check_path_pattern_valid(self.file_path) if os.path.exists(self.file_path): check_common_file_size(self.file_path) @@ -251,7 +251,7 @@ def check_path_owner_consistent(path): raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR) -def check_path_pattern_vaild(path): +def check_path_pattern_valid(path): if not re.match(FileCheckConst.FILE_VALID_PATTERN, path): print_error_log('The file path %s contains special characters.' % path) raise FileCheckException(FileCheckException.INVALID_PATH_ERROR) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py index a98c3c27fbdb79abe0ed24dbe6df81e41b45a5c8..c800f31dbca6a7f857854e0c0733fec5ca27888c 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py @@ -31,7 +31,7 @@ from functools import wraps from pathlib import Path import numpy as np import torch - +import warnings from .file_check_util import FileOpen, FileChecker, FileCheckConst try: @@ -63,6 +63,8 @@ class Const: """ MODEL_TYPE = ['.onnx', '.pb', '.om'] DIM_PATTERN = r"^(-?[0-9]+)(,-?[0-9]+)*" + REGEX_PREFIX_MAX_LENGTH = 20 + REGEX_PREFIX_PATTERN = r"^[a-zA-Z0-9_-]+$" SEMICOLON = ";" COLON = ":" EQUAL = "=" @@ -114,13 +116,17 @@ class Const: ENV_ENABLE = "1" ENV_DISABLE = "0" - MAX_SEED_VALUE = 2**32 - 1 + MAX_SEED_VALUE = 2 ** 32 - 1 INPLACE_LIST = [ "broadcast", "all_reduce", "reduce", "all_gather", "gather", "scatter", "reduce_scatter", "_reduce_scatter_base", "_all_gather_base", "send", "recv", "irecv", "isend", "all_to_all_single" - ] + ] + # version message tips + VERSION_MESSAGE = """The current version of ptdbg will be deprecated on September 30, 2024. + The att/debug/accuracy_tools/ptdbg_ascend directory will be deleted on September 30, 2024. + Please use the ptdbg in the att/debug/accuracy_tools/atat directory.""" class CompareConst: """ @@ -195,6 +201,23 @@ class CompareConst: # compare const FLOAT_TYPE = [np.half, np.single, float, np.double, np.float64, np.longdouble] + # highlight xlsx color const + RED = "FFFF0000" + YELLOW = "FFFF00" + BLUE = "0000FF" + + # highlight rules const + OVERFLOW_LIST = ['nan\t', 'inf\t', '-inf\t', 'nan', 'inf', '-inf'] + MAX_DIFF_RED = 1e+10 + ORDER_MAGNITUDE_DIFF_YELLOW = 1 + ONE_THOUSAND_ERROR_IN_RED = 0.9 + ONE_THOUSAND_ERROR_OUT_RED = 0.6 + ONE_THOUSAND_ERROR_DIFF_YELLOW = 0.1 + COSINE_DIFF_YELLOW = 0.1 + MAX_RELATIVE_OUT_RED = 0.5 + MAX_RELATIVE_OUT_YELLOW = 0.1 + MAX_RELATIVE_IN_YELLOW = 0.01 + class VersionCheck: """ @@ -205,7 +228,6 @@ class VersionCheck: V2_0 = "2.0" V2_1 = "2.1" V2_2 = "2.2" - @staticmethod def check_torch_version(version): @@ -324,11 +346,18 @@ def check_mode_valid(mode, scope=None, api_list=None): raise ValueError("api_list param set invalid, it's must be a list.") mode_check = { Const.ALL: lambda: None, - Const.RANGE: lambda: ValueError("set_dump_switch, scope param set invalid, it's must be [start, end].") if len(scope) != 2 else None, - Const.LIST: lambda: ValueError("set_dump_switch, scope param set invalid, it's should not be an empty list.") if len(scope) == 0 else None, - Const.STACK: lambda: ValueError("set_dump_switch, scope param set invalid, it's must be [start, end] or [].") if len(scope) > 2 else None, - Const.ACL: lambda: ValueError("set_dump_switch, scope param set invalid, only one api name is supported in acl mode.") if len(scope) != 1 else None, - Const.API_LIST: lambda: ValueError("Current dump mode is 'api_list', but the content of api_list parameter is empty or valid.") if len(api_list) < 1 else None, + Const.RANGE: lambda: ValueError("set_dump_switch, scope param set invalid, it's must be [start, end].") if len( + scope) != 2 else None, + Const.LIST: lambda: ValueError( + "set_dump_switch, scope param set invalid, it's should not be an empty list.") if len(scope) == 0 else None, + Const.STACK: lambda: ValueError( + "set_dump_switch, scope param set invalid, it's must be [start, end] or [].") if len(scope) > 2 else None, + Const.ACL: lambda: ValueError( + "set_dump_switch, scope param set invalid, only one api name is supported in acl mode.") if len( + scope) != 1 else None, + Const.API_LIST: lambda: ValueError( + "Current dump mode is 'api_list', but the content of api_list parameter is empty or valid.") if len( + api_list) < 1 else None, Const.API_STACK: lambda: None, } if mode not in Const.DUMP_MODE: @@ -351,7 +380,8 @@ def check_dump_mode_valid(dump_mode): print_warn_log("Please set dump_mode as a list.") dump_mode = [dump_mode] if not all(mode in ["all", "forward", "backward", "input", "output"] for mode in dump_mode): - raise ValueError("Please set dump_mode as a list containing one or more of the following: 'all', 'forward', 'backward', 'input', 'output'.") + raise ValueError( + "Please set dump_mode as a list containing one or more of the following: 'all', 'forward', 'backward', 'input', 'output'.") if 'input' not in dump_mode and 'output' not in dump_mode: dump_mode.extend(['input', 'output']) if 'forward' not in dump_mode and 'backward' not in dump_mode: @@ -385,7 +415,7 @@ def check_compare_param(input_parma, output_path, stack_mode=False, summary_comp check_file_or_directory_path(input_parma.get("bench_dump_data_dir"), True) check_file_or_directory_path(output_path, True) with FileOpen(input_parma.get("npu_pkl_path"), "r") as npu_pkl, \ - FileOpen(input_parma.get("bench_pkl_path"), "r") as bench_pkl: + FileOpen(input_parma.get("bench_pkl_path"), "r") as bench_pkl: check_pkl_file(input_parma, npu_pkl, bench_pkl, stack_mode) @@ -425,6 +455,27 @@ def check_configuration_param(stack_mode=False, auto_analyze=True, fuzzy_match=F raise CompareException(CompareException.INVALID_PARAM_ERROR) +def check_regex_prefix_format_valid(prefix): + """ + validate the format of the regex prefix + + Args: + prefix (str): The prefix string to validate. + + Returns: + no returns + + Raises: + ValueError: if the prefix length exceeds Const.REGEX_PREFIX_MAX_LENGTH characters or the prefix do not match + the given pattern Const.REGEX_PREFIX_PATTERN + """ + if len(prefix) > Const.REGEX_PREFIX_MAX_LENGTH: + raise ValueError(f"Maximum length of prefix is {Const.REGEX_PREFIX_MAX_LENGTH}, while current length " + f"is {len(prefix)}") + if not re.match(Const.REGEX_PREFIX_PATTERN, prefix): + raise ValueError(f"prefix contains invalid characters, prefix pattern {Const.REGEX_PREFIX_PATTERN}") + + def check_file_or_directory_path(path, isdir=False): """ Function Description: @@ -456,12 +507,14 @@ def is_starts_with(string, prefix_list): def check_stack_mode(pkl_fp): api_prefix = "" + api_match = "" api_pattern = r'\[\"([0-9a-zA-Z_.]+.(for|back)ward).(in|out)put(\.[0-9]+)?' is_stack_mode = False for index, line in enumerate(pkl_fp): - if index == 0: - api_match = re.search(api_pattern, line) - api_prefix = api_match.group(1) + if not api_match: + if re.search(api_pattern, line): + api_match = re.search(api_pattern, line) + api_prefix = api_match.group(1) elif api_prefix and line.startswith(f'["{api_prefix}'): if line.startswith(f'["{api_prefix}.stack_info'): is_stack_mode = True @@ -596,7 +649,7 @@ def save_numpy_data(file_path, data): save_numpy_data """ if not os.path.exists(os.path.dirname(file_path)): - os.makedirs(os.path.dirname(file_path)) + create_directory(os.path.dirname(file_path)) np.save(file_path, data) @@ -630,22 +683,28 @@ def add_time_as_suffix(name): return '{}_{}.csv'.format(name, time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))) +def add_time_with_xlsx(name): + return '{}_{}.xlsx'.format(name, time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))) + + def get_time(): return datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S") def format_value(value): - return '{:.12f}'.format(value) + return float('{:.12f}'.format(value)) def torch_device_guard(func): if is_gpu or torch_without_guard_version: return func + # Parse args/kwargs matched torch.device objects @torch_npu_device_guard def wrapper(*args, **kwargs): return func(*args, **kwargs) + return wrapper @@ -689,16 +748,15 @@ def get_process_rank(model): return 0, False if local_device.type == 'cpu': print_warn_log("Warning: the debugger is unable to get the rank id. " - "This may cause the dumpped data to be corrupted in the " - "case of distributed training. (You may ignore this if you are using only one card.) " - "Transfer the model to npu or gpu before register_hook() to avoid this warning.") + "This may cause the dumpped data to be corrupted in the " + "case of distributed training. (You may ignore this if you are using only one card.) " + "Transfer the model to npu or gpu before register_hook() to avoid this warning.") return 0, False else: return local_device.index, True def parameter_adapter(func): - @wraps(func) def inner(self, *args, **kwargs): if self.op_name_ == "__getitem__" and len(args) > 1 and isinstance(args[1], torch.Tensor): @@ -724,6 +782,7 @@ def parameter_adapter(func): if self.op_name_ == "__eq__" and args[1] is None: return False return func(self, *args, **kwargs) + return inner @@ -735,7 +794,7 @@ def generate_compare_script(dump_path, pkl_file_path, dump_switch_mode): try: with FileOpen(template_path, 'r') as ftemp, \ - os.fdopen(os.open(compare_script_path, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as fout: + os.fdopen(os.open(compare_script_path, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as fout: code_temp = ftemp.read() fout.write(code_temp % (pkl_file_path, dump_path, is_api_stack)) except OSError: @@ -781,9 +840,13 @@ def get_md5_for_tensor(x): def check_path_before_create(path): + if os.path.islink(path): + print_error_log('The file path {} is a symbolic link, which is not allowed.'.format(path)) + raise CompareException(CompareException.INVALID_PATH_ERROR) + if len(os.path.realpath(path)) > Const.DIRECTORY_LENGTH or len(os.path.basename(path)) > \ Const.FILE_NAME_LENGTH: - print_error_log('The file path length exceeds limit.') + print_error_log('The file path {} length exceeds limit.'.format(path)) raise CompareException(CompareException.INVALID_PATH_ERROR) if not re.match(Const.FILE_PATTERN, os.path.realpath(path)): @@ -797,3 +860,9 @@ def check_inplace_op(prefix): match_op = re.findall(r"Distributed_(.+?)_\d", prefix) op_name = match_op[0] if match_op else None return op_name in Const.INPLACE_LIST + + +class WarningManager: + def warn(self, message=None, enable_warnings=True): + if enable_warnings: + warnings.warn(message) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py index be1e0dae76e27bb63dab6016101cd84584cafd62..78ee5bf42575a9ff1b7662990a3e7f0ea772d6d8 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py @@ -16,17 +16,21 @@ """ import json +import math import multiprocessing import os.path import stat import sys - +import warnings import numpy as np import pandas as pd +import openpyxl +from openpyxl.styles import PatternFill +from collections import namedtuple from .match import graph_mapping from ..advisor.advisor import Advisor -from ..common.utils import check_compare_param, add_time_as_suffix, \ +from ..common.utils import check_compare_param, add_time_with_xlsx, \ print_info_log, print_warn_log, print_error_log, CompareException, Const, \ CompareConst, format_value, check_file_not_exists, check_configuration_param, \ is_summary_compare, is_md5_compare @@ -37,7 +41,7 @@ def correct_data(result): if result == CompareConst.NAN: return result if float(result) > 0.99999: - return '1.0' + return 1.0 return result @@ -50,7 +54,7 @@ def cosine_similarity(n_value, b_value): b_norm = np.linalg.norm(b_value) message = '' if a_norm <= Const.FLOAT_EPSILON and b_norm <= Const.FLOAT_EPSILON: - result = '1.0' + result = 1.0 elif a_norm <= Const.FLOAT_EPSILON: message = 'Cannot compare by Cosine Similarity, All the data is Zero in npu dump data.' result = CompareConst.NAN @@ -228,6 +232,7 @@ def rename_api(npu_name, process): def merge_tensor(tensor_list): + """Return a dictionary of one API with its name, input, output, summery and stack info""" op_dict = {} op_dict["op_name"] = [] op_dict["input_struct"] = [] @@ -245,7 +250,7 @@ def merge_tensor(tensor_list): op_dict["op_name"].append(tensor[0]) if len(tensor) != Const.SUMMARY_COLUMN_NUM: print_error_log(f"This summary data is not complete. {tensor}") - raise CompareException(CompareException.INVALID_DATA_ERROR) + raise CompareException(CompareException.INVALID_DATA_ERROR) if tensor[0].find("input") != -1: op_dict["input_struct"].append((tensor[3], tensor[4], tensor[2])) elif tensor[0].find("output") != -1: @@ -254,10 +259,11 @@ def merge_tensor(tensor_list): if tensor[1] <= Const.DUMP_RATIO_MAX: op_dict["summery"].append(tensor[5]) - return op_dict + return op_dict if op_dict["op_name"] else {} def read_op(ops_queue, pkl_file_handle, stack_mode): + """Read input pkl file and get one API info with dictionary format once""" tensor_list = [] read_err = False read_output_flag = {"last_line": False, "curr_line": False} @@ -275,13 +281,15 @@ def read_op(ops_queue, pkl_file_handle, stack_mode): tensor_data = json.loads(tensor_line) if not isinstance(tensor_data, list): print_error_log(f"This data is not a list, please check the dump data pkl file. {tensor_data}") - raise CompareException(CompareException.INVALID_DATA_ERROR) + raise CompareException(CompareException.INVALID_DATA_ERROR) read_output_flag["last_line"] = read_output_flag.get("curr_line") read_output_flag["curr_line"] = True if tensor_data[0].find(end_flag) != -1 else False if (read_output_flag.get("last_line") and not read_output_flag.get("curr_line")) \ or (len(tensor_line) == 0 and read_output_flag.get("curr_line")): # end of file scenario - ops_queue.append(merge_tensor(tensor_list)) + merge_list = merge_tensor(tensor_list) + if merge_list: + ops_queue.append(merge_list) # the pos of the handle needs to restore to the start of the next api. pkl_file_handle.seek(curr_pos, 0) break @@ -291,6 +299,7 @@ def read_op(ops_queue, pkl_file_handle, stack_mode): def match_op(npu_queue, bench_queue, fuzzy_match): + """Match NPU with bench API""" for b_index, b_op in enumerate(bench_queue[0: -1]): if check_op(npu_queue[-1], b_op, fuzzy_match): return len(npu_queue) - 1, b_index @@ -302,7 +311,8 @@ def match_op(npu_queue, bench_queue, fuzzy_match): return -1, -1 -def get_accuracy(result, n_dict, b_dict, summary_compare=False, md5_compare=False): +def get_accuracy(result, n_dict, b_dict, highlight_dict, summary_compare=False, md5_compare=False): + """Compare and add one API's input and output results to dataframe""" def get_accuracy_core(n_start, n_len, b_start, b_len, key): min_len = min(n_len, b_len) npu_stack_info = n_dict.get("stack_info", None) @@ -388,72 +398,283 @@ def get_accuracy(result, n_dict, b_dict, summary_compare=False, md5_compare=Fals b_num_input = len([name for name in b_dict['op_name'] if 'input' in name]) n_num_output = n_num - n_num_input b_num_output = b_num - b_num_input + last_len = len(result) get_accuracy_core(0, n_num_input, 0, b_num_input, 'input_struct') get_accuracy_core(n_num_input, n_num_output, b_num_input, b_num_output, 'output_struct') + # check whether to highlight API rows if using summary compare + if summary_compare: + find_error_rows(result[last_len:], last_len, n_num_input, highlight_dict, summary_compare) -def _do_multi_process(input_parma, result_path): - try: - _handle_multi_process(compare_ops, input_parma, result_path, multiprocessing.Manager().RLock()) - except FileNotFoundError as error: - print("File not Found. compare failed!") +def check_order_magnitude(info, color_columns, summary_compare=True): + """Check if order magnitude diff of max_diff larger than 1""" + api_in, api_out, num = info + max_diff_index = get_header_index('Max diff' if summary_compare else 'MaxAbsErr', summary_compare) + if api_in[max_diff_index] > api_out[max_diff_index]: return - except IOError as error: - print("IOEError. compare failed!") + in_order = 0 if api_in[max_diff_index] == 0 else math.log10(abs(api_in[max_diff_index])) + out_order = 0 if api_out[max_diff_index] == 0 else math.log10(abs(api_out[max_diff_index])) + if abs(in_order - out_order) >= CompareConst.ORDER_MAGNITUDE_DIFF_YELLOW: + color_columns.yellow.append(num) + + +def check_one_thousand_error_ratio(info, color_columns, summary_compare=True): + """Compare output's one thousand error ratio with input's """ + api_in, api_out, num = info + one_thousand_index = get_header_index('One Thousandth Err Ratio', summary_compare) + if not isinstance(api_in[one_thousand_index], (float, int)) or not isinstance(api_out[one_thousand_index], (float, int)): return + if api_in[one_thousand_index] > CompareConst.ONE_THOUSAND_ERROR_IN_RED and api_out[one_thousand_index] < CompareConst.ONE_THOUSAND_ERROR_OUT_RED: + color_columns.red.append(num) + elif api_in[one_thousand_index] - api_out[one_thousand_index] > CompareConst.ONE_THOUSAND_ERROR_DIFF_YELLOW: + color_columns.yellow.append(num) + + +def check_cosine_similarity(info, color_columns, summary_compare=True): + """Check if output's cosine similarity more than 0.1 smaller than input's""" + api_in, api_out, num = info + cosine_index = get_header_index('Cosine', summary_compare) + if not isinstance(api_in[cosine_index], (float, int)) or not isinstance(api_out[cosine_index], (float, int)): + return + if api_in[cosine_index] - api_out[cosine_index] > CompareConst.COSINE_DIFF_YELLOW: + color_columns.yellow.append(num) + + +def check_max_relative_diff(info, color_columns, summary_compare=True): + """Compare the output value of max_diff / bench_max with input""" + api_in, api_out, num = info + max_diff_index = get_header_index('Max diff', summary_compare) + bench_max_index = get_header_index('Bench max', summary_compare) + input_max_relative_diff = np.abs(np.divide(api_in[max_diff_index], max(0.01, api_in[bench_max_index]))) + output_max_relative_diff = np.abs(np.divide(api_out[max_diff_index], max(0.01, api_out[bench_max_index]))) + if not isinstance(input_max_relative_diff, (float, int)) or not isinstance(output_max_relative_diff, (float, int)): + return + if output_max_relative_diff > CompareConst.MAX_RELATIVE_OUT_RED: + color_columns.red.append(num) + elif output_max_relative_diff > CompareConst.MAX_RELATIVE_OUT_YELLOW and input_max_relative_diff < CompareConst.MAX_RELATIVE_IN_YELLOW: + color_columns.yellow.append(num) + + +def check_overflow(info, color_columns, summary_compare=False): + """Check if Inf or Nan exists in NPU max/min, or large number in Max diff""" + line, num = info + npu_max_index = get_header_index('NPU max', summary_compare) + npu_min_index = get_header_index('NPU min', summary_compare) + max_diff_index = get_header_index('Max diff' if summary_compare else 'MaxAbsErr', summary_compare) + if str(line[npu_max_index]) in CompareConst.OVERFLOW_LIST or str(line[npu_min_index]) in CompareConst.OVERFLOW_LIST: + color_columns.red.append(num) + return + # check if Max_Diff > 1e+10 + if isinstance(line[max_diff_index], (float, int)) and line[max_diff_index] > CompareConst.MAX_DIFF_RED: + color_columns.red.append(num) + + +def get_header_index(header_name, summary_compare=False): + if summary_compare: + header = CompareConst.SUMMARY_COMPARE_RESULT_HEADER[:] + else: + header = CompareConst.COMPARE_RESULT_HEADER[:] + if header_name not in header: + print_error_log(f"{header_name} not in data name") + raise CompareException(CompareException.INVALID_PARAM_ERROR) + return header.index(header_name) + + +class HighlightRules: + """Highlight rules to check whether API errors""" + # rules for every line: pass in every line of api to check if error exists + basic_rules = { + "check_overflow": check_overflow + } + + # rules compare output with input: pass in input, output to check if output errors compare to input + compare_rules = { + "check_order_magnitude": check_order_magnitude, + "check_one_thousand_error": check_one_thousand_error_ratio, + "check_cosine_similarity": check_cosine_similarity + } + summary_compare_rules = { + "check_order_magnitude": check_order_magnitude, + "check_max_relative_diff": check_max_relative_diff, + } + + +def find_error_rows(result, last_len, n_num_input, highlight_dict, summary_compare=False): + """Find error api and return dict with highlight information red or yellow""" + npu_max_index = get_header_index('NPU max', summary_compare) + bench_max_index = get_header_index('Bench max', summary_compare) + max_diff_index = get_header_index('Max diff' if summary_compare else 'MaxAbsErr', summary_compare) + + red_lines, yellow_lines = [], [] #lines to highlight red or yellow + LineInfo = namedtuple('LineInfo', ['line_data', 'num_pointer']) + ApiInfo = namedtuple('ApiInfo', ['api_input', 'api_output', 'num_pointer']) + ColorColumns = namedtuple('ColorColumns', ['red', 'yellow']) + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + + for i, line in enumerate(result): + num = last_len + i + line_info = LineInfo(line_data=line, num_pointer=num) + for rule in HighlightRules.basic_rules.values(): + rule(line_info, color_columns, summary_compare) + + for n, api_out in enumerate(result[n_num_input:len(result)]): + num = last_len + n_num_input + n + if num in red_lines: + continue + if not isinstance(api_out[npu_max_index], (float, int)) \ + or not isinstance(api_out[bench_max_index], (float, int)) \ + or not isinstance(api_out[max_diff_index],(float, int)): + continue + for m, api_in in enumerate(result[0:n_num_input]): + if not isinstance(api_in[npu_max_index], (float, int)) \ + or not isinstance(api_in[bench_max_index], (float, int)) \ + or not isinstance(api_in[max_diff_index], (float, int)): + continue + + api_info = ApiInfo(api_input=api_in, api_output=api_out, num_pointer=num) + if summary_compare: + for rule in HighlightRules.summary_compare_rules.values(): + rule(api_info, color_columns, summary_compare) + else: + for rule in HighlightRules.compare_rules.values(): + rule(api_info, color_columns, summary_compare) + highlight_dict.get('red_rows', []).extend(list(set(red_lines))) + highlight_dict.get('yellow_rows', []).extend(list(set(yellow_lines) - set(red_lines))) -def read_dump_path(result_path): + +def get_name_and_state(name): + """Get api/module name and state""" + if "input" in name: + api_name = name.split("input")[0] + state = "input" + else: + api_name = name.split("output")[0] + state = "output" + return api_name, state + + +def find_compare_result_error_rows(result_df, highlight_dict): + """Group the API with its input and output, then find error API with func find_error_rows""" + result = result_df.values + start, input_num, output_num, end = 0, 0, 0, len(result_df) + last_api_name, last_state = None, None + num, last_len = 0, 0 + for i in range(len(result)): + api_name, state = get_name_and_state(result[i][0]) + if last_api_name: + if api_name == last_api_name: + if state == last_state: + num += 1 + else: + input_num = num + num, last_state = 1, state + else: + output_num = num + find_error_rows(result[start:start + input_num + output_num], start, input_num, highlight_dict) + num, last_api_name, last_state = 1, api_name, state + start += input_num + output_num + input_num, output_num = 1, 0 + else: + num, last_api_name, last_state = 1, api_name, state + if state: + if state == "input": + input_num = num + else: + output_num = num + find_error_rows(result[start:start + input_num + output_num], start, input_num, highlight_dict) + + +def highlight_rows_xlsx(result_df, highlight_dict, file_path): + """Write and highlight results in Excel""" + print_info_log('Compare result is %s' % file_path) + + wb = openpyxl.Workbook() + ws = wb.active + + # write header + for j, col_name in enumerate(result_df.columns, start=1): + ws.cell(row=1, column=j, value=col_name) + + for i, row in enumerate(result_df.iterrows(), start=2): + for j, value in enumerate(row[1], start=1): + if not isinstance(value, (float, int)): + value = f'{str(value)}\t' if str(value) in ('inf', '-inf', 'nan') else str(value) + ws.cell(row=i, column=j, value=f'{str(value)}\t' if str(value) in ('inf', '-inf', 'nan') else value) + + if (i - 2) in highlight_dict['red_rows']: + ws.cell(row=i, column=j).fill = PatternFill(start_color=CompareConst.RED, + end_color=CompareConst.RED, fill_type="solid") + elif (i - 2) in highlight_dict['yellow_rows']: + ws.cell(row=i, column=j).fill = PatternFill(start_color=CompareConst.YELLOW, + end_color=CompareConst.YELLOW, fill_type="solid") + wb.save(file_path) + + +def _do_multi_process(input_parma, result_df): + """Use multiprocess to handle real data""" + try: + result_df = _handle_multi_process(compare_ops, input_parma, result_df, multiprocessing.Manager().RLock()) + return result_df + except ValueError as e: + print_error_log('result dataframe is not found.') + raise CompareException(CompareException.INVALID_DATA_ERROR) from e + + +def read_dump_data(result_df): + """Return a dict to pair npu name with bench name""" try: - csv_pd = pd.read_csv(result_path) - npu_dump_name_list = csv_pd.iloc[0:, 0].tolist() - bench_dump_name_list = csv_pd.iloc[0:, 1].tolist() + npu_dump_name_list = result_df.iloc[0:, 0].tolist() + bench_dump_name_list = result_df.iloc[0:, 1].tolist() op_name_mapping_dict = {} for index, _ in enumerate(npu_dump_name_list): npu_dump_name = npu_dump_name_list[index] bench_dump_name = bench_dump_name_list[index] op_name_mapping_dict[npu_dump_name] = [npu_dump_name, bench_dump_name] return op_name_mapping_dict - except FileNotFoundError as e: - print_error_log('{} file is not found.'.format(result_path)) - raise CompareException(CompareException.OPEN_FILE_ERROR) from e - except IOError as e: - print_error_log('{} read csv failed.'.format(result_path)) - raise CompareException(CompareException.READ_FILE_ERROR) from e + except ValueError as e: + print_error_log('result dataframe is not found.') + raise CompareException(CompareException.INVALID_DATA_ERROR) from e + except IndexError as e: + print_error_log('result dataframe elements can not be access.') + raise CompareException(CompareException.INDEX_OUT_OF_BOUNDS_ERROR) from e -def _handle_multi_process(func, input_parma, result_path, lock): +def _handle_multi_process(func, input_parma, result_df, lock): + """To split and multiprocess real data""" process_num = int((multiprocessing.cpu_count() + 1) / 2) - op_name_mapping_dict = read_dump_path(result_path) - op_names = [] - for _ in range(process_num): - op_names.append([]) - all_op_names = list(op_name_mapping_dict.keys()) - for i, op_name in enumerate(all_op_names): - op_names[i % process_num].append(op_name) - all_tasks = [] + op_name_mapping_dict = read_dump_data(result_df) + + df_chunk_size = len(result_df) // process_num + if df_chunk_size > 0: + df_chunks = [result_df.iloc[i:i + df_chunk_size] for i in range(0, len(result_df), df_chunk_size)] + else: + df_chunks = [result_df] + + results =[] pool = multiprocessing.Pool(process_num) def err_call(args): print_error_log('multiprocess compare failed! Reason: {}'.format(args)) try: pool.terminate() - if os.path.exists(result_path): - os.remove(result_path) except OSError as e: print_error_log("pool terminate failed") - for process_idx, fusion_op_names in enumerate(op_names): - idx = [process_num, process_idx] - task = pool.apply_async(func, - args=(idx, fusion_op_names, op_name_mapping_dict, result_path, lock, input_parma), - error_callback=err_call) - all_tasks.append(task) + for process_idx, df_chunk in enumerate(df_chunks): + idx = df_chunk_size * process_idx + result = pool.apply_async(func, + args=(idx, op_name_mapping_dict, df_chunk, lock, input_parma), + error_callback=err_call) + results.append(result) + final_results = [r.get() for r in results] pool.close() pool.join() + return pd.concat(final_results, ignore_index = True) -def compare_ops(idx, fusion_op_names, dump_path_dict, result_path, lock, input_parma): +def compare_ops(idx, dump_path_dict, result_df, lock, input_parma): + """Return a dataframe with compare results of real data""" cos_result = [] max_err_result = [] max_relative_err_result = [] @@ -461,7 +682,8 @@ def compare_ops(idx, fusion_op_names, dump_path_dict, result_path, lock, input_p one_thousand_err_ratio_result = [] five_thousand_err_ratio_result = [] is_print_compare_log = input_parma.get("is_print_compare_log") - for i, op_name in enumerate(fusion_op_names): + for i in range(len(result_df)): + op_name = result_df.iloc[i, 0] if is_print_compare_log: print("start compare: {}".format(op_name)) cos_sim, max_abs_err, max_relative_err, err_msg, one_thousand_err_ratio, five_thousand_err_ratio = compare_by_op(op_name, dump_path_dict, input_parma) @@ -473,32 +695,29 @@ def compare_ops(idx, fusion_op_names, dump_path_dict, result_path, lock, input_p err_mess.append(err_msg) one_thousand_err_ratio_result.append(one_thousand_err_ratio) five_thousand_err_ratio_result.append(five_thousand_err_ratio) - _save_cmp_result(idx, cos_result, max_err_result, max_relative_err_result, err_mess, one_thousand_err_ratio_result, - five_thousand_err_ratio_result, result_path, lock) - + result_df = _save_cmp_result(idx, cos_result, max_err_result, max_relative_err_result, err_mess, one_thousand_err_ratio_result, + five_thousand_err_ratio_result, result_df, lock) + return result_df -def _save_cmp_result(idx, cos_result, max_err_result, max_relative_err_result, err_msg, one_thousand_err_ratio_result, five_thousand_err_ratio_result, result_path, lock): +def _save_cmp_result(idx, cos_result, max_err_result, max_relative_err_result, err_msg, one_thousand_err_ratio_result, five_thousand_err_ratio_result, result_df, lock): lock.acquire() try: - csv_pd = pd.read_csv(result_path, dtype=str) - process_num = idx[0] - process_idx = idx[1] for i, _ in enumerate(cos_result): - process_index = i * process_num + process_idx - csv_pd.loc[process_index, CompareConst.COSINE] = cos_result[i] - csv_pd.loc[process_index, CompareConst.MAX_ABS_ERR] = max_err_result[i] - csv_pd.loc[process_index, CompareConst.MAX_RELATIVE_ERR] = max_relative_err_result[i] - csv_pd.loc[process_index, CompareConst.ERROR_MESSAGE] = err_msg[i] - csv_pd.loc[process_index, CompareConst.ACCURACY] = check_accuracy(cos_result[i], max_err_result[i]) - csv_pd.loc[process_index, CompareConst.ONE_THOUSANDTH_ERR_RATIO] = one_thousand_err_ratio_result[i] - csv_pd.loc[process_index, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] = five_thousand_err_ratio_result[i] - csv_pd.to_csv(result_path, index=False) - except FileNotFoundError as e: - print_error_log('{} file is not found.'.format(result_path)) - raise CompareException(CompareException.OPEN_FILE_ERROR) from e - except IOError as e: - print_error_log('{} read csv failed.'.format(result_path)) - raise CompareException(CompareException.READ_FILE_ERROR) from e + process_index = i + idx + result_df.loc[process_index, CompareConst.COSINE] = cos_result[i] + result_df.loc[process_index, CompareConst.MAX_ABS_ERR] = max_err_result[i] + result_df.loc[process_index, CompareConst.MAX_RELATIVE_ERR] = max_relative_err_result[i] + result_df.loc[process_index, CompareConst.ERROR_MESSAGE] = err_msg[i] + result_df.loc[process_index, CompareConst.ACCURACY] = check_accuracy(cos_result[i], max_err_result[i]) + result_df.loc[process_index, CompareConst.ONE_THOUSANDTH_ERR_RATIO] = one_thousand_err_ratio_result[i] + result_df.loc[process_index, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] = five_thousand_err_ratio_result[i] + return result_df + except ValueError as e: + print_error_log('result dataframe is not found.') + raise CompareException(CompareException.INVALID_DATA_ERROR) from e + except IndexError as e: + print_error_log('result dataframe elements can not be access.') + raise CompareException(CompareException.INDEX_OUT_OF_BOUNDS_ERROR) from e finally: lock.release() @@ -523,6 +742,7 @@ def check_accuracy(cos, max_abs_err): def compare_by_op(op_name, op_name_mapping_dict, input_parma): + """Compare single NPU data with bench data""" npu_bench_name_list = op_name_mapping_dict[op_name] if npu_bench_name_list[1] == CompareConst.NAN: return CompareConst.NAN, CompareConst.NAN, CompareConst.NAN, CompareConst.NO_BENCH, CompareConst.NAN, CompareConst.NAN @@ -601,6 +821,8 @@ def handle_inf_nan(n_value, b_value): def compare(input_parma, output_path, stack_mode=False, auto_analyze=True, fuzzy_match=False): try: + message = Const.VERSION_MESSAGE + warnings.warn(message) summary_compare = is_summary_compare(input_parma) md5_compare = is_md5_compare(input_parma) check_configuration_param(stack_mode, auto_analyze, fuzzy_match) @@ -615,24 +837,25 @@ def compare(input_parma, output_path, stack_mode=False, auto_analyze=True, def compare_core(input_parma, output_path, stack_mode=False, auto_analyze=True, suffix='', fuzzy_match=False, summary_compare=False, md5_compare=False): + """Compare NPU with bench data then return compare results and advise""" print_info_log("Please check whether the input data belongs to you. If not, there may be security risks.") - file_name = add_time_as_suffix("compare_result" + suffix) + file_name = add_time_with_xlsx("compare_result" + suffix) file_path = os.path.join(os.path.realpath(output_path), file_name) check_file_not_exists(file_path) + highlight_dict = {'red_rows': [], 'yellow_rows': []} with FileOpen(input_parma.get("npu_pkl_path"), "r") as npu_pkl, \ - FileOpen(input_parma.get("bench_pkl_path"), "r") as bench_pkl, \ - os.fdopen(os.open(file_path, os.O_RDWR | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP), 'w+') \ - as fout: - compare_process([npu_pkl, bench_pkl, fout], stack_mode, fuzzy_match, summary_compare, md5_compare) - if summary_compare: - print_info_log(f"Summary compare result is {file_path}") + FileOpen(input_parma.get("bench_pkl_path"), "r") as bench_pkl: + result_df = compare_process([npu_pkl, bench_pkl], stack_mode, fuzzy_match, highlight_dict, summary_compare, + md5_compare) if not md5_compare and not summary_compare: - _do_multi_process(input_parma, file_path) + result_df = _do_multi_process(input_parma, result_df) + find_compare_result_error_rows(result_df, highlight_dict) + highlight_rows_xlsx(result_df, highlight_dict, file_path) change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY) if auto_analyze: - advisor = Advisor(file_path, output_path) + advisor = Advisor(result_df, output_path) advisor.analysis() @@ -671,19 +894,22 @@ def parse(pkl_file, module_name_prefix): print(summery_info) -def compare_process(file_handles, stack_mode, fuzzy_match, summary_compare=False, md5_compare=False): - npu_pkl_handle, bench_pkl_handle, output_csv_handle = file_handles +def compare_process(file_handles, stack_mode, fuzzy_match, highlight_dict, summary_compare=False, md5_compare=False): + """Read input pkl files, compare summary/md5 data between NPU and bench and return dataframe""" + npu_pkl_handle, bench_pkl_handle = file_handles if fuzzy_match: print_warn_log("This task uses fuzzy matching, which may affect the accuracy of the comparison.") npu_ops_queue = [] bench_ops_queue = [] result = [] while True: + last_npu_ops_len = len(npu_ops_queue) npu_file_flag = read_op(npu_ops_queue, npu_pkl_handle, stack_mode) bench_file_flag = read_op(bench_ops_queue, bench_pkl_handle, stack_mode) - if (not npu_file_flag and not bench_file_flag) \ - or (len(npu_ops_queue) == 0 or len(bench_ops_queue) == 0): + if not npu_file_flag and not bench_file_flag: break + if len(npu_ops_queue) == 0 or len(bench_ops_queue) == 0 or len(npu_ops_queue) == last_npu_ops_len: + continue n_match_point, b_match_point = match_op(npu_ops_queue, bench_ops_queue, fuzzy_match) if n_match_point == -1 and b_match_point == -1: continue @@ -692,7 +918,7 @@ def compare_process(file_handles, stack_mode, fuzzy_match, summary_compare=False un_match_data = npu_ops_queue[0: n_match_point] for npu_data in un_match_data: get_un_match_accuracy(result, npu_data, md5_compare, summary_compare) - get_accuracy(result, n_match_data, b_match_data, summary_compare, md5_compare) + get_accuracy(result, n_match_data, b_match_data, highlight_dict, summary_compare, md5_compare) del npu_ops_queue[0: n_match_point + 1] del bench_ops_queue[0: b_match_point + 1] if npu_ops_queue: @@ -709,7 +935,7 @@ def compare_process(file_handles, stack_mode, fuzzy_match, summary_compare=False if stack_mode: header.append(CompareConst.STACK) result_df = pd.DataFrame(result, columns=header) - result_df.to_csv(output_csv_handle, index=False) + return result_df def get_un_match_accuracy(result, n_dict, md5_compare, summary_compare): diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py index 85f9bb95ffa954f64390f9f418d37cf55c00d41b..143678711ec0487aa292e13a8f077df015374784 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py @@ -18,21 +18,37 @@ import os import sys import re from ..common.utils import print_error_log, CompareException, check_compare_param, check_file_or_directory_path, \ - check_configuration_param, is_summary_compare, is_md5_compare + check_configuration_param, is_summary_compare, is_md5_compare, Const, WarningManager, check_regex_prefix_format_valid from .acc_compare import compare_core def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs): def check_and_return_dir_contents(dump_dir, prefix): + """ + check the given dump dir and validate files in dump dir by using the given prefix patterns to build a + pattern: ^{prefix}(?:0|[0-9][1-9]*)?$ + + Args: + dump_dir (str): dump dir + prefix (str): prefix for the patterns, prefix should be less than 20 characters and alphanumeric/-/_ only + + Returns: + content [list]: dir contents + Raises: + CompareException: invalid path + ValueError: prefix not match the patterns + + """ + check_regex_prefix_format_valid(prefix) check_file_or_directory_path(dump_dir, True) contents = os.listdir(dump_dir) pattern = re.compile(f'^{prefix}[0-9]+$') for name in contents: - match = pattern.match(name) - if match is None: - msg = (f"dump_dir contains '{name}'. Expected '{prefix}'. This name is not in the format of dump output. " - f"Please check and delete irrelevant files in {dump_dir} and try again.") - print_error_log(msg) + if not pattern.match(name): + print_error_log( + f"dump_dir contains '{name}'. Expected '{prefix}'. This name is not in the format of dump " + f"output. Please check and delete irrelevant files in {dump_dir} and try again." + ) raise CompareException(CompareException.INVALID_PATH_ERROR) return contents @@ -46,20 +62,25 @@ def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs): elif full_path.endswith('.pkl'): pkl_path = full_path pkl_name = fname + + name_body, ext = os.path.splitext(pkl_name) # Provide robustness on invalid directory inputs if not pkl_path: print_error_log(f'No file is found in dump dir {dirname}. ') raise CompareException(CompareException.NO_DUMP_FILE_ERROR) - name_body, ext = os.path.splitext(pkl_name) + # Check if the name of pkl file meets the criteria, or raise ValueError if not + check_regex_prefix_format_valid(name_body) + pattern = re.compile(f'{name_body}$') match = pattern.match(dump_data_dirname) if dump_data_dir and match is None: print_error_log('The names of pkl and directory do not match! ' - f'Please check the names and remove irrelevant files in {dirname}. ') + f'Please check the names and remove irrelevant files in {dirname}. ') raise CompareException(CompareException.INVALID_FILE_ERROR) return pkl_path, dump_data_dir - + wm = WarningManager() + wm.warn(message=Const.VERSION_MESSAGE, enable_warnings=True) if kwargs.get('suffix'): print_error_log("Argument 'suffix' is not supported for compare_distributed.") raise CompareException(CompareException.INVALID_PARAM_ERROR) @@ -70,9 +91,8 @@ def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs): npu_ranks = sorted(check_and_return_dir_contents(npu_dump_dir, 'rank')) bench_ranks = sorted(check_and_return_dir_contents(bench_dump_dir, 'rank')) if len(npu_ranks) != len(bench_ranks): - print_error_log('The number of ranks in the two runs are different. ' - 'Unable to match the ranks. Please use another folder to compare ' - 'or use compare() api and manually match the ranks.') + print_error_log('The number of ranks in the two runs are different. Unable to match the ranks. Please use ' + 'another folder to compare or use compare() api and manually match the ranks.') raise CompareException(CompareException.INVALID_PATH_ERROR) for nr, br in zip(npu_ranks, bench_ranks): n_dir = os.path.join(npu_dump_dir, nr) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py index d9bbf6f4060e29fab0164c66ccbd121f22944c31..46e4803fa605ef9a1f6421bbe7b725a36c21d491 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py @@ -2,7 +2,7 @@ import os from concurrent.futures import ThreadPoolExecutor import torch from ..common.utils import Const, check_switch_valid, generate_compare_script, check_is_npu, print_error_log, \ - CompareException, print_warn_log + CompareException, print_warn_log, WarningManager from ..dump.dump import DumpUtil, acc_cmp_dump, write_to_disk, get_pkl_file_path, reset_module_count from ..dump.utils import set_dump_path, set_dump_switch_print_info, generate_dump_path_str, \ set_dump_switch_config, set_backward_input @@ -97,6 +97,8 @@ class PrecisionDebugger: @classmethod def start(cls): + wm = WarningManager() + wm.warn(message=Const.VERSION_MESSAGE, enable_warnings=True) instance = cls._instance if not instance: raise Exception("No instance of PrecisionDebugger found.") diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py index b24eb4e876b634ce0ce81a92313cfb996e2f15db..8d24917a2c6b7b086180217af23565df3a4699a7 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py @@ -37,7 +37,7 @@ from .utils import (DumpUtil, check_if_in_api_list, make_dump_data_dir, get_tens from ..common.utils import (print_warn_log, Const, print_info_log, modify_dump_path, check_inplace_op, CompareConst, get_md5_for_tensor, print_error_log) from ..dump.utils import check_writable -from ..common.file_check_util import FileOpen, change_mode, FileCheckConst, check_path_pattern_vaild, check_path_length +from ..common.file_check_util import FileOpen, change_mode, FileCheckConst, check_path_pattern_valid, check_path_length forward_init_status = False backward_init_status = False diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py index bea18501aefaba4916cba7ad3b7e455d0c29033c..4ed099da2dc8890f6e7f894756fdc2a3185aa53c 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py @@ -10,7 +10,7 @@ from ..dump import dump from ..common.utils import print_error_log, CompareException, DumpException, Const, get_time, print_info_log, \ check_mode_valid, check_switch_valid, check_dump_mode_valid, check_summary_only_valid, generate_compare_script, \ check_is_npu, check_file_valid, make_dump_path_if_not_exists, check_path_before_create, print_warn_log, check_summary_mode_valid -from ..common.file_check_util import FileChecker, FileCheckConst, check_path_length, check_path_pattern_vaild +from ..common.file_check_util import FileChecker, FileCheckConst, check_path_length, check_path_pattern_valid from ..common.version import __version__ @@ -203,7 +203,7 @@ def create_dirs_if_not_exist(rank, dump_file): rank_dir = os.path.join(dump_path, f"rank{rank}") dump_file = os.path.join(rank_dir, file_name) if not os.path.isdir(rank_dir): - check_path_pattern_vaild(dump_file) + check_path_pattern_valid(dump_file) check_path_length(dump_file, name_length=200) Path(rank_dir).mkdir(mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True) return dump_file @@ -313,7 +313,7 @@ def make_dump_dirs(): dump_root_dir = load_env_dump_path(DumpUtil.dump_path) tag_dir = os.path.join(dump_root_dir, DumpUtil.dump_dir_tag + f'_v{__version__}') check_path_length(tag_dir) - check_path_pattern_vaild(tag_dir) + check_path_pattern_valid(tag_dir) Path(tag_dir).mkdir(mode=0o750, parents=True, exist_ok=True) DumpUtil.dump_dir = tag_dir dump_file_path = os.path.join(tag_dir, dump_file_name) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py index 17fadc8ea94e8da02795a20a4fb5da1a16193ab2..ab3d30ef5107d1351ed822a0a43ff5150a613e5d 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py @@ -17,7 +17,6 @@ import functools import os - from inspect import isfunction import torch import torch.distributed as dist @@ -27,7 +26,7 @@ from .hook_module import HOOKModule from .api_registry import api_register from .wrap_functional import remove_dropout from ..common.utils import check_file_or_directory_path, print_error_log, CompareException, Const, \ - print_info_log, print_warn_log, get_process_rank, torch_without_guard_version + print_info_log, print_warn_log, get_process_rank, torch_without_guard_version, WarningManager from ..dump.utils import make_dump_dirs, DumpUtil from ..overflow_check.utils import OverFlowUtil, clear_overflow_npu @@ -62,6 +61,8 @@ def add_clear_overflow(func, pid): def register_hook(model, hook, **kwargs): + wm = WarningManager() + wm.warn(message=Const.VERSION_MESSAGE, enable_warnings=True) check_register_hook(hook, **kwargs) print_info_log("Please disable dataloader shuffle before running the program.") overflow_nums = kwargs.get('overflow_nums', 1) diff --git a/debug/accuracy_tools/ptdbg_ascend/test/run_ut.py b/debug/accuracy_tools/ptdbg_ascend/test/run_ut.py index 5972e73d2762a2ac981fdcf9f870aa761727c1c4..31d350df097e794512dd6bfd1faf0387d3fa8ba8 100644 --- a/debug/accuracy_tools/ptdbg_ascend/test/run_ut.py +++ b/debug/accuracy_tools/ptdbg_ascend/test/run_ut.py @@ -3,6 +3,7 @@ import shutil import subprocess import sys + def run_ut(): cur_dir = os.path.realpath(os.path.dirname(__file__)) top_dir = os.path.realpath(os.path.dirname(cur_dir)) @@ -13,11 +14,11 @@ def run_ut(): if os.path.exists(report_dir): shutil.rmtree(report_dir) - os.makedirs(report_dir) + os.makedirs(report_dir, mode=0o700) cmd = ["python3", "-m", "pytest", ut_path, "--junitxml=" + report_dir + "/final.xml", "--cov=" + src_dir, "--cov-branch", "--cov-report=xml:" + report_dir + "/coverage.xml"] - + result_ut = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) while result_ut.poll() is None: diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/compare/test_acc_compare.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/compare/test_acc_compare.py index d51dbfb93d04378866f887f9873e7f99279cb495..1d6bc5b01bbf94e13a6159378fdc5357f6525e19 100644 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/compare/test_acc_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/compare/test_acc_compare.py @@ -4,93 +4,175 @@ import numpy as np import os from ptdbg_ascend.compare import acc_compare as compare from ptdbg_ascend.common.utils import CompareConst +from collections import namedtuple +import pandas as pd +npu_dict = {'op_name': ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.1', + 'Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_output'], \ + 'input_struct': [('torch.float32', [1, 1, 28, 28]), ('torch.float32', [16, 1, 5, 5]), + ('torch.float32', [16])], \ + 'output_struct': [('torch.float32', [1, 16, 28, 28])], + 'summery': [[3.029174327850342, -2.926689624786377, -0.06619918346405029], \ + [0.19919930398464203, -0.19974489510059357, 0.006269412115216255], + [0.19734230637550354, -0.18177609145641327, 0.007903944700956345], + [2.1166646480560303, -2.190781354904175, -0.003579073818400502]], 'stack_info': []} +bench_dict = {'op_name': ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.1', + 'Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_output'], \ + 'input_struct': [('torch.float32', [1, 1, 28, 28]), ('torch.float32', [16, 1, 5, 5]), + ('torch.float32', [16])], \ + 'output_struct': [('torch.float32', [1, 16, 28, 28])], + 'summery': [[3.029174327850342, -2.926689624786377, -0.06619918346405029], \ + [0.19919930398464203, -0.19974489510059357, 0.006269412115216255], + [0.19734230637550354, -0.18177609145641327, 0.007903944700956345], + [2.1166646480560303, -2.190781354904175, -0.003579073818400502]], 'stack_info': []} +tensor_list = [['Functional_conv2d_0_forward_input.0', 1, [], 'torch.float32', [1, 1, 28, 28], + [3.029174327850342, -2.926689624786377, -0.06619918346405029]], \ + ['Functional_conv2d_0_forward_input.1', 1, [], 'torch.float32', [16, 1, 5, 5], + [0.19919930398464203, -0.19974489510059357, 0.006269412115216255]], \ + ['Functional_conv2d_0_forward_input.2', 1, [], 'torch.float32', [16], + [0.19734230637550354, -0.18177609145641327, 0.007903944700956345]], \ + ['Functional_conv2d_0_forward_output', 1, [], 'torch.float32', [1, 16, 28, 28], + [2.1166646480560303, -2.190781354904175, -0.003579073818400502]]] +result_op_dict = {'op_name': ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.1', + 'Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_output'], \ + 'input_struct': [('torch.float32', [1, 1, 28, 28], []), ('torch.float32', [16, 1, 5, 5], []), + ('torch.float32', [16], [])], \ + 'output_struct': [('torch.float32', [1, 16, 28, 28], [])], + 'summery': [[3.029174327850342, -2.926689624786377, -0.06619918346405029], + [0.19919930398464203, -0.19974489510059357, 0.006269412115216255], \ + [0.19734230637550354, -0.18177609145641327, 0.007903944700956345], + [2.1166646480560303, -2.190781354904175, -0.003579073818400502]], 'stack_info': []} -npu_dict = {'op_name': ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.1', 'Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_output'],\ - 'input_struct': [('torch.float32', [1, 1, 28, 28]), ('torch.float32', [16, 1, 5, 5]), ('torch.float32', [16])],\ - 'output_struct': [('torch.float32', [1, 16, 28, 28])], 'summery': [[3.029174327850342, -2.926689624786377, -0.06619918346405029], \ - [0.19919930398464203, -0.19974489510059357, 0.006269412115216255], [0.19734230637550354, -0.18177609145641327, 0.007903944700956345], [2.1166646480560303, -2.190781354904175, -0.003579073818400502]], 'stack_info': []} -bench_dict = {'op_name': ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.1', 'Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_output'],\ - 'input_struct': [('torch.float32', [1, 1, 28, 28]), ('torch.float32', [16, 1, 5, 5]), ('torch.float32', [16])],\ - 'output_struct': [('torch.float32', [1, 16, 28, 28])], 'summery': [[3.029174327850342, -2.926689624786377, -0.06619918346405029], \ - [0.19919930398464203, -0.19974489510059357, 0.006269412115216255], [0.19734230637550354, -0.18177609145641327, 0.007903944700956345], [2.1166646480560303, -2.190781354904175, -0.003579073818400502]], 'stack_info': []} -tensor_list = [['Functional_conv2d_0_forward_input.0', 1, [], 'torch.float32', [1, 1, 28, 28], [3.029174327850342, -2.926689624786377, -0.06619918346405029]],\ - ['Functional_conv2d_0_forward_input.1', 1, [], 'torch.float32', [16, 1, 5, 5], [0.19919930398464203, -0.19974489510059357, 0.006269412115216255]], \ - ['Functional_conv2d_0_forward_input.2', 1, [], 'torch.float32', [16], [0.19734230637550354, -0.18177609145641327, 0.007903944700956345]],\ - ['Functional_conv2d_0_forward_output', 1, [], 'torch.float32', [1, 16, 28, 28], [2.1166646480560303, -2.190781354904175, -0.003579073818400502]]] -result_op_dict = {'op_name': ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.1', 'Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_output'], \ -'input_struct': [('torch.float32', [1, 1, 28, 28], []), ('torch.float32', [16, 1, 5, 5], []), ('torch.float32', [16], [])], \ -'output_struct': [('torch.float32', [1, 16, 28, 28], [])], 'summery': [[3.029174327850342, -2.926689624786377, -0.06619918346405029], [0.19919930398464203, -0.19974489510059357, 0.006269412115216255], \ -[0.19734230637550354, -0.18177609145641327, 0.007903944700956345], [2.1166646480560303, -2.190781354904175, -0.003579073818400502]], 'stack_info': []} - -o_result = [['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.0', 'torch.float32', 'torch.float32', [1, 1, 28, 28], [1, 1, 28, 28], ' ', ' ', ' ', ' ', ' ', 3.029174327850342, -2.926689624786377, -0.06619918346405029, 3.029174327850342, -2.926689624786377, -0.06619918346405029, 'Yes', ''], ['Functional_conv2d_0_forward_input.1', 'Functional_conv2d_0_forward_input.1', 'torch.float32', 'torch.float32', [16, 1, 5, 5], [16, 1, 5, 5], ' ', ' ', ' ', ' ', ' ', 0.19919930398464203, -0.19974489510059357, 0.006269412115216255, 0.19919930398464203, -0.19974489510059357, 0.006269412115216255, 'Yes', ''], ['Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_input.2', 'torch.float32', 'torch.float32', [16], [16], ' ', ' ', ' ', ' ', ' ', 0.19734230637550354, -0.18177609145641327, 0.007903944700956345, 0.19734230637550354, -0.18177609145641327, 0.007903944700956345, 'Yes', ''], ['Functional_conv2d_0_forward_output', 'Functional_conv2d_0_forward_output', 'torch.float32', 'torch.float32', [1, 16, 28, 28], [1, 16, 28, 28], ' ', ' ', ' ', ' ', ' ', 2.1166646480560303, -2.190781354904175, -0.003579073818400502, 2.1166646480560303, -2.190781354904175, -0.003579073818400502, 'Yes', '']] +o_result = [ + ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.0', 'torch.float32', 'torch.float32', + [1, 1, 28, 28], [1, 1, 28, 28], ' ', ' ', ' ', ' ', ' ', 3.029174327850342, -2.926689624786377, + -0.06619918346405029, 3.029174327850342, -2.926689624786377, -0.06619918346405029, 'Yes', ''], + ['Functional_conv2d_0_forward_input.1', 'Functional_conv2d_0_forward_input.1', 'torch.float32', 'torch.float32', + [16, 1, 5, 5], [16, 1, 5, 5], ' ', ' ', ' ', ' ', ' ', 0.19919930398464203, -0.19974489510059357, + 0.006269412115216255, 0.19919930398464203, -0.19974489510059357, 0.006269412115216255, 'Yes', ''], + ['Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_input.2', 'torch.float32', 'torch.float32', + [16], [16], ' ', ' ', ' ', ' ', ' ', 0.19734230637550354, -0.18177609145641327, 0.007903944700956345, + 0.19734230637550354, -0.18177609145641327, 0.007903944700956345, 'Yes', ''], + ['Functional_conv2d_0_forward_output', 'Functional_conv2d_0_forward_output', 'torch.float32', 'torch.float32', + [1, 16, 28, 28], [1, 16, 28, 28], ' ', ' ', ' ', ' ', ' ', 2.1166646480560303, -2.190781354904175, + -0.003579073818400502, 2.1166646480560303, -2.190781354904175, -0.003579073818400502, 'Yes', '']] npu_dict_aten = {'op_name': ['Aten__native_batch_norm_legit_functional.default_0_forward_input.0', - 'Aten__native_batch_norm_legit_functional.default_0_forward_input.1', - 'Aten__native_batch_norm_legit_functional.default_0_forward_input.2', - 'Aten__native_batch_norm_legit_functional.default_0_forward_input.3', - 'Aten__native_batch_norm_legit_functional.default_0_forward_input.4', - 'Aten__native_batch_norm_legit_functional.default_0_forward_output.0', - 'Aten__native_batch_norm_legit_functional.default_0_forward_output.1', - 'Aten__native_batch_norm_legit_functional.default_0_forward_output.2', - 'Aten__native_batch_norm_legit_functional.default_0_forward_output.3', - 'Aten__native_batch_norm_legit_functional.default_0_forward_output.4'], - 'input_struct': [('torch.float16', [256, 256, 14, 14]), ('torch.float32', [256]), ('torch.float32', [256]), ('torch.float32', [256]), ('torch.float32', [256])], - 'output_struct': [('torch.float16', [256, 256, 14, 14]), ('torch.float32', [256]), ('torch.float32', [256]), ('torch.float32', [256]), ('torch.float32', [256])], - 'summery': [[139.625, -127.5625, -0.0103607177734375], - [2.5276029109954834, -2.1788690090179443, -0.0008259844034910202], - [2.472219944000244, -2.845968723297119, -0.008756577968597412], - [2.763145923614502, -3.398397922515869, -0.052132632583379745], - [2.673110008239746, -3.149275064468384, 0.01613386906683445], - [13.5546875, -10.640625, -0.008758544921875], - [0.30550330877304077, -0.24485322833061218, -0.010361209511756897], - [623.9192504882812, 432.96826171875, 520.2276611328125], - [2.4797861576080322, -3.055997371673584, -0.04795549064874649], - [61.7945556640625, 42.59713363647461, 52.03831481933594]]} - -bench_dict_functional = {'op_name': ['Functional_batch_norm_0_forward_input.0', 'Functional_batch_norm_0_forward_input.1', - 'Functional_batch_norm_0_forward_input.2', 'Functional_batch_norm_0_forward_input.3', - 'Functional_batch_norm_0_forward_input.4', 'Functional_batch_norm_0_forward_output'], - 'input_struct': [('torch.float32', [256, 256, 14, 14]), ('torch.float32', [256]), ('torch.float32', [256]), - ('torch.float32', [256]), ('torch.float32', [256])], + 'Aten__native_batch_norm_legit_functional.default_0_forward_input.1', + 'Aten__native_batch_norm_legit_functional.default_0_forward_input.2', + 'Aten__native_batch_norm_legit_functional.default_0_forward_input.3', + 'Aten__native_batch_norm_legit_functional.default_0_forward_input.4', + 'Aten__native_batch_norm_legit_functional.default_0_forward_output.0', + 'Aten__native_batch_norm_legit_functional.default_0_forward_output.1', + 'Aten__native_batch_norm_legit_functional.default_0_forward_output.2', + 'Aten__native_batch_norm_legit_functional.default_0_forward_output.3', + 'Aten__native_batch_norm_legit_functional.default_0_forward_output.4'], + 'input_struct': [('torch.float16', [256, 256, 14, 14]), ('torch.float32', [256]), + ('torch.float32', [256]), ('torch.float32', [256]), ('torch.float32', [256])], + 'output_struct': [('torch.float16', [256, 256, 14, 14]), ('torch.float32', [256]), + ('torch.float32', [256]), ('torch.float32', [256]), ('torch.float32', [256])], + 'summery': [[139.625, -127.5625, -0.0103607177734375], + [2.5276029109954834, -2.1788690090179443, -0.0008259844034910202], + [2.472219944000244, -2.845968723297119, -0.008756577968597412], + [2.763145923614502, -3.398397922515869, -0.052132632583379745], + [2.673110008239746, -3.149275064468384, 0.01613386906683445], + [13.5546875, -10.640625, -0.008758544921875], + [0.30550330877304077, -0.24485322833061218, -0.010361209511756897], + [623.9192504882812, 432.96826171875, 520.2276611328125], + [2.4797861576080322, -3.055997371673584, -0.04795549064874649], + [61.7945556640625, 42.59713363647461, 52.03831481933594]]} + +bench_dict_functional = { + 'op_name': ['Functional_batch_norm_0_forward_input.0', 'Functional_batch_norm_0_forward_input.1', + 'Functional_batch_norm_0_forward_input.2', 'Functional_batch_norm_0_forward_input.3', + 'Functional_batch_norm_0_forward_input.4', 'Functional_batch_norm_0_forward_output'], + 'input_struct': [('torch.float32', [256, 256, 14, 14]), ('torch.float32', [256]), ('torch.float32', [256]), + ('torch.float32', [256]), ('torch.float32', [256])], 'output_struct': [('torch.float32', [256, 256, 14, 14])], 'summery': [[3.061628818511963, -3.22507381439209, 3.634914173744619e-05], - [0.0005779837374575436, -0.0006301702815108001, 3.634906533989124e-06], - [0.9338104128837585, 0.9277191162109375, 0.930335283279419], - [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], - [5.397906303405762, -5.796811580657959, 2.5283952709287405e-10]] -} - -aten_result = [['Aten__native_batch_norm_legit_functional.default_0_forward_input.0', 'Functional_batch_norm_0_forward_input.0', 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], ' ', ' ', ' ', ' ', ' ', 139.625, -127.5625, -0.0103607177734375, 3.061628818511963, -3.22507381439209, 3.634914173744619e-05, 'Yes', ''], - ['Aten__native_batch_norm_legit_functional.default_0_forward_input.1', 'Functional_batch_norm_0_forward_input.1', 'torch.float32', 'torch.float32', [256], [256], ' ', ' ', ' ', ' ', ' ', 2.5276029109954834, -2.1788690090179443, -0.0008259844034910202, 0.0005779837374575436, -0.0006301702815108001, 3.634906533989124e-06, 'Yes', ''], - ['Aten__native_batch_norm_legit_functional.default_0_forward_input.2', 'Functional_batch_norm_0_forward_input.2', 'torch.float32', 'torch.float32', [256], [256], ' ', ' ', ' ', ' ', ' ', 2.472219944000244, -2.845968723297119, -0.008756577968597412, 0.9338104128837585, 0.9277191162109375, 0.930335283279419, 'Yes', ''], - ['Aten__native_batch_norm_legit_functional.default_0_forward_input.3', 'Functional_batch_norm_0_forward_input.3', 'torch.float32', 'torch.float32', [256], [256], ' ', ' ', ' ', ' ', ' ', 2.763145923614502, -3.398397922515869, -0.052132632583379745, 1.0, 1.0, 1.0, 'Yes', ''], - ['Aten__native_batch_norm_legit_functional.default_0_forward_input.4', 'Functional_batch_norm_0_forward_input.4', 'torch.float32', 'torch.float32', [256], [256], ' ', ' ', ' ', ' ', ' ', 2.673110008239746, -3.149275064468384, 0.01613386906683445, 0.0, 0.0, 0.0, 'Yes', ''], - ['Aten__native_batch_norm_legit_functional.default_0_forward_output.0', 'Functional_batch_norm_0_forward_output', 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], ' ', ' ', ' ', ' ', ' ', 13.5546875, -10.640625, -0.008758544921875, 5.397906303405762, -5.796811580657959, 2.5283952709287405e-10, 'Yes', ''], - ['Aten__native_batch_norm_legit_functional.default_0_forward_output.1', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', ' ', ' ', ' ', ' ', ' ', 0.30550330877304077, -0.24485322833061218, -0.010361209511756897, 'Nan', 'Nan', 'Nan', 'Yes', ''], - ['Aten__native_batch_norm_legit_functional.default_0_forward_output.2', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', ' ', ' ', ' ', ' ', ' ', 623.9192504882812, 432.96826171875, 520.2276611328125, 'Nan', 'Nan', 'Nan', 'Yes', ''], - ['Aten__native_batch_norm_legit_functional.default_0_forward_output.3', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', ' ', ' ', ' ', ' ', ' ', 2.4797861576080322, -3.055997371673584, -0.04795549064874649, 'Nan', 'Nan', 'Nan', 'Yes', ''], - ['Aten__native_batch_norm_legit_functional.default_0_forward_output.4', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', ' ', ' ', ' ', ' ', ' ', 61.7945556640625, 42.59713363647461, 52.03831481933594, 'Nan', 'Nan', 'Nan', 'Yes', ''] + [0.0005779837374575436, -0.0006301702815108001, 3.634906533989124e-06], + [0.9338104128837585, 0.9277191162109375, 0.930335283279419], + [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], + [5.397906303405762, -5.796811580657959, 2.5283952709287405e-10]] + } + +aten_result = [ + ['Aten__native_batch_norm_legit_functional.default_0_forward_input.0', 'Functional_batch_norm_0_forward_input.0', + 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], ' ', ' ', ' ', ' ', ' ', 139.625, + -127.5625, -0.0103607177734375, 3.061628818511963, -3.22507381439209, 3.634914173744619e-05, 'Yes', ''], + ['Aten__native_batch_norm_legit_functional.default_0_forward_input.1', 'Functional_batch_norm_0_forward_input.1', + 'torch.float32', 'torch.float32', [256], [256], ' ', ' ', ' ', ' ', ' ', 2.5276029109954834, -2.1788690090179443, + -0.0008259844034910202, 0.0005779837374575436, -0.0006301702815108001, 3.634906533989124e-06, 'Yes', ''], + ['Aten__native_batch_norm_legit_functional.default_0_forward_input.2', 'Functional_batch_norm_0_forward_input.2', + 'torch.float32', 'torch.float32', [256], [256], ' ', ' ', ' ', ' ', ' ', 2.472219944000244, -2.845968723297119, + -0.008756577968597412, 0.9338104128837585, 0.9277191162109375, 0.930335283279419, 'Yes', ''], + ['Aten__native_batch_norm_legit_functional.default_0_forward_input.3', 'Functional_batch_norm_0_forward_input.3', + 'torch.float32', 'torch.float32', [256], [256], ' ', ' ', ' ', ' ', ' ', 2.763145923614502, -3.398397922515869, + -0.052132632583379745, 1.0, 1.0, 1.0, 'Yes', ''], + ['Aten__native_batch_norm_legit_functional.default_0_forward_input.4', 'Functional_batch_norm_0_forward_input.4', + 'torch.float32', 'torch.float32', [256], [256], ' ', ' ', ' ', ' ', ' ', 2.673110008239746, -3.149275064468384, + 0.01613386906683445, 0.0, 0.0, 0.0, 'Yes', ''], + ['Aten__native_batch_norm_legit_functional.default_0_forward_output.0', 'Functional_batch_norm_0_forward_output', + 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], ' ', ' ', ' ', ' ', ' ', 13.5546875, + -10.640625, -0.008758544921875, 5.397906303405762, -5.796811580657959, 2.5283952709287405e-10, 'Yes', ''], + ['Aten__native_batch_norm_legit_functional.default_0_forward_output.1', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', + ' ', ' ', ' ', ' ', ' ', 0.30550330877304077, -0.24485322833061218, -0.010361209511756897, 'Nan', 'Nan', 'Nan', + 'Yes', ''], + ['Aten__native_batch_norm_legit_functional.default_0_forward_output.2', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', + ' ', ' ', ' ', ' ', ' ', 623.9192504882812, 432.96826171875, 520.2276611328125, 'Nan', 'Nan', 'Nan', 'Yes', ''], + ['Aten__native_batch_norm_legit_functional.default_0_forward_output.3', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', + ' ', ' ', ' ', ' ', ' ', 2.4797861576080322, -3.055997371673584, -0.04795549064874649, 'Nan', 'Nan', 'Nan', 'Yes', + ''], + ['Aten__native_batch_norm_legit_functional.default_0_forward_output.4', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', + ' ', ' ', ' ', ' ', ' ', 61.7945556640625, 42.59713363647461, 52.03831481933594, 'Nan', 'Nan', 'Nan', 'Yes', ''] ] +highlight_dict = {'red_rows': [], 'yellow_rows': []} +LineInfo = namedtuple('LineInfo', ['line_data', 'num_pointer']) +ApiInfo = namedtuple('ApiInfo', ['api_input', 'api_output', 'num_pointer']) +ColorColumns = namedtuple('ColorColumns', ['red', 'yellow']) + +num_0, num_1, num_2, num_3 = 0, 1, 2, 3 +summary_line_input = ['Functional_batch_norm_0_forward_input.0', 'Functional_batch_norm_0_forward_input.0', 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.01, 0, 0, 0, 1, 1, 1, 1, 1.01, 1, 1, 1, 'Yes', ''] +summary_line_1 = ['Functional_batch_norm_0_forward_output.0', 'Functional_batch_norm_0_forward_output.0', 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 1, 0, 0, 0, 2, 0, 1, 1, 1, 1, 1, 1, 'Warning', ''] +summary_line_2 = ['Functional_batch_norm_0_forward_output.1', 'Functional_batch_norm_0_forward_output.1', 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.02, 0, 0, 0, 0.12, 0, 1, 1, 0.1, 1, 1, 1, 'Warning', ''] +summary_line_3 = ['Functional_batch_norm_0_forward_output.2', 'Functional_batch_norm_0_forward_output.2', 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0, 0, 0, 0, 2, 0, 1, 1, 1, 1, 1, 1, 'Warning', ''] +api_summary_info_1 = ApiInfo(api_input=summary_line_input, api_output=summary_line_1, num_pointer=num_1) +api_summary_info_2 = ApiInfo(api_input=summary_line_input, api_output=summary_line_2, num_pointer=num_2) +api_summary_info_3 = ApiInfo(api_input=summary_line_input, api_output=summary_line_3, num_pointer=num_3) +line_input = ['Functional_batch_norm_0_forward_input.0', 'Functional_batch_norm_0_forward_input.0', 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 1, 1, 1, 0.95, 1, 1, 1, 1, 1, 1.01, 1, 1, 1, 'Yes', ''] +line_1 = ['Functional_batch_norm_0_forward_output.0', 'Functional_batch_norm_0_forward_output.0', 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.8, 1, 1, 0.59, 1, 'nan', 0, 1, 1, 19, 1, 1, 1, 'Warning', ''] +line_2 = ['Functional_batch_norm_0_forward_output.1', 'Functional_batch_norm_0_forward_output.1', 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.9, 1, 1, 0.8, 1, 0, 0.12, 0, 1, 1, 0.1, 1, 1, 1, 'Warning', ''] +line_3 = ['Functional_batch_norm_0_forward_output.2', 'Functional_batch_norm_0_forward_output.2', 'torch.float16', + 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 0.8, 1.1e+10, 1, 0.85, 1, 9, 0.12, 0, 1, 1, 0.1, 1, 1, 1, 'Warning', ''] +api_info_1 = ApiInfo(api_input=line_input, api_output=line_1, num_pointer=num_1) +api_info_2 = ApiInfo(api_input=line_input, api_output=line_2, num_pointer=num_2) +api_info_3 = ApiInfo(api_input=line_input, api_output=line_3, num_pointer=num_3) class TestUtilsMethods(unittest.TestCase): def test_correct_data(self): input_1 = 'NAN' result_1 = compare.correct_data(input_1) self.assertEqual(result_1, 'NAN') - input_2 = '0.99999' + input_2 = 0.99999 result_2 = compare.correct_data(input_2) - self.assertEqual(result_2, '0.99999') - input_3 = '0.999991' + self.assertEqual(result_2, 0.99999) + input_3 = 0.999991 result_3 = compare.correct_data(input_3) - self.assertEqual(result_3, '1.0') + self.assertEqual(result_3, 1.0) def test_cosine_similarity_when_all_result_less_than_epsilon(self): n_value = np.array([0, 0, 0]) b_value = np.array([0, 0, 0]) result, message = compare.cosine_similarity(n_value, b_value) - self.assertEqual(result, '1.0') + self.assertEqual(result, 1.0) self.assertEqual(message, '') def test_cosine_similarity_when_only_npu_result_less_than_epsilon(self): @@ -111,8 +193,7 @@ class TestUtilsMethods(unittest.TestCase): n_value = np.array([1, 2, 3]) b_value = np.array([1, 2, 3]) result, message = compare.cosine_similarity(n_value, b_value) - - self.assertEqual(result, '1.0') + self.assertEqual(result, 1.0) self.assertEqual(message, '') def test_cosine_similarity_when_all_result_greater_than_epsilon_with_nan(self): @@ -147,7 +228,7 @@ class TestUtilsMethods(unittest.TestCase): n_value = np.array([1, 2, 3]) b_value = np.array([1, 2, 3]) max_relative_err, message = compare.get_max_relative_err(n_value, b_value) - self.assertEqual(max_relative_err, "0.000000000000") + self.assertEqual(max_relative_err, 0.000000000000) self.assertEqual(message, "") def test_check_graph_mode(self): @@ -169,16 +250,15 @@ class TestUtilsMethods(unittest.TestCase): def test_read_op(self): base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - + pkl_dir = os.path.join(base_dir, "resources/compare/npu_test.pkl") - + npu_ops_queue = [] npu_pkl_handle = open(pkl_dir, "r") stack_mode = False result = compare.read_op(npu_ops_queue, npu_pkl_handle, stack_mode) self.assertEqual(result, True) - def test_match_op(self): fuzzy_match = False a, b = compare.match_op([npu_dict], [bench_dict], fuzzy_match) @@ -187,11 +267,92 @@ class TestUtilsMethods(unittest.TestCase): def test_get_accuracy(self): result = [] - compare.get_accuracy(result, npu_dict, bench_dict) - + compare.get_accuracy(result, npu_dict, bench_dict, highlight_dict) + self.assertEqual(result, o_result) def test_get_accuracy_graph_mode(self): result = [] - compare.get_accuracy(result, npu_dict_aten, bench_dict_functional) + compare.get_accuracy(result, npu_dict_aten, bench_dict_functional, highlight_dict) self.assertEqual(result, aten_result) + + def test_check_order_magnitude(self): + red_lines, yellow_lines = [], [] + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + compare.check_order_magnitude(api_summary_info_1, color_columns, summary_compare=True) + self.assertEqual(yellow_lines,[num_1]) + red_lines, yellow_lines = [], [] + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + compare.check_order_magnitude(api_summary_info_2, color_columns, summary_compare=True) + self.assertEqual(yellow_lines, []) + + def test_check_max_relative_diff(self): + red_lines, yellow_lines = [], [] + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + compare.check_max_relative_diff(api_summary_info_1, color_columns, summary_compare=True) + self.assertEqual(red_lines, [num_1]) + red_lines, yellow_lines = [], [] + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + compare.check_max_relative_diff(api_summary_info_2, color_columns, summary_compare=True) + self.assertEqual(yellow_lines, [num_2]) + red_lines, yellow_lines = [], [] + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + compare.check_max_relative_diff(api_summary_info_3, color_columns, summary_compare=True) + self.assertEqual(red_lines, []) + self.assertEqual(yellow_lines, []) + + def test_check_one_thousand_error_ratio(self): + red_lines, yellow_lines = [], [] + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + compare.check_one_thousand_error_ratio(api_info_1, color_columns, summary_compare=False) + self.assertEqual(red_lines, [num_1]) + red_lines, yellow_lines = [], [] + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + compare.check_one_thousand_error_ratio(api_info_2, color_columns, summary_compare=False) + self.assertEqual(yellow_lines, [num_2]) + red_lines, yellow_lines = [], [] + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + compare.check_one_thousand_error_ratio(api_info_3, color_columns, summary_compare=False) + self.assertEqual(red_lines, []) + self.assertEqual(yellow_lines, []) + + def test_check_cosine_similarity(self): + red_lines, yellow_lines = [], [] + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + compare.check_cosine_similarity(api_info_1, color_columns, summary_compare=False) + self.assertEqual(yellow_lines, [num_1]) + red_lines, yellow_lines = [], [] + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + compare.check_cosine_similarity(api_info_2, color_columns, summary_compare=False) + self.assertEqual(yellow_lines, []) + + def test_check_overflow(self): + line_info0 = LineInfo(line_data=line_input, num_pointer=num_0) + line_info1 = LineInfo(line_data=line_1, num_pointer=num_1) + line_info2 = LineInfo(line_data=line_2, num_pointer=num_2) + line_info3 = LineInfo(line_data=line_3, num_pointer=num_3) + red_lines, yellow_lines = [], [] + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + compare.check_overflow(line_info1, color_columns, summary_compare=False) + self.assertEqual(red_lines, [num_1]) + red_lines, yellow_lines = [], [] + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + compare.check_overflow(line_info2, color_columns, summary_compare=False) + self.assertEqual(red_lines, []) + red_lines, yellow_lines = [], [] + color_columns = ColorColumns(red=red_lines, yellow=yellow_lines) + compare.check_overflow(line_info3, color_columns, summary_compare=False) + self.assertEqual(red_lines, [num_3]) + + def test_find_error_rows(self): + summary_result = [summary_line_input, summary_line_1, summary_line_2, summary_line_3] + highlight_dict = {'red_rows': [], 'yellow_rows': []} + compare.find_error_rows(summary_result, 0, 1, highlight_dict, summary_compare=True) + self.assertEqual(highlight_dict,{'red_rows': [num_1], 'yellow_rows':[num_2]}) + + def test_find_compare_result_error_rows(self): + result = [line_input, line_1, line_2, line_3] + result_df = pd.DataFrame(result) + highlight_dict = {'red_rows': [], 'yellow_rows': []} + compare.find_compare_result_error_rows(result_df, highlight_dict) + self.assertEqual(highlight_dict, {'red_rows': [num_1, num_3], 'yellow_rows': [num_2]}) \ No newline at end of file diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py index f9f31b74c0e058906056c18a8dab3709b8a68a7f..c33f830b90f8a20188d1863cc886f8bf39837ba5 100644 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py +++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py @@ -6,26 +6,21 @@ import shutil import unittest from ptdbg_ascend.advisor.advisor import Advisor from ptdbg_ascend.common.file_check_util import FileCheckException +from ptdbg_ascend.common.utils import CompareException +import pandas class TestAdvisor(unittest.TestCase): def setUp(self) -> None: - os.makedirs("test_result/output", exist_ok=True) + os.makedirs("test_result/output", mode=0o700, exist_ok=True) self.output_path = os.path.abspath("test_result/output") def tearDown(self) -> None: shutil.rmtree("test_result/", ignore_errors=True) - def test_analysis_when_csv_path_is_not_exist(self): - advisor = Advisor("resources/compare/test.pkl", self.output_path) - self.assertRaises(FileCheckException, advisor.analysis) - - def test_analysis_when_csv_path_is_invalid(self): - advisor = Advisor("resources/compare/npu_test_1.pkl", self.output_path) - self.assertRaises(FileCheckException, advisor.analysis) - def test_analysis_when_csv_is_valid(self): - advisor = Advisor("resources/compare/compare_result_20230703104808.csv", self.output_path) + input_data = pandas.read_csv("resources/compare/compare_result_20230703104808.csv") + advisor = Advisor(input_data, self.output_path) advisor.analysis() filenames = os.listdir(self.output_path) self.assertEqual(len(filenames), 1) diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor_result.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor_result.py index 1c840391591526541784a2456ee1d7b3249d3324..80d3c3c4508170591ed035082916b2a0f74dc52e 100644 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor_result.py +++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor_result.py @@ -6,11 +6,12 @@ import os import shutil import unittest from ptdbg_ascend.advisor.advisor import Advisor +import pandas class TestAdvisor(unittest.TestCase): def setUp(self) -> None: - os.makedirs("test_result/output", exist_ok=True) + os.makedirs("test_result/output", mode=0o700, exist_ok=True) self.output_path = os.path.abspath("test_result/output") self.has_error = False @@ -18,7 +19,8 @@ class TestAdvisor(unittest.TestCase): shutil.rmtree("test_result/", ignore_errors=True) def test_advisor_summary_file(self): - advisor = Advisor("resources/compare/compare_result_20230703104808.csv", self.output_path) + input_data = pandas.read_csv("resources/compare/compare_result_20230703104808.csv") + advisor = Advisor(input_data, self.output_path) advisor.analysis() filenames = os.listdir(self.output_path) for filename in filenames: diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_common_util.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_common_util.py index 5d9394a20ce3be99f7e4a1cda4940fc950b87149..4c91a6928c02c4a1e9eec5b21a4dc43d65ee631b 100644 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_common_util.py +++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_common_util.py @@ -47,13 +47,18 @@ class TestCommonUtilsMethods(unittest.TestCase): csv_name = '{}_{}.csv'.format(name, time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))) self.assertEqual(common.add_time_as_suffix(name), csv_name) + def test_add_time_with_xlsx(self): + name = "op_cmp" + xlsx_name = '{}_{}.xlsx'.format(name, time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))) + self.assertEqual(common.add_time_with_xlsx(name), xlsx_name) + def test_get_time(self): time = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S") self.assertEqual(common.get_time(), time) def test_format_value(self): value = 12345.6789 - format_value = '{:.12f}'.format(value) + format_value = float('{:.12f}'.format(value)) self.assertEqual(common.format_value(value), format_value) def test_modify_dump_path(self): diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py index 9ae980102121314205446bcd4e4d80fadbd74dad..b550954236f3e6c494efd4d69593da085965b9c5 100644 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py @@ -3,7 +3,7 @@ import torch import pytest import ptdbg_ascend.common.utils as utils -from ptdbg_ascend.common.utils import CompareException, is_md5_compare, get_md5_for_tensor +from ptdbg_ascend.common.utils import CompareException, get_md5_for_tensor from ptdbg_ascend.common.file_check_util import FileCheckException @@ -32,10 +32,6 @@ class TestUtilsMethods(unittest.TestCase): utils.check_file_size(file, 0) self.assertEqual(error.value.code, CompareException.INVALID_FILE_ERROR) - def test_is_md5_compare(self): - input_param = {"npu_pkl_path": "resources/compare/npu_test.pkl"} - result = is_md5_compare(input_param) - self.assertFalse(result) def test_get_md5_for_tensor(self): data = [[1, 2], [3, 4]] diff --git a/debug/accuracy_tools/setup.py b/debug/accuracy_tools/setup.py index 886d230906476909b7e88eade5424e8d20aa883a..f1579a7e416e946e7f76ae2f78cc05d112cfc22d 100644 --- a/debug/accuracy_tools/setup.py +++ b/debug/accuracy_tools/setup.py @@ -19,7 +19,7 @@ from setuptools import setup, find_packages setup( name='ascend_training_accuracy_tools', - version='0.0.1', + version='0.0.3', description='This is a pytorch precision comparison tools', long_description='This is a pytorch precision comparison tools, include ptdbg and api accuracy checker', packages=find_packages(), diff --git a/debug/accuracy_tools/test/pytorch/free_benchmark/test_perturbed_layser.py b/debug/accuracy_tools/test/pytorch/free_benchmark/test_perturbed_layser.py new file mode 100644 index 0000000000000000000000000000000000000000..7fea7fa8e095fece43c408e07bde970206fcccc0 --- /dev/null +++ b/debug/accuracy_tools/test/pytorch/free_benchmark/test_perturbed_layser.py @@ -0,0 +1,92 @@ +from unittest import TestCase + +import torch +from atat.pytorch.common.utils import Const +from atat.pytorch.free_benchmark.common.enums import DeviceType, PerturbationMode +from atat.pytorch.free_benchmark.common.params import data_pre_deal +from atat.pytorch.free_benchmark.perturbed_layers.layer_factory import LayerFactory + + +class TestPerturbedLayer(TestCase): + + # 对输出精度和输入精度一致算子使用升精度扰动因子时, 输出结果的精度也会提升 + def test_improve_precision_layer_handle_with_out_dtype_changing(self): + api_name = "Torch.mul.0.forward" + x = torch.randn(2, 3, dtype=torch.float16) + y = torch.randn(2, 3, dtype=torch.float16) + out = torch.mul(x, y) + + data_params = data_pre_deal(api_name, torch.mul, (x, y), {}) + data_params.fuzz_stage = Const.FORWARD + data_params.original_result = out + + layer = LayerFactory.create( + api_name, DeviceType.NPU, PerturbationMode.IMPROVE_PRECISION + ) + layer.handle(data_params) + self.assertEqual(data_params.original_result.dtype, torch.float16) + self.assertEqual(layer.perturbed_value, torch.float32) + self.assertEqual(data_params.perturbed_result.dtype, torch.float32) + + # 对于可迭代类型的输入, 升精度方法会遍历其中元素对支持类型输入升精度 + def test_improve_precision_layer_with_iterable_inputs(self): + api_name = "iterable.0.forward" + tensor_a = torch.randn(2, 3, dtype=torch.bfloat16) + tensor_b = torch.randn(2, 3, dtype=torch.float16) + tensor_c = torch.randn(2, 3, dtype=torch.float32) + tensor_d = torch.randn(2, 3, dtype=torch.float64) + tensor_f = torch.randn(2, 3, dtype=torch.float64).to(torch.int32) + inputs = [tensor_a, tensor_b, {"c": tensor_c, "d": tensor_d}, tensor_f] + + layer = LayerFactory.create( + api_name, DeviceType.NPU, PerturbationMode.IMPROVE_PRECISION + ) + Perturbed_value = layer.improve_tensor_precision(inputs) + self.assertEqual(Perturbed_value[0].dtype, torch.float32) + self.assertEqual(Perturbed_value[1].dtype, torch.float32) + self.assertEqual(Perturbed_value[2]["c"].dtype, torch.float32) + self.assertEqual(Perturbed_value[2]["d"].dtype, torch.float64) + self.assertEqual(Perturbed_value[3].dtype, torch.int32) + + # no_change扰动因子不会改变输入 + def test_no_change_layer(self): + api_name = "nochange.0.forward" + inputs = torch.as_tensor([1e-9, 1e-2], dtype=torch.float32) + layer = LayerFactory.create( + api_name, DeviceType.NPU, PerturbationMode.NO_CHANGE + ) + Perturbed_value = layer.no_change(inputs) + self.assertEqual(Perturbed_value[0], 1e-9) + self.assertEqual(Perturbed_value[1], 1e-2) + + # 对于一维二维张量,change_value扰动因子会交换首尾值的位置 + def test_change_value_layer(self): + api_name = "change.0.forward" + inputs_1dim = torch.as_tensor([1e-9, 1e-7, 1e-2], dtype=torch.float32) + inputs_2dim = torch.as_tensor( + [[1e-9, 1e-7, 1e-2], [1e-9, 1e-2, 1e-7]], dtype=torch.float32 + ) + layer = LayerFactory.create( + api_name, DeviceType.NPU, PerturbationMode.CHANGE_VALUE + ) + Perturbed_value_1dim = layer.change_value(inputs_1dim) + layer.is_added = False + Perturbed_value_2dim = layer.change_value(inputs_2dim) + self.assertEqual(Perturbed_value_1dim[0], 1e-2) + self.assertEqual(Perturbed_value_1dim[2], 1e-9) + self.assertEqual(Perturbed_value_2dim[0][0], 1e-7) + self.assertEqual(Perturbed_value_2dim[-1][-1], 1e-9) + + # 对于输入张量,bit_noise扰动因子对大于极小值的部分进行末尾比特翻转 + def test_bit_noise_layer(self): + api_name = "bitnoise.0.forward" + inputs = torch.as_tensor( + [4096.00048828125, 16777216, 1e-38], dtype=torch.float32 + ) + layer = LayerFactory.create( + api_name, DeviceType.NPU, PerturbationMode.BIT_NOISE + ) + Perturbed_value = layer.add_bit_noise(inputs) + self.assertEqual(Perturbed_value[0], 4096.0000000000) + self.assertEqual(Perturbed_value[1], 16777218) + self.assertEqual(Perturbed_value[2], 1e-38) diff --git a/debug/weight_convert/README.md b/debug/weight_convert/README.md deleted file mode 100644 index 6cc4e2481fbee3460fa79d600bd680721db4e6b7..0000000000000000000000000000000000000000 --- a/debug/weight_convert/README.md +++ /dev/null @@ -1,90 +0,0 @@ -## 训推一体权重转换工具 - -推理场景基于Huggingface的权重进行推理,Huggingface主要基于GPU训练,而昇腾主要在NPU上进行训练,不同硬件平台对应的模型权重格式存在差异。需要支持在NPU和GPU上训练好的模型转成Huggingface格式safetensors文件,用于推理使用。 - - -#### 前提条件 -准备以下权重: -1. 复训前huggingface权重 -2. 复训后权重 - -##### 依赖安装 - -如下命令如果使用非root用户安装,需要在安装命令后加上--user,例如:**pip3 install xxx** **--user**,安装命令可在任意路径下执行。 - -```shell -# python=3.8 -pip install torch-2.1.0-cp38-cp38m-linux_aarch64.whl -pip install torch_npu-2.1.0.post5_XXXXXX-cp38-cp38m-linux_aarch64.whl -source /path/to/Ascend/ascend-toolkit/set_env.sh - -git clone https://gitee.com/ascend/ModelLink.git -git clone https://gitee.com/ascend/AscendSpeed.git - -cd AscendSpeed -pip3 install -r requirements.txt -pip3 install -e . -cd .. -cd ModelLink -pip3 install -r requirements.txt -export PYTHONPATH=`pwd`:$PYTHONPATH -cd .. -``` - -##### 代码获取 - -```shell -git clone https://gitee.com/Ascend/att.git -cd att -git checkout develop - -cd ../ModelLink -git reset --hard c566ce4fa99cf3ea179b163355fca2c2aedfc471 -cp ../att/debug/weight_convert/diff.patch . -git apply --check diff.patch -git apply diff.patch -cd ../att/debug/weight_convert/ -``` - -#### 启动工具 - -1. 参考表1 参数说明配置信息,执行如下命令启动分析任务。转换后权重会保存在`原始huggingface权重存放位置/mg2hf`下 - -```shell -python3 convert_ckpt.py -i 待转换权重路径 -o 原始huggingface权重存放位置 -m 模型类型,可选项:llama/bloom\ - [--target-tensor-parallel-size 张量并行数 \ - --target-pipeline-parallel-size 流水线并行数\ - --embed-layernorm] -``` - - **表1 参数说明** - - | 参数 | 参数说明 | 取值示例 | - | ---------------------------------- | -------------------------------------- | ------------------------------------------------------------ | - | -i
--input-model-dir | **必选** 待转换权重文件的存放位置 | /home/*xxx*/*input_weight* | - | -o
--output-model-dir | **必选** 导出权重文件的存放位置(要求目录下有原始huggingface权重) | /home/*xxx*/*output_weight* | - | -m
--model | **必选** 转换的模型类型 | llama(默认)
bloom | - | --target-tensor-parallel-size | 转换后张量并行数 | 1 | - | --target-pipeline-parallel-size | 转换后流水线并行数 | 1 | - | --embed-layernorm | 模型中是否存在embedding layernorm结构 | False(默认)
True | - | -h
--help | 显示帮助信息。 | - | - - -2. 模型转换命令参考 - - **Llama 7/13/65B**、 **Llama2 7/13/70B** -```shell -python3 convert_ckpt.py -o "your huggingface checkpoint output path" \ - -i "your megatron checkpoint path" \ - --model llama -``` - - **Bloom 7B** -```shell -python3 convert_ckpt.py -o "your huggingface checkpoint output path" \ - -i "your megatron checkpoint path" \ - --model bloom -``` - - -3. 分析完成后,进入输出路径,查看转换结果。 \ No newline at end of file diff --git a/debug/weight_convert/bloom.py b/debug/weight_convert/bloom.py deleted file mode 100644 index d884d451f3282be197df967ebdd979a20c6775c1..0000000000000000000000000000000000000000 --- a/debug/weight_convert/bloom.py +++ /dev/null @@ -1,526 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the License); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -from collections.abc import Mapping -import concurrent.futures -import os -import gc -import sys -import shutil -import torch -import torch_npu - - -def add_arguments(parser): - group = parser.add_argument_group(title='Megatron saver') - group.add_argument('--target-tensor-parallel-size', type=int, - help='Target tensor model parallel size, defaults to the tensor parallel size ' - 'in the input checkpoint if provided by the loader, otherwise to 1') - group.add_argument('--target-pipeline-parallel-size', type=int, - help='Target tensor model parallel size, default to the pipeline parall size ' - 'in the input checkpoint if provided by the loader, otherwise to 1') - group.add_argument("--w-pack", type=bool, - help='True is w_pack weight for llm', - default=False) - - -def save_huggingface_bloom(args, model, model_args): - hf2mg_map = {} - for name_param_m in model.named_parameters(): - print("name_param_m", name_param_m[0]) - layer_num = name_param_m[0].split(".")[3] if len(name_param_m[0].split(".")) > 3 else name_param_m[0].split(".")[1] - nh = model_args.num_attention_heads - ng = ( - model_args.checkpoint_args.num_query_groups - if model_args.checkpoint_args.group_query_attention - else model_args.num_attention_heads - ) - repeats = nh // ng - # word embedding - if name_param_m[0] == "language_model.embedding.word_embeddings.weight": - hf2mg_map["word_embeddings.weight"] = name_param_m[1] - continue - if name_param_m[0] == "language_model.embedding.word_embeddings.norm.weight": - hf2mg_map["word_embeddings_layernorm.weight"] = name_param_m[1] - continue - if name_param_m[0] == "language_model.embedding.word_embeddings.norm.bias": - hf2mg_map["word_embeddings_layernorm.bias"] = name_param_m[1] - continue - - # input layernorm - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.input_norm.weight": - hf2mg_map[f"h.{layer_num}.input_layernorm.weight"] = name_param_m[1] - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.input_norm.bias": - hf2mg_map[f"h.{layer_num}.input_layernorm.bias"] = name_param_m[1] - continue - - # qkv - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.query_key_value.weight": - hf2mg_map[f"h.{layer_num}.self_attention.query_key_value.weight"] = name_param_m[1] - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.query_key_value.bias": - hf2mg_map[f"h.{layer_num}.self_attention.query_key_value.bias"] = name_param_m[1] - continue - - # post attention norm - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.post_attention_norm.weight": - hf2mg_map[f"h.{layer_num}.post_attention_layernorm.weight"] = name_param_m[1] - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.post_attention_norm.bias": - hf2mg_map[f"h.{layer_num}.post_attention_layernorm.bias"] = name_param_m[1] - continue - - # dense - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.dense.weight": - hf2mg_map[f"h.{layer_num}.self_attention.dense.weight"] = name_param_m[ - 1 - ] - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.dense.bias": - hf2mg_map[f"h.{layer_num}.self_attention.dense.bias"] = name_param_m[1] - continue - # mlp - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.mlp.dense_h_to_4h.weight": - hf2mg_map[f"h.{layer_num}.mlp.dense_h_to_4h.weight"] = name_param_m[1] - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.mlp.dense_h_to_4h.bias": - hf2mg_map[f"h.{layer_num}.mlp.dense_h_to_4h.bias"] = name_param_m[1] - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.mlp.dense_4h_to_h.weight": - hf2mg_map[f"h.{layer_num}.mlp.dense_4h_to_h.weight"] = name_param_m[1] - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.mlp.dense_4h_to_h.bias": - hf2mg_map[f"h.{layer_num}.mlp.dense_4h_to_h.bias"] = name_param_m[1] - continue - # final norm - if name_param_m[0] == "language_model.encoder.final_norm.weight": - hf2mg_map[f"ln_f.weight"] = name_param_m[1] - continue - if name_param_m[0] == "language_model.encoder.final_norm.bias": - hf2mg_map[f"ln_f.bias"] = name_param_m[1] - continue - print('hf2mg_map.keys', hf2mg_map.keys()) - - gc.collect() - file_format = "pytorch_model-{:05d}-of-{:05d}.bin" - file_list = os.listdir(args.output_model_dir) - output_mg2hg_path = os.path.join(args.output_model_dir, 'mg2hg') - os.makedirs(output_mg2hg_path, exist_ok=True) - for filename in file_list: - if filename.startswith("pytorch_model-") and filename.endswith(".bin"): - try: - start_index = len("pytorch_model-") - end_index = filename.index("-of-") - index1 = int(filename[start_index:end_index]) - index2 = int(filename[end_index + len("-of-"): -len(".bin")]) - if 0 <= index1 <= index2 <= 99999: - formatted_filename = file_format.format(index1, index2) - file_path = os.path.join(args.output_model_dir, formatted_filename) - hf_model = torch.load(file_path, map_location=torch.device('cpu')) - for key in hf_model.keys(): - print(key) - if key in hf2mg_map.keys(): - hf_model[key].data.copy_(hf2mg_map[key]) - output_file_path = os.path.join(output_mg2hg_path, formatted_filename) - print(f'save weight to {output_file_path}') - # torch.save(hf_model, output_file_path) - else: - print("Invalid file format:", filename) - except ValueError: - print("Unable to save file:", filename) - elif filename.endswith('.json') or filename.endswith('.model'): - source_file = os.path.join(args.output_model_dir, filename) - destination_file = os.path.join(output_mg2hg_path, filename) - shutil.copyfile(source_file, destination_file) - output_model_dir = os.path.join(args.output_model_dir, 'mg2hg') - print(f'save weight to {output_model_dir}') - - -def save_model_checkpoint(queue, args): - - # Search in directory above this - # sys.path.append(os.path.abspath( - # os.path.join(os.path.dirname(__file__), - # os.path.pardir, - # os.path.pardir))) - - from pretrain_gpt import model_provider - from ascendspeed import megatron_adaptor - from megatron.arguments import validate_args - from modellink.utils import parse_args - from megatron.checkpointing import save_checkpoint - from megatron.global_vars import set_global_variables, get_args - from megatron.core.enums import ModelType - from megatron.tokenizer.tokenizer import _vocab_size_with_padding - from megatron import fused_kernels - from megatron.core import mpu - - def queue_get(name=None): - val = queue.get() - if val == "exit": - raise RuntimeError('Loader exited, exiting saver') - if name is not None and args.checking and val["name"] != name: - val_name = val["name"] - raise RuntimeError(f'Unexpected message. Expecting "{name}" but got "{val_name}". Exiting saver.') - if name is not None: - print(f"received {name}") - return val - - def check_message(msg): - if not args.checking: - return - msg_name = msg.pop("name") - if len(msg.keys()) > 0: - print(f"Unexpected values in {msg_name}:") - for key in msg.keys(): - print(f" {key}") - raise RuntimeError(f"Exiting. If you want to ignore this, use the argument --no-checking.") - - - md = queue_get() - - if args.target_tensor_parallel_size is None: - if hasattr(md, 'previous_tensor_parallel_size'): - args.target_tensor_parallel_size = md.previous_tensor_parallel_size - else: - print("loader did not provide a tensor parallel size and --target-tensor-parallel-size not provided on command line. " - "Default to 1.") - args.target_tensor_parallel_size = 1 - - if args.target_pipeline_parallel_size is None: - if hasattr(md, 'previous_pipeline_parallel_size'): - args.target_pipeline_parallel_size = md.previous_pipeline_parallel_size - else: - print("loader did not provide a pipeline parallel size and --target-pipeline-parallel-size not provided on command line. " - "Default to 1.") - args.target_pipeline_parallel_size = 1 - - - # Arguments do sanity checks on the world size, but we don't care, - # so trick it into thinking we are plenty of processes - if args.target_tensor_parallel_size is not None and args.target_pipeline_parallel_size is not None: - os.environ["WORLD_SIZE"] = f'{args.target_tensor_parallel_size * args.target_pipeline_parallel_size}' - - # We want all arguments to come from us - sys.argv = ['script.py', - '--num-layers', str(md.num_layers), - '--hidden-size', str(md.hidden_size), - '--seq-length', str(md.seq_length), - '--num-attention-heads', str(md.num_attention_heads), - '--max-position-embeddings', str(md.max_position_embeddings), - '--position-embedding-type', str(md.position_embedding_type), - '--tokenizer-type', str(md.tokenizer_type), - '--tensor-model-parallel-size', str(args.target_tensor_parallel_size), - '--pipeline-model-parallel-size', str(args.target_pipeline_parallel_size), - '--no-masked-softmax-fusion', - '--no-bias-gelu-fusion', - '--no-bias-dropout-fusion', - '--no-async-tensor-model-parallel-allreduce', - '--use-cpu-initialization', - '--micro-batch-size', '1', - '--no-load-optim', - '--no-load-rng', - '--no-save-optim', - '--no-save-rng', - '--no-initialization', - '--save-interval', '1', - '--save', args.output_model_dir, - '--fp16' - ] - - if md.make_vocab_size_divisible_by is not None: - sys.argv.extend(['--make-vocab-size-divisible-by', str(md.make_vocab_size_divisible_by)]) - if md.output_layer: - sys.argv.append('--untie-embeddings-and-output-weights') - if not md.linear_bias: - sys.argv.append('--disable-bias-linear') - - margs = parse_args() - margs.w_pack = args.w_pack - - - if hasattr(md, 'checkpoint_args'): - # These are arguments that we are either changing, or cause problems for validation if they are set - # Note that some of these deal with T5 so will need to be changed if we support T5. - args_to_keep = ['tensor_model_parallel_size', 'pipeline_model_parallel_size', 'world_size', 'params_dtype', - 'num_layers_per_virtual_pipeline_stage', 'virtual_pipeline_model_parallel_size', - 'masked_softmax_fusion', 'bias_gelu_fusion', 'bias_dropout_fusion', - 'sequence_parallel', 'async_tensor_model_parallel_allreduce', - 'no_load_optim', 'no_load_rng', 'no_save_optim', 'no_save_rng', - 'vocab_file', 'tokenizer_model', - 'save_interval', 'save', - 'perform_initialization', 'use_cpu_initialization', - 'recompute_granularity', 'recompute_num_layers', 'recompute_method', - 'encoder_num_layers', 'encoder_seq_length', - 'distribute_saved_activations', - 'train_iters', 'lr_decay_iters', 'lr_warmup_iters', 'lr_warmup_fraction', - 'start_weight_decay', 'end_weight_decay'] - - - for arg, value in vars(md.checkpoint_args).items(): - if arg in args_to_keep: - continue - if not hasattr(margs, arg): - print(f"Checkpoint had argument {arg} but new arguments does not have this.") - continue - if getattr(margs, arg) != value: - print(f"Overwriting default {arg} value {getattr(margs, arg)} with value from checkpoint {value}.") - setattr(margs, arg, value) - - validate_args(margs) - - set_global_variables(margs, build_tokenizer=False) - - # margs = megatron args - margs = get_args() - - margs.model_type = ModelType.encoder_or_decoder - - if hasattr(md, 'consumed_train_samples'): - margs.consumed_train_samples = md.consumed_train_samples - margs.consumed_valid_samples = md.consumed_valid_samples - print(f"Setting consumed_train_samples to {margs.consumed_train_samples}" - f" and consumed_valid_samples to {margs.consumed_valid_samples}") - else: - print("consumed_train_samples not provided.") - - def get_models(count, dtype, pre_process, post_process): - models = [model_provider(pre_process, post_process).to(dtype) for _ in range(count)] - return models - - # fake initializing distributed - mpu.set_tensor_model_parallel_world_size(args.target_tensor_parallel_size) - mpu.set_pipeline_model_parallel_world_size(args.target_pipeline_parallel_size) - mpu.set_tensor_model_parallel_rank(0) - mpu.set_pipeline_model_parallel_rank(0) - # Embeddings - #----------- - embeddings_msg = queue_get("embeddings") - - pos_embed = None - if md.position_embedding_type == 'learned_absolute': - pos_embed = embeddings_msg.pop("position embeddings") - orig_word_embed = embeddings_msg.pop("word embeddings") - orig_word_embed_n_w, orig_word_embed_n_b = None, None - if "word embeddings norm_w" in embeddings_msg and "word embeddings norm_b" in embeddings_msg: - orig_word_embed_n_w = embeddings_msg.pop("word embeddings norm_w") - orig_word_embed_n_b = embeddings_msg.pop("word embeddings norm_b") - check_message(embeddings_msg) - - # Deal with padding - if md.true_vocab_size is not None: - # figure out what our padded vocab size is - orig_vocab_size = orig_word_embed.shape[0] - margs.padded_vocab_size = _vocab_size_with_padding(md.true_vocab_size, margs) - - # Cut out extra padding we don't need - if orig_vocab_size > margs.padded_vocab_size: - full_word_embed = orig_word_embed[0:margs.padded_vocab_size, :] - - # Expanding embedding to larger size by replicating final entry - elif orig_vocab_size < margs.padded_vocab_size: - padding_size = margs.padded_vocab_size - orig_vocab_size - - full_word_embed = torch.cat(( - orig_word_embed, - orig_word_embed[-1].unsqueeze(0).expand(padding_size, -1))) - - # Same size! - else: - full_word_embed = orig_word_embed - else: - print("Original vocab size not specified, leaving embedding table as-is. " - "If you've changed the tensor parallel size this could cause problems.") - margs.padded_vocab_size = orig_word_embed.shape[0] - full_word_embed = orig_word_embed - - # Split into new tensor model parallel sizes - out_word_embed = torch.chunk(full_word_embed, args.target_tensor_parallel_size, dim=0) - - # Make models for first pipeline stage and fill in embeddings - mpu.set_pipeline_model_parallel_rank(0) - post_process = args.target_pipeline_parallel_size == 1 - models = get_models(args.target_tensor_parallel_size, md.params_dtype, True, post_process) - for tp_rank, model in enumerate(models): - model.language_model.embedding.word_embeddings.weight.data.copy_(out_word_embed[tp_rank]) - if orig_word_embed_n_w is not None: - model.language_model.embedding.word_embeddings.norm.weight.data.copy_(orig_word_embed_n_w) - model.language_model.embedding.word_embeddings.norm.bias.data.copy_(orig_word_embed_n_b) - if pos_embed is not None: - model.language_model.embedding.position_embeddings.weight.data.copy_(pos_embed) - else: - if hasattr(model.language_model.embedding, 'position_embeddings'): - raise ValueError("model should have position_embeddings") - - # Transformer layers - #------------------- - total_layer_num = 0 - for pp_rank in range(args.target_pipeline_parallel_size): - # For later pipeline parallel ranks, make the new models - if pp_rank > 0: - mpu.set_pipeline_model_parallel_rank(pp_rank) - post_process = pp_rank == args.target_pipeline_parallel_size - 1 - models = get_models(args.target_tensor_parallel_size, md.params_dtype, False, post_process) - - encoder_layer_num = len(models[0].language_model.encoder.layers) - for layer in range(encoder_layer_num): - msg = queue_get(f"transformer layer {total_layer_num}") - - # duplicated tensors - input_norm_weight = msg.pop("input norm weight") - if md.norm_has_bias: - input_norm_bias = msg.pop("input norm bias") - post_norm_weight = msg.pop("post norm weight") - if md.norm_has_bias: - post_norm_bias = msg.pop("post norm bias") - if md.linear_bias: - dense_bias = msg.pop("dense bias") - mlp_l1_bias = msg.pop("mlp l1 bias") - - if args.add_qkv_bias: - qkv_bias = torch.chunk(msg.pop("qkv bias"), args.target_tensor_parallel_size, dim=0) - if args.add_dense_bias: - dense_bias = msg.pop("dense bias") - - qkv_org = msg.pop("qkv weight") - qkv_weight = torch.chunk(qkv_org, args.target_tensor_parallel_size, dim=0) - - # Split up the parallel tensors - dense_weight = torch.chunk(msg.pop("dense weight"), args.target_tensor_parallel_size, dim=1) - mlp_l1_weight = torch.chunk(msg.pop("mlp l1 weight"), args.target_tensor_parallel_size, dim=1) - - # Special handling for swiglu - if md.swiglu: - mlp_l0_weight_W = torch.chunk(msg.pop("mlp l0 weight W"), args.target_tensor_parallel_size, dim=0) - mlp_l0_weight_V = torch.chunk(msg.pop("mlp l0 weight V"), args.target_tensor_parallel_size, dim=0) - mlp_l0_weight = [torch.cat(weights, dim=0) for weights in zip(mlp_l0_weight_W, mlp_l0_weight_V)] - else: - mlp_l0_weight = torch.chunk(msg.pop("mlp l0 weight"), args.target_tensor_parallel_size, dim=0) - - if md.linear_bias: - qkv_bias = torch.chunk(msg.pop("qkv bias"), args.target_tensor_parallel_size, dim=0) - if md.swiglu: - mlp_l0_bias_W = torch.chunk(msg.pop("mlp l0 bias W"), args.target_tensor_parallel_size, dim=0) - mlp_l0_bias_V = torch.chunk(msg.pop("mlp l0 bias V"), args.target_tensor_parallel_size, dim=0) - mlp_l0_weight = [] - for weights in zip(mlp_l0_weight_W, mlp_l0_weight_V): - mlp_l0_weight.append(torch.cat(weights, dim=0)) - else: - mlp_l0_bias = torch.chunk(msg.pop("mlp l0 bias"), args.target_tensor_parallel_size, dim=0) - - # Save them to the model - for tp_rank in range(args.target_tensor_parallel_size): - layer_encoder = models[tp_rank].language_model.encoder.layers[layer] - layer_encoder.input_norm.weight.data.copy_(input_norm_weight) - if md.norm_has_bias: - layer_encoder.input_norm.bias.data.copy_(input_norm_bias) - layer_encoder.self_attention.query_key_value.weight.data.copy_(qkv_weight[tp_rank]) - layer_encoder.self_attention.dense.weight.data.copy_(dense_weight[tp_rank]) - layer_encoder.post_attention_norm.weight.data.copy_(post_norm_weight) - if md.norm_has_bias: - layer_encoder.post_attention_norm.bias.data.copy_(post_norm_bias) - layer_encoder.mlp.dense_h_to_4h.weight.data.copy_(mlp_l0_weight[tp_rank]) - layer_encoder.mlp.dense_4h_to_h.weight.data.copy_(mlp_l1_weight[tp_rank]) - if md.linear_bias: - layer_encoder.self_attention.query_key_value.bias.data.copy_(qkv_bias[tp_rank]) - layer_encoder.self_attention.dense.bias.data.copy_(dense_bias) - layer_encoder.mlp.dense_h_to_4h.bias.data.copy_(mlp_l0_bias[tp_rank]) - layer_encoder.mlp.dense_4h_to_h.bias.data.copy_(mlp_l1_bias) - if args.add_qkv_bias: - layer_encoder.self_attention.query_key_value.bias.data.copy_(qkv_bias[tp_rank]) - if args.add_dense_bias: - layer_encoder.self_attention.dense.bias.data.copy_(dense_bias) - - total_layer_num = total_layer_num + 1 - check_message(msg) - - if post_process: - msg = queue_get("final norm") - final_norm_weight = msg.pop("weight") - if md.norm_has_bias: - final_norm_bias = msg.pop("bias") - for tp_rank in range(args.target_tensor_parallel_size): - models[tp_rank].language_model.encoder.final_norm.weight.data.copy_(final_norm_weight) - if md.norm_has_bias: - models[tp_rank].language_model.encoder.final_norm.bias.data.copy_(final_norm_bias) - if pp_rank != 0 and not md.output_layer: - # Copy word embeddings to final pipeline rank - models[tp_rank].word_embeddings.weight.data.copy_(out_word_embed[tp_rank]) - del final_norm_weight - if md.norm_has_bias: - del final_norm_bias - check_message(msg) - - if md.output_layer: - msg = queue_get("output layer") - if not hasattr(models[0].language_model, 'output_layer'): - raise RuntimeError("ERROR: got an output layer, but model does not have one") - output_layer_weight = torch.chunk(msg.pop("weight"), args.target_tensor_parallel_size, dim=0) - for tp_rank in range(args.target_tensor_parallel_size): - models[tp_rank].language_model.output_layer.weight.data.copy_(output_layer_weight[tp_rank]) - del output_layer_weight - check_message(msg) - - msg = queue_get() - if msg != "done" and msg["name"] == "pooler": - if not hasattr(models[0].language_model, 'pooler'): - raise RuntimeError("ERROR: got a pooler, but model does not have one") - print("received pooler") - pooler_weight = msg.pop("weight") - pooler_bias = msg.pop("bias") - for tp_rank in range(args.target_tensor_parallel_size): - models[tp_rank].language_model.pooler.dense.weight.data.copy_(pooler_weight) - models[tp_rank].language_model.pooler.dense.bias.data.copy_(pooler_bias) - del pooler_weight - del pooler_bias - check_message(msg) - msg = queue_get() - - if msg != "done" and msg["name"] == "lm head": - if not hasattr(models[0], 'lm_head'): - raise RuntimeError("ERROR: got an lm head, but model does not have one") - print("received lm head") - lm_head_dense_weight = msg.pop("dense weight") - lm_head_dense_bias = msg.pop("dense bias") - lm_head_norm_weight = msg.pop("norm weight") - if md.norm_has_bias: - lm_head_norm_bias = msg.pop("norm bias") - for tp_rank in range(args.target_tensor_parallel_size): - models[tp_rank].lm_head.dense.weight.data.copy_(lm_head_dense_weight) - models[tp_rank].lm_head.dense.bias.data.copy_(lm_head_dense_bias) - models[tp_rank].lm_head.norm.weight.data.copy_(lm_head_norm_weight) - if md.norm_has_bias: - models[tp_rank].lm_head.norm.bias.data.copy_(lm_head_norm_bias) - check_message(msg) - msg = queue_get() - - if msg != "done" and msg["name"] == "binary head": - if not hasattr(models[0], 'binary_head'): - raise RuntimeError("ERROR: got a binary head, but model does not have one") - print("received binary head") - binary_head_weight = msg.pop("weight") - binary_head_bias = msg.pop("bias") - for tp_rank in range(args.target_tensor_parallel_size): - models[tp_rank].binary_head.weight.data.copy_(binary_head_weight) - models[tp_rank].binary_head.bias.data.copy_(binary_head_bias) - check_message(msg) - msg = queue_get() - - if msg != "done": - print("ERROR: got some more data but was expecting to be done") - - for tp_rank in range(args.target_tensor_parallel_size): - mpu.set_tensor_model_parallel_rank(tp_rank) - save_huggingface_bloom(args, models[tp_rank], md) diff --git a/debug/weight_convert/convert_ckpt.py b/debug/weight_convert/convert_ckpt.py deleted file mode 100644 index b88a73c361f5dbb96574fdbd509bfd57a887f27a..0000000000000000000000000000000000000000 --- a/debug/weight_convert/convert_ckpt.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the License); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import sys -import argparse -import importlib -import torch.multiprocessing as mp - - -def check_and_convert_weight(args): - import torch - from transformers import AutoModelForCausalLM - try: - output_mg2hg_path = os.path.join(args.output_model_dir, 'mg2hg') - hf_model = AutoModelForCausalLM.from_pretrained( - output_mg2hg_path, device_map="cpu", torch_dtype=torch.float16) - hf_model.save_pretrained(output_mg2hg_path, safe_serialization=True) - except ModuleNotFoundError as e: - print('failed to convert bin 2 safetensors') - raise exc from e - - -def load_model(model_name): - module_name = f"{model_name}" - try: - converter = importlib.import_module(module_name) - except ModuleNotFoundError as e: - raise exc from e - return converter - - -def main(): - parser = argparse.ArgumentParser( - description="convert weight to huggingface format") - - parser.add_argument('-m', '--model', type=str, required=True, - choices=['llama', 'bloom', 'gptneox'], - help='Type of the model') - parser.add_argument('-i', '--input-model-dir', type=str, required=True, - help='Directory to load model checkpoint from') - parser.add_argument('-o', '--output-model-dir', type=str, required=True, - help='Directory to save model checkpoint to') - parser.add_argument('--no-checking', action='store_false', - help='Do not perform checking on the name and ordering of weights', - dest='checking') - parser.add_argument('--convert-to-safetensors', action='store_false', - help='convert .bin to safetensors') - - known_args, _ = parser.parse_known_args() - loader = importlib.import_module('load_utils') - saver = load_model(known_args.model) - - loader.add_arguments(parser) - saver.add_arguments(parser) - - args = parser.parse_args() - - queue = mp.Queue(maxsize=50) - - print("Starting saver...") - saver_proc = mp.Process( - target=saver.save_model_checkpoint, args=(queue, args)) - saver_proc.start() - - print("Starting loader...") - loader.load_checkpoint(queue, args) - - print("Waiting for saver to complete...") - saver_proc.join() - - if args.convert_to_safetensors: - print("converting .bin to safetensors...") - check_and_convert_weight(args) - - print("Done!") - - -if __name__ == '__main__': - main() diff --git a/debug/weight_convert/diff.patch b/debug/weight_convert/diff.patch deleted file mode 100644 index 134890cee45c7a7373187b46c0888f2552c1861d..0000000000000000000000000000000000000000 --- a/debug/weight_convert/diff.patch +++ /dev/null @@ -1,76 +0,0 @@ -diff --git a/modellink/__init__.py b/modellink/__init__.py -index ceae8c4..837386e 100644 ---- a/modellink/__init__.py -+++ b/modellink/__init__.py -@@ -22,7 +22,7 @@ except Exception as e: - logging.warning("Warning: You did not install torch_npu") - - --from .model_adaptor import exe_adaptor -+# from .model_adaptor import exe_adaptor - - --exe_adaptor() -+# exe_adaptor() -diff --git a/modellink/model_adaptor.py b/modellink/model_adaptor.py -index 7c1bda1..7376b98 100644 ---- a/modellink/model_adaptor.py -+++ b/modellink/model_adaptor.py -@@ -26,32 +26,32 @@ from .utils import ALL_MODULE_WRAPPER_CLASSNAMES - from .checkpointing import _load_base_checkpoint_wrapper, load_checkpoint_wrapper - - --def exe_adaptor(): -- import megatron -- megatron.utils.ALL_MODULE_WRAPPER_CLASSNAMES = ALL_MODULE_WRAPPER_CLASSNAMES -- megatron.initialize.parse_args = parse_args_decorator(megatron.initialize.parse_args) -- megatron.arguments.parse_args = parse_args_decorator(megatron.arguments.parse_args) -- megatron.global_vars.build_tokenizer = build_tokenizer -+# def exe_adaptor(): -+# import megatron -+# megatron.utils.ALL_MODULE_WRAPPER_CLASSNAMES = ALL_MODULE_WRAPPER_CLASSNAMES -+# megatron.initialize.parse_args = parse_args_decorator(megatron.initialize.parse_args) -+# megatron.arguments.parse_args = parse_args_decorator(megatron.arguments.parse_args) -+# megatron.global_vars.build_tokenizer = build_tokenizer - -- import megatron.training -- megatron.training.get_model = get_model_wrapper(megatron.training.get_model) -- megatron.training.build_pretraining_data_loader = build_pretraining_data_loader -+# import megatron.training -+# megatron.training.get_model = get_model_wrapper(megatron.training.get_model) -+# megatron.training.build_pretraining_data_loader = build_pretraining_data_loader - -- megatron.model.GPTModel = GPTModel -- megatron.model.transformer.SwitchMLP = SwitchMLP -- megatron.model.transformer.ParallelTransformer.__init__ = parallel_transformer_init -- megatron.model.transformer.ParallelTransformer.state_dict_for_save_checkpoint \ -- = state_dict_for_save_checkpoint_wrapper( -- megatron.model.transformer.ParallelTransformer.state_dict_for_save_checkpoint) -- megatron.model.language_model.TransformerLanguageModel.forward = (seq_length_wrapper( -- megatron.model.language_model.TransformerLanguageModel.forward)) -+# megatron.model.GPTModel = GPTModel -+# megatron.model.transformer.SwitchMLP = SwitchMLP -+# megatron.model.transformer.ParallelTransformer.__init__ = parallel_transformer_init -+# megatron.model.transformer.ParallelTransformer.state_dict_for_save_checkpoint \ -+# = state_dict_for_save_checkpoint_wrapper( -+# megatron.model.transformer.ParallelTransformer.state_dict_for_save_checkpoint) -+# megatron.model.language_model.TransformerLanguageModel.forward = (seq_length_wrapper( -+# megatron.model.language_model.TransformerLanguageModel.forward)) - -- megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward = vocab_embedding_wrapper( -- megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward) -- megatron.core.tensor_parallel.layers.VocabParallelEmbedding.__init__ = norm_wrapper( -- megatron.core.tensor_parallel.layers.VocabParallelEmbedding.__init__) -+# megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward = vocab_embedding_wrapper( -+# megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward) -+# megatron.core.tensor_parallel.layers.VocabParallelEmbedding.__init__ = norm_wrapper( -+# megatron.core.tensor_parallel.layers.VocabParallelEmbedding.__init__) - -- megatron.checkpointing._load_base_checkpoint = _load_base_checkpoint_wrapper( -- megatron.checkpointing._load_base_checkpoint) -- megatron.training.load_checkpoint = load_checkpoint_wrapper( -- megatron.checkpointing.load_checkpoint) -+# megatron.checkpointing._load_base_checkpoint = _load_base_checkpoint_wrapper( -+# megatron.checkpointing._load_base_checkpoint) -+# megatron.training.load_checkpoint = load_checkpoint_wrapper( -+# megatron.checkpointing.load_checkpoint) diff --git a/debug/weight_convert/llama.py b/debug/weight_convert/llama.py deleted file mode 100644 index 0ae2173c6cd5a1b1ffcb8fd1ee56e58aa05fe646..0000000000000000000000000000000000000000 --- a/debug/weight_convert/llama.py +++ /dev/null @@ -1,560 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the License); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -from collections.abc import Mapping -import concurrent.futures -import os -import gc -import sys -import shutil -import torch -import torch_npu - - -def add_arguments(parser): - group = parser.add_argument_group(title='Megatron saver') - group.add_argument('--target-tensor-parallel-size', type=int, - help='Target tensor model parallel size, defaults to the tensor parallel size ' - 'in the input checkpoint if provided by the loader, otherwise to 1') - group.add_argument('--target-pipeline-parallel-size', type=int, - help='Target tensor model parallel size, default to the pipeline parall size ' - 'in the input checkpoint if provided by the loader, otherwise to 1') - group.add_argument("--w-pack", type=bool, - help='True is w_pack weight for llm', - default=False) - - -def save_huggingface_llama(args, model, model_args): - hf2mg_map = {} - for name_param_m in model.named_parameters(): - layer_num = name_param_m[0].split(".")[3] if len( - name_param_m[0].split(".")) > 3 else name_param_m[0].split(".")[1] - nh = model_args.num_attention_heads - ng = ( - model_args.checkpoint_args.num_query_groups - if model_args.checkpoint_args.group_query_attention - else model_args.num_attention_heads - ) - repeats = nh // ng - if name_param_m[0] == "language_model.embedding.word_embeddings.weight": - hf2mg_map["model.embed_tokens.weight"] = name_param_m[1] - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.post_attention_norm.weight": - hf2mg_map[f"model.layers.{layer_num}.post_attention_layernorm.weight"] = name_param_m[1] - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.input_norm.weight": - hf2mg_map[f"model.layers.{layer_num}.input_layernorm.weight"] = name_param_m[1] - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.post_attention_norm.weight": - hf2mg_map[f"model.layers.{layer_num}.post_attention_layernorm.weight"] = name_param_m[1] - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.query_key_value.weight": - qkv_weight = name_param_m[1].reshape( - ng, - repeats + 2, - name_param_m[1].shape[0] // ng // (repeats + 2), - name_param_m[1].shape[1], - ) - w = qkv_weight.shape[-1] - qw = qkv_weight[:, :repeats, ...].reshape(-1, w) - kw = qkv_weight[:, repeats: repeats + 1, ...].reshape(-1, w) - vw = qkv_weight[:, repeats + 1:, ...].reshape(-1, w) - if args.w_pack: - qkv = torch.cat((qw, kw, vw), dim=0) - hf2mg_map[f"model.layers.{layer_num}.self_attn.W_pack.weight"] = qkv - else: - hf2mg_map[f"model.layers.{layer_num}.self_attn.q_proj.weight"] = qw - hf2mg_map[f"model.layers.{layer_num}.self_attn.k_proj.weight"] = kw - hf2mg_map[f"model.layers.{layer_num}.self_attn.v_proj.weight"] = vw - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.query_key_value.bias": - bias_weight = name_param_m[1].reshape( - ng, repeats + - 2, name_param_m[1].shape[0] // ng // (repeats + 2) - ) - w = bias_weight.shape[-1] - qw = bias_weight[:, :repeats, ...].reshape(-1) - kw = bias_weight[:, repeats: repeats + 1, ...].reshape(-1) - vw = bias_weight[:, repeats + 1:, ...].reshape(-1) - hf2mg_map[f"model.layers.{layer_num}.self_attn.q_proj.bias"] = qw - hf2mg_map[f"model.layers.{layer_num}.self_attn.k_proj.bias"] = kw - hf2mg_map[f"model.layers.{layer_num}.self_attn.v_proj.bias"] = vw - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.dense.bias": - hf2mg_map[f"model.layers.{layer_num}.self_attn.dense.bias"] = name_param_m[1] - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.self_attention.dense.weight": - hf2mg_map[f"model.layers.{layer_num}.self_attn.o_proj.weight"] = name_param_m[1] - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.mlp.dense_h_to_4h.weight": - proj_read_h_half = name_param_m[1].shape[0] // 2 - hf2mg_map[f"model.layers.{layer_num}.mlp.gate_proj.weight"] = name_param_m[1][:proj_read_h_half, ...] - hf2mg_map[f"model.layers.{layer_num}.mlp.up_proj.weight"] = name_param_m[1][proj_read_h_half:, ...] - continue - if name_param_m[0] == f"language_model.encoder.layers.{layer_num}.mlp.dense_4h_to_h.weight": - hf2mg_map[f"model.layers.{layer_num}.mlp.down_proj.weight"] = name_param_m[1] - continue - if name_param_m[0] == "language_model.encoder.final_norm.weight": - hf2mg_map[f"model.norm.weight"] = name_param_m[1] - continue - if name_param_m[0] == "language_model.output_layer.weight": - hf2mg_map[f"lm_head.weight"] = name_param_m[1] - continue - - gc.collect() - file_format = "pytorch_model-{:05d}-of-{:05d}.bin" - file_list = os.listdir(args.output_model_dir) - output_mg2hg_path = os.path.join(args.output_model_dir, 'mg2hg') - os.makedirs(output_mg2hg_path, exist_ok=True) - for filename in file_list: - if filename.startswith("pytorch_model-") and filename.endswith(".bin"): - try: - start_index = len("pytorch_model-") - end_index = filename.index("-of-") - index1 = int(filename[start_index:end_index]) - index2 = int(filename[end_index + len("-of-"): -len(".bin")]) - if 0 <= index1 <= index2 <= 99999: - formatted_filename = file_format.format(index1, index2) - file_path = os.path.join( - args.output_model_dir, formatted_filename) - hf_model = torch.load( - file_path, map_location=torch.device('cpu')) - for key in hf_model.keys(): - if key in hf2mg_map.keys(): - hf_model[key].data.copy_(hf2mg_map[key]) - output_file_path = os.path.join( - output_mg2hg_path, formatted_filename) - print(f'save weight to {output_file_path}') - torch.save(hf_model, output_file_path) - else: - print("Invalid file format:", filename) - except ValueError: - print("Unable to save file:", filename) - elif (filename.endswith('.json') or filename.endswith('.mode')) and 'safetensors' not in filename: - source_file = os.path.join(args.output_model_dir, filename) - destination_file = os.path.join(output_mg2hg_path, filename) - shutil.copyfile(source_file, destination_file) - - -def save_model_checkpoint(queue, args): - from pretrain_gpt import model_provider - from ascendspeed import megatron_adaptor - from megatron.arguments import validate_args - from modellink.utils import parse_args - from megatron.checkpointing import save_checkpoint - from megatron.global_vars import set_global_variables, get_args - from megatron.core.enums import ModelType - from megatron.tokenizer.tokenizer import _vocab_size_with_padding - from megatron import fused_kernels - from megatron.core import mpu - - def queue_get(name=None): - val = queue.get() - if val == "exit": - raise RuntimeError('Loader exited, exiting saver') - if name is not None and args.checking and val["name"] != name: - val_name = val["name"] - raise RuntimeError(f'Unexpected message. Expecting "{name}" but got "{val_name}". Exiting saver.') - if name is not None: - print(f"received {name}") - return val - - def check_message(msg): - if not args.checking: - return - msg_name = msg.pop("name") - if len(msg.keys()) > 0: - print(f"Unexpected values in {msg_name}:") - for key in msg.keys(): - print(f" {key}") - raise RuntimeError(f"Exiting. If you want to ignore this, use the argument --no-checking.") - - md = queue_get() - - if args.target_tensor_parallel_size is None: - if hasattr(md, 'previous_tensor_parallel_size'): - args.target_tensor_parallel_size = md.previous_tensor_parallel_size - else: - print("loader did not provide a tensor parallel size and --target-tensor-parallel-size not provided on command line. " - "Default to 1.") - args.target_tensor_parallel_size = 1 - - if args.target_pipeline_parallel_size is None: - if hasattr(md, 'previous_pipeline_parallel_size'): - args.target_pipeline_parallel_size = md.previous_pipeline_parallel_size - else: - print("loader did not provide a pipeline parallel size and --target-pipeline-parallel-size not provided on command line. " - "Default to 1.") - args.target_pipeline_parallel_size = 1 - - # Arguments do sanity checks on the world size, but we don't care, - # so trick it into thinking we are plenty of processes - if args.target_tensor_parallel_size is not None and args.target_pipeline_parallel_size is not None: - os.environ["WORLD_SIZE"] = f'{args.target_tensor_parallel_size * args.target_pipeline_parallel_size}' - - # We want all arguments to come from us - sys.argv = ['script.py', - '--num-layers', str(md.num_layers), - '--hidden-size', str(md.hidden_size), - '--seq-length', str(md.seq_length), - '--num-attention-heads', str(md.num_attention_heads), - '--max-position-embeddings', str(md.max_position_embeddings), - '--position-embedding-type', str(md.position_embedding_type), - '--tokenizer-type', str(md.tokenizer_type), - '--tensor-model-parallel-size', str( - args.target_tensor_parallel_size), - '--pipeline-model-parallel-size', str( - args.target_pipeline_parallel_size), - '--no-masked-softmax-fusion', - '--no-bias-gelu-fusion', - '--no-bias-dropout-fusion', - '--no-async-tensor-model-parallel-allreduce', - '--use-cpu-initialization', - '--micro-batch-size', '1', - '--no-load-optim', - '--no-load-rng', - '--no-save-optim', - '--no-save-rng', - '--no-initialization', - '--save-interval', '1', - '--save', args.output_model_dir, - '--fp16' - ] - - if md.make_vocab_size_divisible_by is not None: - sys.argv.extend(['--make-vocab-size-divisible-by', - str(md.make_vocab_size_divisible_by)]) - if md.output_layer: - sys.argv.append('--untie-embeddings-and-output-weights') - if not md.linear_bias: - sys.argv.append('--disable-bias-linear') - - margs = parse_args() - margs.w_pack = args.w_pack - - if hasattr(md, 'checkpoint_args'): - # These are arguments that we are either changing, or cause problems for validation if they are set - # Note that some of these deal with T5 so will need to be changed if we support T5. - args_to_keep = ['tensor_model_parallel_size', 'pipeline_model_parallel_size', 'world_size', 'params_dtype', - 'num_layers_per_virtual_pipeline_stage', 'virtual_pipeline_model_parallel_size', - 'masked_softmax_fusion', 'bias_gelu_fusion', 'bias_dropout_fusion', - 'sequence_parallel', 'async_tensor_model_parallel_allreduce', - 'no_load_optim', 'no_load_rng', 'no_save_optim', 'no_save_rng', - 'vocab_file', 'tokenizer_model', - 'save_interval', 'save', - 'perform_initialization', 'use_cpu_initialization', - 'recompute_granularity', 'recompute_num_layers', 'recompute_method', - 'encoder_num_layers', 'encoder_seq_length', - 'distribute_saved_activations', - 'train_iters', 'lr_decay_iters', 'lr_warmup_iters', 'lr_warmup_fraction', - 'start_weight_decay', 'end_weight_decay'] - - for arg, value in vars(md.checkpoint_args).items(): - if arg in args_to_keep: - continue - if not hasattr(margs, arg): - print( - f"Checkpoint had argument {arg} but new arguments does not have this.") - continue - if getattr(margs, arg) != value: - print( - f"Overwriting default {arg} value {getattr(margs, arg)} with value from checkpoint {value}.") - setattr(margs, arg, value) - - validate_args(margs) - - set_global_variables(margs, build_tokenizer=False) - - # margs means megatron args - margs = get_args() - - margs.model_type = ModelType.encoder_or_decoder - - if hasattr(md, 'consumed_train_samples'): - margs.consumed_train_samples = md.consumed_train_samples - margs.consumed_valid_samples = md.consumed_valid_samples - print(f"Setting consumed_train_samples to {margs.consumed_train_samples}" - f" and consumed_valid_samples to {margs.consumed_valid_samples}") - else: - print("consumed_train_samples not provided.") - - def get_models(count, dtype, pre_process, post_process): - models = [model_provider(pre_process, post_process).to( - dtype) for _ in range(count)] - return models - - # fake initializing distributed - mpu.set_tensor_model_parallel_world_size(args.target_tensor_parallel_size) - mpu.set_pipeline_model_parallel_world_size( - args.target_pipeline_parallel_size) - mpu.set_tensor_model_parallel_rank(0) - mpu.set_pipeline_model_parallel_rank(0) - # Embeddings - # ----------- - embeddings_msg = queue_get("embeddings") - - pos_embed = None - if md.position_embedding_type == 'learned_absolute': - pos_embed = embeddings_msg.pop("position embeddings") - orig_word_embed = embeddings_msg.pop("word embeddings") - orig_word_embed_n_w, orig_word_embed_n_b = None, None - if "word embeddings norm_w" in embeddings_msg and "word embeddings norm_b" in embeddings_msg: - orig_word_embed_n_w = embeddings_msg.pop("word embeddings norm_w") - orig_word_embed_n_b = embeddings_msg.pop("word embeddings norm_b") - check_message(embeddings_msg) - - # Deal with padding - if md.true_vocab_size is not None: - # figure out what our padded vocab size is - orig_vocab_size = orig_word_embed.shape[0] - margs.padded_vocab_size = _vocab_size_with_padding( - md.true_vocab_size, margs) - - # Cut out extra padding we don't need - if orig_vocab_size > margs.padded_vocab_size: - full_word_embed = orig_word_embed[0:margs.padded_vocab_size, :] - - # Expanding embedding to larger size by replicating final entry - elif orig_vocab_size < margs.padded_vocab_size: - padding_size = margs.padded_vocab_size - orig_vocab_size - - full_word_embed = torch.cat(( - orig_word_embed, - orig_word_embed[-1].unsqueeze(0).expand(padding_size, -1))) - - # Same size! - else: - full_word_embed = orig_word_embed - else: - print("Original vocab size not specified, leaving embedding table as-is. " - "If you've changed the tensor parallel size this could cause problems.") - margs.padded_vocab_size = orig_word_embed.shape[0] - full_word_embed = orig_word_embed - - # Split into new tensor model parallel sizes - out_word_embed = torch.chunk( - full_word_embed, args.target_tensor_parallel_size, dim=0) - - # Make models for first pipeline stage and fill in embeddings - mpu.set_pipeline_model_parallel_rank(0) - post_process = args.target_pipeline_parallel_size == 1 - models = get_models(args.target_tensor_parallel_size, - md.params_dtype, True, post_process) - for tp_rank, model in enumerate(models): - model.language_model.embedding.word_embeddings.weight.data.copy_( - out_word_embed[tp_rank]) - if orig_word_embed_n_w is not None: - model.language_model.embedding.word_embeddings.norm.weight.data.copy_( - orig_word_embed_n_w) - model.language_model.embedding.word_embeddings.norm.bias.data.copy_( - orig_word_embed_n_b) - if pos_embed is not None: - model.language_model.embedding.position_embeddings.weight.data.copy_( - pos_embed) - else: - if hasattr(model.language_model.embedding, 'position_embeddings'): - raise ValueError("model should have position_embeddings") - - # Transformer layers - # ------------------- - total_layer_num = 0 - for pp_rank in range(args.target_pipeline_parallel_size): - # For later pipeline parallel ranks, make the new models - if pp_rank > 0: - mpu.set_pipeline_model_parallel_rank(pp_rank) - post_process = pp_rank == args.target_pipeline_parallel_size - 1 - models = get_models(args.target_tensor_parallel_size, - md.params_dtype, False, post_process) - - encoder_layer_num = len(models[0].language_model.encoder.layers) - for layer in range(encoder_layer_num): - msg = queue_get(f"transformer layer {total_layer_num}") - - # duplicated tensors - input_norm_weight = msg.pop("input norm weight") - if md.norm_has_bias: - input_norm_bias = msg.pop("input norm bias") - post_norm_weight = msg.pop("post norm weight") - if md.norm_has_bias: - post_norm_bias = msg.pop("post norm bias") - if md.linear_bias: - dense_bias = msg.pop("dense bias") - mlp_l1_bias = msg.pop("mlp l1 bias") - - if args.add_qkv_bias: - qkv_bias = torch.chunk( - msg.pop("qkv bias"), args.target_tensor_parallel_size, dim=0) - if args.add_dense_bias: - dense_bias = msg.pop("dense bias") - - qkv_org = msg.pop("qkv weight") - qkv_weight = torch.chunk( - qkv_org, args.target_tensor_parallel_size, dim=0) - - # Split up the parallel tensors - dense_weight = torch.chunk( - msg.pop("dense weight"), args.target_tensor_parallel_size, dim=1) - mlp_l1_weight = torch.chunk( - msg.pop("mlp l1 weight"), args.target_tensor_parallel_size, dim=1) - - # Special handling for swiglu - if md.swiglu: - mlp_l0_weight_W = torch.chunk( - msg.pop("mlp l0 weight W"), args.target_tensor_parallel_size, dim=0) - mlp_l0_weight_V = torch.chunk( - msg.pop("mlp l0 weight V"), args.target_tensor_parallel_size, dim=0) - mlp_l0_weight = [] - for weights in zip(mlp_l0_weight_W, mlp_l0_weight_V): - mlp_l0_weight.append(torch.cat(weights, dim=0)) - else: - mlp_l0_weight = torch.chunk( - msg.pop("mlp l0 weight"), args.target_tensor_parallel_size, dim=0) - - if md.linear_bias: - qkv_bias = torch.chunk( - msg.pop("qkv bias"), args.target_tensor_parallel_size, dim=0) - if md.swiglu: - mlp_l0_bias_W = torch.chunk( - msg.pop("mlp l0 bias W"), args.target_tensor_parallel_size, dim=0) - mlp_l0_bias_V = torch.chunk( - msg.pop("mlp l0 bias V"), args.target_tensor_parallel_size, dim=0) - mlp_l0_weight = [] - for weights in zip(mlp_l0_weight_W, mlp_l0_weight_V): - mlp_l0_weight.append(torch.cat(weights, dim=0)) - else: - mlp_l0_bias = torch.chunk( - msg.pop("mlp l0 bias"), args.target_tensor_parallel_size, dim=0) - - # Save them to the model - for tp_rank in range(args.target_tensor_parallel_size): - layer_encoder = models[tp_rank].language_model.encoder.layers[layer] - layer_encoder.input_norm.weight.data.copy_(input_norm_weight) - if md.norm_has_bias: - layer_encoder.input_norm.bias.data.copy_(input_norm_bias) - layer_encoder.self_attention.query_key_value.weight.data.copy_( - qkv_weight[tp_rank]) - layer_encoder.self_attention.dense.weight.data.copy_(dense_weight[tp_rank]) - layer_encoder.post_attention_norm.weight.data.copy_(post_norm_weight) - if md.norm_has_bias: - layer_encoder.post_attention_norm.bias.data.copy_(post_norm_bias) - layer_encoder.mlp.dense_h_to_4h.weight.data.copy_(mlp_l0_weight[tp_rank]) - layer_encoder.mlp.dense_4h_to_h.weight.data.copy_(mlp_l1_weight[tp_rank]) - if md.linear_bias: - layer_encoder.self_attention.query_key_value.bias.data.copy_( - qkv_bias[tp_rank]) - layer_encoder.self_attention.dense.bias.data.copy_(dense_bias) - layer_encoder.mlp.dense_h_to_4h.bias.data.copy_(mlp_l0_bias[tp_rank]) - layer_encoder.mlp.dense_4h_to_h.bias.data.copy_(mlp_l1_bias) - if args.add_qkv_bias: - layer_encoder.self_attention.query_key_value.bias.data.copy_( - qkv_bias[tp_rank]) - if args.add_dense_bias: - layer_encoder.self_attention.dense.bias.data.copy_(dense_bias) - - total_layer_num = total_layer_num + 1 - check_message(msg) - - if post_process: - msg = queue_get("final norm") - final_norm_weight = msg.pop("weight") - if md.norm_has_bias: - final_norm_bias = msg.pop("bias") - for tp_rank in range(args.target_tensor_parallel_size): - models[tp_rank].language_model.encoder.final_norm.weight.data.copy_( - final_norm_weight) - if md.norm_has_bias: - models[tp_rank].language_model.encoder.final_norm.bias.data.copy_( - final_norm_bias) - if pp_rank != 0 and not md.output_layer: - # Copy word embeddings to final pipeline rank - models[tp_rank].word_embeddings.weight.data.copy_( - out_word_embed[tp_rank]) - del final_norm_weight - if md.norm_has_bias: - del final_norm_bias - check_message(msg) - - if md.output_layer: - msg = queue_get("output layer") - if not hasattr(models[0].language_model, 'output_layer'): - raise AttributeError( - "ERROR: got an output layer, but model does not have one") - output_layer_weight = torch.chunk( - msg.pop("weight"), args.target_tensor_parallel_size, dim=0) - for tp_rank in range(args.target_tensor_parallel_size): - models[tp_rank].language_model.output_layer.weight.data.copy_( - output_layer_weight[tp_rank]) - del output_layer_weight - check_message(msg) - - msg = queue_get() - if msg != "done" and msg["name"] == "pooler": - if not hasattr(models[0].language_model, 'pooler'): - raise AttributeError( - "ERROR: got a pooler, but model does not have one") - print("received pooler") - pooler_weight = msg.pop("weight") - pooler_bias = msg.pop("bias") - for tp_rank in range(args.target_tensor_parallel_size): - models[tp_rank].language_model.pooler.dense.weight.data.copy_( - pooler_weight) - models[tp_rank].language_model.pooler.dense.bias.data.copy_( - pooler_bias) - del pooler_weight - del pooler_bias - check_message(msg) - msg = queue_get() - - if msg != "done" and msg["name"] == "lm head": - if not hasattr(models[0], 'lm_head'): - raise RuntimeError("ERROR: got an lm head, but model does not have one") - print("received lm head") - lm_head_dense_weight = msg.pop("dense weight") - lm_head_dense_bias = msg.pop("dense bias") - lm_head_norm_weight = msg.pop("norm weight") - if md.norm_has_bias: - lm_head_norm_bias = msg.pop("norm bias") - for tp_rank in range(args.target_tensor_parallel_size): - models[tp_rank].lm_head.dense.weight.data.copy_( - lm_head_dense_weight) - models[tp_rank].lm_head.dense.bias.data.copy_( - lm_head_dense_bias) - models[tp_rank].lm_head.norm.weight.data.copy_( - lm_head_norm_weight) - if md.norm_has_bias: - models[tp_rank].lm_head.norm.bias.data.copy_( - lm_head_norm_bias) - check_message(msg) - msg = queue_get() - - if msg != "done" and msg["name"] == "binary head": - if not hasattr(models[0], 'binary_head'): - raise RuntimeError("ERROR: got a binary head, but model does not have one") - print("received binary head") - binary_head_weight = msg.pop("weight") - binary_head_bias = msg.pop("bias") - for tp_rank in range(args.target_tensor_parallel_size): - models[tp_rank].binary_head.weight.data.copy_( - binary_head_weight) - models[tp_rank].binary_head.bias.data.copy_( - binary_head_bias) - check_message(msg) - msg = queue_get() - - if msg != "done": - print("ERROR: got some more data but was expecting to be done") - - for tp_rank in range(args.target_tensor_parallel_size): - mpu.set_tensor_model_parallel_rank(tp_rank) - save_huggingface_llama(args, models[tp_rank], md) diff --git a/debug/weight_convert/load_utils.py b/debug/weight_convert/load_utils.py deleted file mode 100644 index a041d61cff122cc0a7218f80d19ff81c4512f7b7..0000000000000000000000000000000000000000 --- a/debug/weight_convert/load_utils.py +++ /dev/null @@ -1,371 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the License); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import sys -import types -import argparse -import importlib -import torch -import torch.multiprocessing as mp - - -def add_arguments(parser): - group = parser.add_argument_group(title='Megatron loader') - - group.add_argument('--true-vocab-size', type=int, default=None, - help='original size of vocab, if specified will trim padding from embedding table.') - group.add_argument('--vocab-file', type=str, default=None, - help='Path to the vocab file. If specified will use this to get vocab size and ' - 'trim padding from the embedding table.') - parser.add_argument('--add-qkv-bias', action='store_true', - help='Add bias for attention qkv', default=False, - ) - parser.add_argument('--add-dense-bias', action='store_true', - help='Add bias for attention dense', default=False, - ) - parser.add_argument('--embed-layernorm', action='store_true', - help='Add embed layernorm for word embedding', default=False, - ) - parser.add_argument('--params-dtype', type=str, - help='Set weight dtype', default='fp16', - ) - - -def _load_checkpoint(queue, args): - - # Search in directory above this - sys.path.append(os.path.abspath( - os.path.join(os.path.dirname(__file__), - os.path.pardir))) - try: - from pretrain_gpt import model_provider - from ascendspeed import megatron_adaptor - from megatron.arguments import validate_args - from modellink.utils import parse_args - from megatron.global_vars import set_args, set_global_variables - from megatron.checkpointing import load_args_from_checkpoint - from megatron.checkpointing import load_checkpoint as load_checkpoint_mg - from megatron.model import module - from megatron.core import mpu - from megatron.core.enums import ModelType - except ModuleNotFoundError: - print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") - queue.put("exit") - if args.input_model_dir: - print(f">>>{args.input_model_dir}") - else: - print("NO") - # We want all arguments to come from us - sys.argv = ['script.py', - '--no-masked-softmax-fusion', - '--no-bias-gelu-fusion', - '--no-bias-dropout-fusion', - '--no-async-tensor-model-parallel-allreduce', - '--use-cpu-initialization', - '--micro-batch-size', '1', - '--no-load-optim', - '--no-load-rng', - '--no-save-optim', - '--no-save-rng', - '--no-initialization', - '--load', args.input_model_dir - ] - margs = parse_args() - margs.embed_layernorm = args.embed_layernorm - margs, checkpoint_args = load_args_from_checkpoint(margs) - margs.add_qkv_bias = args.add_qkv_bias - margs.add_dense_bias = args.add_dense_bias - margs.fp16 = True - if args.add_dense_bias: - margs.skip_bias_add = False - - # Arguments do sanity checks on the world size, but we don't care, - # so trick it into thinking we are plenty of processes - margs.world_size = margs.tensor_model_parallel_size * \ - margs.pipeline_model_parallel_size - - margs = validate_args(margs) - - def check_for_arg(arg_name, default=None): - if getattr(margs, arg_name, None) is None: - if default is not None: - setattr(margs, arg_name, default) - else: - print( - f"Checkpoint does not specify the argument {arg_name}. Exiting.") - print(f"Arguments: {margs}") - queue.put("exit") - - check_for_arg('tensor_model_parallel_size') - check_for_arg('pipeline_model_parallel_size') - check_for_arg('num_layers') - check_for_arg('hidden_size') - check_for_arg('seq_length') - check_for_arg('num_attention_heads') - check_for_arg('max_position_embeddings') - check_for_arg('position_embedding_type') - check_for_arg('tokenizer_type') - check_for_arg('iteration') - check_for_arg('bert_binary_head') - check_for_arg('disable_bias_linear', False) - check_for_arg('params_dtype') - check_for_arg('swiglu', False) - - margs.model_type = ModelType.encoder_or_decoder - # supress warning about torch.distributed not being initialized - module.MegatronModule.embedding_warning_printed = True - - consumed_train_samples = None - consumed_valid_samples = None - - def get_models(count, dtype): - nonlocal consumed_train_samples - nonlocal consumed_valid_samples - model_array_len = margs.virtual_pipeline_model_parallel_size - if model_array_len is None: - model_array_len = 1 - models = [[] for _ in range(model_array_len)] - pre_process = mpu.is_pipeline_first_stage() - post_process = mpu.is_pipeline_last_stage() - for rank in range(count): - mpu.set_tensor_model_parallel_rank(rank) - if margs.virtual_pipeline_model_parallel_size is not None: - model_ = [] - for i in range(margs.virtual_pipeline_model_parallel_size): - mpu.set_virtual_pipeline_model_parallel_rank(i) - # Set pre_process and post_process only after virtual rank is set. - pre_process = mpu.is_pipeline_first_stage() - post_process = mpu.is_pipeline_last_stage() - this_model = model_provider( - pre_process=pre_process, - post_process=post_process - ).to(dtype) - model_.append(this_model) - else: - pre_process = mpu.is_pipeline_first_stage() - post_process = mpu.is_pipeline_last_stage() - model_rank = 0 - model_ = [model_provider(pre_process, post_process).to(dtype)] - margs.consumed_train_samples = 0 - margs.consumed_valid_samples = 0 - load_checkpoint_mg(model_, None, None) - - if consumed_train_samples is not None: - if margs.consumed_train_samples != consumed_train_samples: - return None - else: - consumed_train_samples = margs.consumed_train_samples - if consumed_valid_samples is not None: - if margs.consumed_valid_samples != consumed_valid_samples: - return None - else: - consumed_valid_samples = margs.consumed_valid_samples - for vp_rank in range(model_array_len): - models[vp_rank].append(model_[vp_rank]) - return models - - set_global_variables(margs, build_tokenizer=False) - mpu.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size) - mpu.set_pipeline_model_parallel_world_size( - margs.pipeline_model_parallel_size) - mpu.set_virtual_pipeline_model_parallel_world_size( - margs.virtual_pipeline_model_parallel_size) - - # Get true (non-padded) vocab size - if args.true_vocab_size is not None: - true_vocab_size = args.true_vocab_size - elif args.vocab_file is not None: - vb_file = open(args.vocab_file) - vocab = json.load(vb_file) - true_vocab_size = len(vocab) - if args.true_vocab_size is not None and true_vocab_size != args.true_vocab_size: - print( - "Both --true-vocab-size and --vocab-file specified and the vocab size does not match, aborting.") - queue.put("exit") - vb_file.close() - else: - true_vocab_size = None - - # short aliases - tp_size = margs.tensor_model_parallel_size - pp_size = margs.pipeline_model_parallel_size - vp_size = margs.virtual_pipeline_model_parallel_size - if vp_size is None: - vp_size = 1 - - # Layernorm has bias; RMSNorm does not. - if hasattr(checkpoint_args, 'normalization'): - norm_has_bias = checkpoint_args.normalization == "LayerNorm" - else: - # older models only supported LayerNorm - norm_has_bias = True - - # metadata - md = types.SimpleNamespace() - md.model_type = 'GPT' - md.num_layers = margs.num_layers - md.hidden_size = margs.hidden_size - md.seq_length = margs.seq_length - md.num_attention_heads = margs.num_attention_heads - md.max_position_embeddings = margs.max_position_embeddings - md.tokenizer_type = margs.tokenizer_type - md.iteration = margs.iteration - md.params_dtype = margs.params_dtype - md.bert_binary_head = margs.bert_binary_head - md.output_layer = margs.untie_embeddings_and_output_weights - md.position_embedding_type = margs.position_embedding_type - md.linear_bias = margs.add_bias_linear - md.norm_has_bias = norm_has_bias - md.swiglu = margs.swiglu - md.previous_tensor_parallel_size = margs.tensor_model_parallel_size - md.previous_pipeline_parallel_size = margs.pipeline_model_parallel_size - md.true_vocab_size = true_vocab_size - md.make_vocab_size_divisible_by = margs.make_vocab_size_divisible_by - md.checkpoint_args = checkpoint_args - md.embed_layernorm = margs.embed_layernorm - - # Get first pipe stage - mpu.set_pipeline_model_parallel_rank(0) - all_models = [get_models(tp_size, md.params_dtype)] - models = all_models[0][0] - - md.consumed_train_samples = consumed_train_samples - md.consumed_valid_samples = consumed_valid_samples - queue.put(md) - - def queue_put(name, msg): - print(f"sending {name}") - msg["name"] = name - queue.put(msg) - - # Send embeddings - message_word_embedding = [] - for tp_rank in range(tp_size): - message_word_embedding.append(models[tp_rank].language_model.embedding.word_embeddings.weight.data) - message = {"word embeddings": torch.cat(message_word_embedding, dim=0)} - if md.position_embedding_type == 'learned_absolute': - message["position embeddings"] = models[0].language_model.embedding.position_embeddings.weight.data - if md.embed_layernorm: - message["word embeddings norm_w"] = models[0].language_model.embedding.word_embeddings.norm.weight.data - message["word embeddings norm_b"] = models[0].language_model.embedding.word_embeddings.norm.bias.data - queue_put("embeddings", message) - - total_layer_num = 0 - for vp_rank in range(vp_size): - mpu.set_virtual_pipeline_model_parallel_rank(vp_rank) - for pp_rank in range(pp_size): - if pp_rank > 0: - mpu.set_pipeline_model_parallel_rank(pp_rank) - if vp_rank == 0: - all_models.append(get_models(tp_size, md.params_dtype)) - models = all_models[pp_rank][vp_rank] - for layer_num, _ in enumerate(models[0].language_model.encoder.layers): - message = {} - - # Get non-parallel tensors from tp_rank 0 - layer = models[0].language_model.encoder.layers[layer_num] - message["input norm weight"] = layer.input_norm.weight.data - if norm_has_bias: - message["input norm bias"] = layer.input_norm.bias.data - message["post norm weight"] = layer.post_attention_norm.weight.data - if norm_has_bias: - message["post norm bias"] = layer.post_attention_norm.bias.data - if md.linear_bias: - message["dense bias"] = layer.self_attention.dense.bias.data - message["mlp l1 bias"] = layer.mlp.dense_4h_to_h.bias.data - if args.add_dense_bias: - message["dense bias"] = layer.self_attention.dense.bias.data - - # Grab all parallel tensors for this layer - qkv_weight = [] - qkv_bias = [] - dense_weight = [] - mlp_l0_weight = [] - mlp_l0_bias = [] - mlp_l1_weight = [] - for tp_rank, model in enumerate(models): - layer = model.language_model.encoder.layers[layer_num] - qkv_weight.append( - layer.self_attention.query_key_value.weight.data) - dense_weight.append(layer.self_attention.dense.weight.data) - mlp_l0_weight.append(layer.mlp.dense_h_to_4h.weight.data) - mlp_l1_weight.append(layer.mlp.dense_4h_to_h.weight.data) - if md.linear_bias: - qkv_bias.append( - layer.self_attention.query_key_value.bias.data) - mlp_l0_bias.append(layer.mlp.dense_h_to_4h.bias.data) - if args.add_qkv_bias: - qkv_bias.append( - layer.self_attention.query_key_value.bias.data) - - # Handle gated linear units - if md.swiglu: - # concat all the first halves ('W's) and all the second halves ('V's) - for tp_rank in range(tp_size): - mlp_l0_weight[tp_rank] = torch.chunk( - mlp_l0_weight[tp_rank], 2, dim=0) - message["mlp l0 weight W"] = torch.cat( - [w[0] for w in mlp_l0_weight], dim=0) - message["mlp l0 weight V"] = torch.cat( - [w[1] for w in mlp_l0_weight], dim=0) - else: - message["mlp l0 weight"] = torch.cat(mlp_l0_weight, dim=0) - - # simple concat of the rest - message["qkv weight"] = torch.cat(qkv_weight, dim=0) - message["dense weight"] = torch.cat(dense_weight, dim=1) - message["mlp l1 weight"] = torch.cat(mlp_l1_weight, dim=1) - if md.linear_bias: - message["qkv bias"] = torch.cat(qkv_bias, dim=0) - if md.swiglu: - for tp_rank in range(tp_size): - mlp_l0_bias[tp_rank] = torch.chunk( - mlp_l0_bias[tp_rank], 2, dim=0) - message["mlp l0 bias W"] = torch.cat( - [b[0] for b in mlp_l0_bias], dim=0) - message["mlp l0 bias V"] = torch.cat( - [b[1] for b in mlp_l0_bias], dim=0) - else: - message["mlp l0 bias"] = torch.cat(mlp_l0_bias, dim=0) - if args.add_qkv_bias: - message["qkv bias"] = torch.cat(qkv_bias, dim=0) - - queue_put(f"transformer layer {total_layer_num}", message) - - total_layer_num = total_layer_num + 1 - - # Send final norm from tp_rank 0 - message = { - "weight": models[0].language_model.encoder.final_norm.weight.data, - } - if norm_has_bias: - message["bias"] = models[0].language_model.encoder.final_norm.bias.data - queue_put("final norm", message) - - if md.output_layer: - message_weight = [] - for tp_rank in range(tp_size): - message_weight.append(models[tp_rank].language_model.output_layer.weight.data) - message = {"weight": torch.cat(message_weight, dim=0)} - queue_put("output layer", message) - - queue.put("done") - - -def load_checkpoint(queue, args): - try: - _load_checkpoint(queue, args) - except: - queue.put("exit") - raise diff --git a/plugins/mindstudio-vscode-plugins/ReadMe.md b/plugins/mindstudio-vscode-plugins/ReadMe.md new file mode 100644 index 0000000000000000000000000000000000000000..37e928d9d99da4140c41a6e78708a8f28963668c --- /dev/null +++ b/plugins/mindstudio-vscode-plugins/ReadMe.md @@ -0,0 +1,95 @@ +# MindStudio Operator Debug VSCode Plugin + +### 介绍 +MindStudio Operator Debug VSCode Plugin插件基于MindStudio Debugger(msdebug调试器)提供的底层调试昇腾算子能力,支持远程调试C/C++与昇腾算子程序。 +MindStudio Operator Debug VSCode Plugin插件[源码仓](https://gitee.com/ascend/aot)和[下载链接](https://ascend-package.obs.cn-north-4.myhuaweicloud.com:443/mindstudio-operator-tools/MindStudio-Operator-Debug-VSCode-Plugin-0.0.1.vsix) + +### 特性 +* 断点调试(设置/删除/禁用/启用断点) +* 单步执行调试(逐行执行/内部执行/跳出函数)/继续/暂停/重启 +* 查看变量/寄存器/堆栈/断点信息,监视器支持执行表达式 +* 内存查询 +* 核切换 + +### 规格要求 + +* VSCode (1.88版本及以上) +* VSCode已安装远程登录插件和Hex Editor插件 +* 支持平台: Linux +* 调试昇腾算子程序时,Linux环境已安装CANN工具包(含算子调试器) + +### 约束 +* 参考CANN工具包的约束要求 + +### 功能 + +#### 快速使用 + + +1. 打开VSCode IDE界面,安装远程登录插件(如Remote-SSH) +2. IDE界面远程SSH登录算子开发环境,打开已编译的算子工程文件 +3. 安装[Hex Editor](https://marketplace.visualstudio.com/items?itemName=ms-vscode.hexeditor)插件和[MindStudio Operator Debug VSCode Plugin](https://ascend-package.obs.cn-north-4.myhuaweicloud.com:443/mindstudio-operator-tools/MindStudio-Operator-Debug-VSCode-Plugin-0.0.1.vsix)插件 + +* IDE界面插件市场界面离线本地安装 +``` + - 插件.vsix文件上传linux环境 + - 展开IDE界面左侧边栏的插件菜单 + - 点击右上角...图标,选择Install from VSIX...选项,选择本地的插件文件进行安装 +``` + +* IDE界面终端命令行离线本地安装 +``` + - 插件.vsix文件上传linux环境 + - IDE界面点开终端命令行 + - 输入命令安装插件文件: code --install-extension /xx_dir/MindStudio-Operator-Debug-VSCode-Plugin-0.0.1.vsix +``` +4. 点开debug侧边工具栏,若未生成launch.json文件,则根据提示点击生成launch.json文件(弹窗中调试器选择MSDebug) +5. 在`.vscode/launch.json`中添加调试配置(格式参考下面推荐配置) +6. debug侧边工具栏中,选择已配置的调试器,点击`debug`或按`F5`启动调试 + +- **launch方式推荐配置** +``` +{ + "configurations": [ + { + "name": "xx-debug", + "type": "msdebug-mi", + "request": "launch", + "cwd": "${workspaceFolder}", + "target": "${workspaceFolder}/xx_operator", + "msdebugMiPath": "/xx/x/msdebug-mi", + "environmentScripts": [ + "/xxx/xx/Ascend/ascend-toolkit/set_env.sh" + ], + "environment": [{ + "name": "LD_LIBRARY_PATH", + "value": "/xx/x/lib:${LD_LIBRARY_PATH}" + }] + } + ] +} +``` +* `name` 调试插件的名称 +* `type` 固定值`"msdebug-mi"` +* `request` 固定值`"launch"` +* `cwd` 调试器启动的工作目录 +* `target` (必要配置项)被调试的算子可执行文件的路径 +* `msdebugMiPath` 调试器的路径 +* `environmentScripts` 包含设置调试器的环境变量的脚本数组(脚本每行形如:`"export XX_KEY=XX_VALUE"`),依次加载数组中各脚本 +* `environment` 包含自定义环境变量的对象数组,在加载完`environmentScripts`脚本中的环境变量后依次加载`environment`中的环境变量,格式需满足:`[{"name": "xxx", "value":"yyy"}]` + +#### 断点调试(设置/删除/禁用/启用断点) +* 可以在算子程序行号显示处设置、删除、禁用、用断点,也可以在debug侧边工具栏中的底部断点工具栏执行相同操作。 + +#### 单步执行调试(逐行执行/内部执行/跳出函数)/继续/暂停/重启 +* 可以单击顶部调试工具栏上的图标控制程序,包括单步执行、步入、步出、继续、暂停、重启或停止程序的操作。 + +#### 查看变量/寄存器/堆栈/断点信息,监视器支持执行表达式 +* 启动调试后,程序会停止在断点,当进程处于停止状态时,可以在IDE界面左侧查看当前线程的变量、堆栈、监视器和断点信息。其中监视器支持执行表达式。 + +#### 内存查询 +* 查看变量的内存需要预先安装`Hex Editor`插件 +* 调试停在断点时,光标移到变量右侧显示`0110`的内存按钮,点击按钮弹窗查看变量地址对应的内存值 + +#### 核切换 +* 调试算子程序Kernel侧代码时,IDE右下角显示当前调试占用NPU卡的核ID,点击该按钮(形如:`kernel:aiv 10`),弹窗显示**所有可用核ID**,根据提示将**待切核ID**输入弹窗并回车进行切核,输入格式例如`aiv 15` diff --git a/plugins/tensorboard-plugins/ OWNERS b/plugins/tensorboard-plugins/ OWNERS new file mode 100644 index 0000000000000000000000000000000000000000..34c383beaf138da92df0991b472135496450a827 --- /dev/null +++ b/plugins/tensorboard-plugins/ OWNERS @@ -0,0 +1,9 @@ +options: + no_parent_owners: true +approvers: +- wo-wenjie +- ly-qianxiao +reviewers: +- wo-wenjie +- ly-qianxiao +- leo920320 diff --git a/plugins/tensorboard-plugins/tb_plugin/torch_tb_profiler/profiler/op_agg.py b/plugins/tensorboard-plugins/tb_plugin/torch_tb_profiler/profiler/op_agg.py index f1fc6117befb1e9522603ca07007c47e9b993104..08a3f0d7061dc332a78ec97a6ff085bf1840a47d 100644 --- a/plugins/tensorboard-plugins/tb_plugin/torch_tb_profiler/profiler/op_agg.py +++ b/plugins/tensorboard-plugins/tb_plugin/torch_tb_profiler/profiler/op_agg.py @@ -145,11 +145,15 @@ class ModuleAggregator: stack_lists_group_by_name: Dict[str, List[OperatorAgg]] = defaultdict(list) stack_lists_group_by_name_input: Dict[str, List[OperatorAgg]] = defaultdict(list) for agg in agg_result[2].values(): - assert (len(agg.callstacks) == 1) + if len(agg.callstacks) != 1: + logger.error(f'Incorrect length of callstacks, expected 1 but got {len(agg.callstacks)}') + break if list(agg.callstacks)[0]: stack_lists_group_by_name[agg.name].append(agg) for agg in agg_result[3].values(): - assert (len(agg.callstacks) == 1) + if len(agg.callstacks) != 1: + logger.error(f'Incorrect length of callstacks, expected 1 but got {len(agg.callstacks)}') + break if list(agg.callstacks)[0]: key = agg.name + '###' + str(agg.input_shape) stack_lists_group_by_name_input[key].append(agg) diff --git a/profiler/MANIFEST.in b/profiler/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..0550da458f399209a4002b47706e5d741c990af3 --- /dev/null +++ b/profiler/MANIFEST.in @@ -0,0 +1,7 @@ +recursive-include profiler/advisor/ * +recursive-include profiler/cli/ * +recursive-include profiler/prof_common/ * +recursive-include profiler/compare_tools/ * +recursive-include profiler/cluster_analyse/ * +global-exclude */__pycache__/* +global-exclude *.pyc diff --git a/profiler/OWNERS b/profiler/OWNERS new file mode 100644 index 0000000000000000000000000000000000000000..0c09fb8ce494186cdbc03cdc136edc1452ae1de0 --- /dev/null +++ b/profiler/OWNERS @@ -0,0 +1,11 @@ +options: + no_parent_owners: true +approvers: +- xhahn +- aerfaliang +- chenhao_1209 +- feng123www +reviewers: +- sunboquan +- stby +- Seanesmhxocism diff --git a/profiler/README.md b/profiler/README.md index e010c2c3cf81246795d620aedf82ac8475c6a481..dff28625719f0e720f00499361aa2e8ca432aba8 100644 --- a/profiler/README.md +++ b/profiler/README.md @@ -1,14 +1,14 @@ # 性能工具 -ATT工具针对训练&大模型场景,提供端到端调优工具:用户采集到性能数据后,由ATT工具提供统计、分析以及相关的调优建议。 +ATT工具针对训练&大模型场景,提供端到端性能调优工具:用户采集到性能数据后,由ATT性能工具提供统计、分析以及相关的调优建议。 -### NPU Profiling数据采集 +## NPU性能数据采集 -目前ATT工具主要支持Ascend PyTorch Profiler接口的性能数据采集,请参考官方文档:[Ascend PyTorch Profiler数据采集与分析](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/modeldevpt/ptmigr/AImpug_0067.html)。 +目前ATT工具主要支持Ascend PyTorch Profiler接口的性能数据采集,请参考官方文档:[Ascend PyTorch Profiler数据采集与分析](https://www.hiascend.com/document/detail/zh/canncommercial/80RC1/devaids/auxiliarydevtool/atlasprofiling_16_0006.html)。 -Ascend PyTorch Profiler接口支持AscendPyTorch 5.0.RC2或更高版本,支持的PyThon和CANN软件版本配套关系请参见《CANN软件安装指南》中的“[安装PyTorch](https://www.hiascend.com/document/detail/zh/canncommercial/63RC2/envdeployment/instg/instg_000041.html)”。 +Ascend PyTorch Profiler接口支持AscendPyTorch 1.11.0或更高版本,支持的PyThon和CANN软件版本配套关系请参见“[安装PyTorch框架](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/configandinstg/instg/insg_0006.html)”。 -#### 采集方式一:通过with语句进行采集 +### 采集方式一:通过with语句进行采集 ```python import torch_npu @@ -35,7 +35,7 @@ with torch_npu.profiler.profile( prof.step() ``` -#### 采集方式二:start,stop方式进行采集 +### 采集方式二:start,stop方式进行采集 ```python import torch_npu @@ -64,7 +64,7 @@ for epoch, data in enumerate(dataloader): prof.stop() ``` -#### NPU性能数据目录结构 +### NPU性能数据目录结构 ascend pytorch profiler数据目录结构如下: @@ -79,7 +79,99 @@ ascend pytorch profiler数据目录结构如下: |- * _ascend_pt ``` -### 子功能介绍 +## 工具安装 + +性能工具的安装方式包括:**下载whl包安装**和**源代码编译安装**。 + +#### 下载whl包安装 + +1. whl包获取。 + + 请通过下表链接下载profiler工具whl包。 + + | profiler版本 | 发布日期 | 下载链接 | 校验码 | + | ------------ | ---------- | ------------------------------------------------------------ | ------------------------------------------------------------ | + | 1.1.1 | 2024-06-20 | [msprof_analyze-1.1.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/1.1.1/msprof_analyze-1.1.1-py3-none-any.whl) | 76aad967a3823151421153d368d4d2f8e5cfbcb356033575e0b8ec5acea8e5e4 | + | 1.1.0 | 2024-05-28 | [msprof_analyze-1.1.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/1.1.0/msprof_analyze-1.1.0-py3-none-any.whl) | b339f70e7d1e45e81f289332ca64990a744d0e7ce6fdd84a8d82e814fa400698 | + | 1.0 | 2024-05-10 | [msprof_analyze-1.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/profiler/package/1.0/msprof_analyze-1.0-py3-none-any.whl) | 95b2f41c8c8e8afe4887b738c8cababcb4f412e1874483b6adae4a025fcbb7d4 | + + + +2. whl包校验。 + + 1. 根据以上下载链接下载whl包到Linux安装环境。 + + 2. 进入whl包所在目录,执行如下命令。 + + ``` + sha256sum {name}.whl + ``` + + {name}为whl包名称。 + + 若回显呈现对应版本whl包一致的**校验码**,则表示下载了正确的性能工具whl安装包。示例如下: + + ``` + sha256sum msprof_analyze-1.0-py3-none-any.whl + xx *msprof_analyze-1.0-py3-none-any.whl + ``` + +3. whl包安装。 + + 执行如下命令进行安装。 + + ``` + pip3 install ./msprof_analyze-{version}-py3-none-any.whl + ``` + + 若为覆盖安装,请在命令行末尾增加“--force-reinstall”参数强制安装,例如: + + ``` + pip3 install ./msprof_analyze-{version}-py3-none-any.whl --force-reinstall + ``` + + 提示如下信息则表示安装成功。 + + ``` + Successfully installed msprof_analyze-{version} + ``` + +#### 源代码编译安装 + +1. 安装依赖。 + + 编译前需要安装wheel。 + + ```bash + pip3 install wheel + ``` + +2. 下载源码。 + + ```bash + git clone https://gitee.com/ascend/att.git + ``` + +3. 编译whl包。 + + ```bash + cd att/profiler + python3 setup.py bdist_wheel + ``` + + 以上命令执行完成后在att/profiler/dist目录下生成性能工具whl安装包`msprof_analyze-{version}-py3-none-any.whl`。 + +4. 安装。 + + 执行如下命令进行性能工具安装。 + + ```bash + cd dist + pip3 install ./msprof_analyze-{version}-py3-none-any.whl --force-reinstall + ``` + +## 工具使用 + | 工具名称 | 说明 | | ------------------------------------------------------------ | ------------------------------------------------------------ | | [compare_tools(性能比对工具)](https://gitee.com/ascend/att/tree/master/profiler/compare_tools) | 提供NPU与GPU性能拆解功能以及算子、通信、内存性能的比对功能。 | diff --git a/profiler/advisor/README.md b/profiler/advisor/README.md index 722243cdc263433431ead52ef67a7257f6dfeb22..283aa2943881262ffbefaeb7025cf301c17b18fa 100644 --- a/profiler/advisor/README.md +++ b/profiler/advisor/README.md @@ -1,16 +1,55 @@ -# 性能分析工具 +# advisor -性能分析工具是将Ascend PyTorch Profiler采集的性能数据进行分析,并输出性能调优建议的工具 。使用方式如下: +msprof-analyze的advisor功能是将Ascend PyTorch Profiler或者msprof采集的PyThon场景性能数据进行分析,并输出性能调优建议(当前暂不支持对db格式文件分析)。 + +## 工具使用(命令行方式方式) + +1. 参见《[性能工具](../README.md)》完成工具安装。建议安装最新版本。 + +2. 执行分析。 + + - 总体性能瓶颈 + + ```bash + msprof-analyze advisor all -d [待分析性能数据文件所在路径] -bp [基准性能数据文件所在路径] + ``` + + - 计算瓶颈 + + ```bash + msprof-analyze advisor computation -d [待分析性能数据文件所在路径] + ``` + + - 调度瓶颈 + + ```bash + msprof-analyze advisor schedule -d [待分析性能数据文件所在路径] + ``` + + + -d(必选):待分析性能数据文件所在路径。 + + -bp(可选):基准性能数据文件所在路径。 + + 单卡场景需要指定到性能数据文件`*_ascend_pt`目录;多卡或集群场景需要指定到`*_ascend_pt`目录的父目录层级。 + +3. 查看结果。 + + 分析结果打屏展示并生成html和csv文件。 + +## 工具使用(Jupyter Notebook方式) + +Jupyter Notebook使用方式如下: 下列以Windows环境下执行为例介绍。 -1. 在环境下安装jupyter notebook工具。 +1. 在环境下安装Jupyter Notebook工具。 ```bash pip install jupyter notebook ``` - jupyter notebook工具的具体安装和使用指导请至jupyter notebook工具官网查找。 + Jupyter Notebook工具的具体安装和使用指导请至Jupyter Notebook工具官网查找。 2. 在环境下安装ATT工具。 @@ -20,7 +59,7 @@ 安装环境下保存Ascend PyTorch Profiler采集的性能数据。 -3. 进入att\profiler\advisor目录执行如下命令启动jupyter notebook工具。 +3. 进入att\profiler\advisor目录执行如下命令启动Jupyter Notebook工具。 ```bash jupyter notebook @@ -28,13 +67,13 @@ 执行成功则自动启动浏览器读取att\profiler\advisor目录,如下示例: - ![jupyter_report](img/jupyter_report.PNG) + ![jupyter_report](./img/jupyter_report.PNG) - 若在Linux环境下则回显打印URL地址,即是打开jupyter notebook工具页面的地址,需要复制URL,并使用浏览器访问(若为远端服务器则需要将域名“**localhost**”替换为远端服务器的IP),进入jupyter notebook工具页面。 + 若在Linux环境下则回显打印URL地址,即是打开Jupyter Notebook工具页面的地址,需要复制URL,并使用浏览器访问(若为远端服务器则需要将域名“**localhost**”替换为远端服务器的IP),进入Jupyter Notebook工具页面。 4. 每个.ipynb文件为一项性能数据分析任务,选择需要的.ipynb打开,并在*_path参数下拷贝保存Ascend PyTorch Profiler采集的性能数据的路径。如下示例: - ![advisor_result](img/advisor_result.PNG) + ![advisor_result](./img/advisor_result.PNG) 5. 单击运行按钮执行性能数据分析。 diff --git a/profiler/advisor/__init__.py b/profiler/advisor/__init__.py index 0428ee03f05fac6a068642ccd7c36d56d219ea81..e79018ed05c6d1cdeb56feaa6182f048e3c8e06f 100644 --- a/profiler/advisor/__init__.py +++ b/profiler/advisor/__init__.py @@ -13,3 +13,5 @@ # See the License for the specific language governing permissions and # limitations under the License. + +from profiler.advisor.interface.interface import Interface \ No newline at end of file diff --git a/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py b/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py index 2b6e5270f278276521b20eae225b0c004a77a2f7..336bef7dd8553eb82586d52260443a7d01e84ab0 100644 --- a/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py +++ b/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py @@ -15,11 +15,13 @@ from common_func_advisor.constant import Constant from advice_factory.advice_factory import AdviceFactory from compute_advice.npu_fused_advice import NpuFusedAdvice +from compute_advice.npu_slow_advice import NpuSlowAdvice class ComputeAdviceFactory(AdviceFactory): ADVICE_LIB = { Constant.NPU_FUSED: NpuFusedAdvice, + Constant.NPU_SLOW: NpuSlowAdvice, } def __init__(self, collection_path: str): diff --git a/profiler/advisor/advisor_backend/cluster_advice/cluster_advice_base.py b/profiler/advisor/advisor_backend/cluster_advice/cluster_advice_base.py index 8cd9acab4c43cc5eff89b6e8c3bdd3ab4a72fc4b..e9be4675963a9cd48da3b4cd91ee646f8e82468b 100644 --- a/profiler/advisor/advisor_backend/cluster_advice/cluster_advice_base.py +++ b/profiler/advisor/advisor_backend/cluster_advice/cluster_advice_base.py @@ -46,7 +46,8 @@ class ClusterAdviceBase(AdviceBase): def cluster_analyze(self): parameter = { - Constant.COLLECTION_PATH: self.collection_path + Constant.COLLECTION_PATH: self.collection_path, + Constant.ANALYSIS_MODE: "all" } try: Interface(parameter).run() diff --git a/profiler/advisor/advisor_backend/cluster_advice/kernel_cluster_advice.py b/profiler/advisor/advisor_backend/cluster_advice/kernel_cluster_advice.py index e2ca914a79451b5bf5fdbcbba14e1f2606cc7cd5..6fa83c765f5fe1f4ac20dcc62895fe0450e338ce 100644 --- a/profiler/advisor/advisor_backend/cluster_advice/kernel_cluster_advice.py +++ b/profiler/advisor/advisor_backend/cluster_advice/kernel_cluster_advice.py @@ -12,7 +12,7 @@ class KernelClusterAdvice(ClusterAdviceBase): COLUMNS_TO_CAL = ["Duration(us)"] CAL_FUN = ['mean', 'var', 'max', 'min', 'count', 'sum'] - def __init__(self, collection_path: str): + def __init__(self, collection_path: str, kwargs: dict = None): super().__init__(collection_path) self.all_kernel_data = pd.DataFrame() diff --git a/profiler/advisor/advisor_backend/cluster_advice/slow_link_advice.py b/profiler/advisor/advisor_backend/cluster_advice/slow_link_advice.py index e350e08f39c087198962f0317926787acbceb406..f8a625242f3939602cbb7b8391cd8062e21fe01b 100644 --- a/profiler/advisor/advisor_backend/cluster_advice/slow_link_advice.py +++ b/profiler/advisor/advisor_backend/cluster_advice/slow_link_advice.py @@ -33,7 +33,7 @@ class SlowLinkAdvice(ClusterAdviceBase): SDMA = "SDMA" RDMA = "RDMA" - def __init__(self, collection_path: str): + def __init__(self, collection_path: str, kwargs: dict = None): super().__init__(collection_path) self.rank_bw_dict = defaultdict(lambda: { self.RDMA_TIME_MS: 0, diff --git a/profiler/advisor/advisor_backend/cluster_advice/slow_rank_advice.py b/profiler/advisor/advisor_backend/cluster_advice/slow_rank_advice.py index 516554583240878f211dba01d4f92c0a17a79cdc..4e789fb7fb688626df7e8f5b25b84e4955d6c2a3 100644 --- a/profiler/advisor/advisor_backend/cluster_advice/slow_rank_advice.py +++ b/profiler/advisor/advisor_backend/cluster_advice/slow_rank_advice.py @@ -26,7 +26,7 @@ class SlowRankAdvice(ClusterAdviceBase): RATIO_THRESHOLD = 0.05 BOTTLENECK_LIST = ['Computing', 'Communication', "Free"] - def __init__(self, collection_path: str): + def __init__(self, collection_path: str, kwargs: dict = None): super().__init__(collection_path) def load_step_time(self): diff --git a/profiler/advisor/advisor_backend/common_func_advisor/constant.py b/profiler/advisor/advisor_backend/common_func_advisor/constant.py index 34879db9f2c078854aab6cfe658fc46865b885df..46a7fb24c2dade75c157f18118f29233eb924b88 100644 --- a/profiler/advisor/advisor_backend/common_func_advisor/constant.py +++ b/profiler/advisor/advisor_backend/common_func_advisor/constant.py @@ -15,11 +15,104 @@ from enum import Enum +class CsvTitle: + MODEL_NAME = "Model Name" + MODEL_ID = "Model ID" + TASK_ID = "Task ID" + STREAM_ID = "Stream ID" + INFER_ID = "Infer ID" + TASK_START_TIME = "Task Start Time(us)" + TASK_WAIT_TIME = "Task Wait Time(us)" + BLOCK_DIM = "Block Dim" + MIX_BLOCK_DIM = "Mix Block Dim" + HF32_ELIGIBLE = "HF32 Eligible" + INPUT_SHAPES = "Input Shapes" + INPUT_DATA_TYPES = "Input Data Types" + INPUT_FORMATS = "Input Formats" + OUTPUT_SHAPES = "Output Shapes" + OUTPUT_DATA_TYPES = "Output Data Types" + OUTPUT_FORMATS = "Output Formats" + CONTEXT_ID = "Context ID" + AICORE_TIME = "aicore_time(us)" + AIC_TOTAL_CYCLES = "aic_total_cycles" + AIC_MAC_TIME = "aic_mac_time(us)" + AIC_MAC_RATIO = "aic_mac_ratio" + AIC_SCALAR_TIME = "aic_scalar_time(us)" + AIC_SCALAR_RATIO = "aic_scalar_ratio" + AIC_MTE1_TIME = "aic_mte1_time(us)" + AIC_MTE1_RATIO = "aic_mte1_ratio" + AIC_MTE2_TIME = "aic_mte2_time(us)" + AIC_MTE2_RATIO = "aic_mte2_ratio" + AIC_FIXPIPE_TIME = "aic_fixpipe_time(us)" + AIC_FIXPIPE_RATIO = "aic_fixpipe_ratio" + AIC_ICACHE_MISS_RATE = "aic_icache_miss_rate" + AIV_TIME = "aiv_time(us)" + AIV_TOTAL_CYCLES = "aiv_total_cycles" + AIV_VEC_TIME = "aiv_vec_time(us)" + AIV_VEC_RATIO = "aiv_vec_ratio" + AIV_SCALAR_TIME = "aiv_scalar_time(us)" + AIV_SCALAR_RATIO = "aiv_scalar_ratio" + AIV_MTE2_TIME = "aiv_mte2_time(us)" + AIV_MTE2_RATIO = "aiv_mte2_ratio" + AIV_MTE3_TIME = "aiv_mte3_time(us)" + AIV_MTE3_RATIO = "aiv_mte3_ratio" + AIV_ICACHE_MISS_RATE = "aiv_icache_miss_rate" + CUBE_UTILIZATION = "cube_utilization( %)" + TASK_DURATION_SUM = "Task Duration Sum(us)" + TASK_DURATION_MEAN = "Task Duration Mean(us)" + TASK_DURATION_STD = "Task Duration Std(us)" + TASK_DURATION_RATIO = "Task Duration Ratio(100%)" + SIZE = "size(MB)" + THROUGHPUT = "throughput(GB/s)" + COLOR = "color" + GAP = "Gap(us)" + DURATION_SUM = "Duration Sum(us)" + COUNT = "Count" + MAX_DURATION = "Max Duration(us)" + MIN_DURATION = "Min Duration(us)" + AVG_DURATION = "Avg Duration(us)" + DURATION_RATIO = "Duration Ratio" + INDEX = "Index" + + +# 定义CSV_TITILE_V1类,继承自CSV_TITILE类, 适配旧版csv +class CsvTitleV1(CsvTitle): + OP_NAME = "Op Name" + OP_TYPE = "OP Type" + TASK_TYPE = "Task Type" + TASK_DURATION = "Task Duration(us)" + + +# 定义CSV_TITILE_V1类,继承自CSV_TITILE类, 适配新版csv +class CsvTitleV2(CsvTitle): + OP_NAME = "Name" + OP_TYPE = "Type" + TASK_TYPE = "Accelerator Core" + TASK_DURATION = "Duration(us)" + + class Constant: + DTYPE_SIZE_MAP = {"int8": 1, "uint8": 1, + "int16": 2, "uint16": 2, + "int32": 4, "uint32": 4, + "int64": 8, "uint64": 8, + "float16": 2, + "bfloat16": 2, + "bf16": 2, + "dt_bf16": 2, + "float32": 4, + "float": 4, + "float64": 8, + "complex64": 8, + "complex128": 16, + "bool": 1} + TP_THRESHOLD = 1150 MAX_INPUT_MODE_LEN = 30 MAX_INPUT_ADVICE_LEN = 30 SMALL_OP_DUR_RATIO = 0.2 SMALL_OP_NUM_RATIO = 0.2 + BYTE_UNIT_TRANS = 1024 + UNIT_TRANS = 1000 # mode list COMPUTE = "compute" @@ -35,6 +128,7 @@ class Constant: # compute NPU_FUSED = "npu_fused" + NPU_SLOW = "npu_slow" # timeline OPTIM = "optimizer" @@ -108,3 +202,24 @@ class Constant: ("Cast", "Mul", "MaskedFill", "SoftmaxV2", "Cast"): "torch_npu.npu_scaled_masked_softmax", ("Mul", "Slice", "Neg", "Slice", "ConcatD", "Mul"): "torch_npu.npu_rotary_mul", ("Cast", "Square", "ReduceMeanD", "Add", "Rsqrt", "Mul", "Cast", "Mul"): "torch_npu.npu_rms_norm"} + TITLE = CsvTitleV2 + + @classmethod + def update_title(cls): + cls.TITLE = CsvTitleV1 + + +class CoreType: + AIV = "AI_VECTOR_CORE" + AIC = "AI_CORE" + AICPU = "AI_CPU" + MIX_AIV = "MIX_AIV" + MIX_AIC = "MIX_AIC" + HCCL = "HCCL" + + +class PerfColor(Enum): + WHITE = 0 + GREEN = 1 + YELLOW = 2 + RED = 3 diff --git a/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py b/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py index 08ef02876561001b9721e365c9aa6934057674de..8171f06ee235fc02da715044b4d310087c36c102 100644 --- a/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py +++ b/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import os from abc import abstractmethod from dataclasses import dataclass from dataclasses import field from typing import Dict from typing import List +import pandas as pd + from common_func.file_manager import FileManager @@ -89,9 +91,34 @@ class TraceViewJson: self.cann_dur_events: Dict[str, DurationEvent] = dict() self.ascend_hardware_dur_events: Dict[str, DurationEvent] = dict() self.torch_2_npu_flow_events: Dict[str, FlowEvent] = dict() - traces = FileManager.read_json_file(path) self._load_obj(traces) + + def get_call_stack(self, data: pd.DataFrame, index_id: int, ts_col: str) -> str: + if ts_col not in data.columns.tolist(): + print("[ERROR] No {} col found in data columns.".format(ts_col)) + return "" + row = data.loc[index_id] + timestamp = row[ts_col] + flow_event = self.get_torch_2_npu_flow_event(timestamp) + if not flow_event.valid(): + print("[ERROR] Get flow event failed for pattern {}.".format(row['pattern'])) + return "" + flow_event_s_key = flow_event.s_point_ts + python_dur_events = self.get_python_dur_events_contain_ts(flow_event_s_key) + if not python_dur_events: + print("[ERROR] No python dur event found for pattern {}.".format(row['pattern'])) + return "" + # 保持新老版本callstack兼容性 + if python_dur_events[0].args.get("Call stack"): + # 旧版本 + call_stack_list = python_dur_events[0].args.get("Call stack").split(";") + else: + python_dur_events.sort(key=lambda e: e.ts) + # 新版本 + call_stack_list = [event.name for event in python_dur_events if event.cat == "python_function"] + call_stack = "\n".join(call_stack_list) + return call_stack def get_torch_2_npu_flow_event(self, end_time) -> FlowEvent: if not self.torch_2_npu_flow_events or not self.torch_2_npu_flow_events.get(end_time): diff --git a/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py b/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py index 5411610a7f4229c6f01c04e352d380f3a2864784..c85c14d618ceda199c9c376abc27a3581eed97b8 100644 --- a/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py +++ b/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py @@ -28,18 +28,10 @@ class CSVAnalyzer: def process(self): df = pd.read_csv(self._path, dtype={"Start Time(us)": str}) - - - pool = multiprocessing.Pool(multiprocessing.cpu_count()) - # 数据预解析 - result = pool.map(self.update_op_row, df.iterrows()) - pool.close() - - preparse_df = pd.DataFrame(result) # 分析是否存在可融合的算子 - op_type_list = preparse_df["Type"].tolist() - duration_list = preparse_df["Duration(us)"].tolist() - start_times = preparse_df["Start Time(us)"].tolist() + op_type_list = df["Type"].tolist() + duration_list = df["Duration(us)"].tolist() + start_times = df["Start Time(us)"].tolist() # 去除末尾的\t分隔符 start_times = [start_time[:-1] for start_time in start_times] result_list = [] @@ -50,10 +42,6 @@ class CSVAnalyzer: "index", "first_timestamp"] return data_frame - @staticmethod - def update_op_row(row): - return OpPerfFactory.build(row[1]).update() - @staticmethod def find_all_sub_lists(op_type_list, duration_list, start_times, expect_sub_list): # 创建一个空字典,用来存储子列表和它们的出现次数和起始位置 diff --git a/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py b/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py index 2442807fd10b7942177990d2283ad34c369659bd..7bcbed5a75807b57a55787c743cfaaff55a68589 100644 --- a/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py +++ b/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py @@ -12,19 +12,29 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import functools from typing import Dict + from common_func_advisor.constant import Constant +from common_func_advisor.constant import CoreType +from common_func_advisor.constant import PerfColor class OpPerfFactory: @classmethod def build(cls, op_row: Dict): - return OpPerf(op_row) + if op_row.get(Constant.TITLE.TASK_TYPE) == CoreType.AIV: + return VecOpPerf(op_row) + elif op_row.get(Constant.TITLE.TASK_TYPE) == CoreType.AIC: + return CubeOpPerf(op_row) + else: + return OpPerf(op_row) class OpPerf: def __init__(self, op_row: Dict): + if "OP Type" in op_row.keys(): + Constant.update_title() self.row = op_row self.model_name = op_row.get("Model Name") self.model_id = op_row.get("Model ID") @@ -75,6 +85,112 @@ class OpPerf: self.aiv_mte3_ratio = op_row.get("aiv_mte3_ratio") self.aiv_icache_miss_rate = op_row.get("aiv_icache_miss_rate") self.cube_utilization = op_row.get("cube_utilization( %)") + + @staticmethod + def get_dtype_size(dtype_str: str): + return Constant.DTYPE_SIZE_MAP.get(dtype_str.lower(), 0) + + @staticmethod + def get_element_count(shape: list): + return functools.reduce(lambda x, y: int(x) * int(y), shape) + + @staticmethod + def shape_to_tuple(shape_str: str) -> tuple: + if not isinstance(shape_str, str): + return [] + shape_str = shape_str.strip('"') + split_shape = shape_str.strip(';') + if not split_shape: + return [] + pairs = split_shape.split(';') + shape_result = [] + for pair in pairs: + pair = pair.strip(";") + elements = pair.split(',') + elements = tuple(int(element) if "" != element else 0 for element in elements) + shape_result.append(elements) + return tuple(shape_result) + + @staticmethod + def dtype_to_tuple(dtypes_str: str) -> tuple: + if not isinstance(dtypes_str, str): + return [] + dtypes_str = dtypes_str.strip('"') + split_dtypes = dtypes_str.strip(';') + if not split_dtypes: + return [] + pairs = split_dtypes.split(';') + return tuple(pairs) + + def get_mac_ratio(self): + return self.aic_mac_ratio + + def get_size(self, shapes_str, dtypes_str): + shapes = self.shape_to_tuple(shapes_str) + dtypes = self.dtype_to_tuple(dtypes_str) + if len(shapes) > len(dtypes): + print(f"[ERROR] The size of shape is greater than that of dtypes.") + return 0 + if len(shapes) < len(dtypes): + shapes = list(shapes) + shapes.extend([(1,)] * (len(dtypes) - len(shapes))) + all_size = 0 + for index, shape in enumerate(shapes): + element_count = self.get_element_count(shape) + dtype_size = self.get_dtype_size(dtypes[index]) + all_size += element_count * dtype_size + return all_size + + def get_calc_size(self): + # input and output bytes (MB) + if not self.input_shapes or not self.output_shapes: + print("[ERROR] There is no tensor data, do not assess vector op performance.") + return 0 + intput_size = self.get_size(self.input_shapes, self.input_data_types) + output_size = self.get_size(self.output_shapes, self.output_data_types) + return (intput_size + output_size) / (Constant.BYTE_UNIT_TRANS * Constant.BYTE_UNIT_TRANS) + + def get_throughput(self): + # throughput(GB/s) + if not self.task_duration or abs(self.task_duration) < 1e-6: + print("[ERROR] There is no task_duration, do not assess vector op performance.") + return 0 + return self.row[Constant.TITLE.SIZE] / Constant.BYTE_UNIT_TRANS / self.task_duration * Constant.UNIT_TRANS * Constant.UNIT_TRANS + + def get_perf_color(self): + return PerfColor.WHITE def update(self): + self.row[Constant.TITLE.SIZE] = self.get_calc_size() + self.row[Constant.TITLE.THROUGHPUT] = self.get_throughput() + self.row[Constant.TITLE.COLOR] = self.get_perf_color().name return self.row + + +class VecOpPerf(OpPerf): + def get_perf_color(self) -> PerfColor: + throughput = self.row[Constant.TITLE.THROUGHPUT] + op_duration = self.task_duration + tp_threshold = Constant.TP_THRESHOLD + if throughput == 0: + return PerfColor.WHITE + if throughput < tp_threshold / 2 and op_duration > 20: + return PerfColor.RED + elif tp_threshold / 2 <= throughput < tp_threshold: + return PerfColor.YELLOW + else: + return PerfColor.GREEN + + +class CubeOpPerf(OpPerf): + def get_perf_color(self) -> PerfColor: + aic_mac_ratio = self.get_mac_ratio() + if not aic_mac_ratio: + print("[WARNING] There is no aic_mac_ratio, do not assess cube op performance.") + return PerfColor.WHITE + elif aic_mac_ratio < 0.6: + return PerfColor.RED + elif 0.6 <= aic_mac_ratio < 0.8: + return PerfColor.YELLOW + else: + return PerfColor.GREEN diff --git a/profiler/advisor/advisor_backend/compute_advice/npu_slow_advice.py b/profiler/advisor/advisor_backend/compute_advice/npu_slow_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..caff1c792c2171c33a4dd876b0741d6c215c5766 --- /dev/null +++ b/profiler/advisor/advisor_backend/compute_advice/npu_slow_advice.py @@ -0,0 +1,82 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC +import multiprocessing + +import pandas as pd + +from compute_advice.compute_advice_base import ComputeAdviceBase +from compute_advice.npu_fused.op_perf import OpPerfFactory +from common_func_advisor.constant import Constant +from common_func_advisor.constant import PerfColor +from advisor_backend.common_func_advisor.trace_view_json import TraceViewJson + + +class NpuSlowAdvice(ComputeAdviceBase, ABC): + OP_PERF_SHEET = "op_perf" + + def __init__(self, collection_path: str): + super().__init__(collection_path) + self.kernel_details_path = "" + self.data = pd.DataFrame() + + @staticmethod + def save_to_excel(data: pd.DataFrame, file_path: str) -> None: + writer = pd.ExcelWriter(file_path, engine="xlsxwriter", mode="w") + data.index.name = Constant.TITLE.INDEX + data.to_excel(writer, index=True, sheet_name=NpuSlowAdvice.OP_PERF_SHEET) + NpuSlowAdvice.color_sheet(data, writer.book, writer.sheets[NpuSlowAdvice.OP_PERF_SHEET]) + writer.sheets[NpuSlowAdvice.OP_PERF_SHEET].freeze_panes = "A2" + writer.close() + + @staticmethod + def color_sheet(data: pd.DataFrame, workbook, worksheet): + color_rgb = { + PerfColor.GREEN.name: workbook.add_format({'bg_color': '#C6EFCE'}), + PerfColor.YELLOW.name: workbook.add_format({'bg_color': '#FFEB9C'}), + PerfColor.RED.name: workbook.add_format({'bg_color': '#FFC7CE'}), + } + for row in data.iterrows(): + color = row[1][Constant.TITLE.COLOR] + fill_format = color_rgb.get(color) + if not fill_format: + continue + worksheet.set_row(row[0] + 1, None, fill_format) + + @staticmethod + def update_op_row(row: tuple): + return OpPerfFactory.build(row[1]).update() + + def get_call_stack(self, data: pd.DataFrame, index_id: int, ts_col: str) -> str: + if not self.has_callstack(): + print("There is no call stack info, please set 'with_stack=True'") + return "" + trace_json = TraceViewJson(self.trace_view_path) + return trace_json.get_call_stack(data, index_id, ts_col) + + def run(self): + if not self.path_check(): + return self.data + self.process() + return self.data + + def process(self): + self.data = pd.read_csv(self.kernel_details_path, dtype={"Start Time(us)": str}) + # 去除末尾的\t分隔符 + self.data["Start Time(us)"] = self.data["Start Time(us)"].apply(lambda x: x[:-1]) + pool = multiprocessing.Pool(multiprocessing.cpu_count()) + result = pool.map(self.update_op_row, self.data.iterrows()) + pool.close() + self.data = pd.DataFrame(result) diff --git a/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py b/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py index 7cbf7d807e0498a4f17d7f1ee78b38fd2212e94e..f5bfc351f2820ac8d797798fd959577da8062ea4 100644 --- a/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py +++ b/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py @@ -21,9 +21,9 @@ from compare_interface.comparison_interface import ComparisonInterface class OverallSummaryAdvice(AdviceBase): advice_map = { - "Computing Time": "if you want more detailed advice please go to compute_perf_analysis.ipynb.", - "Uncovered Communication Time": "if you want more detailed advice please go to cluster_perf_analysis.ipynb.", - "Free Time": "if you want more detailed advice please go to timeline_perf_analysis.ipynb." + "Computing Time": "if you want more detailed advice please use msprof-analyze advisor computation.", + "Uncovered Communication Time": "if you want more detailed advice, please use msprof-analyze advisor schedule.", + "Free Time": "if you want more detailed advice please use msprof-analyze advisor schedule." } time_name_map = { "Computing Time": "computing", @@ -39,7 +39,7 @@ class OverallSummaryAdvice(AdviceBase): performance_time_dict = { "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)', 'Flash Attention Time(Backward)(Num)', 'Other Time'], - "Uncovered Communication Time": [], + "Uncovered Communication Time(Wait Time)": [], "Free Time": ['SDMA Time(Num)'] } @@ -112,6 +112,7 @@ class OverallSummaryAdvice(AdviceBase): if time_value == Constant.INVALID_VALUE: continue duration, _ = self.split_duration_and_num(time_value) + time_category = time_category.split("(")[0] time_category_dict[time_category] = duration self.get_sub_category_time(time_category, time_list, duration) self.cur_data["overall_data"] = time_category_dict @@ -145,7 +146,7 @@ class OverallSummaryAdvice(AdviceBase): overall_data = self.cur_data.get("overall_data") if not overall_data: return - e2e_time = sum([data for data in overall_data.values()]) + e2e_time = '%.3f' % sum([data for data in overall_data.values()]) overall_bottleneck = f"The Model E2E Time is {e2e_time}s.\n" comparison_bottleneck = "" for time_type, time_value in overall_data.items(): @@ -160,7 +161,9 @@ class OverallSummaryAdvice(AdviceBase): if not self._has_base_collection: continue # add comparison bottleneck - base_duration, _ = self.split_duration_and_num(self.get_time_value(time_type, self._base_data)) + time_type_origin = "Uncovered Communication Time(Wait Time)" \ + if time_type == "Uncovered Communication Time" else time_type + base_duration, _ = self.split_duration_and_num(self.get_time_value(time_type_origin, self._base_data)) if time_value > base_duration: ratio = "{:.2%}".format(self.calculate_ratio(time_value - base_duration, base_duration)) comparison_bottleneck += f"{time_type} exceeds the benchmark by {ratio}\n" diff --git a/profiler/advisor/analyzer/__init__.py b/profiler/advisor/analyzer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/base_analyzer.py b/profiler/advisor/analyzer/base_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..5f4bd3202cd2071088f25564a7d4b14144a34826 --- /dev/null +++ b/profiler/advisor/analyzer/base_analyzer.py @@ -0,0 +1,94 @@ +import logging +from functools import wraps +from typing import Dict, List, Union +from abc import abstractmethod, ABCMeta + +from profiler.advisor.common import constant +from profiler.advisor.common.version_control import VersionControl +from profiler.advisor.dataset.dataset import Dataset +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.display.html.render import HTMLRender + +logger = logging.getLogger() + + +class BaseAnalyzer(VersionControl, metaclass=ABCMeta): + _SUPPORT_VERSIONS = constant.SUPPORTED_CANN_VERSION + + dataset_cls_list = [] + + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + self.n_processes = n_processes + self.cann_version = kwargs.get("cann_version", constant.DEFAULT_CANN_VERSION) + self.torch_version = kwargs.get("torch_version", constant.DEFAULT_TORCH_VERSION) + self.html_render = HTMLRender() + self.collection_path = collection_path + self.kwargs = kwargs + self.dataset_list: Dict[str, List[Dataset]] = {} + self.init_dataset_list() + self.result = OptimizeResult() + self.record_list: Dict[str, List] = {} + + @classmethod + def check_data(cls, data_list: tuple): + """ + check if all data in data list is contained + :param data_list: data list to check + :return: func ptr if check success + """ + + def decorate(func): + + @wraps(func) + def wrapper(self, **kwargs): + data = self.dataset_list + if data is None: + return None + for data_key in data_list: + if data_key not in data: + return None + + logger.info("Enable analysis %s with %s", self.__class__.__name__, ",".join(data_list)) + return func(self) + + return wrapper + + return decorate + + @abstractmethod + def optimize(self, **kwargs): + pass + + @abstractmethod + def make_record(self): + pass + + @abstractmethod + def make_render(self): + pass + + def init_dataset_list(self)->None: + dataset_cls_list = self.dataset_cls_list + if len(dataset_cls_list) == 0: + logger.warning(f"Analyser: %s don't rely on any dataset!", self.__class__.__name__) + return + + for dataset_cls in dataset_cls_list: + if dataset_cls and callable(dataset_cls): + dataset = dataset_cls(collection_path=self.collection_path, data=self.dataset_list, **self.kwargs) + key = dataset_cls.get_key() + if key not in self.dataset_list: + self.dataset_list[key] = [] + self.dataset_list[key].append(dataset) + + @staticmethod + def get_first_data_by_key(data, key) -> Union[Dataset, None]: + """ + get the first member from data with key + :param data: input data + :param key: data key + :return: the first dataset in dataset list + """ + if key in data and len(data[key]) > 0: + return data[key][0] + return None diff --git a/profiler/advisor/analyzer/cluster/__init__.py b/profiler/advisor/analyzer/cluster/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/cluster/slow_link_analyser.py b/profiler/advisor/analyzer/cluster/slow_link_analyser.py new file mode 100644 index 0000000000000000000000000000000000000000..846b79a50f31abb8445a0e5c2e82aaaf3c8ee23d --- /dev/null +++ b/profiler/advisor/analyzer/cluster/slow_link_analyser.py @@ -0,0 +1,122 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from typing import Dict, List +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.common import constant +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.dataset.cluster.cluster_dataset import ClusterCommunicationDataSet + + +class SlowLinkAnalyzer(BaseAnalyzer): + RDMA_TIME_MS = "RDMA time(ms)" + RDMA_SIZE_MB = "RDMA size(mb)" + SDMA_TIME_MS = "SDMA time(ms)" + SDMA_SIZE_MB = "SDMA size(mb)" + RDMA_BANDWIDTH = "RDMA bandwidth(GB/s)" + SDMA_BANDWIDTH = "SDMA bandwidth(GB/s)" + COMMUNICATION_BANDWIDTH_INFO = "Communication Bandwidth Info" + TRANSIT_TIME = "Transit Time(ms)" + TRANSIT_SIZE = "Transit Size(MB)" + SDMA = "SDMA" + RDMA = "RDMA" + SLOW_LINK_ANALYSIS = "slow_link_analysis" + dataset_cls_list = [ClusterCommunicationDataSet] + + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + super().__init__(collection_path, n_processes, **kwargs) + key = ClusterCommunicationDataSet.get_key() + self.communication_data_class = self.get_first_data_by_key(self.dataset_list, key) + self.rank_bw_dict = self.communication_data_class.get_data() + self.result = OptimizeResult() + self.bottelneck = '' + self.suggestion = '' + self.format_datas = [] + + def optimize(self, **kwargs): + if self.rank_bw_dict is None: + print("slow_link 分析失败,原因是数据加载失败,请检查你的cluster_analysis_outpu文件夹, \ + 如不关心这类数据请忽略") + return self.result + self.process() + self.format_datas = self.format_details() + self.make_record() + self.make_render() + return self.result + + def process(self): + if self.rank_bw_dict: + self.produce_bottleneck(self.RDMA_BANDWIDTH) + self.produce_bottleneck(self.SDMA_BANDWIDTH) + + def produce_bottleneck(self, link_type: str): + data_list = [rank_dict.get(link_type, 0) for rank_id, rank_dict in self.rank_bw_dict.items()] + avg_bw = round(sum(data_list) / len(data_list), 3) + if avg_bw == 0: + return + self.bottelneck += f'{link_type}: \n' \ + f' The average is {avg_bw}, \n' \ + f' while the maximum is {round(max(data_list), 3)}GB/s \n' \ + f' and the minimum is {round(min(data_list), 3)}GB/s. \n' \ + f' the difference is {round(max(data_list) - min(data_list), 3)}GB/s. \n' + + def format_details(self): + if not self.rank_bw_dict: + return { + "headers": [], + "data": [] + } + + details_dict = {} + headers = list({k for rank_bw_value in self.rank_bw_dict.values() for k in rank_bw_value.keys()}) + headers.sort() + data_list = [[rank_id] + [rank_bw.get(k, 0) for k in headers] for rank_id, rank_bw in self.rank_bw_dict.items()] + data_list.sort(key = lambda x: x[0]) # 按rank_id排序 + + details_dict["headers"] = ["rank_id"] + headers + details_dict["data"] = data_list + + return details_dict + + def make_record(self): + """ + make record for what and how to optimize + """ + optimization_item = OptimizeItem( + SlowLinkAnalyzer.SLOW_LINK_ANALYSIS, + self.bottelneck, + self.suggestion + ) + self.result.add(OptimizeRecord(optimization_item)) + + for i, data in enumerate(self.format_datas["data"]): + self.result.add_detail(SlowLinkAnalyzer.SLOW_LINK_ANALYSIS, self.format_datas["headers"], data) + + def make_render(self): + result_for_html = { + "Description" : self.bottelneck, + "suggestion" : self.suggestion, + "details" : [self.format_datas] + } + + self.html_render.render_template(key="cluster", + title=SlowLinkAnalyzer.SLOW_LINK_ANALYSIS, + template_dir="templates", + template_name="cluster_analysis.html", + cann_version=self.cann_version, + torch_version=self.torch_version, + result=result_for_html) \ No newline at end of file diff --git a/profiler/advisor/analyzer/cluster/slow_rank_analyser.py b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py new file mode 100644 index 0000000000000000000000000000000000000000..4215b514a215a2a350571746ff9cb90c3c9956eb --- /dev/null +++ b/profiler/advisor/analyzer/cluster/slow_rank_analyser.py @@ -0,0 +1,112 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from typing import Dict, List +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.common import constant +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.dataset.cluster.cluster_dataset import ClusterStepTraceTimeDataSet + + +class SlowRankAnalyzer(BaseAnalyzer): + SLOW_RANK_ANALYSIS = "slow_rank_analysis" + RANK = "rank" + RATIO_THRESHOLD = 0.05 + BOTTLENECK_LIST = ['Computing', 'Communication', "Free"] + dataset_cls_list = [ClusterStepTraceTimeDataSet] + + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + super().__init__(collection_path, n_processes, **kwargs) + key = ClusterStepTraceTimeDataSet.get_key() + self.step_trace_class = self.get_first_data_by_key(self.dataset_list, key) + self.step_trace_dict = self.step_trace_class.get_data() + self.result = OptimizeResult() + self.bottelneck = '' + self.suggestion = '' + self.format_datas = [] + + def optimize(self, **kwargs): + if self.step_trace_dict is None: + print("slow_rank 分析失败,原因是数据加载失败,请检查你的cluster_analysis_outpu文件夹 \ + 如不关心这类数据请忽略") + return self.result + self.process() + self.format_datas = self.format_details() + self.make_record() + self.make_render() + return self.result + + def process(self): + total_time_list = [sum(data_tuple) for rank_id, data_tuple in self.step_trace_dict.items()] + if total_time_list: + mean_total_time = sum(total_time_list) / len(total_time_list) + for i in range(len(self.BOTTLENECK_LIST)): + self.produce_bottleneck(self.step_trace_dict, i, mean_total_time) + + def produce_bottleneck(self, step_dict: dict, produce_type: int, mean_total_time: float): + data_list = [data_tuple[produce_type] for rank_id, data_tuple in step_dict.items()] + max_ratio = self.compute_max_gap_ratio(data_list, mean_total_time) + if max_ratio > self.RATIO_THRESHOLD: + self.bottelneck += f'{self.BOTTLENECK_LIST[produce_type]} \n' \ + f' has some issues in the cluster, \n' \ + f' because the max difference of {self.BOTTLENECK_LIST[produce_type]} time \n' \ + f' has reached {round(max_ratio * mean_total_time / 1000, 3)}ms. \n' + + def make_record(self): + """ + make record for what and how to optimize + """ + optimization_item = OptimizeItem( + SlowRankAnalyzer.SLOW_RANK_ANALYSIS, + self.bottelneck, + self.suggestion + ) + self.result.add(OptimizeRecord(optimization_item)) + for i, data in enumerate(self.format_datas["data"]): + self.result.add_detail(SlowRankAnalyzer.SLOW_RANK_ANALYSIS, self.format_datas["headers"], data) + + def format_details(self): + details_dict = {} + headers = ["rank_id", "compute", "communication", "free"] + data_list = [] + for key,value in self.step_trace_dict.items(): + data_list.append([key] + value) + details_dict["headers"] = headers + details_dict["data"] = data_list + return details_dict + + def make_render(self): + result_for_html = { + "Description" : self.bottelneck, + "suggestion" : self.suggestion, + "details" : [self.format_datas] + } + + self.html_render.render_template(key="cluster", + title=SlowRankAnalyzer.SLOW_RANK_ANALYSIS, + template_dir="templates", + template_name="cluster_analysis.html", + cann_version=self.cann_version, + torch_version=self.torch_version, + result=result_for_html) + + @staticmethod + def compute_max_gap_ratio(data: list, mean: float): + if mean == 0: + return 0 + else: + return (max(data) - min(data)) / mean diff --git a/profiler/advisor/analyzer/communication/__init__.py b/profiler/advisor/analyzer/communication/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/communication/bandwidth/__init__.py b/profiler/advisor/analyzer/communication/bandwidth/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/communication/environment/__init__.py b/profiler/advisor/analyzer/communication/environment/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/computation/__init__.py b/profiler/advisor/analyzer/computation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/computation/aicpu/__init__.py b/profiler/advisor/analyzer/computation/aicpu/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..4eca1c6c0278349cf4068544d2a53d8de7f0d5e1 --- /dev/null +++ b/profiler/advisor/analyzer/computation/aicpu/aicpu_checker.py @@ -0,0 +1,278 @@ +import copy +import os +from functools import partial +from typing import List, Dict, Optional + +import yaml +from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker, logger +from profiler.advisor.analyzer.schedule.fusion_ops.timeline_api_stack_checker import OpStackFinder +from profiler.advisor.common import constant +from profiler.advisor.dataset.dataset import Dataset +from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset + + +class AicpuChecker(OperatorChecker): + _CHECKER = "aicpu operator" + _PROBLEM = "AICPU operator" + _MIN_TASK_DURATION = 20 + _description = f"Some operators and task duration exceed {_MIN_TASK_DURATION} us, such as :\n" + _SUGGESTION: List[str] = ["Modify code to avoid aicpu operator"] + STACK_INFO_ITEMS = "stack_info" + SUGGESTION_INFO_ITEMS = "suggestions" + _ITEMS = [ + "op_name", "op_type", "task_duration", "input_shapes", "input_data_types", "input_formats", "output_shapes", + "output_data_types", "output_formats" + ] + + def __init__(self, cann_version): + super(AicpuChecker, self).__init__(cann_version=cann_version) + self.aicpu_rules: Dict = {} + self.aicpu_checker: Dict = {} + self.load_aicpu_rules() + + def _check_data(self, profiling_data: ProfilingDataset) -> bool: + if not self._check_summary(profiling_data): + return False + return True + + def _check_operator(self, op_info) -> bool: + return op_info.task_type == constant.AI_CPU + + def load_aicpu_rules(self, rule_path="rules/aicpu_rules.yaml") -> Dict: + if not os.path.isabs(rule_path): + rule_path = os.path.join(os.path.dirname(__file__), + "../../../", rule_path) + + if not os.path.exists(rule_path): + logger.warning("Skip analyze aicpu issues, because %s does not exist.", rule_path) + return {} + with open(rule_path, 'r') as f: + self.aicpu_rules = yaml.safe_load(f) + self.filter_aicpu_rules(self.aicpu_rules) + for checker_name, check_rule in self.aicpu_rules.items(): + if not isinstance(check_rule, (list, dict,)): + continue + + if checker_name not in AICPU_CHECKER.keys(): + logger.warning("Skip %s, which is not support now.", checker_name) + continue + + self.aicpu_checker[checker_name] = AICPU_CHECKER[checker_name](check_rule) + + def filter_aicpu_rules(self, aicpu_rules): + support_checkers = [] + for checkers in aicpu_rules['CommonChecker']: + for key, value in checkers.items(): + if key == 'DataTypeChecker' and self.cann_version in value['cann_version']: + support_checkers.append(checkers) + aicpu_rules['CommonChecker'] = support_checkers + return + + def check_aicpu_attr(self, op_info) -> List[str]: + suggestions = [] + for _, checker in self.aicpu_checker.items(): + suggestions.extend(checker.check(op_info)) + return suggestions + + def check(self, profiling_data: ProfilingDataset) -> bool: + """ + check if any operator need optimize + :param profiling_data: profiling datasest + :return: true or false + """ + + if not self._check_data(profiling_data): + return False + op_summary = profiling_data.op_summary + + def get_opeartor_stack_info(api_stack_finder: OpStackFinder, op_name_list: list) -> list: + data: Dict[str, Dataset] = {} + event_dataset = TimelineEventDataset(collection_path=profiling_data.collection_path, data=data, task_type=constant.AI_CPU) + + # disable multiprocessing, avoid cost time of enable new process for light task + api_stack_finder.get_api_stack_by_op(event_dataset, op_name_list, constant.AI_CPU, + disable_multiprocess=True) + return api_stack_finder._stack_record + + self._op_list = [] + total_task_duration = 0.0 + max_task_duration = 0.0 + for op_info in op_summary.op_list: + if self._check_operator(op_info): + self._op_list.append(op_info) + + task_duration = float(op_info.task_duration) + total_task_duration += task_duration + max_task_duration = max(max_task_duration, task_duration) + if (not self._op_list) or (max_task_duration < self._MIN_TASK_DURATION): + return False + + # 获取所有算子堆栈的信息 + op_name_list = [] + for op in self._op_list: + if op.op_name not in op_name_list: + op_name_list.append(op.op_name) + api_stack_finder = OpStackFinder() + stack_record = get_opeartor_stack_info(api_stack_finder, op_name_list) + + # task_id 到 stack 信息的对应 + self._op_list.sort(key=lambda x: int(x.task_id)) + stack_record.sort(key=lambda x: x[0]) + task_id_to_stack = dict() + for stack in stack_record: + task_id_to_stack[stack[0]] = stack[-1] + + # 算子追加堆栈属性 + for op in self._op_list: + stack = task_id_to_stack.get(int(op.task_id)) + op.add_attr(self.STACK_INFO_ITEMS, stack) + suggestions = self.check_aicpu_attr(op) + op.add_attr(self.SUGGESTION_INFO_ITEMS, suggestions) + + # double 类型算子判断 + double_type_ai_cpu_operator = [] + for op in self._op_list: + if not op.has_attr("input_data_types"): + logger.warning( + "Skip checking of input data in AICPU checker because of not containing input_data_dtypes in op summary") + break + if op.has_attr( + "input_data_types") and "DOUBLE" in op.input_data_types and op.op_name not in double_type_ai_cpu_operator: + double_type_ai_cpu_operator.append(op.op_name) + if bool(double_type_ai_cpu_operator): + self._SUGGESTION.append("Try to convert double type operator to float, such as {}".format( + ",".join(double_type_ai_cpu_operator))) + return True + + def make_render(self, html_render, record): + html_render.render_template(key="computation", + template_dir="templates", + template_name="operator_ai_cpu.html", + format_result=self.format_operator_result(record, constant.OPERATOR_LIST_UNLIMIT)) + + def format_operator_result(self, record, limit): + """ + Format operator result to html + :param record: profiling check record + :param limit: Limit number of operator statistics lists. + :return: + """ + optimization_item = record.optimization_item + release_suggestion_list = [] + for suggestion in optimization_item.suggestion: + release_suggestion_list.append(suggestion.replace('\n', '
')) + logger.debug("suggestion list is %s", release_suggestion_list) + format_result = {"record": record.__dict__, "suggestion": '
'.join(release_suggestion_list), + "task_duration": round(record.statistics_item.task_duration, 2)} + + statistic = self.group_by(copy.deepcopy(self._op_list), op_key='op_type', + limit=limit) + format_result["statistic"] = statistic + stack_key_list = ["stack_info", "input_data_types", "output_data_types"] + if statistic: + for key, info in statistic: + op_info_list = self.group_by_list(info.get("op_info_list"), stack_key_list, limit) + info["op_info_list"] = op_info_list + return format_result + + def group_by_list(self, op_list, op_key_list: List = ["stack_info", "input_data_types", "output_data_types"], + limit: int = constant.OPERATOR_LIST_UNLIMIT): + if op_list is None: + op_list = [] + + # op_key_list 合并添加合并的属性,作为 groupby 的 key value + op_key = '+'.join(op_key_list) # str, json + for op_info in op_list: + attribute = "" + for _op in op_key_list: + if op_info.get_attr(_op): + attribute += op_info.get_attr(_op) + op_info.add_attr(op_key, attribute) + + return self.group_by(op_list, op_key=op_key, limit=limit) + + +class BaserChecker: + def __init__(self, *args, **kwargs): + self.checker_list = [] + + def build(self): + raise NotImplementedError + + def check(self, op_info) -> List[str]: + suggestions = [] + for checker in self.checker_list: + suggestion = checker(op_info) + if suggestion is not None: + suggestions.append(suggestion) + return suggestions + + +class CommonChecker(BaserChecker): + def __init__(self, check_rules: List[Dict] = None): + super(CommonChecker, self).__init__() + self.check_rules = check_rules if check_rules is not None else [] + self.supported_checker = dict(DataTypeChecker=self.datatype_checker) + self.build() + + @staticmethod + def datatype_checker(check_item: Dict, op_info) -> Optional[str]: + supported_op_type = check_item.get('op_type', []) + suggestion = check_item.get('suggestion', "") + valid_inputs = check_item.get('input', []) + valid_outputs = check_item.get('output', []) + ignore_type = check_item.get('ignore_type', []) + op_type = getattr(op_info, 'op_type', "UNKNOWN") + if "__ALL__" in supported_op_type or \ + op_type.lower() in supported_op_type: + if op_type.lower() in ignore_type: + return None + + op_input_dtype = getattr(op_info, 'input_data_types', "").split(";") + op_input_dtype = [item.lower() for item in op_input_dtype] + op_output_dtype = getattr(op_info, 'output_data_types', "").split(";") + op_output_dtype = [item.lower() for item in op_output_dtype] + input_dtype_diff = set(op_input_dtype).difference(set(valid_inputs)) + output_dtype_diff = set(op_output_dtype).difference(set(valid_outputs)) + unsupported_dtype_diff = input_dtype_diff.union(output_dtype_diff) + if not unsupported_dtype_diff: + return None + + return suggestion.format(",".join(unsupported_dtype_diff).upper(), + op_type, + ",".join(valid_inputs).upper()) + + def build(self): + for check in self.check_rules: + (check_func, check_rule), = check.items() + if check_func not in self.supported_checker: + logger.warning("Skip %s, which has not been implemented.", check_func) + continue + self.checker_list.append(partial(self.supported_checker.get(check_func), check_rule)) + + +class ExampleGuideChecker(BaserChecker): + def __init__(self, check_rules: List[Dict] = None): + super(ExampleGuideChecker, self).__init__() + self.check_rules = check_rules if check_rules is not None else [] + self.build() + + def build(self): + def _guide_url(check_item: Dict, op_info) -> Optional[str]: + supported_op_type = check_item.get('op_type', []) + url = check_item.get('url', "") + suggestion = check_item.get('suggestion', "") + + if getattr(op_info, 'op_type', "UNKNOWN").lower() in supported_op_type: + return suggestion if "{}" not in suggestion else suggestion.format(url) + + for check in self.check_rules: + (_, check_rule), = check.items() + self.checker_list.append(partial(_guide_url, check_rule)) + + +AICPU_CHECKER = { + "CommonChecker": CommonChecker, + "ExampleGuideChecker": ExampleGuideChecker +} diff --git a/profiler/advisor/analyzer/computation/bound/__init__.py b/profiler/advisor/analyzer/computation/bound/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/computation/bound/block_dim_checker.py b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..a7d7ddd93c70e59dc0d10318fdac06fdc581f70c --- /dev/null +++ b/profiler/advisor/analyzer/computation/bound/block_dim_checker.py @@ -0,0 +1,75 @@ +import logging + +from typing import List + +from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker +from profiler.advisor.common import constant +from profiler.advisor.config.config import Config +from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset + +logger = logging.getLogger() + + +class BlockDimChecker(OperatorChecker): + _SUGGESTION: List[str] = [] + _CHECKER = "block dim" + _PROBLEM = "block dim" + _description = "some operator does not make full use of {} ai core" + _ITEMS = [ + "op_name", "op_type", "task_type", "task_duration", "income", "block_dim", "mix_block_dim", "input_shapes", + "input_data_types", "input_formats", "output_shapes", "output_data_types", "output_formats" + ] + + def pre_check(self, profiling_data) -> bool: + return not self.is_dynamic_shape(profiling_data) + + def _check_data(self, data): + self.format_suggestion_content(data) + if not self._check_summary(data): + return False + if not Config().get_config("ai_core_num"): + logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "ai core num in info.json file") + return False + summary = data.op_summary + op_info = summary.op_list[0] + if not hasattr(op_info, "block_dim"): + logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "block dim in op summary") + return False + if Config().get_config("ai_core_num"): + self._aicore_num = int(Config().get_config("ai_core_num")) + if Config().get_config("aiv_num"): + self._aiv_num = int(Config().get_config("aiv_num")) + self._description = self._description.format(self._aicore_num) + if self._aiv_num: + self._description += f" or {self._aiv_num} ai vector core" + self._description += f";\n Top-{OperatorChecker._MAX_TUNE_OP_NUM} operator of " \ + "task duration are as follows:\n" + return True + + def make_render(self, html_render, record): + html_render.render_template(key="computation", + template_dir="templates", + template_name="operator_block_dim.html", + format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK)) + + def _check_operator(self, op_info) -> bool: + if op_info.task_type not in ["AI_CORE", "AI_VECTOR_CORE", "MIX_AIC"]: + return False + block_dim = int(op_info.block_dim) + core_num = self.get_core_num(op_info) + if block_dim % core_num == 0: + return False + if op_info.task_type == "MIX_AIC" and hasattr(op_info, "mix_block_dim") \ + and self._aiv_num and int(op_info.mix_block_dim) % self._aiv_num == 0: + return False + return True + + def get_core_num(self, op_info): + """ + get core num of task type + """ + if op_info.task_type == "AI_CORE" or not self._aiv_num: + core_num = self._aicore_num + else: + core_num = self._aiv_num + return core_num diff --git a/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..a22b380f974b14207d6d7be262cd49f0ba0fbe99 --- /dev/null +++ b/profiler/advisor/analyzer/computation/bound/operator_bound_checker.py @@ -0,0 +1,53 @@ +import logging +from typing import List + +from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker +from profiler.advisor.common import constant +from profiler.advisor.config.config import Config +from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset +from profiler.advisor.utils.utils import to_percent + +logger = logging.getLogger() + + +class OperatorBoundChecker(OperatorChecker): + _MIN_TASK_DURATION = 20 # min task duration 20us + _CHECKER = "operator no bound" + _PROBLEM = "operator no bound" + _SUGGESTION: List[str] = [] + _description = ( + f"There is no mte, cube, vector, scalar ratio is more than {to_percent(Config().operator_bound_ratio)};\n" + + f"Top task duration operators need to be tuned are as follows: \n") + _ITEMS = [ + "op_name", "op_type", "task_type", "task_duration", "vec_ratio", "mac_ratio", "scalar_ratio", "mte1_ratio", + "mte2_ratio", "mte3_ratio", "block_dim", "input_shapes", "input_data_types", "input_formats", "output_shapes", + "output_data_types", "output_formats" + ] + + def pre_check(self, profiling_data) -> bool: + return not self.is_dynamic_shape(profiling_data) + + def _check_data(self, data): + self.format_suggestion_content(data) + if not self._check_summary(data): + return False + for op_info in data.op_summary.op_list: + return self._check_operator(op_info) + + logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "ratio in op summary") + return False + + def _check_operator(self, op_info) -> bool: + bound_list = ["vec_ratio", "mac_ratio", "scalar_ratio", "mte1_ratio", "mte2_ratio", "mte3_ratio"] + ratio_list = [self.get_ratio(op_info, attr) for attr in bound_list] + if not any(ratio_list): + return False # no data, skip check + if any(ratio and ratio > Config().operator_bound_ratio for ratio in ratio_list): + return False + return True + + def make_render(self, html_render, record): + html_render.render_template(key="computation", + template_dir="templates", + template_name="operator_no_bound.html", + format_result=self.format_operator_result(record, constant.OPERATOR_OUT_TOPK)) diff --git a/profiler/advisor/analyzer/computation/op_compile/__init__.py b/profiler/advisor/analyzer/computation/op_compile/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..86d3bac4ff8cb163d23a6365307b855839b12a6a --- /dev/null +++ b/profiler/advisor/analyzer/computation/op_compile/dynamic_shape_checker.py @@ -0,0 +1,65 @@ +import copy +import logging +from typing import List + +from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker +from profiler.advisor.common import constant +from profiler.advisor.dataset.profiling.info_collection import OpInfo +from profiler.advisor.result.item import OptimizeItem, StatisticsItem, OptimizeRecord + +logger = logging.getLogger() + + +class DynamicShapeChecker(OperatorChecker): + ENABLE_COMPILED_SUGGESTION = "Optimize by enabling compiled operator, such as:\n" \ + "`torch_npu.npu.set_compile_mode(jit_compile=False)`\n" + _SUGGESTION: List[str] = [ENABLE_COMPILED_SUGGESTION] + _CHECKER = "dynamic shape operator" + _PROBLEM = "Dynamic shape operator" + _description = f"Found all operators are dynamic shape" + _op_list: List[OpInfo] = [] + _tune_op_list: List[str] = [] # record op name to be tuned, and save to tune_ops_file.cfg + _op_views: List = [] + + def __init__(self, cann_version) -> None: + super().__init__(cann_version=cann_version) + + def check(self, profiling_database) -> bool: + return self.is_dynamic_shape(profiling_database) + + def make_record(self, profiling_database) -> OptimizeRecord: + """ + make record for what and how to optimize + """ + + optimization_item = OptimizeItem( + self._PROBLEM, + self._description, + self._SUGGESTION + ) + statistics_item = StatisticsItem("", "", 1) + return OptimizeRecord(optimization_item, statistics_item) + + def format_operator_result(self, record, limit=-1): + """ + Format operator result to html + :param record: profiling check record + :param limit: Limit number of operator statistics lists. + :return: + """ + optimization_item = record.optimization_item + release_suggestion_list = [] + for suggestion in optimization_item.suggestion: + release_suggestion = copy.deepcopy(suggestion) + if release_suggestion == DynamicShapeChecker.ENABLE_COMPILED_SUGGESTION: + release_suggestion += \ + f"for details please refer to link : LINK" + release_suggestion_list.append(release_suggestion.replace('\n', '
')) + format_result = {"record": record.__dict__, "suggestion": '
'.join(release_suggestion_list)} + return format_result + + def make_render(self, html_render, record): + html_render.render_template(key="computation", + template_dir="templates", + template_name="operator_dynamic_shape.html", + format_result=self.format_operator_result(record)) diff --git a/profiler/advisor/analyzer/computation/operator_checker.py b/profiler/advisor/analyzer/computation/operator_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..0f47650943a7355b494bd766214d10526c46c0fa --- /dev/null +++ b/profiler/advisor/analyzer/computation/operator_checker.py @@ -0,0 +1,307 @@ +import copy +import logging +from textwrap import fill +from typing import List + +from profiler.advisor.common import constant +from profiler.advisor.common.version_control import VersionControl +from profiler.advisor.config.config import Config +from profiler.advisor.dataset.profiling.info_collection import OpInfo +from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset +from profiler.advisor.result.item import OptimizeItem, StatisticsItem, OptimizeRecord +from profiler.advisor.utils.utils import safe_division + +logger = logging.getLogger() + + +class OperatorChecker(VersionControl): + _SUPPORT_VERSIONS = constant.SUPPORTED_CANN_VERSION + _MAX_TUNE_OP_NUM = constant.OPERATOR_OUT_TOPK + _MIN_TASK_DURATION = 0 + _MIN_TASK_DURATION_RATIO = 1.0 + _MIN_TOTAL_DURATION_RATIO = 1.0 + _CHECKER = str() + _PROBLEM = str() + _description = str() + STACK_INFO_ITEMS = "" + _ITEMS: List[str] = [] + _SUGGESTION: List[str] = [] + SKIP_CHECK_MSG = "Skip %s checker because of not containing %s" + _tune_op_info_list: List[OpInfo] = [] + PyTorch_OPERATOR_TUNE_SUGGESTION = f"Optimize operator by AOE, such as:\n" \ + f"'aoe --job_type=2 --model_path=$user_dump_path " \ + f"--tune_ops_file={Config().tune_ops_file}'\n" + MSLite_OPERATOR_TUNE_SUGGESTION = f"Optimize operator by AOE in mindspore lite framework, such as:\n" \ + f"converter_lite --fmk=ONNX --optimize=ascend_oriented --saveType=MINDIR " \ + f"--modelFile=$user_model.onnx --outputFile=user_model --configFile=./config.txt\n" + _tune_op_list: List[str] = [] + + def __init__(self, cann_version: str): + self.cann_version = cann_version + self._op_list: List[OpInfo] = [] + + def check(self, profiling_data: ProfilingDataset) -> bool: + """ + check if any operator need optimize + :param profiling_data: profiling datasest + :return: true or false + """ + if not self._check_data(profiling_data): + return False + + summary = profiling_data.op_summary + total_task_duration = 0.0 + max_task_duration = 0.0 + for op_info in summary.op_list: + if not self._check_operator(op_info): + continue + task_duration = float(op_info.task_duration) + total_task_duration += task_duration + max_task_duration = max(max_task_duration, task_duration) + self._op_list.append(op_info) + if task_duration > self._MIN_TASK_DURATION: + self._tune_op_info_list.append(op_info) + + if any([ + max_task_duration > self._MIN_TASK_DURATION, + round(safe_division(max_task_duration, summary.get_total_task_duration()), + 4) > self._MIN_TASK_DURATION_RATIO, + round(safe_division(total_task_duration, summary.get_total_task_duration()), 4) > + self._MIN_TOTAL_DURATION_RATIO, + ]): + self._op_list.sort(key=lambda x: float(x.get_attr("task_duration")), reverse=True) + self._tune_op_info_list.sort(key=lambda x: float(x.get_attr("task_duration")), reverse=True) + for op in self._op_list: + if op.op_name not in self._tune_op_list and len(self._tune_op_list) < constant.OPERATOR_OUT_TOPK: + self._tune_op_list.append(op.op_name) + return True + return False + + def make_record(self, profiling_data: ProfilingDataset): + """ + Make record for what and how to optimize + :param profiling_data: profiling data + :return: optimize record + """ + task_duration_list = [float(op_info.get_attr("task_duration")) for op_info in self._op_list if + hasattr(op_info, "get_attr")] + total_cost_time = sum(task_duration_list) + total_task_duration = profiling_data.op_summary.get_total_task_duration() + count = len(task_duration_list) + statistics_item = StatisticsItem(total_task_duration, total_cost_time, count, self.get_incomes()) + optimization_item = OptimizeItem( + self._PROBLEM, + self._get_description(self._description, self.get_op_type_list(self._op_list)[:self._MAX_TUNE_OP_NUM]), + self._SUGGESTION + ) + return OptimizeRecord(optimization_item, statistics_item) + + def _get_description(self, description, op_type_list=None): + if not op_type_list: + return description + + desc_suffix = [] + for i in range(len(op_type_list)): + if i % 3 == 0 and i != 0: + desc_suffix.append("\n") + + desc_suffix.append(f"{op_type_list[i]}") + + if i < len(op_type_list) - 1: + desc_suffix.append(", ") + + description += "".join(desc_suffix) + return description + + def pre_check(self, profiling_data) -> bool: + return True + + def is_dynamic_shape(self, profiling_database: ProfilingDataset) -> bool: + less_than_cann800_list = [constant.CANN_VERSION_C30, constant.CANN_VERSION_C13, constant.CANN_VERSION_C15] + # CANN 8.0.0 之前从 ge_info 中获取 op_state 属性,进行动态 shape 逻辑判断 + if self.cann_version in less_than_cann800_list: + if hasattr(profiling_database, "ge_info"): + ge_info = profiling_database.ge_info + static_shape_operators = ge_info.get_static_shape_operators() + if len(static_shape_operators) == 0: + return True + else: + logger.warning( + "Skip dynamic shape check because of not containing ge_info.db file in host filefloder.\n" + "To enable dynamic shape check, please try to set data_simplification=False in experimental_config.\n" + "More details please refer to link : %s", constant.ASCEND_PROFILER_URL) + else: + # CANN 8.0.0 之后 op_state 属性从 op_summary 文件中获取 + if hasattr(profiling_database, "op_summary"): + static_shape_operators = profiling_database.op_summary.get_static_shape_operators() + if len(static_shape_operators) == 0: + return True + else: + logger.warning( + "Skip dynamic shape check because of not containing op_summary.csv file in current filefloder." + ) + return False + + def format_operator_result(self, record, limit): + """ + Format operator result to html + :param record: profiling check record + :param limit: Limit number of operator statistics lists. + :return: + """ + optimization_item = record.optimization_item + release_suggestion_list = [] + for suggestion in optimization_item.suggestion: + release_suggestion = copy.deepcopy(suggestion) + if release_suggestion == OperatorChecker.PyTorch_OPERATOR_TUNE_SUGGESTION: + release_suggestion += \ + (f"for details please refer to link : LINK") + elif release_suggestion == OperatorChecker.MSLite_OPERATOR_TUNE_SUGGESTION: + release_suggestion += \ + (f"\nThe config file for MSLite AOE usage is as follows:\n" \ + f"[ascend_context]\n" \ + f"aoe_mode=\"operator tuning\"\n" \ + f"--tune_ops_file={Config().tune_ops_file}\n" + f"\nFor details please refer to link : LINK") + release_suggestion_list.append(release_suggestion.replace('\n', '
')) + format_result = {"record": record.__dict__, + "suggestion": fill('
'.join(release_suggestion_list), width=200), + "task_duration": round(record.statistics_item.task_duration, 2)} + statistic = self.group_by(copy.deepcopy(self._op_list), limit=limit) + format_result["statistic"] = statistic + return format_result + + def group_by(self, op_list, op_key="op_type", + limit: int = constant.OPERATOR_LIST_UNLIMIT): + """ + group by Profiling.OpInfo's attribute key, then return top limit tuple by duration + :param op_list: input a OpInfo list + :param op_key: group by Profiling.OpInfo's attribute key + :param limit: top limit num, if you do not need to limit the length of tuple, input -1(int) + :return: + """ + if op_list is None: + op_list = [] + statistic = {} # str, json + for op_info in op_list: + if statistic.get(op_info.get_attr(op_key)): + statistic[op_info.get_attr(op_key)]["summary"]["total_duration"] = float( + statistic[op_info.get_attr(op_key)]["summary"]["total_duration"]) + float( + op_info.get_attr("task_duration", constant.DEFAULT_DURATION_ZERO)) + statistic[op_info.get_attr(op_key)]["summary"]["counts"] += 1 + stack_info = op_info.get_attr("stack_info") + if stack_info: + op_info.stack_info = stack_info.replace('\r\n', '
') + statistic[op_info.get_attr(op_key)]["op_info_list"].append(op_info) + else: + statistic[op_info.get_attr(op_key)] = {"summary": {}, "op_info_list": []} + statistic[op_info.get_attr(op_key)]["summary"]["op_type"] = op_info.get_attr( + "op_type", constant.DEFAULT_OPERATOR_TYPE) + statistic[op_info.get_attr(op_key)]["summary"]["total_duration"] = float( + op_info.get_attr("task_duration", constant.DEFAULT_DURATION_ZERO)) + statistic[op_info.get_attr(op_key)]["summary"]["counts"] = 1 + stack_info = op_info.get_attr("stack_info") + if stack_info: + op_info.stack_info = stack_info.replace('\r\n', '
') + statistic[op_info.get_attr(op_key)]["op_info_list"] = [op_info] + + if statistic: + for op_key in statistic.keys(): + statistic[op_key]["summary"]["total_duration"] = round( + statistic[op_key]["summary"]["total_duration"], 2) + # Grouped by op_type, sorted by total_duration, and obtained the top 10 operators that take the most time. + if limit > 0: + statistic = sorted( + statistic.items(), key=lambda kv: kv[1]["summary"]["total_duration"], reverse=True)[:limit] + else: + statistic = sorted(statistic.items(), key=lambda kv: kv[1]["summary"]["total_duration"], reverse=True) + else: + logger.warning("%s checker do not has results to format html", str(self.__class__.__name__)) + return statistic + + def _check_data(self, profiling_data): + return True + + def _check_operator(self, op_info): + return False + + def _get_income(self, _op_info: OpInfo) -> float: + return 0 + + def get_tune_op_list(self): + """ + get tune op list + :return: tune op list + """ + return self._tune_op_list + + def get_views(self, _graph_data): + """Get node views.""" + return [] + + @classmethod + def get_name(cls): + """ + get name of checker + :return: checker name + """ + return cls._PROBLEM + + def get_incomes(self) -> float: + """get incomes""" + incomes = 0.0 + for op_info in self._op_list: + income = self._get_income(op_info) + setattr(op_info, "income", round(income, 2)) + incomes += income + return incomes + + def get_op_type_list(self, op_list: List[OpInfo]): + """get op type list""" + op_type_list = [] + for op_info in op_list: + if op_info.op_type not in op_type_list: + op_type_list.append(op_info.op_type) + return op_type_list + + def _check_summary(self, data: ProfilingDataset): + if not hasattr(data, "op_summary"): + logger.warning(self.SKIP_CHECK_MSG, self._CHECKER, "op summary") + return False + return True + + @staticmethod + def get_ratio(op_info: OpInfo, attr: str) -> float: + if not op_info.has_attr(attr): + return 0 + value = op_info.get_attr(attr) + if not value or value == "N/A": + return 0 + return float(value) + + def get_details(self) -> list: + """ + get details of operator to be optimized + :return: detail list + """ + op_list = self._op_list + if not op_list or not (self._ITEMS + [self.STACK_INFO_ITEMS]): + return [] + details = [] + attrs = [attr for attr in (self._ITEMS + [self.STACK_INFO_ITEMS]) if op_list[0].has_attr(attr)] + details.append(attrs) + op_list = sorted(op_list, key=lambda x: float(x.get_attr("task_duration")), reverse=True) + for op_info in op_list: + content = [ + op_info.get_attr(attr) if attr != "aicore_time" + else op_info.get_float_attr(attr, strict_mode=True) + + op_info.get_float_attr("aiv_time", strict_mode=True) for attr in attrs + ] + details.append(content) + return details + + def format_suggestion_content(self, profiling_data: ProfilingDataset) -> None: + if profiling_data.PROF_TYPE == constant.ASCEND_PYTORCH_PROFILER: + self._SUGGESTION.append(self.PyTorch_OPERATOR_TUNE_SUGGESTION) + elif profiling_data.PROF_TYPE == constant.MSLITE: + self._SUGGESTION.append(self.MSLite_OPERATOR_TUNE_SUGGESTION) diff --git a/profiler/advisor/analyzer/computation/profiling_analyzer.py b/profiler/advisor/analyzer/computation/profiling_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..8682617700702055628a31982b0eafab9feb336d --- /dev/null +++ b/profiler/advisor/analyzer/computation/profiling_analyzer.py @@ -0,0 +1,89 @@ +import logging +from abc import ABC +from typing import Dict, List + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.common import constant +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.analyzer.computation.aicpu.aicpu_checker import AicpuChecker +from profiler.advisor.analyzer.computation.bound.block_dim_checker import BlockDimChecker +from profiler.advisor.analyzer.computation.bound.operator_bound_checker import OperatorBoundChecker +from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker +from profiler.advisor.analyzer.computation.op_compile.dynamic_shape_checker import DynamicShapeChecker +from profiler.advisor.analyzer.computation.operator_checker import OperatorChecker +from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset +from profiler.advisor.utils.utils import get_supported_subclass + +logger = logging.getLogger() + + +class ProfilingAnalyzer(BaseAnalyzer, ABC): + dataset_cls_list = [ProfilingDataset] + + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = OperatorChecker(self.cann_version) + self.html_render = HTMLRender() + self.result = OptimizeResult() + + @BaseAnalyzer.check_data((ProfilingDataset.get_key(),)) + def optimize(self, **kwargs) -> OptimizeResult: + """ + optimize operator + :param data: input datasets + :return: result + """ + profiling_data = self.get_first_data_by_key(self.dataset_list, ProfilingDataset.get_key()) + checker = self.checker + if not checker.pre_check(profiling_data): + return self.result + if checker.check(profiling_data): + # add record + record = checker.make_record(profiling_data) + checker.make_render(self.html_render, record) + self.result.add(record) + # add details + details = checker.get_details() + if details: + for i, detail in enumerate(details): + if i == 0: + # the first row is header + self.result.add_detail(checker.get_name(), headers=detail) + else: + self.result.add_detail(checker.get_name(), detail=detail) + # add tune op list + tune_op_list = checker.get_tune_op_list() + if tune_op_list: + self.result.add_tune_op_list(tune_op_list) + + return self.result + + def make_record(self): + pass + + def make_render(self): + pass + + +class DynamicShapeAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = DynamicShapeChecker(self.cann_version) + + +class BlockDimAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = BlockDimChecker(self.cann_version) + + +class OperatorBoundAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = OperatorBoundChecker(self.cann_version) + +class AicpuAnalyzer(ProfilingAnalyzer): + def __init__(self, collection_path, **kwargs) -> None: + super().__init__(collection_path, **kwargs) + self.checker = AicpuChecker(self.cann_version) \ No newline at end of file diff --git a/profiler/advisor/analyzer/dataloader/__init__.py b/profiler/advisor/analyzer/dataloader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/graph_fusion/__init__.py b/profiler/advisor/analyzer/graph_fusion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py b/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..326be83b8d49088b1563ccd8c08b68a4aa3001ef --- /dev/null +++ b/profiler/advisor/analyzer/graph_fusion/graph_fusion_analyzer.py @@ -0,0 +1,49 @@ +from typing import List +from functools import partial + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.dataset.graph_dataset import GraphDataset +from profiler.advisor.analyzer.graph_fusion.graph_fusion_checker import GraphFusionRules +from profiler.advisor.dataset.profiling.profiling_dataset import ProfilingDataset +from profiler.advisor.display.html.render import HTMLRender + + +class FusionOPAnalyzer(BaseAnalyzer): + """ + fusion optimizer + """ + RULES = dict(graph_dataset=partial(GraphFusionRules, "rules/op_fusion_pass.yaml")) + dataset_cls_list = [GraphDataset, ProfilingDataset] + + def __init__(self, collection_path, **kwargs) -> None: + super(FusionOPAnalyzer, self).__init__(collection_path, **kwargs) + self.result = OptimizeResult() + self.html_render = HTMLRender() + + @BaseAnalyzer.check_data((GraphDataset.get_key(),)) + def optimize(self, **kwargs): + """ + :return: result + """ + self._check(self.dataset_list.get("GraphDataset"), self.dataset_list.get("ProfilingDataset")) + return self.result + + def _check(self, graph_data: List[GraphDataset], + profiling_data: List[ProfilingDataset] = None) -> None: + if len(graph_data) == 0 or graph_data[0].is_empty(): + return + for _, rule in self.RULES.items(): + checker = rule() + if profiling_data is None: + checker.find_fusion_matched_issues(graph_data) + else: + checker.find_fusion_matched_issues_with_times(graph_data, profiling_data) + checker.make_record(self.result) + checker.make_render(self.html_render) + + def make_record(self): + pass + + def make_render(self): + pass diff --git a/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py b/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..e64020fdfe2ace37172e82ed562db1b66971d3d6 --- /dev/null +++ b/profiler/advisor/analyzer/graph_fusion/graph_fusion_checker.py @@ -0,0 +1,207 @@ +import logging +from typing import List + +from tqdm import tqdm + +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord, StatisticsItem +from profiler.advisor.common.graph.graph import Graph +from profiler.advisor.common.graph.graph_parser import QueryGraphParser +from profiler.advisor.dataset.graph_dataset import GraphDataset +from profiler.advisor.common.graph.graph_match import find_isomorphisms + +logger = logging.getLogger() + + +class GraphFusionRules: + def __init__(self, fusion_rules: str): + self.fusion_rules = fusion_rules + self.candidates = [] + self.task_duration_list = [] + + @staticmethod + def build_query_graph(query_graphs) -> List[Graph]: + for _, query_graph in query_graphs.fusion_rules.items(): + for sub_graph in query_graph: + graph = Graph(*sub_graph) + graph.build() + yield graph + + def find_fusion_matched_issues(self, graphs: List[GraphDataset]): + query_graphs = QueryGraphParser(self.fusion_rules) + with tqdm(total=query_graphs.num_rules, leave=False, ncols=100, unit=" rules") as pbar: + pbar.set_description(f"Searching Isomorphic Subgraph") + for query_graph in self.build_query_graph(query_graphs): + query_candidates = find_isomorphisms(query_graph.graph, graphs[0].graphs[-1].graph) + pbar.update(1) + if len(query_candidates) > 0: + self.candidates.append(query_candidates) + + def find_fusion_matched_issues_with_times(self, graphs: List[GraphDataset], profiling): + self.find_fusion_matched_issues(graphs) + if len(self.candidates) == 0 or len(profiling) == 0: + return + + if not hasattr(profiling[0], 'op_summary') or profiling[0].op_summary is None: + if hasattr(profiling[0], 'msprof'): + self.match_time_from_msprof(profiling[0].msprof) + return + else: + logger.warning("Skip analyze operator because of not containing op summary.") + return + + self.match_time_from_summary(profiling[0].op_summary) + time_duration_sum = [] + for task_duration in self.task_duration_list: + time_duration_sum.append(sum([sum(duration) for duration in task_duration])) + time_duration_index = sorted(range(len(time_duration_sum)), + key=time_duration_sum.__getitem__, + reverse=True) + self.task_duration_list = [self.task_duration_list[i] for i in time_duration_index] + self.candidates = [self.candidates[i] for i in time_duration_index] + + def match_time_from_summary(self, op_summary): + op_dict = op_summary.task_dict + for candidates in self.candidates: + candidate_duration = [] + for candidate in candidates: + duration_list = [] + for node in candidate.values(): + if node.op_name not in op_dict or op_dict[node.op_name][0].op_type.lower() != node.op_type.lower(): + logger.warning("Operator %s is missing in op summary, which will be set to 0.", node.op_name) + duration_list.append(0.0) + continue + duration_list.append(float(op_dict[node.op_name][0].task_duration)) + candidate_duration.append(duration_list) + self.task_duration_list.append(candidate_duration) + + def match_time_from_msprof(self, msprof): + op_dict = dict() + for task in msprof.tasks: + if "item_id" not in task.args: + continue + op_dict[task.args["item_id"]] = {"task_duration": task.dur} + for candidates in self.candidates: + candidate_duration = [] + for candidate in candidates: + duration_list = [] + for node in candidate.values(): + if node.op_name not in op_dict: + logger.warning("Operator %s is missing in msprof, which will be set to 0.", node.op_name) + duration_list.append(0.0) + continue + duration_list.append(float(op_dict[node.op_name].get("task_duration"))) + candidate_duration.append(duration_list) + self.task_duration_list.append(candidate_duration) + + def make_render(self, html_render): + if not self.candidates: + return + + candidates_list = [] + for case_id, nodes in enumerate(self.candidates): + candidate_dict = dict() + candidate_dict['counts'] = len(nodes) + candidate_dict['matches'] = [] + has_time_info = False + if self.task_duration_list: + has_time_info = True + candidate_dict['total_duration'] = round(sum(sum(duration) for duration in + self.task_duration_list[case_id]), 2) + for node_index, refer_node in enumerate(nodes): + match = [] + index = 0 + pass_name = ','.join(item.op_type for item in refer_node.keys()) + for query_node, host_node in refer_node.items(): + fusion_pattern = query_node.op_pass + + if 'op_pass' not in candidate_dict: + candidate_dict['op_pass'] = fusion_pattern + if 'fusion_pattern' not in candidate_dict: + candidate_dict['fusion_pattern'] = pass_name + match_attr = dict() + match_attr['op_name'] = host_node.op_name + match_attr['dtype'] = query_node.op_type + if has_time_info: + match_attr['duration'] = round(self.task_duration_list[case_id][node_index][index], 2) + index += 1 + match.append(match_attr) + match_attr = dict() + match_attr['op_name'] = "-" + match_attr['dtype'] = "-" + if has_time_info: + match_attr['duration'] = round(sum(self.task_duration_list[case_id][node_index]), 2) + match.append(match_attr) + candidate_dict['matches'].append(match) + candidates_list.append(candidate_dict) + html_render.render_template(key="computation", + template_dir="templates", + template_name="fusion.html", + candidates=candidates_list) + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + if not self.candidates: + return + + optimization_item = OptimizeItem( + "fusion issue", + f"Found {len(self.candidates)} fusion issues", + ["Check fusion issues detail in att_advisor*.html"] + ) + total_time = 0.0 + for candidate in self.task_duration_list: + for duration in candidate: + total_time += sum(duration) + statistics_item = StatisticsItem(0, + total_time, + sum([len(candidate) for candidate in self.candidates]) + ) + result.add(OptimizeRecord(optimization_item, statistics_item)) + + record_title = [ + "issue_id", "graph_name", "op_name", "fusion_structure", "fusion_pattern", + "op_type", "input_shape", "input_format", + "input_dtype", "output_shape", "output_format", "output_dtype" + ] + result.add_detail('fusion issues', headers=record_title) + + for case_id, nodes in enumerate(self.candidates): + for _, refer_node in enumerate(nodes): + pass_name = ','.join(item.op_type for item in refer_node.keys()) + for query_node, host_node in refer_node.items(): + fusion_pattern = query_node.op_pass + detail = [ + case_id, + host_node.graph_name, + host_node.op_name, + pass_name, + fusion_pattern, + query_node.op_type, + self.get_attr_shape(host_node, "input", "shape"), + self.get_attr_type(host_node, "input", "format"), + self.get_attr_type(host_node, "input", "dtype"), + self.get_attr_shape(host_node, "output", "shape"), + self.get_attr_type(host_node, "output", "format"), + self.get_attr_type(host_node, "output", "dtype"), + ] + result.add_detail('fusion issues', detail=detail) + + @staticmethod + def get_attr_shape(node, type_name: str, attr_name: str) -> str: + attr_shape = [] + node_attrs = getattr(node, type_name, []) + for attrs in node_attrs: + attr = getattr(attrs, attr_name, []) + attr_shape.append(",".join(attr)) + return ";".join(attr_shape) + + @staticmethod + def get_attr_type(node, type_name: str, attr_name: str) -> str: + attr_type = [] + node_attrs = getattr(node, type_name, []) + for attr in node_attrs: + attr_type.append(getattr(attr, attr_name, "")) + return ";".join(attr_type) diff --git a/profiler/advisor/analyzer/overall/__init__.py b/profiler/advisor/analyzer/overall/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/overall/overall_analyzer.py b/profiler/advisor/analyzer/overall/overall_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..916a396b3d096dc788954cbc8e8ba9755cd15f4e --- /dev/null +++ b/profiler/advisor/analyzer/overall/overall_analyzer.py @@ -0,0 +1,45 @@ +import logging +from typing import Dict, List + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.result.result import OptimizeResult +from profiler.compare_tools.compare_backend.utils.constant import Constant +from profiler.compare_tools.compare_interface.comparison_interface import ComparisonInterface + +logger = logging.getLogger() + + +class OverallSummaryAnalyzer(BaseAnalyzer): + + def __init__(self, profiling_path, benchmark_profiling_path=None, **kwargs): + self.benchmark_profiling_path = benchmark_profiling_path or profiling_path + self.profiling_path = profiling_path + self.html_render = HTMLRender() + self.result = OptimizeResult() + + def optimize(self, **kwargs): + compare_result = ComparisonInterface(self.benchmark_profiling_path, self.profiling_path).compare( + Constant.OVERALL_COMPARE) + + headers = compare_result.get('Model Profiling Time Distribution').get("headers", []) + rows = compare_result.get('Model Profiling Time Distribution').get("rows", []) + + self.make_record() + self.make_render(headers=headers, rows=rows) + return compare_result + + def make_record(self): + pass + + def make_render(self, **kwargs): + headers = kwargs.get("headers") + rows = kwargs.get("rows") + + if not headers or not rows: + logger.info("Empty headers or rows, skip render overall analysis html") + self.html_render.render_template(key="overall", + template_dir="templates", + template_name="overall_analysis.html", + headers=kwargs.get("headers"), + rows=kwargs.get("rows")) diff --git a/profiler/advisor/analyzer/overall/overall_summary_analyzer.py b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..c74ae0510331fb9ba8a1794bd724710ba19cfabf --- /dev/null +++ b/profiler/advisor/analyzer/overall/overall_summary_analyzer.py @@ -0,0 +1,262 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import copy + +import logging +from typing import Dict, List + +from profiler.advisor.display.html.render import HTMLRender +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.compare_tools.compare_backend.utils.constant import Constant +from profiler.advisor.common import constant as const +from profiler.compare_tools.compare_interface.comparison_interface import ComparisonInterface +from profiler.advisor.utils.utils import get_file_path_from_directory, load_parameter + + +class OverallSummaryAnalyzer(BaseAnalyzer): + OVERALL_SUMMARY_ANALYZER = "overall_summary_analysis" + advice_map = { + "Computing Time": "if you want more detailed advice please go to att_advisor_*.html", + "Uncovered Communication Time": "if you want more detailed advice please go to att_advisor_*.html", + "Free Time": "if you want more detailed advice please go to att_advisor_*.html" + } + time_name_map = { + "Computing Time": "computing", + "Uncovered Communication Time": "communication", + "Free Time": "free", + 'Cube Time(Num)': 'Cube Time', + 'Vector Time(Num)': 'Vector Time', + 'Flash Attention Time(Forward)(Num)': 'Flash Attention Time(Forward)', + 'Flash Attention Time(Backward)(Num)': 'Flash Attention Time(Backward)', + 'Other Time': "Other Computing Time", + 'SDMA Time(Num)': 'SDMA Time' + } + performance_time_dict = { + "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)', + 'Flash Attention Time(Backward)(Num)', 'Other Time'], + "Uncovered Communication Time(Wait Time)": [], + "Free Time": ['SDMA Time(Num)'] + } + + def __init__(self, collection_path: str, n_processes: int = 1, **kwargs): + profile_path = get_profile_path(collection_path) + super().__init__(profile_path, n_processes, **kwargs) + self.base_collection_path = kwargs.get("base_collection_path", "") + self._has_base_collection = False + self._is_minimal_profiling = False + self.cur_data = {} + self.cur_data_table = {} + self.cur_bottleneck = {} + self.cur_advices = "" + self._headers = [] + self._base_data = [] + self._comparison_data = [] + self.html_render = HTMLRender() + self.result = OptimizeResult() + self.bottleneck_str = "" + self.bottleneck_table = {} + + @staticmethod + def split_duration_and_num(time_value: str) -> tuple: + split_data = time_value.split("s") # time value example: 0.229s(1756) + duration, num = 0.0, None + if len(split_data) >= 2: + try: + num = int(split_data[1].strip("()")) + except ValueError: + pass + if len(split_data) >= 1: + try: + duration = float(split_data[0]) + except ValueError: + print(f"[WARNING] Invalid time value: {time_value}.") + return duration, num + + @staticmethod + def calculate_ratio(dividend, divisor): + if not divisor: + return float("inf") + return dividend / divisor + + def path_check(self): + if self.base_collection_path: + if os.path.exists(self.base_collection_path): + self._has_base_collection = True + else: + print(f"[WARNING] Invalid path which not exists: {self.base_collection_path}.") + return os.path.exists(self.collection_path) + + def process(self): + base_collection_path = self.base_collection_path if self._has_base_collection else self.collection_path + result_data = ComparisonInterface(base_collection_path, self.collection_path).compare(Constant.OVERALL_COMPARE) + for data in result_data.values(): + self._headers = data.get("headers", []) + rows = data.get("rows", []) + if len(rows) == 2: + self._base_data = rows[0] + self._comparison_data = rows[1] + if not self._headers or not self._comparison_data: + return + self._is_minimal_profiling = 'E2E Time(Not minimal profiling)' not in self._headers + if self._has_base_collection: + self.cur_data["comparison_result"] = result_data + time_category_dict = {} + for time_category, time_list in self.performance_time_dict.items(): + time_value = self.get_time_value(time_category, self._comparison_data) + if time_value == Constant.INVALID_VALUE: + continue + duration, _ = self.split_duration_and_num(time_value) + time_category = time_category.split("(")[0] + time_category_dict[time_category] = duration + self.get_sub_category_time(time_category, time_list, duration) + self.cur_data["overall_data"] = time_category_dict + + def get_time_value(self, header_name: str, data_list: list): + try: + data_index = self._headers.index(header_name) + except ValueError: + return Constant.INVALID_VALUE + try: + time_value = data_list[data_index] + except IndexError: + return Constant.INVALID_VALUE + return time_value + + def get_sub_category_time(self, category: str, time_list: list, total_duration: float): + sub_time_dict = {} + for time_name in time_list: + time_value = self.get_time_value(time_name, self._comparison_data) + if time_value == Constant.INVALID_VALUE: + continue + sub_time_dict.setdefault(f"{category} Subtype", []).append(self.time_name_map.get(time_name, "")) + duration, num = self.split_duration_and_num(time_value) + sub_time_dict.setdefault(f"Duration(s)", []).append(duration) + sub_time_dict.setdefault(f"Duration Ratio", []).append( + "{:.2%}".format(self.calculate_ratio(duration, total_duration))) + sub_time_dict.setdefault(f"Kernel Number", []).append(num) + self.cur_data[self.time_name_map.get(category)] = sub_time_dict + + def identify_bottleneck(self): + overall_data = self.cur_data.get("overall_data") + if not overall_data: + return + e2e_time = '%.3f' % sum([data for data in overall_data.values()]) + overall_bottleneck = f"The Model E2E Time is {e2e_time}s.\n" + comparison_bottleneck = "" + for time_type, time_value in overall_data.items(): + # add subtype time bottleneck + self.cur_bottleneck[self.time_name_map.get(time_type)] = f"{time_type} is {time_value}s.\n" + # add overall bottleneck + overall_bottleneck += f" -- {time_type} is {time_value}s\n" + if time_type == "Free Time" and self._is_minimal_profiling and self.calculate_ratio(time_value, + e2e_time) > 0.1: + overall_bottleneck += "percentage of free time exceed the threshold 10%." + if not self._has_base_collection: + continue + # add comparison bottleneck + time_type_origin = "Uncovered Communication Time(Wait Time)" \ + if time_type == "Uncovered Communication Time" else time_type + base_duration, _ = self.split_duration_and_num(self.get_time_value(time_type_origin, self._base_data)) + if time_value > base_duration: + ratio = "{:.2%}".format(self.calculate_ratio(time_value - base_duration, base_duration)) + comparison_bottleneck += f"{time_type} exceeds the benchmark by {ratio}\n" + self.cur_bottleneck["overall_data"] = overall_bottleneck + if comparison_bottleneck: + self.cur_bottleneck["comparison_result"] = comparison_bottleneck + def optimize(self, **kwargs): + if self.path_check(): + self.process() + self.identify_bottleneck() + self.format_bottleneck() + self.format_cur_data() + self.make_record() + self.make_render() + return self.result + + def format_bottleneck(self): + result = '' + headers = [] + data_list = [] + data = [] + for key, value in self.cur_bottleneck.items(): + if not value: + continue + result += f'{key}: {value} \n' + headers.append(key) + data.append(value) + data_list.append(data) + self.bottleneck_str = result + self.bottleneck_table["headers"] = headers + self.bottleneck_table["data"] = data_list + + def format_cur_data(self): + if not self.cur_data: + return + for data_type, data in self.cur_data.items(): + if not data: + continue + if data_type not in list(self.time_name_map.values()): + data_list = list(data.values()) + else: + data_list = [','.join(map(str, value)) for value in data.values()] + headers = list(data.keys()) + data_table = {"headers": headers, "data": [data_list]} + self.cur_data_table[data_type] = copy.deepcopy(data_table) + + + def make_record(self): + """ + make record for what and how to optimize + """ + if not self.bottleneck_str and not self.cur_advices: + return + optimization_item = OptimizeItem( + OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER, + self.bottleneck_str, + self.cur_advices + ) + self.result.add(OptimizeRecord(optimization_item)) + + self.result.add_detail(const.BOTTLENECK, self.bottleneck_table["headers"], self.bottleneck_table["data"][0]) + for data_type, data_dict in self.cur_data_table.items(): + if data_dict: + self.result.add_detail(const.DATA + data_type, data_dict["headers"], data_dict["data"][0]) + + def make_render(self): + if not self.bottleneck_str and not self.cur_advices: + return + result_for_html = { + "Description" : self.bottleneck_str, + "suggestion" : self.cur_advices, + "details" : [self.bottleneck_table] + } + + self.html_render.render_template(key="overall", + title=OverallSummaryAnalyzer.OVERALL_SUMMARY_ANALYZER, + template_dir="templates", + template_name="cluster_analysis.html", + cann_version=self.cann_version, + torch_version=self.torch_version, + result=result_for_html) + +def get_profile_path(collection_path): + for root, dirs, files in os.walk(collection_path): + for file in files: + if file.startswith("profiler_info"): + return root + return "" \ No newline at end of file diff --git a/profiler/advisor/analyzer/schedule/__init__.py b/profiler/advisor/analyzer/schedule/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/schedule/dispatch/__init__.py b/profiler/advisor/analyzer/schedule/dispatch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py b/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..0e62a3ff0c8eebc0cf7b5b89953b8a0842df9c9d --- /dev/null +++ b/profiler/advisor/analyzer/schedule/dispatch/timeline_op_dispatch_analyzer.py @@ -0,0 +1,107 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging + + +from profiler.advisor.common import constant as const +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.display.html.render import HTMLRender + +logger = logging.getLogger() + + +class OpDispatchAnalyzer(BaseAnalyzer): + dataset_cls_list = [TimelineEventDataset] + """ + operator dispatch optimizer + """ + + def __init__(self, collection_path, n_processes: int = 1, **kwargs) -> None: + super().__init__(collection_path, n_processes, **kwargs) + key = TimelineEventDataset.get_key() + self.dataset = self.get_first_data_by_key(self.dataset_list, key) + self.result = OptimizeResult() + self.html_render = HTMLRender() + self._op_compile = None + self._issues_record = [] + self.optimization_item = [] + + def optimize(self, **kwargs): + """ + optimize operator + :param data: input datasets + :return: result + """ + self.get_op_compile_info(self.dataset) + self.make_record(self.result) + self.make_render(self.html_render) + return self.result + + def get_op_compile_info(self, event_dataset: TimelineEventDataset): + """ + :Param event_dataset: dataset of timeline event + """ + if hasattr(event_dataset, "ops_compile"): + self._op_compile = getattr(event_dataset, "ops_compile") + if not self._op_compile or self._op_compile.total_count < const.MAX_OP_COMPILE_NUM: + return + + self._issues_record.append(['operator dispatch', + const.OP_COMPILE_ID, + self._op_compile.total_count, + self._op_compile.total_time]) + else: + logger.debug("Skip operator compile checker, because no op_compile attr find.") + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + if not self._op_compile or len(self._issues_record) <= 0: + return + desc = f"Found {self._op_compile.total_count} operator compile issues." + suggestion = (f"Please use `torch_npu.npu.set_compile_mode(jit_compile=False)` to disable jit compile " + f"in dynamic shape usage.") + self.optimization_item.append(OptimizeItem("Operator dispatch", desc, [suggestion])) + for optimization in self.optimization_item: + result.add(OptimizeRecord(optimization)) + record_title = ["Issues", "op name", "counts", "total time"] + result.add_detail('operator dispatch', headers=record_title) + for op_info in self._issues_record: + result.add_detail('operator dispatch', detail=op_info) + + def make_render(self, html_render): + issues = [] + optimizations = [] + for optimization in self.optimization_item: + optimizations.append(dict( + description=optimization.description, + suggestion=optimization.suggestion[0] + )) + for record in self._issues_record: + issues.append(dict(issue=record[0], + op_name=record[1], + counts=record[2], + total_time=record[3])) + html_render.render_template(key="schedule", + template_dir="templates", + template_name="operator_dispatch.html", + issues=issues, + optimizers=optimizations) diff --git a/profiler/advisor/analyzer/schedule/free_event/__init__.py b/profiler/advisor/analyzer/schedule/free_event/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/schedule/fusion_ops/__init__.py b/profiler/advisor/analyzer/schedule/fusion_ops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..c1eb24b8e1e11ac167a7eb9333867167a57dd524 --- /dev/null +++ b/profiler/advisor/analyzer/schedule/fusion_ops/fusion_ops_analyzer.py @@ -0,0 +1,271 @@ +import multiprocessing +import logging +import re + +from tqdm import tqdm + +from profiler.advisor.analyzer.base_analyzer import BaseAnalyzer +from profiler.advisor.common import constant as const +from profiler.advisor.common.analyzer_scopes import SupportedScopes +from profiler.advisor.common.timeline.event import TimelineEvent +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.utils.utils import format_timeline_result +from profiler.advisor.common.timeline.fusion_ops_db import init_timeline_ops_db + +logger = logging.getLogger() + + +class TimelineFusionOpsAnalyzer(BaseAnalyzer): + dataset_cls_list = [TimelineEventDataset] + + def __init__(self, collection_path, n_processes: int = 1, **kwargs): + super().__init__(collection_path, n_processes, **kwargs) + self._matched_op_index = {} if self.n_processes <= 1 else multiprocessing.Manager().dict() + self.matched_op_stacks = {} + self.empty_stacks = True + key = TimelineEventDataset.get_key() + self.timeline_event_dataset = self.get_first_data_by_key(self.dataset_list, key) + + def optimize(self, **kwargs): + for mode in [const.ATEN.lower(), const.OPTIMIZER.lower()]: + + for op_combined, npu_apis in tqdm(getattr(init_timeline_ops_db(self.cann_version, self.torch_version), + f"_{mode}_op_api_map").items(), leave=False, ncols=100, + desc="Scanning timeline for affinity apis"): + for npu_api in npu_apis.split("/"): + self.find_fusion_ops(self.timeline_event_dataset, op_combined, npu_api, mode) + + self.query_stack(self.timeline_event_dataset) + + logger.info("Finish timeline analysis") + self.make_record() + self.make_render() + return self.result + + def find_fusion_ops(self, event_dataset, ops: str, npu_api: str, mode: str): + """ + :Param event_dataset: dataset of timeline event + :Param ops: operator combination with '-' as separator , e.g. permute-reshape + :Param npu_api: api of torch_npu, generally more efficient than torch api + :Param mode: aten or dequeue or optimizer + :Return: json of op_name and called times and detail stacks + """ + op_rule_pattern, enable_regex = self._format_rule_to_pattern(ops) + if not enable_regex: + self._match_ops(event_dataset, op_rule_pattern, npu_api, mode) + else: + try: + self._match_ops_with_regex(event_dataset, op_rule_pattern, npu_api, mode) + except Exception as e: + logger.warning("Failed to find fusion operators with regex %s, reason is %s", ops, e) + + def _match_ops(self, event_dataset, ops: str, npu_api: str, mode: str): + """ match operator based on fusion operators rule(without regex), + only strictly equals of op name list means matched + :Param event_dataset: dataset of timeline event + :Param ops: operator combination with '-' as separator , e.g. permute-reshape + :Param npu_api: api of torch_npu, generally more efficient than torch api + :Param mode: aten or dequeue or optimizer + """ + op_list = ops.split(const.OP_SEP) + + matched_op_index = set() + api_ops_matched = False + + for index, event in enumerate(getattr(event_dataset, mode)): + if self._replace_op_name_prefix(event.name, mode) != op_list[0]: + continue + tmp_dequeue_event_names = [self._replace_op_name_prefix(event.name, mode) for event in + getattr(event_dataset, mode)[index: index + len(op_list)]] + if tmp_dequeue_event_names != op_list: + continue + api_ops_matched = True + matched_op_index.add(event.dataset_index) + + if api_ops_matched: + self._matched_op_index[npu_api + f":{ops}"] = matched_op_index + + def _match_ops_with_regex(self, event_dataset, op_rule_pattern: str, npu_api: str, + mode: str): + """ match operator based on fusion operators rule(with regex), + using regex to support condition like 'a = torch.mul(xxx) if xxx else torch.add(xxx)' + :Param event_dataset: dataset of timeline event + :Param op_rule_pattern: fusion operators rule with regex definition , e.g. add-mul{0,10}, add-mul* + :Param npu_api: api of torch_npu, generally more efficient than torch api + :Param mode: aten or dequeue or optimizer + """ + matched_op_index = set() + total_op_name = "".join([f"{const.OP_SEP}{self._replace_op_name_prefix(event.name, mode)}{const.OP_SEP}" + for event in + getattr(event_dataset, mode)]) + + matched_pattern_index_tuple = [(x.start(0), x.end(0)) for x in re.finditer(op_rule_pattern, total_op_name)] + # convert list of index tuple to a whole list: [(3, 25), ...] -> [3, 25, ...] + total_ops_split_points = [num for sublist in matched_pattern_index_tuple for num in sublist] + + api_ops_matched = len(total_ops_split_points) != 0 + + op_index = [] + if 0 not in total_ops_split_points: + total_ops_split_points = [0] + total_ops_split_points + if len(list(total_op_name)) not in total_ops_split_points: + total_ops_split_points.append(len(list(total_op_name))) + + # convert total ops name like "-add-mul-xxx-div-" to small pieces like [["add", "mul"], [...], ["div"]] + # by the regex index and then calculate the real index for matched fusion operators in event dataset + for l, r in zip(total_ops_split_points, total_ops_split_points[1:]): + matched_op_flag = True if (l, r) in matched_pattern_index_tuple else False + matched_ops_list = total_op_name[l: r].strip(const.OP_SEP).split(const.OP_SEP + const.OP_SEP) + op_index.append([matched_op_flag, len(matched_ops_list)]) + for i, _ in enumerate(op_index): + if i > 0: + # calculate cumsum for indexing matched operator + op_index[i][1] = op_index[i][1] + op_index[i - 1][1] + op_index = [[False, 0]] + op_index + + for i, _ in enumerate(op_index): + if not op_index[i][0]: + continue + index = op_index[i - 1][1] + matched_op_index.add(index) + + if index > len(getattr(event_dataset, mode)) - 1: + continue + dataset_index = getattr(event_dataset, mode)[index].get("dataset_index") + matched_op_index.add(dataset_index) + + if api_ops_matched: + self._matched_op_index[npu_api + f":{op_rule_pattern}"] = sorted(list(matched_op_index)) + + def make_record(self): + """ + make record for what and how to optimize + """ + if not self.matched_op_stacks: + return + + desc = f"Found {len(format_timeline_result(self.matched_op_stacks))} apis to be replaced" \ + f" based on the runtime env cann-{self.cann_version} and torch-{self.torch_version}" + suggestion = "Please replace training api according to sub table 'Affinity training api'" + if self.empty_stacks: + desc += ", but with no stack" + suggestion = const.TIMELINE_EMPTY_STACKS_PROMPT.format( + timeline_profiling_doc_url=const.TIMELINE_WITH_STACK_DOC_URL + ) + + optimization_item = OptimizeItem( + SupportedScopes.TIMELINE_FUSION_OPS, + desc, + [suggestion] + ) + + self.result.add(OptimizeRecord(optimization_item)) + + record_title = ["Affinity API", "Code stacks", "Stack called counts"] + self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, headers=record_title) + + for api_name, stacks_info in format_timeline_result(self.matched_op_stacks).items(): + if not stacks_info: + detail = [api_name, "null", "null"] + self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, detail=detail) + else: + for stack in stacks_info: + detail = [api_name, *stack] + self.result.add_detail(SupportedScopes.TIMELINE_FUSION_OPS, detail=detail) + + def make_render(self): + format_result_for_html = format_timeline_result(dict(self.matched_op_stacks), dump_html=True) + + self.html_render.render_template(key="schedule", + template_dir="templates", + template_name="affinity_api.html", + cann_version=self.cann_version, + torch_version=self.torch_version, + empty_stacks=self.empty_stacks, + with_stack_doc_url=const.TIMELINE_WITH_STACK_DOC_URL, + api_doc_url=const.TIMELINE_API_DOC_URL, + result=format_result_for_html) + + def query_stack(self, event_dataset): + if all([len(matched_index) == 0 for matched_index in self._matched_op_index.values()]): + return + + op_stack_list = event_dataset.parse_data_with_generator(self._query_stack_by_matched_index) + for op_stack in op_stack_list: + for op_rule, stack in op_stack.items(): + if op_rule not in self.matched_op_stacks: + self.matched_op_stacks[op_rule] = {} + if stack == const.TIMELINE_FUSION_OPS_NO_STACK_FLAG: + continue + if stack not in self.matched_op_stacks[op_rule]: + self.matched_op_stacks[op_rule][stack] = 0 + self.matched_op_stacks[op_rule][stack] += 1 + + def _query_stack_by_matched_index(self, index, event): + stack_record = {} + event = TimelineEvent(event) + + matched_op_rules = [] + for op_rule, matched_index in self._matched_op_index.items(): + if index not in matched_index: + continue + + matched_op_rules.append(op_rule) + stack = event.args.get(const.CALL_STACKS) + + if not stack: + logger.debug("Got empty '%s' for event %s", const.CALL_STACKS, event) + continue + + if self.empty_stacks and stack: + self.empty_stacks = False + + stack_record[op_rule] = stack + + if matched_op_rules and not stack_record: + for op_rule in matched_op_rules: + stack_record[op_rule] = const.TIMELINE_FUSION_OPS_NO_STACK_FLAG + + return stack_record + + def _replace_op_name_prefix(self, event_name, mode): + if mode == const.DEQUEUE.lower(): + op_name_prefix = f"{const.DEQUEUE}{const.DEQUEUE_SEP}" + elif mode == const.ATEN: + op_name_prefix = f"{const.ATEN}{const.ATEN_SEP}" + else: + op_name_prefix = f"{const.OPTIMIZER}.{const.OPTIMIZER_STEP}{const.OPTIMIZER_SEP}" + + return event_name.replace(op_name_prefix, "") + + def _format_rule_to_pattern(self, op_rule): + """ + Args: + op_rule: like (mul){0,1}-(add|neg){0,2}-dropout-(softmax)* + + Returns: op_pattern like (-mul-){0,1}(-add-|-neg-){0,2}(-dropout-)(-softmax-)* + """ + enable_regex = False + if "(" not in op_rule and ")" not in op_rule: + # op_rule which requires fuzzy matching mush consist of "()" + return op_rule, enable_regex + + enable_regex = True + op_pattern_list = op_rule.split(const.OP_SEP) + format_op_pattern = "" + for op_pattern in op_pattern_list: + matched_res = re.search(r'\((.*?)\)', op_pattern) + + ops_index_range = (matched_res.start() + 1, matched_res.end() - 1) if matched_res else ( + 0, len(op_pattern)) + + op_names = op_pattern[ops_index_range[0]: ops_index_range[1]] + tmp_op_names_record = [] + for op_name in op_names.split("|"): + tmp_op_names_record.append(f"{const.OP_SEP}{op_name.strip(' ')}{const.OP_SEP}") + op_suffix = op_pattern[ops_index_range[1] + 1:] + op_names_format = f"({'|'.join(tmp_op_names_record)}){op_suffix}" + + format_op_pattern += op_names_format + return format_op_pattern, enable_regex diff --git a/profiler/advisor/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py b/profiler/advisor/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..f684a4892111f113f6c502a010c9e14ccd43768a --- /dev/null +++ b/profiler/advisor/analyzer/schedule/fusion_ops/timeline_api_stack_checker.py @@ -0,0 +1,163 @@ +import logging +from typing import List + +from profiler.advisor.common import constant as const +from profiler.advisor.common.timeline.event import TimelineEvent +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.result.result import OptimizeResult +from profiler.advisor.result.item import OptimizeItem, OptimizeRecord +from profiler.advisor.utils.utils import get_analyze_processes, ParallelJob + +logger = logging.getLogger() + + +class OpStackFinder: + + def __init__(self): + self.n_processes = get_analyze_processes() + self._stack_record = [] + self._task_id_record = {} + self.op_name = None + self.task_type = None + self.matched_index = set() + + def get_api_stack_by_op(self, event_dataset: TimelineEventDataset, op_name: List[str] = None, task_type: str = None, + disable_multiprocess=False): + """ + :Param event_dataset: dataset of timeline event + :Param op_name: operator name, e.g. IndexPutV2 + :Param task_type: operator task type, optionals are AI_CPU and AI_CORE + :Param disable_multiprocess: disable multiprocessing, avoid cost time of enable new process for light task + """ + if not op_name: + op_name = [] + if not isinstance(op_name, list): + op_name = [op_name] + + self.op_name = ",".join(op_name) + self.task_type = task_type + op_name_list = event_dataset.task_op_names if not op_name else op_name + + if self.n_processes <= 1 or disable_multiprocess: + self._query_stacks_multiprocess(event_dataset, op_name_list, task_type) + else: + event_num_per_process = int(len(op_name_list) / self.n_processes) + 1 + parallel_analyzer = ParallelJob( + self._query_stacks_multiprocess, + [[event_dataset, op_name_list[i:i + event_num_per_process], task_type] + for i in range(0, len(op_name_list), event_num_per_process)], + job_name="Analyzing operator stacks from timeline" + ) + parallel_analyzer.start(self.n_processes) + self.query_stack(event_dataset) + + def make_record(self, result: OptimizeResult): + """ + make record for what and how to optimize + """ + if not self._stack_record: + return + + desc = f"Found {len(self._stack_record)} called stacks for" + if self.op_name and self.task_type: + desc += f" operators with name '{self.op_name}' with task type '{self.task_type}'" + elif self.op_name and not self.task_type: + desc += f" operators with name '{self.op_name}'" + elif self.task_type and not self.op_name: + desc += f" operators with task type '{self.task_type}'" + else: + desc += " all operators" + + suggestion = f"Please use command 'ma-advisor analyze profiling' to analyze operators" + optimization_item = OptimizeItem( + "Operator stacks", + desc, + [suggestion] + ) + result.add(OptimizeRecord(optimization_item)) + + record_title = ["Task ID", "op name", "op type", "code stacks"] + result.add_detail('operator stacks', headers=record_title) + + for op_info in self._stack_record: + result.add_detail('operator stacks', detail=op_info) + + def _get_api_stack_by_op(self, event_dataset: TimelineEventDataset, op_name: str, task_type: str): + for _, src_op_event in event_dataset.ops_with_task_type.items(): + + op_task_type = src_op_event.get(const.TASK_TYPE) + if not (src_op_event.name == op_name and op_task_type and op_task_type == task_type): + continue + + torch_to_npu_key = f"s-{src_op_event.tid}-{src_op_event.ts}" + torch_to_npu_event = event_dataset.torch_to_npu.get(torch_to_npu_key) or event_dataset.torch_to_npu.get( + f"s-{src_op_event.ts}") or event_dataset.torch_to_npu.get(f"s-{src_op_event.ts.replace('.', '')}") + + acl_to_npu_event = src_op_event.ts in event_dataset.acl_to_npu + + if not torch_to_npu_event and not acl_to_npu_event: + continue + + # query stack by torch_to_npu first, due to each operator had acl_to_npu incoming flow in cann6.3 + if torch_to_npu_event: + dst_op_index = self._query_index_by_torch_to_npu(event_dataset, torch_to_npu_event) + else: + dst_op_index = self._query_index_by_acl_to_npu(acl_to_npu_event) + + if not dst_op_index: + continue + + task_id = src_op_event.task_id + if not task_id: + continue + self.matched_index.add(dst_op_index) + if dst_op_index not in self._task_id_record: + self._task_id_record[dst_op_index] = [] + self._task_id_record[dst_op_index].append([task_id, op_name, task_type]) + + def _query_index_by_torch_to_npu(self, event_dataset, torch_to_npu_event): + dst_op_event_key = torch_to_npu_event.ts + dst_op_event = event_dataset.ops_with_stack.get(dst_op_event_key) + + if not dst_op_event: + return const.TIMELINE_BACKWARD_NO_STACK_CODE + + return dst_op_event.get("dataset_index") + + def _query_index_by_acl_to_npu(self, acl_to_npu_event): + if acl_to_npu_event: + return const.TIMELINE_ACL_TO_NPU_NO_STACK_CODE + + def _query_stacks_multiprocess(self, event_dataset, op_name_list, task_type): + + for op_name in op_name_list: + if task_type is not None: + self._get_api_stack_by_op(event_dataset, op_name, task_type) + else: + self._get_api_stack_by_op(event_dataset, op_name, const.AI_CORE) + self._get_api_stack_by_op(event_dataset, op_name, const.AI_CPU) + + def _format_stack_record(self): + stack_list = [] + for task_id, stack_info in self._task_id_record.items(): + stack_list.append([task_id, *stack_info]) + return stack_list + + def _query_stack_by_matched_index(self, index, event): + if index not in self.matched_index: + return None + event = TimelineEvent(event) + stack = event.args.get(const.CALL_STACKS) + stack = stack if stack else const.NO_STACK_REASON_MAP.get(const.TIMELINE_BACKWARD_NO_STACK_CODE) + for matched_op_info in self._task_id_record.get(index, []): + self._stack_record.append([*matched_op_info, stack]) + + for matched_op_info in self._task_id_record.get(const.TIMELINE_ACL_TO_NPU_NO_STACK_CODE, []): + self._stack_record.append([*matched_op_info, + const.NO_STACK_REASON_MAP.get(const.TIMELINE_ACL_TO_NPU_NO_STACK_CODE)]) + return None + + def query_stack(self, event_dataset: TimelineEventDataset): + if not event_dataset.dataset_len: + return + _ = event_dataset.parse_data_with_generator(self._query_stack_by_matched_index) diff --git a/profiler/advisor/cluster_perf_analysis.ipynb b/profiler/advisor/cluster_perf_analysis.ipynb index 39e389dd3a59c37564e79d10d31413c7acd4464b..7ee0b24e85467fe42205c5986095a7e66bf0a636 100644 --- a/profiler/advisor/cluster_perf_analysis.ipynb +++ b/profiler/advisor/cluster_perf_analysis.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "id": "initial_id", "metadata": { "ExecuteTime": { @@ -12,16 +12,32 @@ }, "outputs": [], "source": [ - "from advisor_backend.interface import Interface\n", + "import sys\n", + "sys.path.append(\"../..\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c552da9d-36f9-43d3-ae1f-c54f78d3ff2d", + "metadata": {}, + "outputs": [], + "source": [ + "from profiler.advisor.interface.interface import Interface\n", "import matplotlib.pyplot as plt\n", - "import numpy as np" + "import numpy as np\n", + "from prettytable import PrettyTable, ALL\n", + "from textwrap import fill" ] }, { "cell_type": "markdown", "id": "57d17a21205c3c5e", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "# 集群调优分析\n", @@ -45,27 +61,33 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "36b7a24cc7ca5da2", "metadata": { "ExecuteTime": { "end_time": "2023-11-21T12:53:38.379699800Z", "start_time": "2023-11-21T12:53:38.363755900Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ "# EDIT THE PROFILING DATA PATH\n", - "cluster_path = \"YOUR PATH\"\n", - "interface = Interface(cluster_path)" + "cluster_path = r\"YOUR PROFILING PATH\"\n", + "interface = Interface(profiling_path=cluster_path)" ] }, { "cell_type": "markdown", "id": "cf832ac2e0dfa30f", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## 1) 识别慢卡" @@ -73,14 +95,17 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "id": "40aac93278dd6e34", "metadata": { "ExecuteTime": { "end_time": "2023-11-21T12:53:41.815599700Z", "start_time": "2023-11-21T12:53:41.783393700Z" }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -93,50 +118,212 @@ } ], "source": [ - "dataset = interface.get_data('cluster', 'slow rank')\n" + "slow_rank_result = interface.get_result(\"cluster\", \"slow_rank\")" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "cd3fceda-49f0-439f-9c54-cc31490fc99e", + "execution_count": 6, + "id": "0e943b2a-37a6-4db6-9e70-235d397f1d39", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rank_idcomputecommunicationfree
028976239.079999877586795.4199998116836641.679994211
129012279.1000001026984613.2200000257388343.859991224
229019115.323000517489956.6330000286881360.253991371
329027089.5600000777963312.2399997946389981.899993688
429044786.936999656533618.6390000177780517.1539908135
529178186.2599998537925184.4200000286286867.999995028
629025331.1899999046386639.907999927941798.704992032
729056803.3049995457234444.8260000247094608.035991492
831383314.9800002283973806.61699999968017981.379989724
931360536.362000194757458.8250000027277062.386991671
1031381891.8000004635276870.3599999986731073.659992552
1131387777.380000334727362.30000000457297578.339992355
1231374132.744999775164443.3880000046829798.933991944
1331377800.1789998044360616.2830000017624691.509991412
1431374658.3600003164457099.6200000017542724.319990785
1531387255.5270000065000860.9056975264.115991174
" + ], + "text/plain": [ + "+---------+--------------------+--------------------+--------------------+\n", + "| rank_id | compute | communication | free |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 0 | 28976239.07999987 | 7586795.419999811 | 6836641.679994211 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 1 | 29012279.100000102 | 6984613.220000025 | 7388343.859991224 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 2 | 29019115.32300051 | 7489956.633000028 | 6881360.253991371 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 3 | 29027089.560000077 | 7963312.239999794 | 6389981.899993688 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 4 | 29044786.93699965 | 6533618.639000017 | 7780517.1539908135 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 5 | 29178186.259999853 | 7925184.420000028 | 6286867.999995028 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 6 | 29025331.189999904 | 6386639.90799992 | 7941798.704992032 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 7 | 29056803.304999545 | 7234444.826000024 | 7094608.035991492 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 8 | 31383314.980000228 | 3973806.6169999996 | 8017981.379989724 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 9 | 31360536.36200019 | 4757458.825000002 | 7277062.386991671 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 10 | 31381891.800000463 | 5276870.359999998 | 6731073.659992552 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 11 | 31387777.38000033 | 4727362.3000000045 | 7297578.339992355 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 12 | 31374132.74499977 | 5164443.388000004 | 6829798.933991944 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 13 | 31377800.178999804 | 4360616.283000001 | 7624691.509991412 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 14 | 31374658.360000316 | 4457099.620000001 | 7542724.319990785 |\n", + "+---------+--------------------+--------------------+--------------------+\n", + "| 15 | 31387255.527000006 | 5000860.905 | 6975264.115991174 |\n", + "+---------+--------------------+--------------------+--------------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "# EDIT THE DATA TO SHOW WHAT YOU WANT\n", - "data = dataset.get('data')\n", - "words = dataset.get('bottleneck')\n", - "rank_ids = list(data.keys())\n", - "# 柱状图显示属性\n", - "compute_time = [data.get(key, {})[0] for key in rank_ids]\n", - "communication_time = [data.get(key, {})[1] for key in rank_ids]\n", - "free_time = [data.get(key, {})[2] for key in rank_ids]\n", - "# 柱宽\n", - "width = 0.2\n", - "\n" + "slow_rank_data = slow_rank_result.get(\"slow_rank_analysis\")\n", + "if slow_rank_data:\n", + " slow_rank_table = PrettyTable(slow_rank_data.get(\"headers\"))\n", + " for row in slow_rank_data.get(\"data\"):\n", + " row = [fill(str(element), width=80) for element in row]\n", + " slow_rank_table.add_row(row)\n", + " slow_rank_table.hrules = ALL\n", + " display(slow_rank_table[:16])" ] }, { "cell_type": "code", - "execution_count": 16, - "id": "6a1d82fb-a31b-49ab-a859-6d4bb898c512", - "metadata": { - "scrolled": true - }, + "execution_count": 10, + "id": "57a9b1c6-4127-47a2-8699-3c983950bd84", + "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Communication has some issues in the cluster, because the max difference of Communication time has reached 88.476ms. \n", - "Free has some issues in the cluster, because the max difference of Free time has reached 29.224ms. \n" - ] - }, { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA2wAAAK9CAYAAABYee9vAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABVtElEQVR4nO3deVhU9eLH8c+AsogCubApKqa5hVYuhG2aJJrXtNyzwqXsFphLLtcy18ykMjVN65ZbN8usNK+VRqRW7oqUmrmlaQYuEaCYQsz5/dF1fk6KgCLzRd6v55nn8ZzznXM+MxHjx+85Z2yWZVkCAAAAABjHzdUBAAAAAAAXR2EDAAAAAENR2AAAAADAUBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAwHC9e/dWzZo1XR0DAOACFDYAwDVn+/bt6tKli2rUqCEvLy9VrVpV99xzj1577TWncS+88IKWLl3qkow2m61Aj9WrV7skHwDADDbLsixXhwAAoKisW7dOrVq1UvXq1RUTE6OgoCAdPnxYGzZs0P79+7Vv3z7H2PLly6tLly6aN29esef8z3/+47S8YMECJSQk6J133nFaf88996hixYqy2+3y9PQszogAAAOUcXUAAACK0sSJE+Xn56fNmzfL39/faduxY8dcE+oiHnroIaflDRs2KCEh4YL1AIDSjVMiAQDXlP3796thw4YXlDVJCggIcPzZZrMpKytL8+fPd5x+2Lt3b8f2I0eOqG/fvgoMDJSnp6caNmyoOXPmOO1v9erVstlsWrRokZ555hkFBQXJx8dH9913nw4fPlxkr+nv17AdPHhQNptNL7/8smbOnKlatWqpXLlyatOmjQ4fPizLsjRhwgRVq1ZN3t7e6tixo9LS0i7Y7+eff6477rhDPj4+qlChgtq3b6+dO3cWWW4AwJVjhg0AcE2pUaOG1q9frx07dujGG2/Mc9w777yjRx99VM2bN1f//v0lSddff70k6ejRo7r11ltls9kUFxenKlWq6PPPP1e/fv2UmZmpQYMGOe1r4sSJstlsGjFihI4dO6apU6cqKipKycnJ8vb2vmqv9d1331V2drYGDBigtLQ0xcfHq1u3brr77ru1evVqjRgxQvv27dNrr72moUOHOhXOd955RzExMYqOjtbkyZN1+vRpzZo1S7fffru2bdvGTU4AwBQWAADXkC+++MJyd3e33N3drcjISGv48OHWypUrrezs7AvG+vj4WDExMRes79evnxUcHGydOHHCaX2PHj0sPz8/6/Tp05ZlWdaqVassSVbVqlWtzMxMx7gPPvjAkmRNmzatwLljY2OtvD6WY2JirBo1ajiWDxw4YEmyqlSpYqWnpzvWjxw50pJkNW7c2MrJyXGs79mzp+Xh4WGdOXPGsizLOnnypOXv72899thjTsdJTU21/Pz8LlgPAHAdTokEAFxT7rnnHq1fv1733XefvvvuO8XHxys6OlpVq1bVsmXL8n2+ZVn66KOP1KFDB1mWpRMnTjge0dHRysjIUFJSktNzHnnkEVWoUMGx3KVLFwUHB+uzzz4r8td3vq5du8rPz8+xHBERIemv6+PKlCnjtD47O1tHjhyRJCUkJCg9PV09e/Z0en3u7u6KiIjQqlWrrmpuAEDBcUokAOCa06xZM3388cfKzs7Wd999pyVLlujVV19Vly5dlJycrAYNGuT53OPHjys9PV1vvvmm3nzzzYuO+fvNS+rUqeO0bLPZVLt2bR08ePCKX8ulVK9e3Wn5XHkLDQ296Prff/9dkrR3715J0t13333R/fr6+hZpTgDA5aOwAQCuWR4eHmrWrJmaNWumG264QX369NHixYs1ZsyYPJ9jt9sl/TVLFRMTc9ExjRo1uip5C8vd3b1Q663/fZPPudf4zjvvKCgo6IJx58/OAQBci9/IAIBSoWnTppKklJQUxzqbzXbBuCpVqqhChQrKzc1VVFRUgfZ9bsbqHMuytG/fPmOK3d+du7lKQEBAgV8jAMA1uIYNAHBNWbVqlWMm6XznrierW7euY52Pj4/S09Odxrm7u6tz58766KOPtGPHjgv2c/z48QvWLViwQCdPnnQsf/jhh0pJSVG7du0u92VcVdHR0fL19dULL7ygnJycC7Zf7DUCAFyDGTYAwDVlwIABOn36tO6//37Vq1dP2dnZWrdunRYtWqSaNWuqT58+jrFNmjTRl19+qSlTpigkJERhYWGKiIjQiy++qFWrVikiIkKPPfaYGjRooLS0NCUlJenLL7+84DvNKlasqNtvv119+vTR0aNHNXXqVNWuXVuPPfZYcb/8AvH19dWsWbP08MMP65ZbblGPHj1UpUoVHTp0SJ9++qluu+02zZgxw9UxAQCisAEArjEvv/yyFi9erM8++0xvvvmmsrOzVb16dT355JMaNWqU0xdqT5kyRf3799eoUaP0xx9/KCYmRhEREQoMDNSmTZs0fvx4ffzxx3r99ddVqVIlNWzYUJMnT77gmM8884y+//57TZo0SSdPnlTr1q31+uuvq1y5csX4ygvnwQcfVEhIiF588UW99NJLOnv2rKpWrao77rjDqdQCAFzLZl3svBEAAJCv1atXq1WrVlq8eLG6dOni6jgAgGsQ17ABAAAAgKEobAAAAABgKAobAAAAABiKa9gAAAAAwFDMsAEAAACAoShsAAAAAGAovoetGNntdv3666+qUKGCbDabq+MAAAAAcBHLsnTy5EmFhITIzS3veTQKWzH69ddfFRoa6uoYAAAAAAxx+PBhVatWLc/tFLZiVKFCBUl//Ufx9fV1cRoAAAAArpKZmanQ0FBHR8gLha0YnTsN0tfXl8IGAAAAIN9LpbjpCAAAAAAYisIGAAAAAIaisAEAAACAobiGzTCWZenPP/9Ubm6uq6OglChbtqzc3d1dHQMAAAAXQWEzSHZ2tlJSUnT69GlXR0EpYrPZVK1aNZUvX97VUQAAAPA3FDZD2O12HThwQO7u7goJCZGHhwdfro2rzrIsHT9+XL/88ovq1KnDTBsAAIBhKGyGyM7Olt1uV2hoqMqVK+fqOChFqlSpooMHDyonJ4fCBgAAYBhuOmIYNzf+k6B4MZMLAABgLtoBAAAAABiKwgYAAAAAhqKwAXmoWbOmpk6d6uoYAAAAKMW46Yjhav7r02I93sEX21/W81JTUzVx4kR9+umnOnLkiAICAnTTTTdp0KBBat26dRGnLFrz5s3ToEGDlJ6e7rR+8+bN8vHxcU0oAAAAQBQ2FIGDBw/qtttuk7+/v1566SWFh4crJydHK1euVGxsrH788UdXR7wsVapUcXUEAAAAlHKcEokr9uSTT8pms2nTpk3q3LmzbrjhBjVs2FBDhgzRhg0bJEmHDh1Sx44dVb58efn6+qpbt246evSoYx9jx47VTTfdpDlz5qh69eoqX768nnzySeXm5io+Pl5BQUEKCAjQxIkTnY5ts9k0a9YstWvXTt7e3qpVq5Y+/PBDx/bVq1fLZrM5zZ4lJyfLZrPp4MGDWr16tfr06aOMjAzZbDbZbDaNHTtW0oWnRNpsNr311lu6//77Va5cOdWpU0fLli1zyrNs2TLVqVNHXl5eatWqlebPn3/B8QEAAICCorDhiqSlpWnFihWKjY296OmD/v7+stvt6tixo9LS0rRmzRolJCTop59+Uvfu3Z3G7t+/X59//rlWrFih9957T2+//bbat2+vX375RWvWrNHkyZM1atQobdy40el5zz33nDp37qzvvvtOvXr1Uo8ePbRr164C5W/RooWmTp0qX19fpaSkKCUlRUOHDs1z/Lhx49StWzd9//33uvfee9WrVy+lpaVJkg4cOKAuXbqoU6dO+u677/T444/r2WefLVAOAAAA4GIobLgi+/btk2VZqlevXp5jEhMTtX37di1cuFBNmjRRRESEFixYoDVr1mjz5s2OcXa7XXPmzFGDBg3UoUMHtWrVSrt379bUqVNVt25d9enTR3Xr1tWqVauc9t+1a1c9+uijuuGGGzRhwgQ1bdpUr732WoHye3h4yM/PTzabTUFBQQoKClL58uXzHN+7d2/17NlTtWvX1gsvvKBTp05p06ZNkqQ33nhDdevW1UsvvaS6deuqR48e6t27d4FyAAAAABdDYcMVsSwr3zG7du1SaGioQkNDHesaNGggf39/p5mwmjVrqkKFCo7lwMBANWjQwOnLxAMDA3Xs2DGn/UdGRl6wXNAZtsJq1KiR488+Pj7y9fV15Nm9e7eaNWvmNL558+ZXJQcAAABKBwobrkidOnVks9mK5MYiZcuWdVq22WwXXWe32wu8z3Nl7/ximZOTU6QZC5MHAAAAKAwKG65IxYoVFR0drZkzZyorK+uC7enp6apfv74OHz6sw4cPO9b/8MMPSk9PV4MGDa44w7kbm5y/XL9+fUn/f6fHlJQUx/bk5GSn8R4eHsrNzb3iHHXr1tWWLVuc1p1/yicAAABQWBQ2XLGZM2cqNzdXzZs310cffaS9e/dq165dmj59uiIjIxUVFaXw8HD16tVLSUlJ2rRpkx555BHdddddatq06RUff/HixZozZ4727NmjMWPGaNOmTYqLi5Mk1a5dW6GhoRo7dqz27t2rTz/9VK+88orT82vWrKlTp04pMTFRJ06c0OnTpy8rx+OPP64ff/xRI0aM0J49e/TBBx9o3rx5kv6aiQMAAAAKi+9hM9zlfpF1capVq5aSkpI0ceJEPf3000pJSVGVKlXUpEkTzZo1SzabTZ988okGDBigO++8U25ubmrbtm2BbwySn3Hjxun999/Xk08+qeDgYL333nuOmbuyZcvqvffe0xNPPKFGjRqpWbNmev7559W1a1fH81u0aKF//vOf6t69u3777TeNGTPGcWv/wggLC9OHH36op59+WtOmTVNkZKSeffZZPfHEE/L09CyS1woAAIDSxWYV5K4RKBKZmZny8/NTRkaGfH19nbadOXNGBw4cUFhYmLy8vFyUsOSx2WxasmSJOnXq5OooFzVx4kTNnj3b6XRQ0/CzBwAAUPwu1Q3OxwwbUIRef/11NWvWTJUqVdLatWv10ksvOU7PBAAAAAqLwgYUob179+r5559XWlqaqlevrqefflojR450dSwAAACUUBQ2lGimndH76quv6tVXX3V1DAAAAFwjuEskAAAAABiKGTYAAIBr0Vi/AozJuPo5AFwRZtgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ3HTEdMV5ILhIj0eFx8XlM1m05IlS9SpU6erdox58+Zp0KBBSk9Pv2rHAAAAgLmYYUORSE1N1YABA1SrVi15enoqNDRUHTp0UGJioqujXTUpKSlq165dke2vZs2amjp1qtO67t27a8+ePUV2DAAAAJQszLDhih08eFC33Xab/P399dJLLyk8PFw5OTlauXKlYmNj9eOPP7o64lURFBR01Y/h7e0tb2/vq34cAAAAmIkZNlyxJ598UjabTZs2bVLnzp11ww03qGHDhhoyZIg2bNggSTp06JA6duyo8uXLy9fXV926ddPRo0cd+xg7dqxuuukmzZkzR9WrV1f58uX15JNPKjc3V/Hx8QoKClJAQIAmTpzodGybzaY33nhD//jHP1SuXDnVr19f69ev1759+9SyZUv5+PioRYsW2r9/v+M5vXv3vuA0xkGDBqlly5aO5ZYtW+qpp57S8OHDVbFiRQUFBWns2LEXHHvp0qWO5V9++UU9e/ZUxYoV5ePjo6ZNm2rjxo2SpP3796tjx44KDAxU+fLl1axZM3355ZdOx/v55581ePBg2Ww22Ww2SX+dEunv7+903FmzZun666+Xh4eH6tatq3feeeeCXG+99Zbuv/9+lStXTnXq1NGyZcvy/g8IAAAAY1HYcEXS0tK0YsUKxcbGysfH54Lt/v7+stvt6tixo9LS0rRmzRolJCTop59+Uvfu3Z3G7t+/X59//rlWrFih9957T2+//bbat2+vX375RWvWrNHkyZM1atQoRwk6Z8KECXrkkUeUnJysevXq6cEHH9Tjjz+ukSNHasuWLbIsS3FxcYV+bfPnz5ePj482btyo+Ph4jR8/XgkJCRcde+rUKd111106cuSIli1bpu+++07Dhw+X3W53bL/33nuVmJiobdu2qW3bturQoYMOHTokSfr4449VrVo1jR8/XikpKUpJSbnocZYsWaKBAwfq6aef1o4dO/T444+rT58+WrVqldO4cePGqVu3bvr+++917733qlevXkpLSyv0ewAAAADX4pRIXJF9+/bJsizVq1cvzzGJiYnavn27Dhw4oNDQUEnSggUL1LBhQ23evFnNmjWTJNntds2ZM0cVKlRQgwYN1KpVK+3evVufffaZ3NzcVLduXU2ePFmrVq1SRESEY/99+vRRt27dJEkjRoxQZGSknnvuOUVHR0uSBg4cqD59+hT6tTVq1EhjxoyRJNWpU0czZsxQYmKi7rnnngvGLly4UMePH9fmzZtVsWJFSVLt2rUd2xs3bqzGjRs7lidMmKAlS5Zo2bJliouLU8WKFeXu7q4KFSpc8lTLl19+Wb1799aTTz4pSY5ZzJdfflmtWrVyjOvdu7d69uwpSXrhhRc0ffp0bdq0SW3bti30+wAAAADXYYYNV8SyrHzH7Nq1S6GhoY6yJkkNGjSQv7+/du3a5VhXs2ZNVahQwbEcGBioBg0ayM3NzWndsWPHnPbfqFEjp+2SFB4e7rTuzJkzyszMLMQrc96vJAUHB19w7HOSk5N18803O8ra3506dUpDhw5V/fr15e/vr/Lly2vXrl2OGbaC2rVrl2677TandbfddpvT+/j37D4+PvL19c0zOwAAAMzFDBuuSJ06dWSz2YrkxiJly5Z1WrbZbBddd+40w4s979y1Xxdbd+55bm5uFxTNnJycAuX5+7HPye/GIEOHDlVCQoJefvll1a5dW97e3urSpYuys7Mv+bzLVZjsAAAAMBczbLgiFStWVHR0tGbOnKmsrKwLtqenp6t+/fo6fPiwDh8+7Fj/ww8/KD09XQ0aNCjOuJKkKlWqXHCNWHJy8hXts1GjRkpOTs7zOrG1a9eqd+/euv/++xUeHq6goCAdPHjQaYyHh4dyc3MveZz69etr7dq1F+zbFe8jAAAArj4KG67YzJkzlZubq+bNm+ujjz7S3r17tWvXLk2fPl2RkZGKiopSeHi4evXqpaSkJG3atEmPPPKI7rrrLjVt2rTY8959993asmWLFixYoL1792rMmDHasWPHFe2zZ8+eCgoKUqdOnbR27Vr99NNP+uijj7R+/XpJf81Efvzxx0pOTtZ3332nBx988IIZr5o1a+rrr7/WkSNHdOLEiYseZ9iwYZo3b55mzZqlvXv3asqUKfr44481dOjQK8oPAAAAM3FKpOnGZrg6Qb5q1aqlpKQkTZw4UU8//bRSUlJUpUoVNWnSRLNmzZLNZtMnn3yiAQMG6M4775Sbm5vatm2r1157zSV5o6Oj9dxzz2n48OE6c+aM+vbtq0ceeUTbt2+/7H16eHjoiy++0NNPP617771Xf/75pxo0aKCZM2dKkqZMmaK+ffuqRYsWqly5skaMGHHBNXXjx4/X448/ruuvv15nz5696PWBnTp10rRp0/Tyyy9r4MCBCgsL09y5c52+kgAAAADXDptVkLtGoEhkZmbKz89PGRkZ8vX1ddp25swZHThwQGFhYfLy8nJRQpRG/OwBwDVqrF8Bxpj/D8PAtepS3eB8nBIJAAAAAIaisAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACGorABAAAAgKEobAAAAABgKAobAAAAABiKwgYAAAAAhirj6gC4tPD54cV6vO0x24v1eAAAAADyxgwbrljv3r1ls9kueOzbt8/V0QAAAIASjRk2FIm2bdtq7ty5TuuqVKnitJydnS0PD4/ijAUAAACUaMywoUh4enoqKCjI6dG6dWvFxcVp0KBBqly5sqKjoyVJO3bsULt27VS+fHkFBgbq4Ycf1okTJxz7stvtmjRpksLCwuTt7a3GjRvrww8/dNVLAwAAAFyGwoarav78+fLw8NDatWs1e/Zspaen6+6779bNN9+sLVu2aMWKFTp69Ki6devmeM6kSZO0YMECzZ49Wzt37tTgwYP10EMPac2aNS58JQAAAEDx45RIFInly5erfPnyjuV27dpJkurUqaP4+HjH+ueff14333yzXnjhBce6OXPmKDQ0VHv27FGNGjX0wgsv6Msvv1RkZKQkqVatWvr222/1xhtv6K677iqmVwQAAAC4HoUNRaJVq1aaNWuWY9nHx0c9e/ZUkyZNnMZ99913WrVqlVO5O2f//v3KycnR6dOndc899zhty87O1s0333x1wgMAAACGorChSPj4+Kh27doXXX++U6dOqUOHDpo8efIFY4ODg7Vjxw5J0qeffqqqVas6bff09CzCxAAAAID5KGwoVrfccos++ugj1axZU2XKXPjj16BBA3l6eurQoUOc/ggAAIBSj5uOoFjFxsYqLS1NPXv21ObNm7V//36tXLlSffr0UW5uripUqKChQ4dq8ODBmj9/vvbv36+kpCS99tprmj9/vqvjAwAAAMWKGTbDbY/Z7uoIRSokJERr167ViBEj1KZNG509e1Y1atRQ27Zt5eb2178fTJgwQVWqVNGkSZP0008/yd/fX7fccoueeeYZF6cHAAAAipfNsizL1SFKi8zMTPn5+SkjI0O+vr5O286cOaMDBw4oLCxMXl5eLkqI0oifPQAogLF+BRiTcfVzFEZJzAyUIpfqBudjhg0AgJKqIH8hl/hLOYBrQyn9RwiuYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFzTDcAwbFjZ85AAAAc3HTEUOULVtWknT69Gl5e3u7OA1Kk+zsbEmSu7u7i5MAAIBiUUpv3lFSUdgM4e7uLn9/fx07dkySVK5cOdlsNhenwrXObrfr+PHjKleunMqU4dcBAACAafgbmkGCgoIkyVHagOLg5uam6tWr8w8EAAAABqKwGcRmsyk4OFgBAQHKyclxdRyUEh4eHnJz43JWAAAAE1HYDOTu7s71RAAAAAC4SyQAAAAAmIrCBgAAAACG4pRIAABQvLilOAAUGDNsAAAAAGAoChsAAAAAGIrCBgAAAACGorABAAAAgKG46QgAAABwubiJDq4yChsAAADMQPkBLsApkQAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIaisAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACGorABAAAAgKEobAAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIZyaWGbNGmSmjVrpgoVKiggIECdOnXS7t27ncacOXNGsbGxqlSpksqXL6/OnTvr6NGjTmMOHTqk9u3bq1y5cgoICNCwYcP0559/Oo1ZvXq1brnlFnl6eqp27dqaN2/eBXlmzpypmjVrysvLSxEREdq0aVOhswAAAABAUXFpYVuzZo1iY2O1YcMGJSQkKCcnR23atFFWVpZjzODBg/Xf//5Xixcv1po1a/Trr7/qgQcecGzPzc1V+/btlZ2drXXr1mn+/PmaN2+eRo8e7Rhz4MABtW/fXq1atVJycrIGDRqkRx99VCtXrnSMWbRokYYMGaIxY8YoKSlJjRs3VnR0tI4dO1bgLAAAAABQlGyWZVmuDnHO8ePHFRAQoDVr1ujOO+9URkaGqlSpooULF6pLly6SpB9//FH169fX+vXrdeutt+rzzz/XP/7xD/36668KDAyUJM2ePVsjRozQ8ePH5eHhoREjRujTTz/Vjh07HMfq0aOH0tPTtWLFCklSRESEmjVrphkzZkiS7Ha7QkNDNWDAAP3rX/8qUJb8ZGZmys/PTxkZGfL19S3S9w4AUAqN9SvguIyrm6OwCpKbzFeOzMWDzMWnpObOQ0G7gVHXsGVk/PUGV6xYUZK0detW5eTkKCoqyjGmXr16ql69utavXy9JWr9+vcLDwx1lTZKio6OVmZmpnTt3Osacv49zY87tIzs7W1u3bnUa4+bmpqioKMeYgmT5u7NnzyozM9PpAQAAAAAFZUxhs9vtGjRokG677TbdeOONkqTU1FR5eHjI39/faWxgYKBSU1MdY84va+e2n9t2qTGZmZn6448/dOLECeXm5l50zPn7yC/L302aNEl+fn6OR2hoaAHfDQAAAAAwqLDFxsZqx44dev/9910dpciMHDlSGRkZjsfhw4ddHQkAAABACVLG1QEkKS4uTsuXL9fXX3+tatWqOdYHBQUpOztb6enpTjNbR48eVVBQkGPM3+/meO7OjeeP+fvdHI8ePSpfX195e3vL3d1d7u7uFx1z/j7yy/J3np6e8vT0LMQ7AQAAAAD/z6UzbJZlKS4uTkuWLNFXX32lsLAwp+1NmjRR2bJllZiY6Fi3e/duHTp0SJGRkZKkyMhIbd++3elujgkJCfL19VWDBg0cY87fx7kx5/bh4eGhJk2aOI2x2+1KTEx0jClIFgAAAAAoSi6dYYuNjdXChQv1ySefqEKFCo5rwfz8/OTt7S0/Pz/169dPQ4YMUcWKFeXr66sBAwYoMjLScVfGNm3aqEGDBnr44YcVHx+v1NRUjRo1SrGxsY7ZrX/+85+aMWOGhg8frr59++qrr77SBx98oE8//dSRZciQIYqJiVHTpk3VvHlzTZ06VVlZWerTp48jU35ZAAAAAKAoubSwzZo1S5LUsmVLp/Vz585V7969JUmvvvqq3Nzc1LlzZ509e1bR0dF6/fXXHWPd3d21fPlyPfHEE4qMjJSPj49iYmI0fvx4x5iwsDB9+umnGjx4sKZNm6Zq1arprbfeUnR0tGNM9+7ddfz4cY0ePVqpqam66aabtGLFCqcbkeSXBQAAAACKkksLW0G+As7Ly0szZ87UzJkz8xxTo0YNffbZZ5fcT8uWLbVt27ZLjomLi1NcXNwVZQEAAACAomLMXSIBAAAAAM4obAAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIaisAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACGorABAAAAgKEobAAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIaisAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACGorABAAAAgKEobAAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIaisAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACGorABAAAAgKEobAAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIaisAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACGorABAAAAgKEobAAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIaisAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACGorABAAAAgKEobAAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIaisAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACGorABAAAAgKEobAAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIaisAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACGorABAAAAgKEobAAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIaisAEAAACAocq4OgAAAEYY61eAMRlXPwcAAOdhhg0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQfA8bYIKCfP+TxHdAAQAAlDLMsAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACG4qYjAC5fQW6Wwo1SAAAALhszbAAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIaisAEAAACAoShsAAAAAGAolxa2r7/+Wh06dFBISIhsNpuWLl3qtL13796y2WxOj7Zt2zqNSUtLU69eveTr6yt/f3/169dPp06dchrz/fff64477pCXl5dCQ0MVHx9/QZbFixerXr168vLyUnh4uD777DOn7ZZlafTo0QoODpa3t7eioqK0d+/eonkjAAAAAOAiyrjy4FlZWWrcuLH69u2rBx544KJj2rZtq7lz5zqWPT09nbb36tVLKSkpSkhIUE5Ojvr06aP+/ftr4cKFkqTMzEy1adNGUVFRmj17trZv366+ffvK399f/fv3lyStW7dOPXv21KRJk/SPf/xDCxcuVKdOnZSUlKQbb7xRkhQfH6/p06dr/vz5CgsL03PPPafo6Gj98MMP8vLyuhpvDy7XWL8CjMm4+jkAAACAK+TSwtauXTu1a9fukmM8PT0VFBR00W27du3SihUrtHnzZjVt2lSS9Nprr+nee+/Vyy+/rJCQEL377rvKzs7WnDlz5OHhoYYNGyo5OVlTpkxxFLZp06apbdu2GjZsmCRpwoQJSkhI0IwZMzR79mxZlqWpU6dq1KhR6tixoyRpwYIFCgwM1NKlS9WjR4+ieksAAAAAwMH4a9hWr16tgIAA1a1bV0888YR+++03x7b169fL39/fUdYkKSoqSm5ubtq4caNjzJ133ikPDw/HmOjoaO3evVu///67Y0xUVJTTcaOjo7V+/XpJ0oEDB5Samuo0xs/PTxEREY4xF3P27FllZmY6PQAAAACgoIwubG3bttWCBQuUmJioyZMna82aNWrXrp1yc3MlSampqQoICHB6TpkyZVSxYkWlpqY6xgQGBjqNObec35jzt5//vIuNuZhJkybJz8/P8QgNDS3U6wcAAABQurn0lMj8nH+qYXh4uBo1aqTrr79eq1evVuvWrV2YrGBGjhypIUOGOJYzMzMpbQAAAAAKzOgZtr+rVauWKleurH379kmSgoKCdOzYMacxf/75p9LS0hzXvQUFBeno0aNOY84t5zfm/O3nP+9iYy7G09NTvr6+Tg8AAAAAKKgSVdh++eUX/fbbbwoODpYkRUZGKj09XVu3bnWM+eqrr2S32xUREeEY8/XXXysnJ8cxJiEhQXXr1tV1113nGJOYmOh0rISEBEVGRkqSwsLCFBQU5DQmMzNTGzdudIwBAAAAgKLm0sJ26tQpJScnKzk5WdJfN/dITk7WoUOHdOrUKQ0bNkwbNmzQwYMHlZiYqI4dO6p27dqKjo6WJNWvX19t27bVY489pk2bNmnt2rWKi4tTjx49FBISIkl68MEH5eHhoX79+mnnzp1atGiRpk2b5nSq4sCBA7VixQq98sor+vHHHzV27Fht2bJFcXFxkiSbzaZBgwbp+eef17Jly7R9+3Y98sgjCgkJUadOnYr1PQMAAABQerj0GrYtW7aoVatWjuVzJSomJkazZs3S999/r/nz5ys9PV0hISFq06aNJkyY4PRdbO+++67i4uLUunVrubm5qXPnzpo+fbpju5+fn7744gvFxsaqSZMmqly5skaPHu24pb8ktWjRQgsXLtSoUaP0zDPPqE6dOlq6dKnjO9gkafjw4crKylL//v2Vnp6u22+/XStWrOA72AAAAABcNS4tbC1btpRlWXluX7lyZb77qFixouNLsvPSqFEjffPNN5cc07VrV3Xt2jXP7TabTePHj9f48ePzzQQAAAAARaFEXcMGAAAAAKWJ0bf1hwHG+hVgTMbVzwEAAACUQsywAQAAAIChmGEDULowawwAAEoQZtgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ3HTEQBA0ePmLgAAFAlm2AAAAADAUMywAYDpmK0CAKDUYoYNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFDQAAAAAMRWEDAAAAAENR2AAAAADAUBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFDQAAAAAMRWEDAAAAAENR2AAAAADAUBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ5Up7BMOHDigb775Rj///LNOnz6tKlWq6Oabb1ZkZKS8vLyuRkYAAAAAKJUKXNjeffddTZs2TVu2bFFgYKBCQkLk7e2ttLQ07d+/X15eXurVq5dGjBihGjVqXM3MAAAAAFAqFKiw3XzzzfLw8FDv3r310UcfKTQ01Gn72bNntX79er3//vtq2rSpXn/9dXXt2vWqBAYAAACA0qJAhe3FF19UdHR0nts9PT3VsmVLtWzZUhMnTtTBgweLKh8AAAAAlFoFKmyXKmt/V6lSJVWqVOmyAwEAAAAA/lLou0QmJSVp+/btjuVPPvlEnTp10jPPPKPs7OwiDQcAAAAApVmhC9vjjz+uPXv2SJJ++ukn9ejRQ+XKldPixYs1fPjwIg8IAAAAAKVVoQvbnj17dNNNN0mSFi9erDvvvFMLFy7UvHnz9NFHHxV1PgAAAAAotQpd2CzLkt1ulyR9+eWXuvfeeyVJoaGhOnHiRNGmAwAAAIBSrNCFrWnTpnr++ef1zjvvaM2aNWrfvr2kv75QOzAwsMgDAgAAAEBpVejCNnXqVCUlJSkuLk7PPvusateuLUn68MMP1aJFiyIPCAAAAAClVYFu63++Ro0aOd0l8pyXXnpJ7u7uRRIKAAAAAHAZhS0vXl5eRbUrAAAAAIAuo7C5ubnJZrPluT03N/eKAgEAAAAA/lLowrZkyRKn5ZycHG3btk3z58/XuHHjiiwYAAAAAJR2hS5sHTt2vGBdly5d1LBhQy1atEj9+vUrkmAAAAAAUNoV+i6Rebn11luVmJhYVLsDAAAAgFKvSArbH3/8oenTp6tq1apFsTsAAAAAgC7jlMjrrrvO6aYjlmXp5MmTKleunP7zn/8UaTgAAAAAKM0KXdimTp3qtOzm5qYqVaooIiJC1113XVHlAgAAAIBSr9CFLSYm5mrkAAAAAAD8TYGuYTt06FChdnrkyJHLCgMAAAAA+H8FKmzNmjXT448/rs2bN+c5JiMjQ//+979144036qOPPiqygAAAAABQWhXolMgffvhBEydO1D333CMvLy81adJEISEh8vLy0u+//64ffvhBO3fu1C233KL4+Hjde++9Vzs3AAAAAFzzCjTDVqlSJU2ZMkUpKSmaMWOG6tSpoxMnTmjv3r2SpF69emnr1q1av349ZQ0AAAAAikihbjri7e2tLl26qEuXLlcrDwAAAADgfy77i7P37dunlStX6o8//pD01/exAQAAAACKTqEL22+//abWrVvrhhtu0L333quUlBRJUr9+/fT0008XeUAAAAAAKK0KXdgGDx6ssmXL6tChQypXrpxjfffu3bVixYoiDQcAAAAApVmhvzj7iy++0MqVK1WtWjWn9XXq1NHPP/9cZMEAAAAAoLQr9AxbVlaW08zaOWlpafL09CySUAAAAACAyyhsd9xxhxYsWOBYttlsstvtio+PV6tWrYo0HAAAAACUZoU+JTI+Pl6tW7fWli1blJ2dreHDh2vnzp1KS0vT2rVrr0ZGAAAAACiVCj3DduONN2rPnj26/fbb1bFjR2VlZemBBx7Qtm3bdP3111+NjAAAAABQKhV6hk2S/Pz89OyzzxZ1FgAAAADAeS6rsJ05c0bff/+9jh07Jrvd7rTtvvvuK5JgAAAAAFDaFbqwrVixQo888ohOnDhxwTabzabc3NwiCQYAAAAApV2hr2EbMGCAunbtqpSUFNntdqcHZQ0AAAAAik6hC9vRo0c1ZMgQBQYGXo08AAAAAID/KXRh69Kli1avXn0VogAAAAAAzlfoa9hmzJihrl276ptvvlF4eLjKli3rtP2pp54qsnAAAAAAUJoVurC99957+uKLL+Tl5aXVq1fLZrM5ttlsNgobAAAAABSRQhe2Z599VuPGjdO//vUvubkV+oxKAAAAAEABFbpxZWdnq3v37pQ1AAAAALjKCt26YmJitGjRoquRBQAAAABwnkKfEpmbm6v4+HitXLlSjRo1uuCmI1OmTCmycAAAAABQmhW6sG3fvl0333yzJGnHjh1O286/AQkAAAAA4MoUurCtWrXqauQAAAAAAPwNdw4BAAAAAEMVaIbtgQce0Lx58+Tr66sHHnjgkmM//vjjIgkGAAAAAKVdgQqbn5+f4/o0Pz+/qxoIAAAAAPCXAhW2uXPnavz48Ro6dKjmzp17tTMBAAAAAFSIa9jGjRunU6dOXc0sAAAAAIDzFLiwWZZ1NXMAAAAAAP6mUHeJ5HvWAAAAAKD4FOp72G644YZ8S1taWtoVBQIAAAAA/KVQhW3cuHHcJRIAAAAAikmhCluPHj0UEBBwtbIAAAAAAM5T4GvYuH4NAAAAAIoXd4kEAAAAAEMV+JRIu91+NXMAAAAAAP6mULf1BwAAAAAUHwobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIaisAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACGorABAAAAgKEobAAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYyqWF7euvv1aHDh0UEhIim82mpUuXOm23LEujR49WcHCwvL29FRUVpb179zqNSUtLU69eveTr6yt/f3/169dPp06dchrz/fff64477pCXl5dCQ0MVHx9/QZbFixerXr168vLyUnh4uD777LNCZwEAAACAouTSwpaVlaXGjRtr5syZF90eHx+v6dOna/bs2dq4caN8fHwUHR2tM2fOOMb06tVLO3fuVEJCgpYvX66vv/5a/fv3d2zPzMxUmzZtVKNGDW3dulUvvfSSxo4dqzfffNMxZt26derZs6f69eunbdu2qVOnTurUqZN27NhRqCwAAAAAUJTKuPLg7dq1U7t27S66zbIsTZ06VaNGjVLHjh0lSQsWLFBgYKCWLl2qHj16aNeuXVqxYoU2b96spk2bSpJee+013XvvvXr55ZcVEhKid999V9nZ2ZozZ448PDzUsGFDJScna8qUKY5iN23aNLVt21bDhg2TJE2YMEEJCQmaMWOGZs+eXaAsAAAAAFDUjL2G7cCBA0pNTVVUVJRjnZ+fnyIiIrR+/XpJ0vr16+Xv7+8oa5IUFRUlNzc3bdy40THmzjvvlIeHh2NMdHS0du/erd9//90x5vzjnBtz7jgFyXIxZ8+eVWZmptMDAAAAAArK2MKWmpoqSQoMDHRaHxgY6NiWmpqqgIAAp+1lypRRxYoVncZcbB/nHyOvMedvzy/LxUyaNEl+fn6OR2hoaD6vGgAAAAD+n7GF7VowcuRIZWRkOB6HDx92dSQAAAAAJYixhS0oKEiSdPToUaf1R48edWwLCgrSsWPHnLb/+eefSktLcxpzsX2cf4y8xpy/Pb8sF+Pp6SlfX1+nBwAAAAAUlLGFLSwsTEFBQUpMTHSsy8zM1MaNGxUZGSlJioyMVHp6urZu3eoY89VXX8lutysiIsIx5uuvv1ZOTo5jTEJCgurWravrrrvOMeb845wbc+44BckCAAAAAEXNpYXt1KlTSk5OVnJysqS/bu6RnJysQ4cOyWazadCgQXr++ee1bNkybd++XY888ohCQkLUqVMnSVL9+vXVtm1bPfbYY9q0aZPWrl2ruLg49ejRQyEhIZKkBx98UB4eHurXr5927typRYsWadq0aRoyZIgjx8CBA7VixQq98sor+vHHHzV27Fht2bJFcXFxklSgLAAAAABQ1Fx6W/8tW7aoVatWjuVzJSomJkbz5s3T8OHDlZWVpf79+ys9PV233367VqxYIS8vL8dz3n33XcXFxal169Zyc3NT586dNX36dMd2Pz8/ffHFF4qNjVWTJk1UuXJljR492um72lq0aKGFCxdq1KhReuaZZ1SnTh0tXbpUN954o2NMQbIAAAAAQFFyaWFr2bKlLMvKc7vNZtP48eM1fvz4PMdUrFhRCxcuvORxGjVqpG+++eaSY7p27aquXbteURYAAAAAKErGXsMGAAAAAKUdhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFDQAAAAAMRWEDAAAAAEOVcXUAAAAA4FoWPj883zHbY7YXQxKURMywAQAAAIChKGwAAAAAYChOiQQAAECJwemFKG2YYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFDQAAAAAMRWEDAAAAAENR2AAAAADAUNzWHwAAAICTgnx9gsRXKBQHZtgAAAAAwFAUNgAAAAAwFKdEAgAA4xTkdCxOxQJQGjDDBgAAAACGorABAAAAgKEobAAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIYq4+oAAAAA14Lw+eH5jtkes70YkgC4ljDDBgAAAACGYoYNAACglGJWEDAfM2wAAAAAYCgKGwAAAAAYilMiAQAAAFwTrsXTfJlhAwAAAABDUdgAAAAAwFAUNgAAAAAwFNewAQBgoJr/+jTfMQe9iiEIjMTPB/LCz8a1h8IGAACKTEn8y2JJzFwSlcT3uSRmLql4r/PGKZEAAAAAYCgKGwAAAAAYisIGAAAAAIaisAEAAACAoShsAAAAAGAo7hIJAMA1Lnx+eL5jtsdsL4YkAIDCYoYNAAAAAAxFYQMAAAAAQ3FKJHCV8UWQAAAAuFwUNgAXRdEEAABwPQpbKVYS/0JeEjOj+JTEnw8yF4+SmBkAAIlr2AAAAADAWBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ/HF2QBwDQifH57vmO0x24shCQAAKErMsAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACG4qYjuGLc7AAAAAC4OphhAwAAAABDMcMGAH/DrDEAADAFM2wAAAAAYCgKGwAAAAAYilMiUSpxyhuAy8HvDgBAcWOGDQAAAAAMRWEDAAAAAENR2AAAAADAUFzDBuCq4pofAACAy8cMGwAAAAAYihk2AIBLMPsKAED+mGEDAAAAAENR2AAAAADAUBQ2AAAAADAUhQ0AAAAADMVNR4AShJs0AAAAlC7MsAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACGorABAAAAgKEobAAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIaisAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACGorABAAAAgKEobAAAAABgKAobAAAAABiKwgYAAAAAhqKwAQAAAIChKGwAAAAAYCgKGwAAAAAYisIGAAAAAIaisAEAAACAoShsAAAAAGAoChsAAAAAGIrCBgAAAACGorABAAAAgKGMLmxjx46VzWZzetSrV8+x/cyZM4qNjVWlSpVUvnx5de7cWUePHnXax6FDh9S+fXuVK1dOAQEBGjZsmP7880+nMatXr9Ytt9wiT09P1a5dW/Pmzbsgy8yZM1WzZk15eXkpIiJCmzZtuiqvGQAAAADOMbqwSVLDhg2VkpLieHz77beObYMHD9Z///tfLV68WGvWrNGvv/6qBx54wLE9NzdX7du3V3Z2ttatW6f58+dr3rx5Gj16tGPMgQMH1L59e7Vq1UrJyckaNGiQHn30Ua1cudIxZtGiRRoyZIjGjBmjpKQkNW7cWNHR0Tp27FjxvAkAAAAASiXjC1uZMmUUFBTkeFSuXFmSlJGRobfffltTpkzR3XffrSZNmmju3Llat26dNmzYIEn64osv9MMPP+g///mPbrrpJrVr104TJkzQzJkzlZ2dLUmaPXu2wsLC9Morr6h+/fqKi4tTly5d9OqrrzoyTJkyRY899pj69OmjBg0aaPbs2SpXrpzmzJlT/G8IAAAAgFLD+MK2d+9ehYSEqFatWurVq5cOHTokSdq6datycnIUFRXlGFuvXj1Vr15d69evlyStX79e4eHhCgwMdIyJjo5WZmamdu7c6Rhz/j7OjTm3j+zsbG3dutVpjJubm6Kiohxj8nL27FllZmY6PQAAAACgoIwubBEREZo3b55WrFihWbNm6cCBA7rjjjt08uRJpaamysPDQ/7+/k7PCQwMVGpqqiQpNTXVqayd235u26XGZGZm6o8//tCJEyeUm5t70THn9pGXSZMmyc/Pz/EIDQ0t9HsAAAAAoPQq4+oAl9KuXTvHnxs1aqSIiAjVqFFDH3zwgby9vV2YrGBGjhypIUOGOJYzMzMpbQAAAAAKzOgZtr/z9/fXDTfcoH379ikoKEjZ2dlKT093GnP06FEFBQVJkoKCgi64a+S55fzG+Pr6ytvbW5UrV5a7u/tFx5zbR148PT3l6+vr9AAAAACAgipRhe3UqVPav3+/goOD1aRJE5UtW1aJiYmO7bt379ahQ4cUGRkpSYqMjNT27dud7uaYkJAgX19fNWjQwDHm/H2cG3NuHx4eHmrSpInTGLvdrsTERMcYAAAAALgajC5sQ4cO1Zo1a3Tw4EGtW7dO999/v9zd3dWzZ0/5+fmpX79+GjJkiFatWqWtW7eqT58+ioyM1K233ipJatOmjRo0aKCHH35Y3333nVauXKlRo0YpNjZWnp6ekqR//vOf+umnnzR8+HD9+OOPev311/XBBx9o8ODBjhxDhgzRv//9b82fP1+7du3SE088oaysLPXp08cl7wsAAACA0sHoa9h++eUX9ezZU7/99puqVKmi22+/XRs2bFCVKlUkSa+++qrc3NzUuXNnnT17VtHR0Xr99dcdz3d3d9fy5cv1xBNPKDIyUj4+PoqJidH48eMdY8LCwvTpp59q8ODBmjZtmqpVq6a33npL0dHRjjHdu3fX8ePHNXr0aKWmpuqmm27SihUrLrgRCQAAAAAUJaML2/vvv3/J7V5eXpo5c6ZmzpyZ55gaNWros88+u+R+WrZsqW3btl1yTFxcnOLi4i45BgAAAACKktGnRAIAAABAaUZhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFDQAAAAAMRWEDAAAAAENR2AAAAADAUBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFDQAAAAAMRWEDAAAAAENR2AAAAADAUBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFDQAAAAAMRWEDAAAAAENR2AAAAADAUBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFDQAAAAAMRWEDAAAAAENR2AAAAADAUBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFDQAAAAAMRWEDAAAAAENR2AAAAADAUBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFDQAAAAAMRWEDAAAAAENR2AAAAADAUBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFDQAAAAAMRWEDAAAAAENR2AAAAADAUBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFDQAAAAAMRWEDAAAAAENR2AAAAADAUBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFDQAAAAAMRWEDAAAAAENR2AAAAADAUBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFrZBmzpypmjVrysvLSxEREdq0aZOrIwEAAAC4RlHYCmHRokUaMmSIxowZo6SkJDVu3FjR0dE6duyYq6MBAAAAuAZR2AphypQpeuyxx9SnTx81aNBAs2fPVrly5TRnzhxXRwMAAABwDSrj6gAlRXZ2trZu3aqRI0c61rm5uSkqKkrr16+/6HPOnj2rs2fPOpYzMjIkSZmZmVc3bAHZz57Od0ymzcp3TO4fufnvp4hec2nOLJXM3GS+NDLnsx8yX/pY/O4oUKb8kDmf/ZD50scqxZmlkpnblL+Ln8thWZd+XTYrvxGQJP3666+qWrWq1q1bp8jISMf64cOHa82aNdq4ceMFzxk7dqzGjRtXnDEBAAAAlCCHDx9WtWrV8tzODNtVNHLkSA0ZMsSxbLfblZaWpkqVKslms7kw2eXLzMxUaGioDh8+LF9fX1fHKRAyFw8yF4+SmFkqmbnJXDzIXDxKYmapZOYmc/EoiZn/zrIsnTx5UiEhIZccR2EroMqVK8vd3V1Hjx51Wn/06FEFBQVd9Dmenp7y9PR0Wufv73+1IhYrX1/fEvc/B5mLB5mLR0nMLJXM3GQuHmQuHiUxs1Qyc5O5eJTEzOfz8/PLdww3HSkgDw8PNWnSRImJiY51drtdiYmJTqdIAgAAAEBRYYatEIYMGaKYmBg1bdpUzZs319SpU5WVlaU+ffq4OhoAAACAaxCFrRC6d++u48ePa/To0UpNTdVNN92kFStWKDAw0NXRio2np6fGjBlzwameJiNz8SBz8SiJmaWSmZvMxYPMxaMkZpZKZm4yF4+SmPlycZdIAAAAADAU17ABAAAAgKEobAAAAABgKAobAAAAABiKwgYAAAAAhqKwocBmzpypmjVrysvLSxEREdq0aZOrI13S119/rQ4dOigkJEQ2m01Lly51daR8TZo0Sc2aNVOFChUUEBCgTp06affu3a6OdUmzZs1So0aNHF9cGRkZqc8//9zVsQrlxRdflM1m06BBg1wdJU9jx46VzWZzetSrV8/VsfJ15MgRPfTQQ6pUqZK8vb0VHh6uLVu2uDrWJdWsWfOC99pmsyk2NtbV0fKUm5ur5557TmFhYfL29tb111+vCRMmyPT7ip08eVKDBg1SjRo15O3trRYtWmjz5s2ujuWQ3+eIZVkaPXq0goOD5e3traioKO3du9c1Yf8nv8wff/yx2rRpo0qVKslmsyk5OdklOc93qcw5OTkaMWKEwsPD5ePjo5CQED3yyCP69ddfXRf4f/J7r8eOHat69erJx8dH1113naKiorRx40bXhP2fwvzd6J///KdsNpumTp1abPkuJr/MvXv3vuD3ddu2bV0T9iqhsKFAFi1apCFDhmjMmDFKSkpS48aNFR0drWPHjrk6Wp6ysrLUuHFjzZw509VRCmzNmjWKjY3Vhg0blJCQoJycHLVp00ZZWVmujpanatWq6cUXX9TWrVu1ZcsW3X333erYsaN27tzp6mgFsnnzZr3xxhtq1KiRq6Pkq2HDhkpJSXE8vv32W1dHuqTff/9dt912m8qWLavPP/9cP/zwg1555RVdd911ro52SZs3b3Z6nxMSEiRJXbt2dXGyvE2ePFmzZs3SjBkztGvXLk2ePFnx8fF67bXXXB3tkh599FElJCTonXfe0fbt29WmTRtFRUXpyJEjro4mKf/Pkfj4eE2fPl2zZ8/Wxo0b5ePjo+joaJ05c6aYk/6//DJnZWXp9ttv1+TJk4s5Wd4ulfn06dNKSkrSc889p6SkJH388cfavXu37rvvPhckdZbfe33DDTdoxowZ2r59u7799lvVrFlTbdq00fHjx4s56f8r6N+NlixZog0bNigkJKSYkuWtIJnbtm3r9Hv7vffeK8aExcACCqB58+ZWbGysYzk3N9cKCQmxJk2a5MJUBSfJWrJkiatjFNqxY8csSdaaNWtcHaVQrrvuOuutt95ydYx8nTx50qpTp46VkJBg3XXXXdbAgQNdHSlPY8aMsRo3buzqGIUyYsQI6/bbb3d1jCs2cOBA6/rrr7fsdruro+Spffv2Vt++fZ3WPfDAA1avXr1clCh/p0+fttzd3a3ly5c7rb/lllusZ5991kWp8vb3zxG73W4FBQVZL730kmNdenq65enpab333nsuSHihS332HThwwJJkbdu2rVgz5acgn9ebNm2yJFk///xz8YQqgILkzsjIsCRZX375ZfGEykdemX/55ReratWq1o4dO6waNWpYr776arFny8vFMsfExFgdO3Z0SZ7iwgwb8pWdna2tW7cqKirKsc7NzU1RUVFav369C5Nd+zIyMiRJFStWdHGSgsnNzdX777+vrKwsRUZGujpOvmJjY9W+fXunn22T7d27VyEhIapVq5Z69eqlQ4cOuTrSJS1btkxNmzZV165dFRAQoJtvvln//ve/XR2rULKzs/Wf//xHffv2lc1mc3WcPLVo0UKJiYnas2ePJOm7777Tt99+q3bt2rk4Wd7+/PNP5ebmysvLy2m9t7e38bPHknTgwAGlpqY6/f7w8/NTREQEn41XWUZGhmw2m/z9/V0dpcCys7P15ptvys/PT40bN3Z1nDzZ7XY9/PDDGjZsmBo2bOjqOAW2evVqBQQEqG7dunriiSf022+/uTpSkSrj6gAw34kTJ5Sbm6vAwECn9YGBgfrxxx9dlOraZ7fbNWjQIN1222268cYbXR3nkrZv367IyEidOXNG5cuX15IlS9SgQQNXx7qk999/X0lJSUZdL3MpERERmjdvnurWrauUlBSNGzdOd9xxh3bs2KEKFSq4Ot5F/fTTT5o1a5aGDBmiZ555Rps3b9ZTTz0lDw8PxcTEuDpegSxdulTp6enq3bu3q6Nc0r/+9S9lZmaqXr16cnd3V25uriZOnKhevXq5OlqeKlSooMjISE2YMEH169dXYGCg3nvvPa1fv161a9d2dbx8paamStJFPxvPbUPRO3PmjEaMGKGePXvK19fX1XHytXz5cvXo0UOnT59WcHCwEhISVLlyZVfHytPkyZNVpkwZPfXUU66OUmBt27bVAw88oLCwMO3fv1/PPPOM2rVrp/Xr18vd3d3V8YoEhQ0wVGxsrHbs2FEi/qW5bt26Sk5OVkZGhj788EPFxMRozZo1xpa2w4cPa+DAgUpISLjgX/dNdf5MSaNGjRQREaEaNWrogw8+UL9+/VyYLG92u11NmzbVCy+8IEm6+eabtWPHDs2ePbvEFLa3335b7dq1M+I6jkv54IMP9O6772rhwoVq2LChkpOTNWjQIIWEhBj9Xr/zzjvq27evqlatKnd3d91yyy3q2bOntm7d6upoMFBOTo66desmy7I0a9YsV8cpkFatWik5OVknTpzQv//9b3Xr1k0bN25UQECAq6NdYOvWrZo2bZqSkpKMPqPg73r06OH4c3h4uBo1aqTrr79eq1evVuvWrV2YrOhwSiTyVblyZbm7u+vo0aNO648ePaqgoCAXpbq2xcXFafny5Vq1apWqVavm6jj58vDwUO3atdWkSRNNmjRJjRs31rRp01wdK09bt27VsWPHdMstt6hMmTIqU6aM1qxZo+nTp6tMmTLKzc11dcR8+fv764YbbtC+fftcHSVPwcHBF5T2+vXrG38q5zk///yzvvzySz366KOujpKvYcOG6V//+pd69Oih8PBwPfzwwxo8eLAmTZrk6miXdP3112vNmjU6deqUDh8+rE2bNiknJ0e1atVydbR8nfv847OxeJwraz///LMSEhJKxOyaJPn4+Kh27dq69dZb9fbbb6tMmTJ6++23XR3ror755hsdO3ZM1atXd3w2/vzzz3r66adVs2ZNV8crsFq1aqly5cpGfz4WFoUN+fLw8FCTJk2UmJjoWGe325WYmFgirlMqSSzLUlxcnJYsWaKvvvpKYWFhro50Wex2u86ePevqGHlq3bq1tm/fruTkZMejadOm6tWrl5KTk0vEKRSnTp3S/v37FRwc7Oooebrtttsu+FqKPXv2qEaNGi5KVDhz585VQECA2rdv7+oo+Tp9+rTc3Jw/0t3d3WW3212UqHB8fHwUHBys33//XStXrlTHjh1dHSlfYWFhCgoKcvpszMzM1MaNG/lsLGLnytrevXv15ZdfqlKlSq6OdNlM/nx8+OGH9f333zt9NoaEhGjYsGFauXKlq+MV2C+//KLffvvN6M/HwuKUSBTIkCFDFBMTo6ZNm6p58+aaOnWqsrKy1KdPH1dHy9OpU6ec/nXlwIEDSk5OVsWKFVW9enUXJstbbGysFi5cqE8++UQVKlRwXAfh5+cnb29vF6e7uJEjR6pdu3aqXr26Tp48qYULF2r16tVG/3KvUKHCBdcF+vj4qFKlSsZeLzh06FB16NBBNWrU0K+//qoxY8bI3d1dPXv2dHW0PA0ePFgtWrTQCy+8oG7dumnTpk1688039eabb7o6Wr7sdrvmzp2rmJgYlSlj/kdlhw4dNHHiRFWvXl0NGzbUtm3bNGXKFPXt29fV0S5p5cqVsixLdevW1b59+zRs2DDVq1fPmM+W/D5HBg0apOeff1516tRRWFiYnnvuOYWEhKhTp07GZk5LS9OhQ4cc32N27h9VgoKCXDYzeKnMwcHB6tKli5KSkrR8+XLl5uY6PhsrVqwoDw8Pl2SWLp27UqVKmjhxou677z4FBwfrxIkTmjlzpo4cOeLSrwjJ7+fj72W4bNmyCgoKUt26dYs7qsOlMlesWFHjxo1T586dFRQUpP3792v48OGqXbu2oqOjXZa5yLn4LpUoQV577TWrevXqloeHh9W8eXNrw4YNro50SatWrbIkXfCIiYlxdbQ8XSyvJGvu3Lmujpanvn37WjVq1LA8PDysKlWqWK1bt7a++OILV8cqNNNv69+9e3crODjY8vDwsKpWrWp1797d2rdvn6tj5eu///2vdeONN1qenp5WvXr1rDfffNPVkQpk5cqVliRr9+7dro5SIJmZmdbAgQOt6tWrW15eXlatWrWsZ5991jp79qyro13SokWLrFq1alkeHh5WUFCQFRsba6Wnp7s6lkN+nyN2u9167rnnrMDAQMvT09Nq3bq1y39m8ss8d+7ci24fM2aMkZnPff3AxR6rVq1yWeb8cv/xxx/W/fffb4WEhFgeHh5WcHCwdd9991mbNm0yNvPFmHBb/0tlPn36tNWmTRurSpUqVtmyZa0aNWpYjz32mJWamurSzEXNZlmWVaQNEAAAAABQJLiGDQAAAAAMRWEDAAAAAENR2AAAAADAUBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAXsNlsWrp0qatjAAAMR2EDAOB/evfuLZvNJpvNprJlyyosLEzDhw/XmTNnXB1N8+bNk7+/v9Pyuazu7u667rrrFBERofHjxysjI8N1QQEARYrCBgDAedq2bauUlBT99NNPevXVV/XGG29ozJgxro51Ub6+vkpJSdEvv/yidevWqX///lqwYIFuuukm/frrr66OBwAoAhQ2AADO4+npqaCgIIWGhqpTp06KiopSQkKCY/tvv/2mnj17qmrVqipXrpzCw8P13nvvOe2jZcuWeuqppzR8+HBVrFhRQUFBGjt27CWPO2bMGAUHB+v7778vcFabzaagoCAFBwerfv366tevn9atW6dTp05p+PDhhXrdAAAzUdgAAMjDjh07tG7dOnl4eDjWnTlzRk2aNNGnn36qHTt2qH///nr44Ye1adMmp+fOnz9fPj4+2rhxo+Lj4zV+/Hin4neOZVkaMGCAFixYoG+++UaNGjW6oswBAQHq1auXli1bptzc3CvaFwDA9cq4OgAAACZZvny5ypcvrz///FNnz56Vm5ubZsyY4dhetWpVDR061LE8YMAArVy5Uh988IGaN2/uWN+oUSPHqZR16tTRjBkzlJiYqHvuuccx5s8//9RDDz2kbdu26dtvv1XVqlWL5DXUq1dPJ0+e1G+//aaAgIAi2ScAwDUobAAAnKdVq1aaNWuWsrKy9Oqrr6pMmTLq3LmzY3tubq5eeOEFffDBBzpy5Iiys7N19uxZlStXzmk/f58pCw4O1rFjx5zWDR48WJ6entqwYYMqV65cZK/BsixJf50yCQAo2TglEgCA8/j4+Kh27dpq3Lix5syZo40bN+rtt992bH/ppZc0bdo0jRgxQqtWrVJycrKio6OVnZ3ttJ+yZcs6LdtsNtntdqd199xzj44cOaKVK1cW6WvYtWuXfH19ValSpSLdLwCg+FHYAADIg5ubm5555hmNGjVKf/zxhyRp7dq16tixox566CE1btxYtWrV0p49ey5r//fdd58WLlyoRx99VO+//36RZD527JgWLlyoTp06yc2Nj3kAKOn4TQ4AwCV07dpV7u7umjlzpqS/rkdLSEjQunXrtGvXLj3++OM6evToZe///vvv1zvvvKM+ffroww8/LNRzLctSamqqUlJStGvXLs2ZM0ctWrSQn5+fXnzxxcvOBAAwB9ewAQBwCWXKlFFcXJzi4+P1xBNPaNSoUfrpp58UHR2tcuXKqX///urUqdMVfVl1ly5dZLfb9fDDD8vNzU0PPPBAgZ6XmZmp4OBg2Ww2+fr6qm7duoqJidHAgQPl6+t72XkAAOawWeeuTAYAAAAAGIVTIgEAAADAUBQ2AAAAADAUhQ0AAAAADEVhAwAAAABDUdgAAAAAwFAUNgAAAAAwFIUNAAAAAAxFYQMAAAAAQ1HYAAAAAMBQFDYAAAAAMBSFDQAAAAAM9X9lkhU2o6/UQwAAAABJRU5ErkJggg==", + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescription
slow_rank_analysisComputing has some issues in the cluster, because the max difference of Computing time
has reached 2411.538ms. Communication has some issues in the cluster, because the max
difference of Communication time has reached 3989.506ms.
" + ], "text/plain": [ - "
" + "+--------------------+--------------------------------------------------------------------------------------------------+\n", + "| problem | description |\n", + "+--------------------+--------------------------------------------------------------------------------------------------+\n", + "| slow_rank_analysis | Computing has some issues in the cluster, because the max difference of Computing time |\n", + "| | has reached 2411.538ms. Communication has some issues in the cluster, because the max |\n", + "| | difference of Communication time has reached 3989.506ms. |\n", + "+--------------------+--------------------------------------------------------------------------------------------------+" ] }, "metadata": {}, @@ -144,31 +331,26 @@ } ], "source": [ - "# 设置展示图大小\n", - "fig, ax = plt.subplots(figsize=(10,8))\n", - "\n", - "x = np.arange(len(rank_ids)) # the label locations\n", - "\n", - "rects1 = ax.bar(x - width, compute_time, width, label='Computing')\n", - "rects2 = ax.bar(x, communication_time, width, label='Communication')\n", - "rects3 = ax.bar(x + width, free_time, width, label='Free')\n", - "\n", - "\n", - "# Add some text for labels, title and custom x-axis tick labels, etc.\n", - "ax.set_ylabel('Time(us)')\n", - "ax.set_xlabel('Rank ID')\n", - "ax.set_title('Step Time')\n", - "ax.set_xticks(x)\n", - "ax.set_xticklabels(rank_ids)\n", - "ax.legend()\n", - "print(words)" + "problems = slow_rank_result.get(\"problems\")\n", + "headers = problems.get('headers')[:2]\n", + "if problems: # 如果存在相关问题则获取相关问题检测描述及建议\n", + " problem_table = PrettyTable(headers)\n", + " for row in problems.get(\"data\"):\n", + " row = [fill(str(element), width=100) for element in row]\n", + " problem_table.add_row(row[:2])\n", + " display(problem_table)\n", + "else:\n", + " print(\"There is no suggestion related to slow rank analysis.\")" ] }, { "cell_type": "markdown", "id": "3511befaff513e8e", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## 2)识别通信链路慢" @@ -176,10 +358,13 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 12, "id": "2a1e617d2a117125", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [ { @@ -192,48 +377,214 @@ } ], "source": [ - "dataset = interface.get_data('cluster', 'slow link')" + "slow_link_result = interface.get_result(\"cluster\", \"slow_link\")" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "c8bca314-a8da-4a5b-985a-c36f00154552", "metadata": {}, - "outputs": [], - "source": [ - "# EDIT THE DATA TO SHOW WHAT YOU WANT\n", - "data = dataset.get('data')\n", - "words = dataset.get('bottleneck')\n", - "rank_ids = list(data.keys())\n", - "# 柱状图显示属性\n", - "sdma_bw = [data.get(key, {}).get(\"SDMA bandwidth(GB/s)\") for key in rank_ids]\n", - "rdma_bw = [data.get(key, {}).get(\"RDMA bandwidth(GB/s)\") for key in rank_ids]\n", - "# 柱宽\n", - "width = 0.4" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "99ef04c9-ec07-4790-bbb6-0de9bf6c99d0", - "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "RDMA bandwidth(GB/s): \n", - "The average is 0.041, while the maximum is 0.041GB/s and the minimum is 0.041GB/s. the difference is 0.0GB/s. \n", - "SDMA bandwidth(GB/s): \n", - "The average is 0.054, while the maximum is 0.056GB/s and the minimum is 0.052GB/s. the difference is 0.003GB/s. \n" - ] - }, { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkAAAAHHCAYAAABXx+fLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABMFklEQVR4nO3df3zO9f7H8ee1zTYbmzG2EebXMT9mQkR+1LGMo1gyWmHmR6ei1ApxMHJqpS9RnKRjqJNIySmJM8s6ZMhmRSGVzK8xi43Nz+3z/aOb63S1YT+uaxufx/12u27t+nze1/v1+lzs8uzz67IYhmEIAADARJwqugEAAIDyRgACAACmQwACAACmQwACAACmQwACAACmQwACAACmQwACAACmQwACAACmQwACAACmQwACgAqWlJQki8WipKSkMs9lsVg0ffr0G46bPn26LBZLsea8OvbUqVNl7A6oPAhAQCVmsViK9bDHP5w3q5deeklr1qwp1thffvml0Hvn5eWltm3bav78+crPz3dss5VcSd5L4GbnUtENALi2d9991+b5O++8o4SEhELLW7RoUZ5tVSovvfSSBg4cqPDw8GK/JjIyUn/5y18kSdnZ2Vq3bp2efPJJHTp0SK+++qqDOi0f58+fl4tL6T7aS/NeAjcrAhBQiQ0ZMsTm+bZt25SQkFBo+R/l5eXJw8PDka1VKMMwdOHCBVWtWrVUr2/Xrp3Ne/jEE0+oU6dOWr58+U0fgNzd3Su6BeCmwCEw4CZ39913q3Xr1kpJSVH37t3l4eGhyZMnS5L+/e9/q2/fvqpbt67c3NzUpEkTzZw5s9ChnqtzfP/997rnnnvk4eGhevXqadasWYXqvfHGG2rVqpU8PDzk4+OjDh06aPny5db1V88X2bdvnwYNGiQvLy/VqlVL48aN04ULF2zmunLlimbOnKkmTZrIzc1NgYGBmjx5si5evGgzLjAwUPfdd582bNigDh06qGrVqnrrrbdksViUm5urZcuWWQ9pDR8+vMTvocVikZ+fX6E9J454/44cOaLw8HB5enqqTp06euaZZwpt7+uvvy5nZ2edOXPGumz27NmyWCyKiYmxLsvPz1f16tU1ceJEm2354zlAW7Zs0R133CF3d3c1adJEb731VpHvwY3eyzNnzmj48OGqUaOGvL29FR0drby8vCLfU6CyYw8QcAvIyspSnz599NBDD2nIkCHy8/OTJC1dulTVqlVTTEyMqlWrpi+++ELTpk1TTk5OoT0dp0+fVu/evTVgwAANGjRIH374oSZOnKjg4GD16dNHkvT222/rqaee0sCBA62B5ttvv9X27dv18MMP28w3aNAgBQYGKi4uTtu2bdPrr7+u06dP65133rGOGTVqlJYtW6aBAwfq2Wef1fbt2xUXF6e9e/fq448/tplv//79ioyM1F//+leNHj1azZs317vvvqtRo0apY8eOevTRRyVJTZo0ueH7lZeXZz2hNycnR59//rnWr1+vSZMm2Yyz9/t3/vx59ezZU+np6XrqqadUt25dvfvuu/riiy9s5urWrZsKCgq0ZcsW3XfffZKkzZs3y8nJSZs3b7aO27Vrl86dO6fu3btfc1t3796tXr16qXbt2po+fbquXLmi2NhY69+Rq4rzXg4aNEiNGjVSXFycUlNT9c9//lN16tTRK6+8csP3HKh0DAA3jTFjxhh//LXt0aOHIclYuHBhofF5eXmFlv31r381PDw8jAsXLhSa45133rEuu3jxouHv7288+OCD1mX9+/c3WrVqdd0eY2NjDUlGv379bJY/8cQThiTjm2++MQzDMNLS0gxJxqhRo2zGPffcc4Yk44svvrAua9iwoSHJWL9+faF6np6eRlRU1HV7uurgwYOGpCIfjz/+uFFQUGAz3t7v39y5cw1JxgcffGBdlpubazRt2tSQZGzatMkwDMPIz883vLy8jAkTJhiGYRgFBQVGrVq1jIiICMPZ2dk4e/asYRiGMWfOHMPJyck4ffq0dT5JRmxsrPV5eHi44e7ubhw6dMi67PvvvzecnZ0L/V261nt59c90xIgRNssfeOABo1atWoXGAzcDDoEBtwA3NzdFR0cXWv77c2TOnj2rU6dOqVu3bsrLy9O+fftsxlarVs3mvBhXV1d17NhRP//8s3VZjRo1dOTIEX399dc37GnMmDE2z5988klJ0rp162z++/tDOpL07LPPSpI+++wzm+WNGjVSWFjYDesWx6OPPqqEhAQlJCToo48+0pgxY/TWW28V6sXe79+6desUEBCggQMHWpd5eHhY97hc5eTkpC5duui///2vJGnv3r3KysrS888/L8MwlJycLOm3vUKtW7dWjRo1itzO/Px8bdiwQeHh4WrQoIF1eYsWLUr1Xj722GM2z7t166asrCzl5OSUeC6gohGAgFtAvXr15OrqWmj5d999pwceeEDe3t7y8vJS7dq1rf9IZ2dn24y97bbbCt0XxsfHR6dPn7Y+nzhxoqpVq6aOHTuqWbNmGjNmjL766qsie2rWrJnN8yZNmsjJyUm//PKLJOnQoUNycnJS06ZNbcb5+/urRo0aOnTokM3yRo0aXecdKJlmzZopNDRUoaGhGjBggObPn68nnnhCc+fO1e7du63j7P3+HTp0SE2bNi00rnnz5oV67Natm1JSUnT+/Hlt3rxZAQEBateunUJCQqyHwbZs2aJu3bpdczszMzN1/vz5Qn8W16p5I78PUdJv2yfJZhuBmwUBCLgFFHU11JkzZ9SjRw998803euGFF/Tpp58qISHBer5GQUGBzXhnZ+ci5zYMw/pzixYttH//fq1YsUJdu3bVRx99pK5duyo2NvaGPV7rpnvFvRlfaa/4Kq6ePXtKknWviyPev5Lo2rWrLl++rOTkZG3evNkadLp166bNmzdr3759yszMvG4Asjd7byNQkTgJGrhFJSUlKSsrS6tXr7Y5SfbgwYNlmtfT01ODBw/W4MGDdenSJQ0YMEAvvviiJk2aZHMJ9oEDB2z22vz4448qKChQYGCgJKlhw4YqKCjQgQMHbO5jdOLECZ05c0YNGzYsVj/FDVA3cuXKFUnSuXPnJDnm/WvYsKH27NkjwzBs+t6/f3+hsR07dpSrq6s2b96szZs3a/z48ZKk7t276+2331ZiYqL1+bXUrl1bVatW1YEDBwqtK6qmvd5L4GbAHiDgFnX1/9Z//3/nly5d0j/+8Y9Sz5mVlWXz3NXVVS1btpRhGLp8+bLNugULFtg8f+ONNyTJekXU1RsRzp0712bcnDlzJEl9+/YtVk+enp42l4uX1qeffipJCgkJkeSY9+8vf/mLjh07pg8//NC6LC8vT4sWLSo01t3dXXfccYfef/99paen2+wBOn/+vF5//XU1adJEAQEB16zn7OyssLAwrVmzRunp6dble/fu1YYNGwqNt9d7CdwM2AME3KK6dOkiHx8fRUVF6amnnpLFYtG7775bpsMVvXr1kr+/v+666y75+flp7969mj9/vvr27avq1avbjD148KD69eun3r17Kzk5Wf/617/08MMPWwNGSEiIoqKitGjRIuvhph07dmjZsmUKDw/XPffcU6ye2rdvr40bN2rOnDmqW7euGjVqpE6dOl33NampqfrXv/4l6beTmxMTE/XRRx+pS5cu6tWrlyTHvH+jR4/W/PnzNWzYMKWkpCggIEDvvvvuNW9a2a1bN7388svy9vZWcHCwJKlOnTpq3ry59u/fX6x7Hs2YMUPr169Xt27d9MQTT+jKlSvWezl9++23NmNL814CN62KuvwMQMld6zL4a12a/tVXXxl33nmnUbVqVaNu3brGhAkTjA0bNthccn29OaKiooyGDRtan7/11ltG9+7djVq1ahlubm5GkyZNjPHjxxvZ2dnWMVcvmf7++++NgQMHGtWrVzd8fHyMsWPHGufPn7eZ//Lly8aMGTOMRo0aGVWqVDHq169vTJo0yeYSc8P47TL4vn37FrmN+/btM7p3725UrVrVkHTdS+KLugzexcXFaNy4sTF+/Hjr5eWOev8MwzAOHTpk9OvXz/Dw8DB8fX2NcePGGevXry80p2EYxmeffWZIMvr06WOzfNSoUYYkY/HixYVq6g+XwRuGYXz55ZdG+/btDVdXV6Nx48bGwoULrX9Ov3et9/Lq2MzMTJvxS5YsMSQZBw8eLNQHUNlZDIOz1wDYz/Tp0zVjxgxlZmbK19e3otsBgCJxDhAAADAdAhAAADAdAhAAADAdzgECAACmwx4gAABgOgQgAABgOtwIsQgFBQU6duyYqlevzq3hAQC4SRiGobNnz6pu3bpycrr+Ph4CUBGOHTum+vXrV3QbAACgFA4fPqzbbrvtumMIQEW4ekv/w4cPy8vLq4K7AQAAxZGTk6P69esX+mqeohCAinD1sJeXlxcBCACAm0xxTl/hJGgAAGA6BCAAAGA6BCAAAGA6nAMEAICd5Ofn6/LlyxXdxi2rSpUqcnZ2tstcBCAAAMrIMAxlZGTozJkzFd3KLa9GjRry9/cv8336CEAAAJTR1fBTp04deXh4cBNdBzAMQ3l5eTp58qQkKSAgoEzzEYAAACiD/Px8a/ipVatWRbdzS6tataok6eTJk6pTp06ZDodxEjQAAGVw9ZwfDw+PCu7EHK6+z2U914oABACAHXDYq3zY630mAAEAANMhAAEAANPhJGgAABwk8PnPyrXeLy/3LfFrMjMzNW3aNH322Wc6ceKEfHx8FBISomnTpumuu+5SYGCgDh06JElyd3eXn5+fOnbsqMcee0x//vOf/1f7l1/UqFEjOTk5KT09XfXq1bOuO378uOrXr6/8/HwdPHhQgYGBNj2EhYVp48aN2rZtm+64447SbXwJsQcIAAATe/DBB7Vr1y4tW7ZMP/zwgz755BPdfffdysrKso554YUXdPz4ce3fv1/vvPOOatSoodDQUL344ouF5qtXr57eeecdm2XLli2zCUS/l56erq1bt2rs2LGKj4+378ZdB3uAAAAwqTNnzmjz5s1KSkpSjx49JEkNGzZUx44dbcZVr15d/v7+kqQGDRqoe/fuCggI0LRp0zRw4EA1b97cOjYqKkpLlizRpEmTrMuWLFmiqKgozZw5s1APS5Ys0X333afHH39cd955p+bMmWO93N2R2AMEAIBJVatWTdWqVdOaNWt08eLFEr123LhxMgxD//73v22W9+vXT6dPn9aWLVskSVu2bNHp06d1//33F5rDMAwtWbJEQ4YMUVBQkJo2baoPP/yw9BtUAuwBAoByVNJzQkpzTgfM69sjZ6w/t7mtxg3Hu7i4aOnSpRo9erQWLlyodu3aqUePHnrooYfUpk2b6762Zs2aqlOnjn755Reb5VWqVNGQIUMUHx+vrl27Kj4+XkOGDFGVKlUKzbFx40bl5eUpLCxMkjRkyBAtXrxYQ4cOvWHvZcUeIAAATOzBBx/UsWPH9Mknn6h3795KSkpSu3bttHTp0hu+1jCMIu/LM2LECK1atUoZGRlatWqVRowYUeTr4+PjNXjwYLm4/LY/JjIyUl999ZV++umnMm1TcbAHCABucSXZ68QeJ3Nyd3fXvffeq3vvvVdTp07VqFGjFBsbq3ah4bqcX6BjZ87b7F2SpDOnf1VmZqbcffz17ZEz8vrduuDgYAUFBSkyMlItWrRQ69atlZaWZvP6X3/9VR9//LEuX76sN99807o8Pz9f8fHxRZ5gbU/sAQIAADZatmyp3Nzc6455b/FCOTk56Z6wokPziBEjlJSUdM29P++9955uu+02ffPNN0pLS7M+Zs+eraVLlyo/P7/M23E97AECYGrsHbG/ijjPqSL+HK/WrFfdWdPvqaNLVXNkcblgl7nLS1ZWliIiIjRixAi1adNG1atX186dOzVr1iz179/fOi733DmdOnlCV65c1tH0Q/rs41Va/f47eur5aWrQqHGRc48ePVoRERGqUaNGkesXL16sgQMHqnXr1jbL69evr0mTJmn9+vXq29dxv3MEIAAATKpatWrq1KmTXnvtNf3000+6fPmy6tevr9GjR2vy5Mk6kPXblWH/mP2S/jH7JVVxdZVv7ToKvv0OLVrxb3Xs0u2ac7u4uMjX17fIdSkpKfrmm2/09ttvF1rn7e2tnj17avHixQQgAABuRp+MvUtS8a7IupE/noNjD25uboqLi1NcXNw1RlzU58nfFmuuwMBAGYZxzfVt27a1rr/R2HXr1hWrZllwDhAAADAdAhAAADAdAhAAADAdAhAAADAdToIGSonLp+2Pr4kAUF7YAwQAAEyHAAQAAEyHQ2AmYIbDCmbYRgCA/RCAKgDnjgAAULE4BAYAAEyHPUAArom9lUDZtPlnQ/vNVYwx3446VKI5hw8frmXLlkn67bu7brvtNkVEROiFF16Qu7u7JCmkvo91vHtVD9Xx81fbDp0UGf2oWrZpa12XlJSke+65RzVq1NDx48etr5ekr7/+Wh07dpSkIr8CIygoSAcPHtShQ4fk7+9fom0oLfYAAQBgYr1799bx48f1888/67XXXtNbb72l2NhYmzEvzF6gxJR9Wp2YrEl/f1V5ebka0i9Un364otB81atX18cff2yzbPHixWrQoEGR9bds2aLz589r4MCB1jBWHtgDBIdgzwEA3Bzc3Nyse13q16+v0NBQJSQk6JVXXrGOqe7lLd86fpKkevUbqEuPP2vKM48rbuoE9QjtLa8aNaxjo6KiFB8fr8jISEnS+fPntWLFCj311FOaOXNmofqLFy/Www8/rB49emjcuHGaOHGiA7f2fwhAwE2CK90AONqePXu0detWNWx440N3Q0Y9oU8/XKHkzZsUdv8D1uVDhw7Vq6++qvT0dDVo0EAfffSRAgMD1a5du0JznD17VqtWrdL27dsVFBSk7Oxsbd68Wd26dbPrdhWFQ2AAAJjY2rVrVa1aNbm7uys4OFgnT57U+PHjb/i6Rk2aSZKOHUm3WV6nTh316dNHS5culSTFx8drxIgRRc6xYsUKNWvWTK1atZKzs7MeeughLV68uGwbVEwEIAAATOyee+5RWlqatm/frqioKEVHR+vBBx+84euunsxssVgKrRsxYoSWLl2qn3/+WcnJyXrkkUeKnCM+Pl5DhgyxPh8yZIhWrVqls2fPlnJrio8ABACAiXl6eqpp06YKCQlRfHy8tm/fXqy9MAd//EGSVK9+4cNlffr00fnz5zVy5Ejdf//9qlWrVqEx33//vbZt26YJEybIxcVFLi4uuvPOO5WXl6cVKwqfXG1vBCAAACBJcnJy0uTJkzVlyhSdP3/+umP/tfhNVateXZ263l1onYuLi4YNG6akpKRrHv5avHixunfvrm+++UZpaWnWR0xMTLkcBiMAAQAAq4iICDk7O2vBggXWZWdzsnXq5AkdO5Ku5P9u0rN/jdLnaz7U316cLS9v7yLnmTlzpjIzMxUWFlZo3eXLl/Xuu+8qMjJSrVu3tnmMGjVK27dv13fffeewbZS4CgwAAPyOi4uLxo4dq1mzZqlrv98uZZ/27BhJkpubu+r4B+j2O+7Ue58mqkVwyDXncXV1la+vb5HrPvnkE2VlZemBBx4otK5FixZq0aKFFi9erDlz5thhi4pGAAIAwEGu3pm5zW01yj7XkTNlnuOPrl6p9UfPP/+8nn/+eX175Iy+OXy6WHPdfffdRd7l+arw8HDr+gcffFD5+fnXHPv9998Xq2ZZcAgMAACYDgEIAACYTqUIQAsWLFBgYKDc3d3VqVMn7dix47rjV61apaCgIOtNm9atW2ezfvjw4bJYLDaP3r17O3ITAADATaTCA9DKlSsVExOj2NhYpaamKiQkRGFhYTp58mSR47du3arIyEiNHDlSu3btUnh4uMLDw7Vnzx6bcVe/3O3q4/333y+PzQEAADeBCg9Ac+bM0ejRoxUdHa2WLVtq4cKF8vDwUHx8fJHj582bp969e2v8+PFq0aKFZs6cqXbt2mn+/Pk2465+udvVh4+PT3lsDgDAZAoMSTKk65wADPu53onWJVGhAejSpUtKSUlRaGiodZmTk5NCQ0OVnJxc5GuSk5NtxktSWFhYofFJSUmqU6eOmjdvrscff1xZWVn23wAAgOmduVCgy/mGjCuXKroVU8jLy5MkValSpUzzVOhl8KdOnVJ+fr78/Pxslvv5+Wnfvn1FviYjI6PI8RkZGdbnvXv31oABA9SoUSP99NNPmjx5svr06aPk5GQ5OzsXmvPixYu6ePGi9XlOTk5ZNgsAYCLnrxhK/Pmc7nN1lk9NyeLiKv3h+7EuXLhQ5jolDVjlXdMe9a7HMAzl5eXp5MmTqlGjRpH/npfELXkfoIceesj6c3BwsNq0aaMmTZooKSlJPXv2LDQ+Li5OM2bMKM8WAQC3kNV7cyVJPRvnq4qzRZJtAHI9X7XMNU6evv5XU/xRede0R73iqFGjhvz9/cs8T4UGIF9fXzk7O+vEiRM2y0+cOHHNjfP39y/ReElq3LixfH199eOPPxYZgCZNmqSYmBjr85ycHNWvX78kmwIAMDFD0kd7c/XZgTz5uDvJ6Q9fkJ747N1lrjFqdVKJxpd3TXvUu5EqVaqUec/PVRUagFxdXdW+fXslJiYqPDxcklRQUKDExESNHTu2yNd07txZiYmJevrpp63LEhIS1Llz52vWOXLkiLKyshQQEFDkejc3N7m5uZV6OwAAkKQLVwwdP1f4Dsfu7u5lnvvo2WvfObko5V3THvXKU4VfBRYTE6O3335by5Yt0969e/X4448rNzdX0dHRkqRhw4Zp0qRJ1vHjxo3T+vXrNXv2bO3bt0/Tp0/Xzp07rYHp3LlzGj9+vLZt26ZffvlFiYmJ6t+/v5o2bVrkF7IBAADzqfBzgAYPHqzMzExNmzZNGRkZatu2rdavX2890Tk9PV1OTv/LaV26dNHy5cs1ZcoUTZ48Wc2aNdOaNWvUunVrSZKzs7O+/fZbLVu2TGfOnFHdunXVq1cvzZw5k708AABAUiUIQJI0duzYax7ySkpKKrQsIiJCERERRY6vWrWqNmzYYM/2AADALabCD4EBAACUNwIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHQIQAAAwHZeKbgA3MN27BGOzHdcHgJL9Pkr8TgKVGAEIhZV36KqIkFeZt7EiarKNlRd/jhVbryJqmuFzrhL8PnIIDAAAmA4BCAAAmE6lCEALFixQYGCg3N3d1alTJ+3YseO641etWqWgoCC5u7srODhY69atu+bYxx57TBaLRXPnzrVz1wAA4GZV4QFo5cqViomJUWxsrFJTUxUSEqKwsDCdPHmyyPFbt25VZGSkRo4cqV27dik8PFzh4eHas2dPobEff/yxtm3bprp16zp6MwAAwE2kwgPQnDlzNHr0aEVHR6tly5ZauHChPDw8FB8fX+T4efPmqXfv3ho/frxatGihmTNnql27dpo/f77NuKNHj+rJJ5/Ue++9pypVqpTHpgAAgJtEhQagS5cuKSUlRaGhodZlTk5OCg0NVXJycpGvSU5OthkvSWFhYTbjCwoKNHToUI0fP16tWrW6YR8XL15UTk6OzQMAANy6KjQAnTp1Svn5+fLz87NZ7ufnp4yMjCJfk5GRccPxr7zyilxcXPTUU08Vq4+4uDh5e3tbH/Xr1y/hlgAAgJtJhR8Cs7eUlBTNmzdPS5culcViKdZrJk2apOzsbOvj8OHDDu4SAABUpAoNQL6+vnJ2dtaJEydslp84cUL+/v5Fvsbf3/+64zdv3qyTJ0+qQYMGcnFxkYuLiw4dOqRnn31WgYGBRc7p5uYmLy8vmwcAALh1VWgAcnV1Vfv27ZWYmGhdVlBQoMTERHXu3LnI13Tu3NlmvCQlJCRYxw8dOlTffvut0tLSrI+6detq/Pjx2rBhg+M2BgAA3DQq/KswYmJiFBUVpQ4dOqhjx46aO3eucnNzFR0dLUkaNmyY6tWrp7i4OEnSuHHj1KNHD82ePVt9+/bVihUrtHPnTi1atEiSVKtWLdWqVcumRpUqVeTv76/mzZuX78YBAIBKqcID0ODBg5WZmalp06YpIyNDbdu21fr1660nOqenp8vJ6X87qrp06aLly5drypQpmjx5spo1a6Y1a9aodevWFbUJAADgJlPhAUiSxo4dq7Fjxxa5LikpqdCyiIgIRUREFHv+X375pZSdAQCAW9EtdxUYAADAjRCAAACA6RCAAACA6RCAAACA6RCAAACA6RCAAACA6RCAAACA6dglAF28eNEe0wAAAJSLUgWgzz//XFFRUWrcuLGqVKkiDw8PeXl5qUePHnrxxRd17Ngxe/cJAABgNyUKQB9//LH+9Kc/acSIEXJxcdHEiRO1evVqbdiwQf/85z/Vo0cPbdy4UY0bN9Zjjz2mzMxMR/UNAABQaiX6KoxZs2bptddeU58+fWy+n+uqQYMGSZKOHj2qN954Q//617/0zDPP2KdTAAAAOylRAEpOTi7WuHr16unll18uVUMAAACOZrerwPLz85WWlqbTp0/ba0oAAACHKHUAevrpp7V48WJJv4WfHj16qF27dqpfv36R3+AOAABQWZQ6AH344YcKCQmRJH366ac6ePCg9u3bp2eeeUZ/+9vf7NYgAACAvZU6AJ06dUr+/v6SpHXr1ikiIsJ6hdju3bvt1iAAAIC9lToA+fn56fvvv1d+fr7Wr1+ve++9V5KUl5cnZ2dnuzUIAABgbyW6Cuz3oqOjNWjQIAUEBMhisSg0NFSStH37dgUFBdmtQQAAAHsrdQCaPn26WrdurcOHDysiIkJubm6SJGdnZz3//PN2axAAAMDeShyAhg0bpv79+yssLEwDBw4stD4qKsoujQEAADhKic8Batq0qV566SXVrl1bffr00ZtvvqmjR486ojcAAACHKHEAmjZtmlJSUnTgwAHdf//9WrNmjZo0aaL27dvrhRdeUFpamgPaBAAAsJ9SXwV222236YknntCGDRuUmZmpiRMnav/+/frzn/+shg0bauzYsfruu+/s2SsAAIBd2OWrMKpXr65BgwbpvffeU2ZmpuLj4+Xs7Fzs7w4DAAAoT6W+CuyPLl26pEuXLqlatWrq2bOnevbsaa+pAQAA7KpUe4CWLFmiJ598Uu+9954kadKkSapevbq8vb117733Kisry65NAgAA2FOJA9CLL76oMWPGaN++fXrqqaf0+OOPa+nSpXrhhRf08ssva9++fZoyZYojegUAALCLEh8CW7p0qRYvXqzIyEjt3LlTnTp10gcffKAHH3xQktS6dWs99thjdm8UAADAXkq8Byg9PV1du3aVJHXo0EEuLi5q3bq1dX2bNm10/Phx+3UIAABgZyUOQJcvX7Z+7YUkubq6qkqVKtbnLi4uys/Pt093AAAADlCqq8C+//57ZWRkSJIMw9C+fft07tw5SdKpU6fs1x0AAIADlCoA9ezZU4ZhWJ/fd999kiSLxSLDMGSxWOzTHQAAgAOUOAAdPHjQEX0AAACUmxIHoIYNGzqiDwAAgHJTqkNgOTk58vLykiStW7dOV65csa5zdnZW37597dMdAACAA5Q4AK1du1ZTp07Vrl27JEmDBw9Wbm6udb3FYtHKlSs1cOBA+3UJAABgRyW+DH7RokV68sknbZb9+OOPKigoUEFBgeLi4hQfH2+3BgEAAOytxAFo9+7duuuuu665vk+fPtq5c2eZmgIAAHCkEgeg48eP29wIcdOmTapfv771ebVq1ZSdnW2f7gAAABygxAGoZs2a+vHHH63PO3ToYHMn6AMHDqhmzZr26Q4AAMABShyAunfvrtdff/2a619//XV17969TE0BAAA4UokD0MSJE/Wf//xHERER+vrrr5Wdna3s7Gzt2LFDDz74oDZu3KiJEyc6olcAAAC7KPFl8LfffrtWrlypUaNGafXq1TbrfHx8tGLFCrVr185uDQIAANhbqW6E2L9/f917773asGGDDhw4IElq1qyZevXqJU9PT7s2CAAAYG+lCkCS5OHhoQceeMCevQAAAJSLEp0DtGLFimKPPXz4sL766qsSNwQAAOBoJQpAb775plq0aKFZs2Zp7969hdZnZ2dr3bp1evjhh9WuXTtlZWXZrVEAAAB7KdEhsC+//FKffPKJ3njjDU2aNEmenp7y8/OTu7u7Tp8+rYyMDPn6+mr48OHas2eP/Pz8HNU3AABAqZX4HKB+/fqpX79+OnXqlLZs2aJDhw7p/Pnz8vX11e23367bb79dTk4lvroeAACg3JT6JGhfX1+Fh4fbsRUAAIDyUeoAdNWlS5d08uRJFRQU2Cxv0KBBWacGAABwiFIHoAMHDmjEiBHaunWrzXLDMGSxWJSfn1/m5gAAAByh1CfrDB8+XE5OTlq7dq1SUlKUmpqq1NRU7dq1S6mpqSWaa8GCBQoMDJS7u7s6deqkHTt2XHf8qlWrFBQUJHd3dwUHB2vdunU266dPn66goCB5enrKx8dHoaGh2r59e4m3EQAA3JpKvQcoLS1NKSkpCgoKKlMDK1euVExMjBYuXKhOnTpp7ty5CgsL0/79+1WnTp1C47du3arIyEjFxcXpvvvu0/LlyxUeHq7U1FS1bt1akvSnP/1J8+fPV+PGjXX+/Hm99tpr6tWrl3788UfVrl27TP0CAICbX6n3ALVs2VKnTp0qcwNz5szR6NGjFR0drZYtW2rhwoXy8PBQfHx8kePnzZun3r17a/z48WrRooVmzpypdu3aaf78+dYxDz/8sEJDQ9W4cWO1atVKc+bMUU5Ojr799tsy9wsAAG5+JQpAOTk51scrr7yiCRMmKCkpSVlZWTbrcnJyijXfpUuXlJKSotDQ0P815OSk0NBQJScnF/ma5ORkm/GSFBYWds3xly5d0qJFi+Tt7a2QkJAix1y8eLFU/QMAgJtTiQ6B1ahRQxaLxfrcMAz17NnTZkxJToI+deqU8vPzC90w0c/PT/v27SvyNRkZGUWOz8jIsFm2du1aPfTQQ8rLy1NAQIASEhLk6+tb5JxxcXGaMWPGDfsFAAC3hhIFoE2bNjmqD7u75557lJaWplOnTuntt9/WoEGDtH379iLPK5o0aZJiYmKsz3NyclS/fv3ybBcAAJSjEgWgHj16WH9OT09X/fr1bfYISb/tATp8+HCx5vP19ZWzs7NOnDhhs/zEiRPy9/cv8jX+/v7FGu/p6ammTZuqadOmuvPOO9WsWTMtXrxYkyZNKjSnm5ub3NzcitUzAAC4+ZX6JOhGjRopMzOz0PJff/1VjRo1KtYcrq6uat++vRITE63LCgoKlJiYqM6dOxf5ms6dO9uMl6SEhIRrjv/9vBcvXixWXwAA4NZW6svgr57r80fnzp2Tu7t7seeJiYlRVFSUOnTooI4dO2ru3LnKzc1VdHS0JGnYsGGqV6+e4uLiJEnjxo1Tjx49NHv2bPXt21crVqzQzp07tWjRIklSbm6uXnzxRfXr108BAQE6deqUFixYoKNHjyoiIqK0mwsAAG4hJQ5AV8+VsVgsmjp1qjw8PKzr8vPztX37drVt27bY8w0ePFiZmZmaNm2aMjIy1LZtW61fv956onN6errNl6t26dJFy5cv15QpUzR58mQ1a9ZMa9assd4DyNnZWfv27dOyZct06tQp1apVS3fccYc2b96sVq1alXRzAQDALajEAWjXrl2SftsDtHv3brm6ulrXubq6KiQkRM8991yJ5hw7dqzGjh1b5LqkpKRCyyIiIq65N8fd3V2rV68uUX0AAGAuJQ5AV68Ei46O1rx58+Tl5WX3pgAAAByp1OcALVmyxJ59AAAAlJsSBaABAwYUeyyHoQAAQGVVosvgvb29rQ8vLy8lJiZq586d1vUpKSlKTEyUt7e33RsFAACwlxLtAfr9Ya+JEydq0KBBWrhwoZydnSX9dhXYE088wXlBAACgUiv1jRDj4+P13HPPWcOP9Nsl6DExMdf8JncAAIDKoNQB6MqVK0V+Yem+fftUUFBQpqYAAAAcqdRXgUVHR2vkyJH66aef1LFjR0nS9u3b9fLLL1vv4gwAAFAZlToA/d///Z/8/f01e/ZsHT9+XJIUEBCg8ePH69lnn7VbgwAAAPZW6gDk5OSkCRMmaMKECcrJyZEkTn4GAAA3hVIHoN8j+AAAgJtJiQJQu3btlJiYKB8fH91+++1Ffhv8VampqWVuDgAAwBFKFID69+8vNzc368/XC0AAAACVVYkCUGxsrPXn6dOn27sXAACAclHq+wBNmzZNmzZt0oULF+zZDwAAgMOVOgAlJyfr/vvvV40aNdStWzdNmTJFGzdu1Pnz5+3ZHwAAgN2VOgAlJCTozJkzSkxM1F/+8hft3LlTAwYMUI0aNdS1a1d79ggAAGBXZboM3sXFRXfddZdq166tmjVrqnr16lqzZk2RX5EBAABQWZR6D9CiRYv08MMPq169eurSpYvWr1+vrl27aufOncrMzLRnjwAAAHZV6j1Ajz32mGrXrq1nn31WTzzxhKpVq2bPvgAAABym1HuAVq9erUceeUQrVqxQ7dq11aVLF02ePFn/+c9/lJeXZ88eAQAA7KrUe4DCw8MVHh4uScrOztbmzZu1atUq3XfffXJycuLyeAAAUGmV6STorKwsffnll0pKSlJSUpK+++47+fj4qFu3bvbqDwAAwO5KHYCCg4O1d+9e+fj4qHv37ho9erR69OihNm3a2LM/AAAAuyvTSdA9evRQ69at7dkPAACAw5U6AI0ZM8aefQAAAJSbEgWgmJiYYo+dM2dOiZsBAAAoDyUKQLt27bJ5npqaqitXrqh58+aSpB9++EHOzs5q3769/ToEAACwsxIFoE2bNll/njNnjqpXr65ly5bJx8dHknT69GlFR0dzFRgAAKjUSn0jxNmzZysuLs4afiTJx8dHf//73zV79my7NAcAAOAIpQ5AOTk5RX7nV2Zmps6ePVumpgAAAByp1AHogQceUHR0tFavXq0jR47oyJEj+uijjzRy5EgNGDDAnj0CAADYVakvg1+4cKGee+45Pfzww7p8+fJvk7m4aOTIkXr11Vft1iAAAIC9lToAeXh46B//+IdeffVV/fTTT5KkJk2ayNPT027NAQAAOEKZvgtMkjw9Pfn6CwAAcFMpdQDKzc3Vyy+/rMTERJ08eVIFBQU263/++ecyNwcAAOAIpQ5Ao0aN0pdffqmhQ4cqICBAFovFnn0BAAA4TKkD0Oeff67PPvtMd911lz37AQAAcLhSXwbv4+OjmjVr2rMXAACAclHqADRz5kxNmzZNeXl59uwHAADA4Up9CGz27Nn66aef5Ofnp8DAQFWpUsVmfWpqapmbAwAAcIRSB6Dw8HA7tgEAAFB+Sh2AYmNj7dkHAABAuSn1OUAAAAA3q1LvAcrPz9drr72mDz74QOnp6bp06ZLN+l9//bXMzQEAADhCqfcAzZgxQ3PmzNHgwYOVnZ2tmJgYDRgwQE5OTpo+fbodWwQAALCvUgeg9957T2+//baeffZZubi4KDIyUv/85z81bdo0bdu2zZ49AgAA2FWpA1BGRoaCg4MlSdWqVVN2drYk6b777tNnn31mn+4AAAAcoNQB6LbbbtPx48clSU2aNNF//vMfSdLXX38tNzc3+3QHAADgAKUOQA888IASExMlSU8++aSmTp2qZs2aadiwYRoxYoTdGgQAALC3Ul8F9vLLL1t/Hjx4sBo2bKitW7eqWbNmuv/+++3SHAAAgCOUeg9QVlaW9efDhw9r3bp1On78uLy9ve3SGAAAgKOUOADt3r1bgYGBqlOnjoKCgpSWlqY77rhDr732mhYtWqQ///nPWrNmjQNaBQAAsI8SB6AJEyYoODhY//3vf3X33XfrvvvuU9++fZWdna3Tp0/rr3/9q83hseJYsGCBAgMD5e7urk6dOmnHjh3XHb9q1SoFBQXJ3d1dwcHBWrdunXXd5cuXNXHiRAUHB8vT01N169bVsGHDdOzYsZJuKgAAuEWVOAB9/fXXevHFF3XXXXfp//7v/3Ts2DE98cQTcnJykpOTk5588knt27ev2POtXLlSMTExio2NVWpqqkJCQhQWFqaTJ08WOX7r1q2KjIzUyJEjtWvXLoWHhys8PFx79uyRJOXl5Sk1NVVTp05VamqqVq9erf3796tfv34l3VQAAHCLKnEA+vXXX+Xv7y/pt/v/eHp6ysfHx7rex8dHZ8+eLfZ8c+bM0ejRoxUdHa2WLVtq4cKF8vDwUHx8fJHj582bp969e2v8+PFq0aKFZs6cqXbt2mn+/PmSJG9vbyUkJGjQoEFq3ry57rzzTs2fP18pKSlKT08v6eYCAIBbUKlOgrZYLNd9XlyXLl1SSkqKQkND/9eQk5NCQ0OVnJxc5GuSk5NtxktSWFjYNcdLUnZ2tiwWi2rUqFHk+osXLyonJ8fmAQAAbl2lugx++PDh1psdXrhwQY899pg8PT0l/RYmiuvUqVPKz8+Xn5+fzXI/P79rHkbLyMgocnxGRkaR4y9cuKCJEycqMjJSXl5eRY6Ji4vTjBkzit03AAC4uZU4AEVFRdk8HzJkSKExw4YNK31HdnT58mUNGjRIhmHozTffvOa4SZMmKSYmxvo8JydH9evXL48WAQBABShxAFqyZIndivv6+srZ2VknTpywWX7ixAnreUZ/5O/vX6zxV8PPoUOH9MUXX1xz748kubm58fUdAACYSKlvhGgPrq6uat++vfUrNSSpoKBAiYmJ6ty5c5Gv6dy5s814SUpISLAZfzX8HDhwQBs3blStWrUcswEAAOCmVOqvwrCXmJgYRUVFqUOHDurYsaPmzp2r3NxcRUdHS/rtcFq9evUUFxcnSRo3bpx69Oih2bNnq2/fvlqxYoV27typRYsWSfot/AwcOFCpqalau3at8vPzrecH1axZU66urhWzoQAAoNKo8AA0ePBgZWZmatq0acrIyFDbtm21fv1664nO6enpcnL6346qLl26aPny5ZoyZYomT56sZs2aac2aNWrdurUk6ejRo/rkk08kSW3btrWptWnTJt19993lsl0AAKDyqvAAJEljx47V2LFji1yXlJRUaFlERIQiIiKKHB8YGCjDMOzZHgAAuMVU6DlAAAAAFYEABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATIcABAAATKfCA9CCBQsUGBgod3d3derUSTt27Lju+FWrVikoKEju7u4KDg7WunXrbNavXr1avXr1Uq1atWSxWJSWlubA7gEAwM2oQgPQypUrFRMTo9jYWKWmpiokJERhYWE6efJkkeO3bt2qyMhIjRw5Urt27VJ4eLjCw8O1Z88e65jc3Fx17dpVr7zySnltBgAAuMlUaACaM2eORo8erejoaLVs2VILFy6Uh4eH4uPjixw/b9489e7dW+PHj1eLFi00c+ZMtWvXTvPnz7eOGTp0qKZNm6bQ0NDy2gwAAHCTqbAAdOnSJaWkpNgEFScnJ4WGhio5ObnI1yQnJxcKNmFhYdccX1wXL15UTk6OzQMAANy6KiwAnTp1Svn5+fLz87NZ7ufnp4yMjCJfk5GRUaLxxRUXFydvb2/ro379+mWaDwAAVG4VfhJ0ZTBp0iRlZ2dbH4cPH67olgAAgAO5VFRhX19fOTs768SJEzbLT5w4IX9//yJf4+/vX6LxxeXm5iY3N7cyzQEAAG4eFbYHyNXVVe3bt1diYqJ1WUFBgRITE9W5c+ciX9O5c2eb8ZKUkJBwzfEAAABFqbA9QJIUExOjqKgodejQQR07dtTcuXOVm5ur6OhoSdKwYcNUr149xcXFSZLGjRunHj16aPbs2erbt69WrFihnTt3atGiRdY5f/31V6Wnp+vYsWOSpP3790v6be9RWfcUAQCAW0OFBqDBgwcrMzNT06ZNU0ZGhtq2bav169dbT3ROT0+Xk9P/dlJ16dJFy5cv15QpUzR58mQ1a9ZMa9asUevWra1jPvnkE2uAkqSHHnpIkhQbG6vp06eXz4YBAIBKrUIDkCSNHTtWY8eOLXJdUlJSoWURERGKiIi45nzDhw/X8OHD7dQdAAC4FXEVGAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMB0CEAAAMJ1KEYAWLFigwMBAubu7q1OnTtqxY8d1x69atUpBQUFyd3dXcHCw1q1bZ7PeMAxNmzZNAQEBqlq1qkJDQ3XgwAFHbgIAALiJVHgAWrlypWJiYhQbG6vU1FSFhIQoLCxMJ0+eLHL81q1bFRkZqZEjR2rXrl0KDw9XeHi49uzZYx0za9Ysvf7661q4cKG2b98uT09PhYWF6cKFC+W1WQAAoBKr8AA0Z84cjR49WtHR0WrZsqUWLlwoDw8PxcfHFzl+3rx56t27t8aPH68WLVpo5syZateunebPny/pt70/c+fO1ZQpU9S/f3+1adNG77zzjo4dO6Y1a9aU45YBAIDKqkID0KVLl5SSkqLQ0FDrMicnJ4WGhio5ObnI1yQnJ9uMl6SwsDDr+IMHDyojI8NmjLe3tzp16nTNOQEAgLm4VGTxU6dOKT8/X35+fjbL/fz8tG/fviJfk5GRUeT4jIwM6/qry6415o8uXryoixcvWp9nZ2dLknJyckqwNcVXcDGv2GNzLEbxJ75GvyWpVxE12cbi1WQby1iTbbR/vWvUNMM2OrSmGT7nHPTv69V/tw3jxr1UaACqLOLi4jRjxoxCy+vXr18B3djyLsngl0s0utLUZBsrQU220f71KqIm22j/ehVR0wyfc3baxms5e/asvL2vX6NCA5Cvr6+cnZ114sQJm+UnTpyQv79/ka/x9/e/7vir/z1x4oQCAgJsxrRt27bIOSdNmqSYmBjr84KCAv3666+qVauWLBZLibfLXnJyclS/fn0dPnxYXl5et2RNtvHWqMk23ho12UZq3iz1rsUwDJ09e1Z169a94dgKDUCurq5q3769EhMTFR4eLum38JGYmKixY8cW+ZrOnTsrMTFRTz/9tHVZQkKCOnfuLElq1KiR/P39lZiYaA08OTk52r59ux5//PEi53Rzc5Obm5vNsho1apRp2+zJy8ur3P9ClXdNtvHWqMk23ho12UZq3iz1inKjPT9XVfghsJiYGEVFRalDhw7q2LGj5s6dq9zcXEVHR0uShg0bpnr16ikuLk6SNG7cOPXo0UOzZ89W3759tWLFCu3cuVOLFi2SJFksFj399NP6+9//rmbNmqlRo0aaOnWq6tataw1ZAADA3Co8AA0ePFiZmZmaNm2aMjIy1LZtW61fv956EnN6erqcnP53sVqXLl20fPlyTZkyRZMnT1azZs20Zs0atW7d2jpmwoQJys3N1aOPPqozZ86oa9euWr9+vdzd3ct9+wAAQOVT4QFIksaOHXvNQ15JSUmFlkVERCgiIuKa81ksFr3wwgt64YUX7NVihXBzc1NsbGyhw3O3Uk228daoyTbeGjXZRmreLPXswWIU51oxAACAW0iF3wkaAACgvBGAAACA6RCAAACA6RCAAACA6RCAKqkFCxYoMDBQ7u7u6tSpk3bs2OHQev/97391//33q27durJYLFqzZo1D68XFxemOO+5Q9erVVadOHYWHh2v//v0Oq/fmm2+qTZs21pt0de7cWZ9//rnD6v3Ryy+/bL1HlaNMnz5dFovF5hEUFOSwelcdPXpUQ4YMUa1atVS1alUFBwdr586dDqsXGBhYaDstFovGjBnjkHr5+fmaOnWqGjVqpKpVq6pJkyaaOXNmsb5rqLTOnj2rp59+Wg0bNlTVqlXVpUsXff3113ab/0a/74ZhaNq0aQoICFDVqlUVGhqqAwcOOLTm6tWr1atXL+sd+NPS0hxW7/Lly5o4caKCg4Pl6empunXratiwYTp27JjDakq//Y4GBQXJ09NTPj4+Cg0N1fbt2x1W7/cee+wxWSwWzZ07t9T1ilNz+PDhhX43e/fuXaaajkIAqoRWrlypmJgYxcbGKjU1VSEhIQoLC9PJkycdVjM3N1chISFasGCBw2r83pdffqkxY8Zo27ZtSkhI0OXLl9WrVy/l5uY6pN5tt92ml19+WSkpKdq5c6f+/Oc/q3///vruu+8cUu/3vv76a7311ltq06aNw2u1atVKx48ftz62bNni0HqnT5/WXXfdpSpVqujzzz/X999/r9mzZ8vHx8dhNb/++mubbUxISJCk694aoyxeeeUVvfnmm5o/f7727t2rV155RbNmzdIbb7zhkHqSNGrUKCUkJOjdd9/V7t271atXL4WGhuro0aN2mf9Gv++zZs3S66+/roULF2r79u3y9PRUWFiYLly44LCaubm56tq1q1555ZVS1yhuvby8PKWmpmrq1KlKTU3V6tWrtX//fvXr189hNSXpT3/6k+bPn6/du3dry5YtCgwMVK9evZSZmemQeld9/PHH2rZtW7G+HsIeNXv37m3zO/r++++Xua5DGKh0OnbsaIwZM8b6PD8/36hbt64RFxdXLvUlGR9//HG51Lrq5MmThiTjyy+/LLeaPj4+xj//+U+H1jh79qzRrFkzIyEhwejRo4cxbtw4h9WKjY01QkJCHDZ/USZOnGh07dq1XGv+0bhx44wmTZoYBQUFDpm/b9++xogRI2yWDRgwwHjkkUccUi8vL89wdnY21q5da7O8Xbt2xt/+9je71/vj73tBQYHh7+9vvPrqq9ZlZ86cMdzc3Iz333/fITV/7+DBg4YkY9euXXapdaN6V+3YscOQZBw6dKjcamZnZxuSjI0bNzqs3pEjR4x69eoZe/bsMRo2bGi89tprZa51vZpRUVFG//797VbDkdgDVMlcunRJKSkpCg0NtS5zcnJSaGiokpOTK7Azx8rOzpYk1axZ0+G18vPztWLFCuXm5lq/Q85RxowZo759+9r8eTrSgQMHVLduXTVu3FiPPPKI0tPTHVrvk08+UYcOHRQREaE6dero9ttv19tvv+3Qmr936dIl/etf/9KIESMc9sXFXbp0UWJion744QdJ0jfffKMtW7aoT58+Dql35coV5efnF7pzfdWqVR2+R0+SDh48qIyMDJu/s97e3urUqdMt/xlksVjK7XsgL126pEWLFsnb21shISEOqVFQUKChQ4dq/PjxatWqlUNqFCUpKUl16tRR8+bN9fjjjysrK6vcapdEpbgTNP7n1KlTys/Pt34VyFV+fn7at29fBXXlWAUFBXr66ad111132Xylib3t3r1bnTt31oULF1StWjV9/PHHatmypcPqrVixQqmpqXY9d+N6OnXqpKVLl6p58+Y6fvy4ZsyYoW7dumnPnj2qXr26Q2r+/PPPevPNNxUTE6PJkyfr66+/1lNPPSVXV1dFRUU5pObvrVmzRmfOnNHw4cMdVuP5559XTk6OgoKC5OzsrPz8fL344ot65JFHHFKvevXq6ty5s2bOnKkWLVrIz89P77//vpKTk9W0aVOH1Py9jIwMSSryM+jqulvNhQsXNHHiREVGRjr8izzXrl2rhx56SHl5eQoICFBCQoJ8fX0dUuuVV16Ri4uLnnrqKYfMX5TevXtrwIABatSokX766SdNnjxZffr0UXJyspydncutj+IgAKHCjRkzRnv27HH4/902b95caWlpys7O1ocffqioqCh9+eWXDglBhw8f1rhx45SQkFBu30H3+z0Sbdq0UadOndSwYUN98MEHGjlypENqFhQUqEOHDnrppZckSbfffrv27NmjhQsXlksAWrx4sfr06WOXcxuu5YMPPtB7772n5cuXq1WrVkpLS9PTTz+tunXrOmwb3333XY0YMUL16tWTs7Oz2rVrp8jISKWkpDiknpldvnxZgwYNkmEYevPNNx1e75577lFaWppOnTqlt99+W4MGDdL27dtVp04du9ZJSUnRvHnzlJqa6rC9o0V56KGHrD8HBwerTZs2atKkiZKSktSzZ89y66M4OARWyfj6+srZ2VknTpywWX7ixAn5+/tXUFeOM3bsWK1du1abNm3Sbbfd5tBarq6uatq0qdq3b6+4uDiFhIRo3rx5DqmVkpKikydPql27dnJxcZGLi4u+/PJLvf7663JxcVF+fr5D6v5ejRo19Kc//Uk//vijw2oEBAQUCpAtWrRw+KE3STp06JA2btyoUaNGObTO+PHj9fzzz+uhhx5ScHCwhg4dqmeeeUZxcXEOq9mkSRN9+eWXOnfunA4fPqwdO3bo8uXLaty4scNqXnX1c8YMn0FXw8+hQ4eUkJDg8L0/kuTp6ammTZvqzjvv1OLFi+Xi4qLFixfbvc7mzZt18uRJNWjQwPoZdOjQIT377LMKDAy0e71rady4sXx9fR36OVRaBKBKxtXVVe3bt1diYqJ1WUFBgRITEx1+vkp5MgxDY8eO1ccff6wvvvhCjRo1KvceCgoKdPHiRYfM3bNnT+3evVtpaWnWR4cOHfTII48oLS2tXHYFnzt3Tj/99JMCAgIcVuOuu+4qdPuCH374QQ0bNnRYzauWLFmiOnXqqG/fvg6tk5eXJycn249KZ2dnFRQUOLSu9Ns/lgEBATp9+rQ2bNig/v37O7xmo0aN5O/vb/MZlJOTo+3bt99Sn0FXw8+BAwe0ceNG1apVq0L6cNTn0NChQ/Xtt9/afAbVrVtX48eP14YNG+xe71qOHDmirKwsh34OlRaHwCqhmJgYRUVFqUOHDurYsaPmzp2r3NxcRUdHO6zmuXPnbBL6wYMHlZaWppo1a6pBgwZ2rzdmzBgtX75c//73v1W9enXruQXe3t6qWrWq3etNmjRJffr0UYMGDXT27FktX75cSUlJDvsgqF69eqHzmTw9PVWrVi2Hnef03HPP6f7771fDhg117NgxxcbGytnZWZGRkQ6pJ0nPPPOMunTpopdeekmDBg3Sjh07tGjRIi1atMhhNaXf/tFYsmSJoqKi5OLi2I+x+++/Xy+++KIaNGigVq1aadeuXZozZ45GjBjhsJobNmyQYRhq3ry5fvzxR40fP15BQUF2+wy40e/7008/rb///e9q1qyZGjVqpKlTp6pu3boKDw93WM1ff/1V6enp1nvxXA3W/v7+pdrzdL16AQEBGjhwoFJTU7V27Vrl5+dbP4Nq1qwpV1dXu29jrVq19OKLL6pfv34KCAjQqVOntGDBAh09erTUt3C40Xv6x1BXpUoV+fv7q3nz5qWqd6OaNWvW1IwZM/Tggw/K399fP/30kyZMmKCmTZsqLCys1DUdpoKvQsM1vPHGG0aDBg0MV1dXo2PHjsa2bdscWm/Tpk2GpEKPqKgoh9QrqpYkY8mSJQ6pN2LECKNhw4aGq6urUbt2baNnz57Gf/7zH4fUuhZHXwY/ePBgIyAgwHB1dTXq1atnDB482Pjxxx8dVu+qTz/91GjdurXh5uZmBAUFGYsWLXJ4zQ0bNhiSjP379zu8Vk5OjjFu3DijQYMGhru7u9G4cWPjb3/7m3Hx4kWH1Vy5cqXRuHFjw9XV1fD39zfGjBljnDlzxm7z3+j3vaCgwJg6darh5+dnuLm5GT179izze32jmkuWLClyfWxsrN3rXb3UvqjHpk2bHLKN58+fNx544AGjbt26hqurqxEQEGD069fP2LFjh0PqFcUel8Ffr2ZeXp7Rq1cvo3bt2kaVKlWMhg0bGqNHjzYyMjLKVNNRLIbhwNuZAgAAVEKcAwQAAEyHAAQAAEyHAAQAAEyHAAQAAEyHAAQAAEyHAAQAAEyHAAQAAEyHAATAlCwWi9asWVPRbQCoIAQgAJXG8OHDZbFYZLFYVKVKFTVq1EgTJkzQhQsXKro1LV26VDVq1LB5frVXZ2dn+fj4qFOnTnrhhReUnZ1dcY0CKBYCEIBKpXfv3jp+/Lh+/vlnvfbaa3rrrbcUGxtb0W0VycvLS8ePH9eRI0e0detWPfroo3rnnXfUtm1b63daAaicCEAAKhU3Nzf5+/urfv36Cg8PV2hoqBISEqzrs7KyFBkZqXr16snDw0PBwcF6//33bea4++679dRTT2nChAmqWbOm/P39NX369OvWjY2NVUBAgL799tti92qxWOTv76+AgAC1aNFCI0eO1NatW3Xu3DlNmDChRNsNoHwRgABUWnv27NHWrVttvp37woULat++vT777DPt2bNHjz76qIYOHaodO3bYvHbZsmXy9PTU9u3bNWvWLL3wwgs2QeoqwzD05JNP6p133tHmzZvVpk2bMvVcp04dPfLII/rkk0+Un59fprkAOI5LRTcAAL+3du1aVatWTVeuXNHFixfl5OSk+fPnW9fXq1dPzz33nPX5k08+qQ0bNuiDDz5Qx44drcvbtGljPXTWrFkzzZ8/X4mJibr33nutY65cuaIhQ4Zo165d2rJli+rVq2eXbQgKCtLZs2eVlZWlOnXq2GVOAPZFAAJQqdxzzz168803lZubq9dee00uLi568MEHrevz8/P10ksv6YMPPtDRo0d16dIlXbx4UR4eHjbz/HFPTkBAgE6ePGmz7JlnnpGbm5u2bdsmX19fu22DYRiSfjtEBqBy4hAYgErF09NTTZs2VUhIiOLj47V9+3YtXrzYuv7VV1/VvHnzNHHiRG3atElpaWkKCwvTpUuXbOapUqWKzXOLxaKCggKbZffee6+OHj2qDRs22HUb9u7dKy8vL9WqVcuu8wKwHwIQgErLyclJkydP1pQpU3T+/HlJ0ldffaX+/ftryJAhCgkJUePGjfXDDz+Uav5+/fpp+fLlGjVqlFasWGGXnk+ePKnly5crPDxcTk58xAKVFb+dACq1iIgIOTs7a8GCBZJ+O58nISFBW7du1d69e/XXv/5VJ06cKPX8DzzwgN59911FR0frww8/LNFrDcNQRkaGjh8/rr179yo+Pl5dunSRt7e3Xn755VL3BMDxOAcIQKXm4uKisWPHatasWXr88cc1ZcoU/fzzzwoLC5OHh4ceffRRhYeHl+nmgwMHDlRBQYGGDh0qJycnDRgwoFivy8nJUUBAgCwWi7y8vNS8eXNFRUVp3Lhx8vLyKnU/ABzPYlw9Ww8AAMAkOAQGAABMhwAEAABMhwAEAABMhwAEAABMhwAEAABMhwAEAABMhwAEAABMhwAEAABMhwAEAABMhwAEAABMhwAEAABMhwAEAABM5/8BtX0lvKMuBToAAAAASUVORK5CYII=", + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rank_idRDMA bandwidth(GB/s)RDMA size(mb)RDMA time(ms)SDMA bandwidth(GB/s)SDMA size(mb)SDMA time(ms)
00009.766842507.34694399984352.225880000002
100010.165342507.3467759997954181.611080000001
200010.47142507.3467759997954059.527798999999
30009.969142507.3467759997954263.9230400000015
40009.146942507.3467759997954647.202435000001
50009.466342507.3467759997954490.373999999999
60009.569242507.3467759997954442.106745000001
70009.844442507.3467759997954317.931616999999
800018.89542507.3899522249.662369
900018.911242507.390808000062247.7420159999997
1000018.771342507.390808000062264.48576
1100018.838942507.390808000062256.3606000000004
1200018.768742507.390808000062264.8021099999996
1300018.971742507.390808000062240.5713950000004
1400018.922642507.390808000062246.381839999999
1500018.834642507.390808000062256.8781
" + ], "text/plain": [ - "
" + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| rank_id | RDMA bandwidth(GB/s) | RDMA size(mb) | RDMA time(ms) | SDMA bandwidth(GB/s) | SDMA size(mb) | SDMA time(ms) |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 0 | 0 | 0 | 0 | 9.7668 | 42507.3469439998 | 4352.225880000002 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 1 | 0 | 0 | 0 | 10.1653 | 42507.346775999795 | 4181.611080000001 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 2 | 0 | 0 | 0 | 10.471 | 42507.346775999795 | 4059.527798999999 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 3 | 0 | 0 | 0 | 9.9691 | 42507.346775999795 | 4263.9230400000015 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 4 | 0 | 0 | 0 | 9.1469 | 42507.346775999795 | 4647.202435000001 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 5 | 0 | 0 | 0 | 9.4663 | 42507.346775999795 | 4490.373999999999 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 6 | 0 | 0 | 0 | 9.5692 | 42507.346775999795 | 4442.106745000001 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 7 | 0 | 0 | 0 | 9.8444 | 42507.346775999795 | 4317.931616999999 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 8 | 0 | 0 | 0 | 18.895 | 42507.389952 | 2249.662369 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 9 | 0 | 0 | 0 | 18.9112 | 42507.39080800006 | 2247.7420159999997 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 10 | 0 | 0 | 0 | 18.7713 | 42507.39080800006 | 2264.48576 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 11 | 0 | 0 | 0 | 18.8389 | 42507.39080800006 | 2256.3606000000004 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 12 | 0 | 0 | 0 | 18.7687 | 42507.39080800006 | 2264.8021099999996 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 13 | 0 | 0 | 0 | 18.9717 | 42507.39080800006 | 2240.5713950000004 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 14 | 0 | 0 | 0 | 18.9226 | 42507.39080800006 | 2246.381839999999 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+\n", + "| 15 | 0 | 0 | 0 | 18.8346 | 42507.39080800006 | 2256.8781 |\n", + "+---------+----------------------+---------------+---------------+----------------------+--------------------+--------------------+" ] }, "metadata": {}, @@ -241,43 +592,72 @@ } ], "source": [ - "# 设置展示图大小\n", - "fig, ax = plt.subplots(figsize=(10,8))\n", - "\n", - "x = np.arange(len(rank_ids)) # the label locations\n", - "\n", - "rects1 = ax.bar(x - width/2, sdma_bw, width, label='SDMA')\n", - "rects2 = ax.bar(x + width/2, rdma_bw, width, label='RDMA')\n", - "\n", - "# Add some text for labels, title and custom x-axis tick labels, etc.\n", - "ax.set_ylabel('Bandwidth(GB/s)')\n", - "ax.set_xlabel('Rank ID')\n", - "ax.set_title('Transport Bandwidth')\n", - "ax.set_xticks(x)\n", - "ax.set_xticklabels(rank_ids)\n", - "ax.legend()\n", - "print(words)" + "slow_link_data = slow_link_result.get(\"slow_link_analysis\")\n", + "if slow_link_data:\n", + " slow_link_table = PrettyTable(slow_link_data.get(\"headers\"))\n", + " for row in slow_link_data.get(\"data\"):\n", + " for i in range(len(row)):\n", + " row[i] = fill(str(row[i]), width=60)\n", + " slow_link_table.add_row(row)\n", + " slow_link_table.hrules = ALL\n", + " display(slow_link_table[:16])" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "id": "77d6efa1-48e3-409f-82c4-3e2b3d868898", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "RDMA bandwidth(GB/s): \n", - "The average is 0.041, while the maximum is 0.041GB/s and the minimum is 0.041GB/s. the difference is 0.0GB/s. \n", - "SDMA bandwidth(GB/s): \n", - "The average is 0.054, while the maximum is 0.056GB/s and the minimum is 0.052GB/s. the difference is 0.003GB/s. \n" - ] + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescription
slow_rank_analysisComputing has some issues in the cluster, because the max difference of Computing time
has reached 2411.538ms. Communication has some issues in the cluster, because the max
difference of Communication time has reached 3989.506ms.
slow_link_analysisSDMA bandwidth(GB/s): The average is 14.332, while the maximum is 18.972GB/s and the
minimum is 9.147GB/s. the difference is 9.825GB/s.
" + ], + "text/plain": [ + "+--------------------+------------------------------------------------------------------------------------------------------+\n", + "| problem | description |\n", + "+--------------------+------------------------------------------------------------------------------------------------------+\n", + "| slow_rank_analysis | Computing has some issues in the cluster, because the max difference of Computing time |\n", + "| | has reached 2411.538ms. Communication has some issues in the cluster, because the max |\n", + "| | difference of Communication time has reached 3989.506ms. |\n", + "| slow_link_analysis | SDMA bandwidth(GB/s): The average is 14.332, while the maximum is 18.972GB/s and the |\n", + "| | minimum is 9.147GB/s. the difference is 9.825GB/s. |\n", + "+--------------------+------------------------------------------------------------------------------------------------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "print(dataset.get('bottleneck'))" + "problems = slow_link_result.get(\"problems\")\n", + "headers = problems.get('headers')[:2]\n", + "if problems: # 如果存在相关问题则获取相关问题检测描述及建议\n", + " problem_table = PrettyTable(headers)\n", + " for row in problems.get(\"data\"):\n", + " row = [fill(str(element), width=100) for element in row]\n", + " problem_table.add_row(row[:2])\n", + " display(problem_table)\n", + "else:\n", + " print(\"There is no suggestion related to slow link analysis.\")" ] }, { @@ -290,17 +670,30 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 66, + "id": "466a0f30-042c-492a-bbf2-a5a85b649f95", + "metadata": {}, + "outputs": [], + "source": [ + "from advisor_backend.interface import Interface\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 68, "id": "e05774e9-c47e-400f-8421-b4b71bcdcbc4", "metadata": {}, "outputs": [], "source": [ + "interface = Interface(cluster_path)\n", "dataset = interface.get_data('cluster', 'kernel')" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 69, "id": "e95b6849-1738-4975-929f-734edff5d1c1", "metadata": {}, "outputs": [ @@ -342,72 +735,72 @@ " \n", " 0\n", " 0\n", - " Add\n", - " \"1024,2,5120;1024,2,5120\"\n", - " DT_BF16;DT_BF16\n", - " \"1024,2,5120\"\n", - " 45.012050\n", - " 82.952748\n", - " 55.9255\n", - " 35.3108\n", - " 16\n", - " 720.1928\n", + " Add100\n", + " \"4096,10880;4096,10880\"\n", + " FLOAT;FLOAT\n", + " \"4096,10880\"\n", + " 478.210918\n", + " 237.729252\n", + " 721.420\n", + " 449.80\n", + " 1024\n", + " 489687.980\n", " \n", " \n", " 1\n", " 0\n", - " Add\n", - " \"2,8192,5120;2,8192,5120\"\n", - " DT_BF16;DT_BF16\n", - " \"2,8192,5120\"\n", - " 447.183700\n", - " NaN\n", - " 447.1837\n", - " 447.1837\n", - " 1\n", - " 447.1837\n", + " Add102\n", + " \"21760;21760\"\n", + " FLOAT;FLOAT\n", + " \"21760\"\n", + " 4.390391\n", + " 0.011915\n", + " 4.820\n", + " 3.98\n", + " 1024\n", + " 4495.760\n", " \n", " \n", " 2\n", " 0\n", - " Add\n", - " \"8192,2,1920;1920\"\n", - " DT_BF16;DT_BF16\n", - " \"8192,2,1920\"\n", - " 54.330850\n", - " 1.342846\n", - " 55.2456\n", - " 52.6463\n", - " 4\n", - " 217.3234\n", + " Add106\n", + " \"21760,4096;21760,4096\"\n", + " FLOAT;FLOAT\n", + " \"21760,4096\"\n", + " 933.504395\n", + " 462.979321\n", + " 1257.140\n", + " 927.38\n", + " 1024\n", + " 955908.500\n", " \n", " \n", " 3\n", " 0\n", - " Add\n", - " \"8192,2,2560;2560\"\n", - " DT_BF16;DT_BF16\n", - " \"8192,2,2560\"\n", - " 75.485375\n", - " 0.761315\n", - " 76.2802\n", - " 74.2407\n", - " 4\n", - " 301.9415\n", + " Add111\n", + " \"4096,4096;4096,4096\"\n", + " FLOAT;FLOAT\n", + " \"4096,4096\"\n", + " 91.267363\n", + " 2.158275\n", + " 97.120\n", + " 85.12\n", + " 1024\n", + " 93457.780\n", " \n", " \n", " 4\n", " 0\n", - " Add\n", - " \";\"\n", + " Add118\n", + " \"12288,4096;12288,4096\"\n", " FLOAT;FLOAT\n", - " \"\"\n", - " 1.200884\n", - " 0.017257\n", - " 1.4996\n", - " 0.9597\n", - " 50\n", - " 60.0442\n", + " \"12288,4096\"\n", + " 526.312012\n", + " 1462.617511\n", + " 787.780\n", + " 424.24\n", + " 1024\n", + " 538943.500\n", " \n", " \n", " ...\n", @@ -424,124 +817,124 @@ " ...\n", " \n", " \n", - " 1441\n", + " 2513\n", " 15\n", - " atomic_memset-1_67_1998432_1_0\n", - " \"\"\n", - " UNDEFINED\n", - " \"\"\n", - " 3.160000\n", - " NaN\n", - " 3.1600\n", - " 3.1600\n", - " 1\n", - " 3.1600\n", + " trans_Cast_12\n", + " \"4096,1,1,128\"\n", + " FLOAT\n", + " \"4096,1,1,128\"\n", + " 8.486495\n", + " 0.060174\n", + " 9.820\n", + " 8.20\n", + " 2048\n", + " 17380.342\n", " \n", " \n", - " 1442\n", + " 2514\n", " 15\n", - " trans_Cast_14\n", - " \"1\"\n", + " trans_Cast_13\n", + " \"4096,1,1,128\"\n", " FLOAT\n", - " \"1\"\n", - " 1.390000\n", - " 0.023067\n", - " 1.6000\n", - " 1.2600\n", - " 4\n", - " 5.5600\n", + " \"4096,1,1,128\"\n", + " 10.534564\n", + " 0.166380\n", + " 12.900\n", + " 9.48\n", + " 2048\n", + " 21574.787\n", " \n", " \n", - " 1443\n", + " 2515\n", " 15\n", - " trans_Cast_15\n", - " \"\"\n", - " INT32\n", - " \"\"\n", - " 64.445000\n", - " 36.276100\n", - " 70.3000\n", - " 59.2000\n", - " 4\n", - " 257.7800\n", + " trans_Cast_14\n", + " \"4096,1,1,128\"\n", + " FLOAT\n", + " \"4096,1,1,128\"\n", + " 9.784551\n", + " 0.295368\n", + " 13.021\n", + " 8.56\n", + " 2048\n", + " 20038.761\n", " \n", " \n", - " 1444\n", + " 2516\n", " 15\n", - " trans_Cast_4\n", - " \"1\"\n", - " FLOAT\n", - " \"1\"\n", - " 1.555000\n", - " 0.035857\n", - " 1.9400\n", - " 1.3200\n", - " 8\n", - " 12.4400\n", + " trans_Cast_15\n", + " \"4096,1,1,128\"\n", + " DT_BF16\n", + " \"4096,1,1,128\"\n", + " 8.342211\n", + " 0.120471\n", + " 10.220\n", + " 7.86\n", + " 2048\n", + " 17084.848\n", " \n", " \n", - " 1445\n", + " 2517\n", " 15\n", - " trans_Cast_5\n", - " \"\"\n", - " INT32\n", - " \"\"\n", - " 62.895000\n", - " 15.584200\n", - " 69.8600\n", - " 56.7600\n", - " 8\n", - " 503.1600\n", + " trans_Cast_16\n", + " \"4096,1,1,128\"\n", + " DT_BF16\n", + " \"4096,1,1,128\"\n", + " 9.507589\n", + " 0.117111\n", + " 11.681\n", + " 9.18\n", + " 2048\n", + " 19471.543\n", " \n", " \n", "\n", - "

1446 rows × 11 columns

\n", + "

2518 rows × 11 columns

\n", "" ], "text/plain": [ - " rank id Name Input Shapes \\\n", - "0 0 Add \"1024,2,5120;1024,2,5120\" \n", - "1 0 Add \"2,8192,5120;2,8192,5120\" \n", - "2 0 Add \"8192,2,1920;1920\" \n", - "3 0 Add \"8192,2,2560;2560\" \n", - "4 0 Add \";\" \n", - "... ... ... ... \n", - "1441 15 atomic_memset-1_67_1998432_1_0 \"\" \n", - "1442 15 trans_Cast_14 \"1\" \n", - "1443 15 trans_Cast_15 \"\" \n", - "1444 15 trans_Cast_4 \"1\" \n", - "1445 15 trans_Cast_5 \"\" \n", + " rank id Name Input Shapes Input Data Types \\\n", + "0 0 Add100 \"4096,10880;4096,10880\" FLOAT;FLOAT \n", + "1 0 Add102 \"21760;21760\" FLOAT;FLOAT \n", + "2 0 Add106 \"21760,4096;21760,4096\" FLOAT;FLOAT \n", + "3 0 Add111 \"4096,4096;4096,4096\" FLOAT;FLOAT \n", + "4 0 Add118 \"12288,4096;12288,4096\" FLOAT;FLOAT \n", + "... ... ... ... ... \n", + "2513 15 trans_Cast_12 \"4096,1,1,128\" FLOAT \n", + "2514 15 trans_Cast_13 \"4096,1,1,128\" FLOAT \n", + "2515 15 trans_Cast_14 \"4096,1,1,128\" FLOAT \n", + "2516 15 trans_Cast_15 \"4096,1,1,128\" DT_BF16 \n", + "2517 15 trans_Cast_16 \"4096,1,1,128\" DT_BF16 \n", "\n", - " Input Data Types Output Shapes Duration(us)_mean Duration(us)_var \\\n", - "0 DT_BF16;DT_BF16 \"1024,2,5120\" 45.012050 82.952748 \n", - "1 DT_BF16;DT_BF16 \"2,8192,5120\" 447.183700 NaN \n", - "2 DT_BF16;DT_BF16 \"8192,2,1920\" 54.330850 1.342846 \n", - "3 DT_BF16;DT_BF16 \"8192,2,2560\" 75.485375 0.761315 \n", - "4 FLOAT;FLOAT \"\" 1.200884 0.017257 \n", - "... ... ... ... ... \n", - "1441 UNDEFINED \"\" 3.160000 NaN \n", - "1442 FLOAT \"1\" 1.390000 0.023067 \n", - "1443 INT32 \"\" 64.445000 36.276100 \n", - "1444 FLOAT \"1\" 1.555000 0.035857 \n", - "1445 INT32 \"\" 62.895000 15.584200 \n", + " Output Shapes Duration(us)_mean Duration(us)_var Duration(us)_max \\\n", + "0 \"4096,10880\" 478.210918 237.729252 721.420 \n", + "1 \"21760\" 4.390391 0.011915 4.820 \n", + "2 \"21760,4096\" 933.504395 462.979321 1257.140 \n", + "3 \"4096,4096\" 91.267363 2.158275 97.120 \n", + "4 \"12288,4096\" 526.312012 1462.617511 787.780 \n", + "... ... ... ... ... \n", + "2513 \"4096,1,1,128\" 8.486495 0.060174 9.820 \n", + "2514 \"4096,1,1,128\" 10.534564 0.166380 12.900 \n", + "2515 \"4096,1,1,128\" 9.784551 0.295368 13.021 \n", + "2516 \"4096,1,1,128\" 8.342211 0.120471 10.220 \n", + "2517 \"4096,1,1,128\" 9.507589 0.117111 11.681 \n", "\n", - " Duration(us)_max Duration(us)_min Duration(us)_count Duration(us)_sum \n", - "0 55.9255 35.3108 16 720.1928 \n", - "1 447.1837 447.1837 1 447.1837 \n", - "2 55.2456 52.6463 4 217.3234 \n", - "3 76.2802 74.2407 4 301.9415 \n", - "4 1.4996 0.9597 50 60.0442 \n", - "... ... ... ... ... \n", - "1441 3.1600 3.1600 1 3.1600 \n", - "1442 1.6000 1.2600 4 5.5600 \n", - "1443 70.3000 59.2000 4 257.7800 \n", - "1444 1.9400 1.3200 8 12.4400 \n", - "1445 69.8600 56.7600 8 503.1600 \n", + " Duration(us)_min Duration(us)_count Duration(us)_sum \n", + "0 449.80 1024 489687.980 \n", + "1 3.98 1024 4495.760 \n", + "2 927.38 1024 955908.500 \n", + "3 85.12 1024 93457.780 \n", + "4 424.24 1024 538943.500 \n", + "... ... ... ... \n", + "2513 8.20 2048 17380.342 \n", + "2514 9.48 2048 21574.787 \n", + "2515 8.56 2048 20038.761 \n", + "2516 7.86 2048 17084.848 \n", + "2517 9.18 2048 19471.543 \n", "\n", - "[1446 rows x 11 columns]" + "[2518 rows x 11 columns]" ] }, - "execution_count": 12, + "execution_count": 69, "metadata": {}, "output_type": "execute_result" } @@ -563,6 +956,13 @@ }, { "cell_type": "markdown", + "id": "ae45826394463cc4", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "## 4) 展示集群流水并行图\n", "使用说明: \n", @@ -574,15 +974,28 @@ "\n", "示例图:\n", "![pipeline_view](../../profiler/test/resource/pipeline_view.png)" - ], - "metadata": { - "collapsed": false - }, - "id": "ae45826394463cc4" + ] }, { "cell_type": "code", - "outputs": [], + "execution_count": 70, + "id": "baf66781eccfbca1", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] Start to process 8 rank profiling data with 8 workers.\n", + "[INFO] Pipline view data process finished, cost 98.48s.\n" + ] + } + ], "source": [ "import json\n", "\n", @@ -594,11 +1007,15 @@ "# 保存json数据,在chrome trace中查看\n", "with open(\"./pipeline_view.json\", \"w\") as f:\n", " json.dump(dataset.get(\"data\", []), f)" - ], - "metadata": { - "collapsed": false - }, - "id": "baf66781eccfbca1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f34ecf5-5c4a-4bc0-a761-e6338e534bac", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -617,7 +1034,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.7" + "version": "3.9.10" } }, "nbformat": 4, diff --git a/profiler/advisor/common/__init__.py b/profiler/advisor/common/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/common/analyzer_scopes.py b/profiler/advisor/common/analyzer_scopes.py new file mode 100644 index 0000000000000000000000000000000000000000..592f9d421e2bfad53a9ea621d951ae0166221623 --- /dev/null +++ b/profiler/advisor/common/analyzer_scopes.py @@ -0,0 +1,14 @@ +class SupportedScopes: + + # used for specify fourth-level commands and define the key of the result dict + # the key defined bellow must be the same as value + TIMELINE_FUSION_OPS = "timeline_fusion_ops" + GRAPH = "graph" + SLOW_RANK = "slow_rank" + SLOW_LINK = "slow_link" + OVER_ALL = "over_all" + DYNAMIC_SHAPE_ANALYSIS = "dynamic_shape_analysis" + AICPU_ANALYSIS = "aicpu_analysis" + BLOCK_DIM_ANALYSIS = "block_dim_analysis" + OPERATOR_NO_BOUND_ANALYSIS = "operator_no_bound_analysis" + TIMELINE_OP_DISPATCH = "timeline_op_dispatch" diff --git a/profiler/advisor/common/constant.py b/profiler/advisor/common/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..40aaac94b1c1e7f88a56c8a5b0d15e8814b9f61d --- /dev/null +++ b/profiler/advisor/common/constant.py @@ -0,0 +1,140 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# timeline +DEQUEUE = "Dequeue" +DEQUEUE_SEP = "@" +ATEN = "aten" +NPU = "npu" +ATEN_SEP = "::" +OPTIMIZER = "Optimizer" +OPTIMIZER_SEP = "#" +OPTIMIZER_STEP = "step" +ENQUEUE = "enqueue" +TORCH_TO_NPU = "torch_to_npu" +OP_COMPILE_NAME = "AscendCL@aclopCompileAndExecute" +OP_COMPILE_ID = "aclopCompileAndExecute" +MAX_OP_COMPILE_NUM = 20 +ACL_TO_NPU = "acl_to_npu" +TASK_TYPE = "Task Type" +CPU_OP = "cpu_op" +AI_CORE = "AI_CORE" +AI_CPU = "AI_CPU" +CALL_STACKS = "Call stack" +INPUT_DIMS = "Input Dims" +OP_SEP = "-" +MA_ADVISOR_MAX_PROCESSES = 16 +MA_ADVISOR_ANALYZE_PROCESSES = "MA_ADVISOR_ANALYZE_PROCESSES" +TIMELINE_OP_STACKS_DATASET = "timeline_op_stacks_dataset" +TIMELINE_BACKWARD_NO_STACK = "Backward broadcast, without call stacks in profiling." +TIMELINE_ACL_TO_NPU_NO_STACK = "Incoming flow is 'acl_to_npu', without call stacks in profiling." +TIMELINE_BACKWARD_NO_STACK_CODE = -1 +TIMELINE_ACL_TO_NPU_NO_STACK_CODE = -2 +TIMELINE_FUSION_OPS_NO_STACK_FLAG = "NO STACK" +NO_STACK_REASON_MAP = { + TIMELINE_BACKWARD_NO_STACK_CODE: "Backward broadcast, without call stacks in profiling.", + TIMELINE_ACL_TO_NPU_NO_STACK_CODE: "Incoming flow is 'acl_to_npu', without call stacks in profiling." +} +TIMELINE_API_DOC_URL = "https://support.huaweicloud.com/bestpractice-modelarts/modelarts_10_2516.html" +AFFINITY_TRAINING_API = "Affinity training api" +TIMELINE_WITH_STACK_DOC_URL = "https://www.hiascend.com/document/detail/zh/canncommercial/" \ + "70RC1/modeldevpt/ptmigr/AImpug_0067.html" +PyTorch_AOE_OPERATOR_TUNE_URL = "https://www.hiascend.com/document/detail/zh/canncommercial/" \ + "70RC1/devtools/auxiliarydevtool/aoe_16_045.html" +MSLite_Infer_AOE_OPEATOR_TUNE_URL = "https://www.mindspore.cn/lite/docs/en/master/use/cloud_infer/converter_tool_ascend.html#aoe-auto-tuning" +ENABLE_COMPILED_TUNE_URL = "https://www.hiascend.com/document/detail/zh/canncommercial/" \ + "70RC1/modeldevpt/ptmigr/AImpug_0059.html" + +ASCEND_PROFILER_URL = "https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/modeldevpt/ptmigr/AImpug_0067.html" +TIMELINE_EMPTY_STACKS_PROMPT = "These APIs have no code stack. If parameter 'with_stack=False' while profiling, " \ + "please refer to {timeline_profiling_doc_url} to set 'with_stack=True'. " \ + "Otherwise, ignore following affinity APIs due to backward broadcast lack of stack." + +CLUSTER_ANALYSIS = "Cluster analysis" +SLOW_RANK_TIME_RATIO_THRESHOLD = 0.05 + +# version_control +CANN_VERSION_C30 = '6.3.RC2' +CANN_VERSION_C13 = '7.0.RC1' +CANN_VERSION_C15 = '7.0.0' +CANN_VERSION_C17 = '8.0.0' +SUPPORTED_CANN_VERSION = [CANN_VERSION_C30, CANN_VERSION_C13, CANN_VERSION_C15, CANN_VERSION_C17] +DEFAULT_CANN_VERSION = CANN_VERSION_C17 +ASCEND_PYTORCH_PROFILER = "ascend_pytorch_profiler" +MSLITE = "mslite" +MSPROF = "msprof" +SUPPORTED_PROFILING_TYPE = [ASCEND_PYTORCH_PROFILER, MSLITE, MSPROF] +DEFAULT_PROFILING_TYPE = ASCEND_PYTORCH_PROFILER +TORCH_VERSION_1_11_0 = '1.11.0' +TORCH_VERSION_2_1_0 = '2.1.0' + +SUPPORTED_TORCH_VERSION = [TORCH_VERSION_1_11_0, TORCH_VERSION_2_1_0] +DEFAULT_TORCH_VERSION = TORCH_VERSION_2_1_0 + +TERMINAL_OUTPUT_HEADERS = ["No.", "Problem", "Description", "Suggestion"] +SKIP_ANALYZE_PROMPT = "Finish analysis, no optimization suggestions" +SKIP_QUERY_PROMPT = "Finish query operator stack, no operators" + +# operator output constant +OPERATOR_OUT_TOPK = 10 +OPERATOR_LIST_UNLIMIT = -1 + +DEFAULT_OPERATOR_TYPE = 'None_type' +DEFAULT_DURATION_ZERO = 0.0 + +ADVISOR_LOG_LEVEL = "ADVISOR_LOG_LEVEL" +DEFAULT_LOG_LEVEL = "INFO" +SUPPORTED_LOG_LEVEL = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] + +RULE_BUCKET = "RULE-BUCKET" +CLOUD_RULE_REGION_CN_NORTH_9 = "cn-north-9" +CLOUD_RULE_REGION_CN_NORTH_7 = "cn-north-7" +CLOUD_RULE_REGION_CN_SOUTHWEST_2 = "cn-southwest-2" +CLOUD_RULE_REGION_LIST = [CLOUD_RULE_REGION_CN_NORTH_7, CLOUD_RULE_REGION_CN_NORTH_9, CLOUD_RULE_REGION_CN_SOUTHWEST_2] +INNER_REGION_LIST = [CLOUD_RULE_REGION_CN_NORTH_7] +DEFAULT_CLOUD_RULE_REGION = CLOUD_RULE_REGION_CN_SOUTHWEST_2 + +HTTP_PREFIXES = "http://" +HTTPS_PREFIXES = "https://" +COMMON_YAML_DIR = "modelarts/solution/ma_advisor_rules/" +COMMON_ENDPOINT_SUFFIX = "obs.{}.myhuaweicloud.com" +INNER_ENDPOINT_SUFFIX= "obs.{}.ulanqab.huawei.com" + +AICPU_RULES_YAML_NAME = "aicpu_rules.yaml" +FUSION_PASS_YAML_NAME = "op_fusion_pass.yaml" +TIMELINE_FUSION_OPS_YAML_NAME = "timeline_fusion_ops.yaml" +CLOUD_YAML_NAME_LIST = [AICPU_RULES_YAML_NAME, FUSION_PASS_YAML_NAME, TIMELINE_FUSION_OPS_YAML_NAME] + +MAX_RETRIES = 3 +TIMEOUT = 3 + +ADVISOR_RULE_PATH = "ADVISOR_RULE_PATH" +CLOUD_RULE_PATH = "rules/cloud/" +DEFAULT_RULE_PATH = "./rules/" + +TIMELINE_FUSION_OPS_INVALID_UNIQUE_ID = -1 + +DEFAULT_TEMPLATE_HEADER = "Performance Optimization Suggestions" + +PT_PROF_SUFFIX = "ascend_pt" +ASCEND_PROFILER_OUTPUT = "ASCEND_PROFILER_OUTPUT" +COLLECTION_PATH = "collection_path" +CLUSTER_ANALYSIS_OUTPUT = "cluster_analysis_output" +KERNEL_DETAILS_CSV = "kernel_details.csv" +CLUSTER_STEP_TIME_CSV = "cluster_step_trace_time.csv" +CLUSTER_COMM_JSON = "cluster_communication.json" + +BOTTLENECK = "bottleneck" +DATA = "data" \ No newline at end of file diff --git a/profiler/advisor/common/graph/__init__.py b/profiler/advisor/common/graph/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/common/graph/graph.py b/profiler/advisor/common/graph/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..6bab2042de3a09f9317f71fc6a5c9740743cc790 --- /dev/null +++ b/profiler/advisor/common/graph/graph.py @@ -0,0 +1,135 @@ +import logging +from typing import Dict, List, Tuple, Callable, Any, Optional, Union + +import networkx as nx + +from profiler.advisor.common.graph.graph_parser import HostGraphNode, QueryGraphNode + +logger = logging.getLogger() + + +class Graph: + """ + Graph Struct + """ + + # pylint: disable=too-many-instance-attributes + def __init__(self, + nodes: Dict[str, Optional[Union[HostGraphNode, QueryGraphNode]]] = None, + edges: List[Tuple[Optional[Union[HostGraphNode, QueryGraphNode]], + Optional[Union[HostGraphNode, QueryGraphNode]]]] = None, + name: str = None): + self.name = name + self.graph = nx.DiGraph(name=name) + self.nodes = nodes if nodes is not None else {} + self.edges = edges if edges is not None else list() + + def build(self): + for op_name, node in self.nodes.items(): + # add node and mark op_name as tag + self.add_node(node, + op_type=node.op_type + ) + for edge in self.edges: + self.add_edge(*edge) + return self.graph + + def get_size(self) -> Dict[str, int]: + if not hasattr(self.graph, "nodes"): + return {"edges": 0, "nodes": 0} + + return {"edges": len(self.graph.edges), + "nodes": len(self.graph.nodes)} + + def add_node(self, node: HostGraphNode, **kwargs): + if node is None: + return + self.graph.add_node(node, **kwargs) + + def add_edge(self, pre_node: HostGraphNode, next_node: HostGraphNode): + if pre_node is None or next_node is None: + return + + if pre_node not in self.graph or \ + next_node not in self.graph: + logging.error("Nodes between edge should be both exists.") + return + + self.graph.add_edge(pre_node, next_node) + + def add_node_with_edge(self, node, adj_nodes: List[HostGraphNode]): + self.add_node(node) + for adj in adj_nodes: + self.add_edge(node, adj) + + def remove_node(self, node: HostGraphNode = None) -> None: + if node is None: + return + + self.graph.remove_node(node) + + def remove_edge(self, pre_node: HostGraphNode = None, next_node: HostGraphNode = None) -> None: + if pre_node is None or next_node is None: + raise ValueError(f"Invalid edge from {pre_node} to {pre_node}.") + + self.remove_edge(pre_node, next_node) + + def get_subgraph(self, nodes: List[HostGraphNode]) -> nx.DiGraph: + nodes = list(set(nodes)) + for node in nodes: + if not self.is_node_exists(node): + raise ValueError(f"Failed to subtract subgraph because {node.op_name} is not in the graph.") + + return self.graph.subgraph(nodes) + + def highlight_subgraph(self, subgraph: nx.DiGraph = None) -> None: + pass + + def get_node(self, node: HostGraphNode): + if node not in self.graph: + return + + return self.graph[node] + + def get_node_by_name(self, node_name: str): + return self.nodes.get(node_name, None) + + def is_node_exists(self, node: HostGraphNode): + return node in self.graph + + def draw(self, + graph: nx.DiGraph = None, + with_labels: bool = False, + labels: Dict[HostGraphNode, Any] = None, + pos_func: Callable = None, + font_weight: str = "bold", + savefig: bool = False, + node_size: int = 50, + **kwargs + ): + try: + import matplotlib.pylab as plt + except ImportError: + logger.error('Please install matplotlib first by using `pip install matplotlib`.') + return + + if graph is None: + graph = self.graph + + pos = pos_func(graph) if pos_func is not None else None + + if with_labels: + if labels is None: + labels = {k: f"{k}\n({v['op_name']})" for k, v in graph.nodes.items()} + + nx.draw(graph, + with_labels=with_labels, + pos=pos, + node_size=node_size, + font_weight=font_weight, + labels=labels, + **kwargs + ) + if savefig: + plt.savefig(self.name + ".png") + plt.show() diff --git a/profiler/advisor/common/graph/graph_match.py b/profiler/advisor/common/graph/graph_match.py new file mode 100644 index 0000000000000000000000000000000000000000..d0dfc162952b0c52bf9ed73cef2ff18ff5ffda24 --- /dev/null +++ b/profiler/advisor/common/graph/graph_match.py @@ -0,0 +1,355 @@ +import itertools +import logging +from functools import lru_cache +from collections import deque +from typing import Dict, Generator, List, Callable, Hashable, Tuple + +import networkx as nx + + +@lru_cache() +def match_node_attr_fun(query_node: Hashable, + host_node: Hashable, + query_graph: nx.Graph, + host_graph: nx.Graph + ) -> bool: + """ + Check query node matches the attributes in host graph + + :param query_node: Query graph node + :param host_node: Host graph node + :param query_graph: Query Graph + :param host_graph: Host graph + :return: bool, match or not + """ + # get node attr + if query_node not in query_graph.nodes or host_node not in host_graph.nodes: + return False + + query_node = query_graph.nodes[query_node] + host_node = host_graph.nodes[host_node] + for attr, val in query_node.items(): + if attr not in host_node: + return False + if isinstance(host_node[attr], str) and isinstance(val, str): + if host_node[attr].lower() != val.lower(): + return False + else: + if host_node[attr] != val: + return False + return True + + +@lru_cache() +def match_node_struct_fun(query_node: Hashable, + host_node: Hashable, + query_graph: nx.Graph, + host_graph: nx.Graph + ) -> bool: + """ + Check query node matches the structure in host graph + + :param query_node: Query graph node + :param host_node: Host graph node + :param query_graph: Query Graph + :param host_graph: Host graph + :return: bool, match or not + """ + if query_node not in query_graph.nodes or host_node not in host_graph.nodes: + return False + + return host_graph.degree(host_node) >= query_graph.degree(query_node) + + +@lru_cache() +def match_edge_attr_fun(query_edge: Tuple[Hashable, Hashable], + host_edge: Tuple[Hashable, Hashable], + query_graph: nx.Graph, + host_graph: nx.Graph + ) -> bool: + """ + Check query edge matches the attr in host graph + + :param query_edge: Query graph edge + :param host_edge: Host graph edge + :param query_graph: Query Graph + :param host_graph: Host graph + :return: bool, match or not + """ + # get edge attr + if query_edge not in query_graph.edges or host_edge not in host_graph.edges: + return False + + query_edge = query_graph.edges[query_edge] + host_edge = host_graph.edges[host_edge] + for attr, val in query_edge.items(): + if attr not in host_edge: + return False + if isinstance(host_edge[attr], str) and isinstance(val, str): + if host_edge[attr].lower() != val.lower(): + return False + else: + if host_edge[attr] != val: + return False + return True + + +def find_isomorphisms(query_graph: nx.Graph, + host_graph: nx.Graph, + *args, + _node_attr_fun: Callable = match_node_attr_fun, + _node_struct_fun: Callable = match_node_struct_fun, + _edge_attr_fun: Callable = match_edge_attr_fun, + limit: int = None, + **kwargs) -> List[Dict[Hashable, Hashable]]: + """ + Find all the sub graphs that are isomorphic to query_graph in host_graph . + + :param query_graph: The graph object to query + :param host_graph: The graph object to be queried + :param args: Position args + :param _node_attr_fun: The function to match node attr + :param _node_struct_fun: The function to match node structural + :param _edge_attr_fun: The function to match edge attr + :param limit: The limitation for the number of returned mappings + :param kwargs: Keyword args + :return: Matched node mapping list + ``` + [{query_id: host_id, ...}, ...] + ``` + """ + candidates = [] + for query_result in find_isomorphisms_iter( + query_graph, + host_graph, + *args, + _node_attr_fun=_node_attr_fun, + _node_struct_fun=_node_struct_fun, + _edge_attr_fun=_edge_attr_fun, + **kwargs + ): + candidates.append(query_result) + if limit and len(candidates) >= limit: + return candidates + return candidates + + +def find_isomorphisms_iter(query_graph: nx.Graph, + host_graph: nx.Graph, + directed: bool = None, + _node_attr_fun: Callable = None, + _node_struct_fun: Callable = None, + _edge_attr_fun: Callable = None, + ) -> Generator[Dict[Hashable, Hashable], None, None]: + """ + A generation to find one isomorphic subgraph in host_graph for query_graph. + + :param query_graph: The graph object to query + :param host_graph: The graph object to be queried + :param directed: Whether direction should be considered during search + :param _node_attr_fun: The function to match node attr + :param _node_struct_fun: The function to match node structural + :param _edge_attr_fun: The function to match edge attr + :return: Yield mappings from query node IDs to host graph IDs: {query_id: host_id, ...} + + """ + if directed is None: + # query graph and host graph should consider directions. + if isinstance(query_graph, nx.DiGraph) and \ + isinstance(host_graph, nx.DiGraph): + directed = True + else: + directed = False + + # Initialize queue + dq = deque() + dq.appendleft({}) + + while len(dq) > 0: + backbone = dq.pop() + next_candidate_backbones = get_next_candidates(backbone=backbone, + query_graph=query_graph, + host_graph=host_graph, + directed=directed, + _node_attr_fun=_node_attr_fun, + _node_struct_fun=_node_struct_fun, + _edge_attr_fun=_edge_attr_fun, + ) + for candidate in next_candidate_backbones: + # find a legal isomorphism + if len(candidate) == len(query_graph): + yield candidate + else: + # continue to search + dq.appendleft(candidate) + + +def get_next_candidates( + backbone: Dict, + query_graph: nx.Graph, # noqa + host_graph: nx.Graph, # noqa + next_node: Hashable = None, + directed: bool = True, # noqa + _node_attr_fun: Callable = None, # noqa + _node_struct_fun: Callable = None, # noqa + _edge_attr_fun: Callable = None # noqa +) -> List[Dict[Hashable, Hashable]]: + """ + Get a list of candidate node assignments for the next "step" of this map. + + :param backbone: Mapping of query node IDs to one set of host graph IDs + :param next_node: Optional suggestion for the next node to assign + :return: List[Dict[Hashable, Hashable]]: A new list of node mappings with one additional element mapped + """ + node_priority = {n: 1 for n in query_graph.nodes} + candidate_nodes = [] + + if next_node is None and len(backbone) == 0: + # Start case + next_node = max(node_priority.keys(), + key=lambda x: node_priority.get(x, 0)) + + for node in host_graph.nodes: + if _node_attr_fun(next_node, node, query_graph, host_graph) and \ + _node_struct_fun(next_node, node, query_graph, host_graph): + candidate_nodes.append({next_node: node}) + return candidate_nodes + + nodes_with_maximum_backbone = [] + for query_node_id in query_graph.nodes: + if query_node_id in backbone: + continue + + backbone_neighbors = [] + if not directed: + backbone_neighbors = query_graph.adj[query_node_id] + else: + # nx.DiGraph.pred: A <- B: find previous node from B to A + # nx.DiGraph.adj: A -> B : find next node from A to B + backbone_neighbors = list(set(query_graph.adj[query_node_id]).union(set(query_graph.pred[query_node_id]))) + + query_backbone_node_count = sum([1 for _node in backbone_neighbors if _node in backbone]) + if query_backbone_node_count > 0: + # Find a longer backbone node + nodes_with_maximum_backbone.append(query_node_id) + + # next_node is connected to the current backbone. + next_node = max(nodes_with_maximum_backbone, key=lambda x: node_priority.get(x, 0)) + + # verify all edges between `next_node` and nodes in the backbone are exist in host graph + # Step1: find all edges between `next_node` and nodes in the backbone + next_edge_edges = [] + for _node in query_graph.adj[next_node]: + if _node in backbone: + # `next_node` -> `_node` + next_edge_edges.append((None, next_node, _node)) + + if directed: + for _node in query_graph.pred[next_node]: + if _node in backbone: + # `_node` -> `next_node` + next_edge_edges.append((_node, next_node, None)) + + if len(next_edge_edges) == 0: + logging.warning("Find node without any edge, which is invalid.") + return [] + # Step2: verify candidate nodes that have such edges in the host graph + candidate_nodes = [] + if len(next_edge_edges) == 1: + source, _, target = next_edge_edges[0] + if not directed: + candidate_nodes = list(host_graph.adj[backbone[target]]) + else: + if source is not None: + # means `source` is a `from` edge + candidate_nodes = list(host_graph.adj[backbone[source]]) + elif target is not None: + # means `target` is a `from` edge + candidate_nodes = list(host_graph.pred[backbone[target]]) + + elif len(next_edge_edges) > 1: + candidate_nodes_set = set() + for (source, _, target) in candidate_nodes: + if not directed: + candidate_nodes_from_this_edge = host_graph.adj[backbone[target]] + else: + if source is not None: + candidate_nodes_from_this_edge = host_graph.adj[backbone[source]] + else: # target is not None: + candidate_nodes_from_this_edge = host_graph.pred[backbone[target]] + + if len(candidate_nodes_set) > 0: + candidate_nodes_set = candidate_nodes_set.intersection(candidate_nodes_from_this_edge) + else: + # Initialize candidate_nodes_set + candidate_nodes_set.update(candidate_nodes_from_this_edge) + candidate_nodes = list(candidate_nodes_set) + + tentative_results = [] + for _node in candidate_nodes: + if all([_node not in backbone.values(), + _node_attr_fun(next_node, _node, query_graph, host_graph), + _node_struct_fun(next_node, _node, query_graph, host_graph)] + ): + tentative_results.append({**backbone, + next_node: _node}) + + final_candidates = check_edges_mapping(tentative_results, + query_graph=query_graph, + host_graph=host_graph, + _edge_attr_fun=_edge_attr_fun) + return final_candidates + + +def check_edges_mapping(candidates: List[Dict[Hashable, Hashable]], + query_graph: nx.Graph, + host_graph: nx.Graph, + _edge_attr_fun: Callable = None + ) -> List[Dict[Hashable, Hashable]]: + """ + Check that all edges between the assigned nodes exist in the host graph. + + :param candidates: mapping nodes candidates + :param query_graph: The graph object to query + :param host_graph: The graph object to be queried + :param _edge_attr_fun: The function to match edge attr + :return: + """ + monomorphism_candidates = [] + + for candidate in candidates: + if len(candidate) != len(query_graph): + monomorphism_candidates.append(candidate) + continue + + all_pass_flag = True + for edge_start, edge_end in query_graph.edges: + # check edge in host graph + if not host_graph.has_edge(candidate[edge_start], candidate[edge_end]): + all_pass_flag = False + break + + # check edge attr + if _edge_attr_fun is None or not _edge_attr_fun( + (edge_start, edge_end), + (candidate[edge_start], candidate[edge_end]), + query_graph, + host_graph + ): + all_pass_flag = False + break + + if all_pass_flag: + monomorphism_candidates.append(candidate) + + # Isomorphisms check + final_candidates = [] + for candidate in monomorphism_candidates: + all_product = itertools.product(candidate.keys(), candidate.keys()) + for edge_start, edge_end in all_product: + if not query_graph.has_edge(edge_start, edge_end) and \ + host_graph.has_edge(candidate[edge_start], candidate[edge_end]): + break + else: + final_candidates.append(candidate) + return final_candidates diff --git a/profiler/advisor/common/graph/graph_parser.py b/profiler/advisor/common/graph/graph_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..d4c67fc1918af37a837e016bd9e5b813957b1aef --- /dev/null +++ b/profiler/advisor/common/graph/graph_parser.py @@ -0,0 +1,413 @@ +import os +import logging +import yaml +import itertools +from collections import deque +from dataclasses import dataclass +from typing import List, Tuple, Dict + +logger = logging.getLogger() + + +@dataclass +class Tensor: + def __init__(self): + super().__init__() + self.shape = [] + self.origin_shape = [] + self.shape_range = [] + self.origin_shape_range = [] + self.dtype = "" + self.origin_data_type = "" + self.format = "" + self.origin_format = [] + + +@dataclass +class Attr: + + def __init__(self): + super().__init__() + self.key = str() + self.value = [] + + +class HostGraphNode: + def __init__(self): + super().__init__() + self.graph_name = str() + self.op_name = str() + self.op_type = str() + self.inputs = [] + self.input = [] + self.outputs = [] + self.output = [] + self.strides = [] + self.pads = [] + self.groups = "" + self.dilations = [] + self.kernelname = "" + self._attrs = [] + + def __repr__(self): + return f"" + + +@dataclass +class HostGraph: + def __init__(self): + super().__init__() + self.name = "" + self.nodes = {} + self.inputs = [] + self.edges = [] + self.model_name = None + self.file_path = None + + def build(self): + """build a graph""" + for name, node in self.nodes.items(): + for input_node in node.inputs: + if input_node not in self.nodes: + continue + self.nodes[input_node].outputs.append(name) + + +class HostGraphParser: + """ + Parse graph metadata from text file + """ + def __init__(self, file_path): + self.buffer = deque(maxlen=100) + self.line_no = 0 + self._file_path = file_path + self.edges: List[Tuple[HostGraphNode, HostGraphNode]] = [] + self.nodes: Dict[str, HostGraphNode] = {} + self.graphs = self._parse(self._file_path) + self._get_node_dict() + self._get_edges_list() + del self.graphs[0] + + @staticmethod + def _get_key_value( line): + res = line.split(':', 1) + return res[0].strip(), res[1].strip().strip('"') + + @staticmethod + def _parse_attr(key, value, obj): + if not isinstance(obj, list) and not obj: + return + if key == "dim" and hasattr(obj, "shape"): + obj.shape.append(value) + elif key == "name" and hasattr(obj, "op_name"): + obj.op_name = value + elif key == "name" and hasattr(obj, "name"): + obj.name = value + elif key == "dtype" and hasattr(obj, "dtype"): + obj.dtype = value + elif key == "layout" and hasattr(obj, "format"): + obj.format = value + elif key == "type" and hasattr(obj, "op_type"): + obj.op_type = value + elif key == "input" and hasattr(obj, "input"): + obj.inputs.append(value.strip('"').split(':')[0]) + elif key == "key" and hasattr(obj, "key"): + obj.key = value + elif hasattr(obj, key): + setattr(obj, key, value) + elif isinstance(obj, list) and key != "val_type": + obj.append(value) + + def _parse_struct(self, in_file, key, in_obj): + + def parse_shape(file, obj): + obj = self._parse_line(file, obj) + + def parse_input_desc(file, obj): + tensor = self._parse_line(file, Tensor()) + if obj and hasattr(obj, "input"): + obj.input.append(tensor) + + def parse_out_desc(file, obj): + tensor = self._parse_line(file, Tensor()) + if obj and hasattr(obj, "output"): + obj.output.append(tensor) + + def parse_op(file, obj: HostGraph): + node = self._parse_line(file, HostGraphNode()) + if hasattr(obj, "name"): + node.graph_name = obj.name + if obj and hasattr(obj, "nodes") and node.op_name: + obj.nodes[node.op_name] = node + + def parse_graph(file, obj): + graph = self._parse_line(file, HostGraph()) + obj.append(graph) + + def parse_attr(file, obj): + attr = self._parse_line(file, Attr()) + if hasattr(obj, attr.key): + if attr.key not in ['format']: + setattr(obj, attr.key, attr.value) + elif attr.key.endswith("_kernelname"): + setattr(obj, "kernelname", attr.value) + if obj and hasattr(obj, "get_attrs"): + obj.get_attrs().append(attr) + + def parse_list(file, obj): + value = [] + self._parse_line(file, value) + if isinstance(obj, list): + obj.append(value) + else: + obj = value + + def parse_value(file, obj): + if hasattr(obj, "value"): + obj.value = self._parse_line(file, obj.value) + + def parse_default(file, _obj=None): + """function with unused argument""" + self._parse_line(file, None) + + parse_methods = { + "shape": parse_shape, + "input_desc": parse_input_desc, + "output_desc": parse_out_desc, + "op": parse_op, + "graph": parse_graph, + "attr": parse_attr, + "list_list_int": parse_list, + "list_list_i": parse_list, + "list": parse_list, + "value": parse_value, + } + parse_methods.get(key, parse_default)(in_file, in_obj) + + def _read_line(self, file): + self.line_no += 1 + line = file.readline() + if line.strip().endswith('}'): + end_line = "" + while self.buffer and not end_line.strip().endswith("{"): + end_line = self.buffer.pop() + else: + self.buffer.append(line) + return line.strip() + + def _parse_line(self, file, obj=None): + line = self._read_line(file) + try: + while line and not line.endswith("}"): + if line.endswith('{'): + key = line.rstrip('{').strip() + self._parse_struct(file, key, obj) + else: + key, value = self._get_key_value(line) + self._parse_attr(key, value, obj) + line = self._read_line(file) + except Exception as exception: + if self.buffer: + logger.debug("***********************graph content**************************") + while self.buffer: + line = self.buffer.popleft() + logger.debug(line) + logger.debug("***********************graph content**************************") + raise exception + return obj + + def _parse(self, graph_file): + # pylint:disable=broad-except + graph_list = [] + with open(graph_file, "r", encoding="gbk") as file: + try: + graph_list = self._parse_line(file, graph_list) + except Exception: + logger.error( + "Parse line %s of file %s failed, make sure the format is correct.", self.line_no, graph_file + ) + graphs = [] + for graph in graph_list: + if isinstance(graph, HostGraph): + graphs.append(graph) + for graph in graphs: + graph.model_name = graphs[0].name + graph.file_path = self._file_path + graph.build() + return graphs + + def _get_edges_list(self) -> None: + if len(self.graphs) <= 0: + return + + def is_repeat_edge(edge, edge_collector): + for _edge in edge_collector: + if edge[0].op_name == _edge[0].op_name and edge[1].op_name == _edge[1].op_name: + return True + return False + + for node in self.nodes.values(): + for input_node_name in node.inputs: + if input_node_name not in self.nodes: + continue + input_node = self.nodes[input_node_name] + if not is_repeat_edge((input_node, node), self.edges): + self.edges.append((input_node, node)) + for output_node_name in node.outputs: + if output_node_name not in self.nodes: + continue + output_node = self.nodes[output_node_name] + if not is_repeat_edge((node, output_node), self.edges): + self.edges.append((node, output_node)) + + def _get_node_dict(self) -> None: + if not self.graphs: + self.nodes = {} + return + self.nodes = {node.op_name: node for graph in self.graphs for node in graph.nodes.values()} + + +class QueryGraphNode: + """ + Graph Node + """ + _ID = 0 + + def __init__(self, op_type: str, op_pass: str): + self._op_type = op_type + self._id = QueryGraphNode._ID + self._op_pass = op_pass + QueryGraphNode._ID += 1 + + def get_property(self, name): + """ + get property + """ + return getattr(self, name, lambda: None) + + @property + def op_type(self): + return self._op_type + + @property + def op_name(self): + return self._op_type + "_id_" + str(self._id) + + @property + def op_pass(self): + return self._op_pass + + @op_type.setter + def op_type(self, op_type): + self._op_type = op_type + + def __eq__(self, other): + return self._op_type == other._op_type and \ + self._id == other._id + + def __hash__(self): + return hash(self._op_type + str(self._id)) + + @staticmethod + def trim_string(string: str, length: int = -1): + """ + + Trim string to target length + :param string: Original string + :param length: Target length of string, -1 indicates original string. + :return: Trimmed string + """ + if string is None or not isinstance(string, str): + raise TypeError(f"Param string must be a string type but got {type(string)}.") + + if length <= -1 or len(string) <= length: + return string + + return string[:length] + + +class QueryGraphParser: + def __init__(self, rule_database_path: str): + self._fusion_rules: Dict[str, List[Tuple]] = dict() + self.load_database(rule_database_path) + self.num_rules = sum([len(v) for v in self._fusion_rules.values()]) + + @property + def fusion_rules(self): + return self._fusion_rules + + def load_database(self, rule_database): + if not os.path.isabs(rule_database): + rule_database = os.path.join(os.path.dirname(__file__), + "../", "../", + rule_database) + + if not os.path.exists(rule_database): + raise FileNotFoundError(f"Path {rule_database} does not exist.") + with open(rule_database, 'r') as f: + database = yaml.safe_load(f) + self.parse_yaml(database) + + def parse_yaml(self, yaml_database): + fusion_strategy_list = yaml_database.get("GraphFusion", []) + if yaml_database.get("UBFusion", []): + fusion_strategy_list.extend(yaml_database.get("UBFusion", [])) + for fusion_strategy in fusion_strategy_list: + if not isinstance(fusion_strategy, dict): + continue + (fusion_name, strategy), = fusion_strategy.items() + version = strategy.get("version", 0) + if version == 0 or version == "0": + self._fusion_rules[fusion_name] = self.build_query_graph_v0(fusion_name, + strategy.get('struct', [])) + elif version == 1 or version == "1": + self._fusion_rules[fusion_name] = self.build_query_graph_v1(fusion_name, + strategy.get('nodes', []), + strategy.get('edges', [])) + + @staticmethod + def build_query_graph_v0(graph_name: str, graph_struct: List[str]) -> List[Tuple]: + nodes = dict() + graphs = [] + edges = [] + + pre_node, next_node = None, None + for node in graph_struct: + pre_node = next_node + next_node = QueryGraphNode(node, graph_name) + nodes[next_node.op_name] = next_node + if pre_node is None or next_node is None: + continue + edges.append((pre_node, next_node,)) + graphs.append((nodes, edges, graph_name,)) + return graphs + + @staticmethod + def build_query_graph_v1(graph_name: str, + nodes_list: List[Dict], + edges_list: List[List[str]]) -> List[Tuple]: + graphs = [] + node_index = dict() + multi_node_list = [] + for index, node in enumerate(nodes_list): + (node_name, op_type), = node.items() + if isinstance(op_type, str): + op_type = [op_type] + multi_node_list.append([QueryGraphNode(op, graph_name) for op in op_type]) + node_index[node_name] = index + + multi_node = list(itertools.product(*multi_node_list)) + + for index, sub_nodes in enumerate(multi_node): + sub_graph_name = graph_name if index == 0 else f"{graph_name}#{index}" + sub_edge = [] + sub_node = dict() + for node in sub_nodes: + sub_node[node.op_name] = node + for edge in edges_list: + pre_node, next_node = edge + pre_node_index, next_node_index = node_index.get(pre_node), node_index.get(next_node) + sub_edge.append((sub_nodes[pre_node_index], sub_nodes[next_node_index])) + sub_graph = (sub_node, sub_edge, sub_graph_name,) + graphs.append(sub_graph) + return graphs diff --git a/profiler/advisor/common/profiling/__init__.py b/profiler/advisor/common/profiling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/common/profiling/ge_info.py b/profiler/advisor/common/profiling/ge_info.py new file mode 100644 index 0000000000000000000000000000000000000000..9996ec611a2a835bd8dffd24c3fbe7d8817ec29a --- /dev/null +++ b/profiler/advisor/common/profiling/ge_info.py @@ -0,0 +1,47 @@ +""" +DB +""" +import logging +import os +from typing import Any, List + +from sqlalchemy import text + +from profiler.advisor.dataset.profiling.db_manager import ConnectionManager +from profiler.advisor.dataset.profiling.profiling_parser import ProfilingParser + +logger = logging.getLogger() + + +class GeInfo(ProfilingParser): + """ + ge info file + """ + FILE_PATTERN = r"ge_info.db" + FILE_PATTERN_MSG = "ge_info.db" + FILE_INFO = "ge info" + STATIC_OP_STATE = "0" + DYNAMIC_OP_STATE = "1" + + def __init__(self, path: str) -> None: + super().__init__(path) + self.op_state_info_list = None + + def parse_from_file(self, profiling_db_file): + """ + ge info + """ + db_path, db_file = os.path.split(profiling_db_file) + if not ConnectionManager.check_db_exists(db_path, [db_file]): + return False + conn = ConnectionManager(db_path, db_file) + if conn.check_table_exists(['TaskInfo']): + with conn().connect() as sql_conn: + self.op_state_info_list = sql_conn.execute(text("select op_name, op_state from TaskInfo")).fetchall() + return True + + def get_static_shape_operators(self) -> List[Any]: + return [op for op, state in self.op_state_info_list if state == self.STATIC_OP_STATE] + + def get_dynamic_shape_operators(self) -> List[Any]: + return [op for op, state in self.op_state_info_list if state == self.DYNAMIC_OP_STATE] diff --git a/profiler/advisor/common/profiling/msprof.py b/profiler/advisor/common/profiling/msprof.py new file mode 100644 index 0000000000000000000000000000000000000000..9453986b8225ccad68f2135d674e3832d987fcf0 --- /dev/null +++ b/profiler/advisor/common/profiling/msprof.py @@ -0,0 +1,144 @@ +""" +msprof +""" +import logging +from typing import Dict, List + +from profiler.advisor.dataset.profiling.info_collection import TaskInfo +from profiler.advisor.dataset.profiling.profiling_parser import ProfilingParser + +logger = logging.getLogger() + + +class TaskChecker: + """ + check task info + """ + + def __init__(self): + self.sqe_keys = set() + + def is_sqe(self, task: TaskInfo) -> bool: + """check sqe""" + key = (task.pid, task.tid) + if task.args.get('name', '').endswith('_SQE'): + self.sqe_keys.add(key) + return False + + return key in self.sqe_keys + + +class Msprof(ProfilingParser): + """ + msprof + + """ + FILE_PATTERN = r"^msprof[_\d]+.json$" + FILE_PATTERN_MSG = "msprof_*.json" + FILE_INFO = "msprof" + + def __init__(self, path: str) -> None: + super().__init__(path) + self._tasks: List[TaskInfo] = [] + self._iteration_time = 0.0 + self._model_id = None + self._iteration_id = None + self._process_pid: Dict[str, str] = {} + self._min_time = 0.0 + self._max_time = 0.0 + self._data_process_time = 0.0 + self._start_point = 0.0 + + def parse_from_file(self, file: str): + if not self._parse_json(file): + return False + min_time = float('inf') + max_time = 0.0 + task_checker = TaskChecker() + is_iter = False + for item in self._raw_data: + task = TaskInfo(item) + if task.cat == "Iteration Time": + self._min_time = task.start_time + self._max_time = task.end_time + self._iteration_time = task.dur + is_iter = True + if task.cat == "Data_aug Bound" and "Data_aug Bound(us)" in task.args: + self._data_process_time = task.args["Data_aug Bound(us)"] + + if self._start_point == 0 and task.start_time > 0: + self._start_point = task.start_time + + if task_checker.is_sqe(task): + continue + + self._tasks.append(task) + self._parse_task(task) + + start_time = task.start_time + dur = task.dur + if start_time == -1 or dur == -1 or dur == 0: + continue + if start_time < min_time: + min_time = start_time + end_time = start_time + dur + if end_time > max_time: + max_time = end_time + if not is_iter: + self._iteration_time = dur + self._max_time = max_time + self._min_time = min_time + if self._tasks: + return True + return False + + def _parse_task(self, task): + if "Iteration Refresh" in task.name: + self._iteration_id = task.args.get("Iteration ID") + elif "Model ID" in task.name: + self._model_id = int(task.name.split(":")[1]) + elif "process_name" == task.name: + self._process_pid[task.args.get("name")] = task.pid + + @property + def step_time(self): + return self._iteration_time + self._data_process_time + + @property + def iteration_time(self): + return self._iteration_time + + @property + def iter_max_time(self): + return self._max_time + + @property + def iter_min_time(self): + return self._min_time + + @property + def data_process_time(self): + return self._data_process_time + + @property + def tasks(self): + return self._tasks + + @property + def model_id(self): + return self._model_id + + @property + def iteration_id(self): + return self._iteration_id + + @property + def process_pid(self): + return self._process_pid + + def __len__(self): + return len(self._tasks) + + @property + def start_point(self): + return self._start_point diff --git a/profiler/advisor/common/profiling/op_summary.py b/profiler/advisor/common/profiling/op_summary.py new file mode 100644 index 0000000000000000000000000000000000000000..d79439dbad8e2c105bed737c1a1c3be1a2cecfc1 --- /dev/null +++ b/profiler/advisor/common/profiling/op_summary.py @@ -0,0 +1,76 @@ +""" +summary +""" +import logging +from decimal import Decimal +from typing import List, Any + +from profiler.advisor.dataset.profiling.info_collection import OpInfo +from profiler.advisor.dataset.profiling.profiling_parser import ProfilingParser +from profiler.advisor.utils.utils import format_excel_title, lazy_property + +logger = logging.getLogger() + + +class OpSummary(ProfilingParser): + """ + op summary + """ + + FILE_PATTERN = r"^op_summary_[_\d]+\.csv$" + FILE_PATTERN_MSG = "op_summary_*.csv" + FILE_INFO = "op summary" + STATIC_OP_STATE = "static" + DYNAMIC_OP_STATE = "dynamic" + + def __init__(self, path: str) -> None: + super().__init__(path) + self.op_list: List[OpInfo] = [] + self._total_task_duration = 0.0 + self._total_task_wait_time = 0.0 + self._raw_data: List[List[str]] = [] + + def parse_from_file(self, file: str): + if not self._parse_csv(file): + return False + title_dict = dict(enumerate(self._raw_data[0])) + for op_data in self._raw_data[1:]: + op_info = OpInfo() + for idx, value in enumerate(op_data): + title = title_dict.get(idx, "") + formatted_title = format_excel_title(title) + if formatted_title == 'task_start_time' and 'us' in title and \ + value.replace('.', '').replace("E+", "").isnumeric(): + value = str(Decimal(value) * Decimal(1000)) + op_info.add_attr(formatted_title, value) + self.op_list.append(op_info) + self._total_task_duration += self.get_float(op_info.get_attr("task_duration")) + self._total_task_wait_time += self.get_float(op_info.get_attr("task_wait_time")) + if not self.op_list: + logger.error("No valid op info in %s", file) + return False + return True + + def get_static_shape_operators(self) -> List[Any]: + return [op_info.get_attr("op_name") for op_info in self.op_list if op_info.get_attr("op_state") == self.STATIC_OP_STATE] + + def get_total_task_duration(self): + """ + get total task duration of all operators + :return: + """ + return self._total_task_duration + + @lazy_property + def task_dict(self): + """ + task dict + """ + task_dict = {} + for op_info in self.op_list: + if op_info.op_name not in task_dict: + task_dict[op_info.op_name] = [op_info] + else: + task_dict[op_info.op_name].append(op_info) + + return task_dict diff --git a/profiler/advisor/common/profiling/tasktime.py b/profiler/advisor/common/profiling/tasktime.py new file mode 100644 index 0000000000000000000000000000000000000000..3ce09a783851e94163aa72f423788a373da5eb3a --- /dev/null +++ b/profiler/advisor/common/profiling/tasktime.py @@ -0,0 +1,75 @@ +""" +task time +""" +import logging +from typing import Dict, List + +from profiler.advisor.dataset.profiling.info_collection import TaskInfo +from profiler.advisor.dataset.profiling.profiling_parser import ProfilingParser + +logger = logging.getLogger() + +AICPU_TASK_TYPE = "AI_CPU" +AICORE_TASK_TYPE = "AI_CORE" + + +class TaskTime(ProfilingParser): + """ + task time info + """ + + FILE_PATTERN = r"^task_time_[_\d]+\.json$" + FILE_PATTERN_MSG = "task_time*.json" + FILE_INFO = "task time" + + def __init__(self, path: str) -> None: + super().__init__(path) + self._tasks: List[TaskInfo] = [] + self._aicore_tasks: List[TaskInfo] = [] + self._aicpu_tasks: List[TaskInfo] = [] + self._process_map: Dict[str, str] = {} + self._pid_map: Dict[str, str] = {} + + def get_aicpu_tasks(self): + """ + get aicpu tasks + :return: aicpu tasks + """ + return self._aicpu_tasks + + def get_aicore_tasks(self): + """ + get aicore tasks + :return: aicore tasks + """ + return self._aicore_tasks + + def parse_from_file(self, file: str): + if not self._parse_json(file): + return False + for item in self._raw_data: + if item.get("ph") != "M": # header + continue + if item.get("name") != "process_name": + continue + pid = item.get("pid") + pname = item["args"]["name"] + self._process_map[pid] = pname + self._pid_map[pname] = pid + for item in self._raw_data: + if item.get("ph") == "M": # header + continue + task = TaskInfo(item) + self._tasks.append(task) + if task.pid != self._pid_map.get("Task Scheduler"): + continue + if task.task_type == AICORE_TASK_TYPE: + self._aicore_tasks.append(task) + elif task.task_type == AICPU_TASK_TYPE: + self._aicpu_tasks.append(task) + self._aicore_tasks.sort(key=lambda x: x.start_time) + self._aicpu_tasks.sort(key=lambda x: x.start_time) + if not self._tasks: + logger.error("No valid task info in %s", file) + return False + return True diff --git a/profiler/advisor/common/timeline/__init__.py b/profiler/advisor/common/timeline/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/common/timeline/event.py b/profiler/advisor/common/timeline/event.py new file mode 100644 index 0000000000000000000000000000000000000000..6001ac88722e5a77daba1c960e8ccfd6894889e6 --- /dev/null +++ b/profiler/advisor/common/timeline/event.py @@ -0,0 +1,23 @@ +class AdvisorDict(dict): + def __getstate__(self): + return self.__dict__ + + def __setstate__(self, d): + self.__dict__.update(d) + + def __getattr__(self, key: str): + if key not in self: + return {} + + value = self[key] + if isinstance(value, dict): + value = AdvisorDict(value) + return value + + +class TimelineEvent(AdvisorDict): + + def ts_include(self, event): + + return float(self.ts) <= float(event.ts) and float(self.ts) + float(self.dur) >= float(event.ts) + float( + event.dur) \ No newline at end of file diff --git a/profiler/advisor/common/timeline/fusion_ops_db.py b/profiler/advisor/common/timeline/fusion_ops_db.py new file mode 100644 index 0000000000000000000000000000000000000000..8637befd1ab108928bdf8f4fdb19d9cab03ff960 --- /dev/null +++ b/profiler/advisor/common/timeline/fusion_ops_db.py @@ -0,0 +1,269 @@ +import logging +import os + +import yaml + +from profiler.advisor.common import constant +from profiler.advisor.common.timeline.fusion_ops_rule import OpRule +from profiler.advisor.common.timeline.fusion_ops_rule_handler import TimelineOpRuleHandler +from profiler.advisor.utils.log import get_log_level +from profiler.advisor.utils.utils import get_file_path_by_walk + +logger = logging.getLogger() +logger.setLevel(get_log_level()) + + +def init_timeline_ops_db(cann_version=None, torch_version=None): + logger.debug("init operators database") + + return FusionOperatorDB(cann_version=cann_version, torch_version=torch_version) + + +def get_timeline_fusion_ops_yaml_path(): + # 环境变量 ADVISOR_RULE_PATH 不为空且该路径存在, os.walk遍历其下文件, 若存在相应的规则文件则返回路径 + advisor_rule_path = os.getenv(constant.ADVISOR_RULE_PATH) + if advisor_rule_path and os.path.exists(advisor_rule_path): + specified_file_path = get_file_path_by_walk(advisor_rule_path, constant.TIMELINE_FUSION_OPS_YAML_NAME) + if len(specified_file_path.strip()) and os.path.exists(specified_file_path): + logger.debug("Successfully find The %s file which is specified by the environment variable: %s.", + specified_file_path, constant.ADVISOR_RULE_PATH) + return specified_file_path + logger.warning("The %s does not exist in path: %s. Try to use cloud or default local YAML file.", + constant.TIMELINE_FUSION_OPS_YAML_NAME, os.path.normpath(advisor_rule_path)) + # 检查云文件默认保存路径文件夹下是否存在相应文件, 默认路径 ~/rules/cloud/ + cloud_file_path = os.path.join(os.path.expanduser("~"), constant.CLOUD_RULE_PATH, constant.TIMELINE_FUSION_OPS_YAML_NAME) + if os.path.exists(cloud_file_path): + logger.debug("Successfully find The cloud %s file in %s.", constant.TIMELINE_FUSION_OPS_YAML_NAME, + cloud_file_path) + return cloud_file_path + # 检查本地默认文件 + local_file_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), + constant.DEFAULT_RULE_PATH, constant.TIMELINE_FUSION_OPS_YAML_NAME) + if not os.path.exists(local_file_path): + # 若本地默认文件不存在, 则log异常信息并 + logger.error("The default local YAML file does not exist. Please check the YAML file in the default path %s.", + local_file_path) + return local_file_path + + +class FusionOperatorDB: + + def __init__(self, file_path=None, cann_version=None, torch_version=None): + self.timeline_fusion_ops_yaml_path = os.path.normpath(get_timeline_fusion_ops_yaml_path()) + + self.cann_version = cann_version or constant.DEFAULT_CANN_VERSION + self.torch_version = torch_version or constant.DEFAULT_TORCH_VERSION + + self._supported_version_dict = {} + + self.is_empty = False + self.timeline_op_rule_handler = TimelineOpRuleHandler() + self.fusion_operator = self._load_yaml(self.timeline_fusion_ops_yaml_path) + + self._dequeue_op_names = [] + self._aten_op_names = [] + self._optimizer_op_names = [] + self._dequeue_op_api_map = {} + self._aten_op_api_map = {} + self._optimizer_op_api_map = {} + self._parse_db() + + @property + def dequeue_op_names(self): + return self._dequeue_op_names + + @property + def aten_op_names(self): + return self._aten_op_names + + @property + def optimizer_op_names(self): + return self._optimizer_op_names + + @property + def dequeue_op_api_map(self): + return self._dequeue_op_api_map + + @property + def aten_op_api_map(self): + return self._aten_op_api_map + + @property + def optimizer_op_api_map(self): + return self._optimizer_op_api_map + + def get_fusion_operator_with_unique_id(self, unique_id): + if unique_id == constant.TIMELINE_FUSION_OPS_INVALID_UNIQUE_ID: + logger.warning("The specified unique id: %s is invalid.Please check whether the rule of the unique id " + "exists and modify the rule.", constant.TIMELINE_FUSION_OPS_INVALID_UNIQUE_ID) + return {} + result_tmp_rule = self.timeline_op_rule_handler.get_tmp_timeline_op_rule_with_unique_id(unique_id) + result_op_rule = OpRule(result_tmp_rule) + return result_op_rule.get_final_rules() + + def regenerate_timeline_op_rule_with_unique_id(self, unique_id): + self.fusion_operator.clear() + logger.debug("Program try to regenerate the rule to version %s.", unique_id) + self.fusion_operator = self.get_fusion_operator_with_unique_id(unique_id) + self.regenerate_op_api_map_and_op_names() + + def regenerate_timeline_op_rule_with_version(self, cann_version=None, torch_version=None): + cann_version = cann_version or self.cann_version + torch_version = torch_version or self.torch_version + unique_id = self._get_unique_id_in_supported_version_dict(cann_version=cann_version, + torch_version=torch_version) + self.regenerate_timeline_op_rule_with_unique_id(unique_id) + + def regenerate_op_api_map_and_op_names(self): + self._dequeue_op_names.clear() + self._aten_op_names.clear() + self._optimizer_op_names.clear() + self._dequeue_op_api_map.clear() + self._aten_op_api_map.clear() + self._optimizer_op_api_map.clear() + self._parse_db() + + def _is_version_supported(self, db_content): + """校验当前版本是否被规则库中的版本支持, 保存版本支持信息数组, 按数组或字符串的可变方式保存""" + if db_content is None: + logger.warning( + "The rule library is empty. Check the rule library file: %s", + self.timeline_fusion_ops_yaml_path + ) + return False + for rule_dic in db_content: + if not isinstance(rule_dic, dict) or rule_dic.get("unique_id") is None: + continue + cann_version_list = rule_dic.get("cann_version") + torch_version_list = rule_dic.get("torch_version") + if not cann_version_list or not torch_version_list: + continue + supported_version = [cann_version_list, torch_version_list] + + unique_id = rule_dic.get("unique_id") + if unique_id < 0: + logger.warning( + "The unique id: %s of the rule should be a positive integer. " + "Please check and modify the rule configuration in the YAML file: %s.", + unique_id, os.path.normpath(self.timeline_fusion_ops_yaml_path) + ) + self._supported_version_dict[unique_id] = supported_version + + # 若解析timeline规则库的版本支持数组为空, 则存在问题 + if not self._supported_version_dict: + logger.warning( + "The rule library does not contain rules that support the current version. " + "Check the rule library file: %s", + self.timeline_fusion_ops_yaml_path + ) + return False + + # 检验当前版本是否被规则库支持 + is_version_supported = self._is_version_supported_in_supported_version_dict() + if not is_version_supported: + # 若规则库不支持当前版本, 则log警告信息 + logger.warning("Unsupported versions: cann-%s and torch-%s, supported version list of ['cann', 'torch'] " + "is %s", self.cann_version, self.torch_version, self._supported_version_dict.values()) + return is_version_supported + + def _is_version_supported_in_supported_version_dict(self, cann_version=None, torch_version=None): + """校验当前版本是否存在在规则库中的版本支持字典中""" + for _, supported_version in self._supported_version_dict.items(): + if self._is_version_supported_in_versions(supported_version, cann_version, torch_version): + return True + return False + + def _get_unique_id_in_supported_version_dict(self, cann_version=None, torch_version=None) -> int: + """校验当前版本是否存在在规则库中的版本支持字典中, 在使用前请检查是否支持该版本""" + for key_unique_id, supported_version in self._supported_version_dict.items(): + if self._is_version_supported_in_versions(supported_version, cann_version, torch_version): + return key_unique_id + return constant.TIMELINE_FUSION_OPS_INVALID_UNIQUE_ID + + def _is_version_supported_in_versions(self, supported_version, cann_version=None, torch_version=None): + """校验当前cann版本和torch版本是否存在在规则库中的版本支持数组的元素中""" + cann_version_list = supported_version[0] + if not isinstance(cann_version_list, list): + cann_version_list = [cann_version_list] + + torch_version_list = supported_version[1] + if not isinstance(torch_version_list, list): + torch_version_list = [torch_version_list] + + cann_version = cann_version or self.cann_version + torch_version = torch_version or self.torch_version + + if (cann_version in cann_version_list) and (torch_version in torch_version_list): + return True + return False + + def _parse_db(self): + """生成输出的规则库""" + self._parse(constant.ATEN) + self._parse(constant.DEQUEUE) + self._parse(constant.OPTIMIZER) + + def _parse(self, mode): + """生成输出的规则库中指定部分, 如aten, Optimizer等""" + op_info = self.fusion_operator.get(mode, []) or [] + for ops in op_info: + for npu_api, op_combined in ops.items(): + if not isinstance(op_combined, list): + self._parse_in_list(mode, op_combined, npu_api) + for _op_combined in op_combined: + self._parse_in_list(mode, _op_combined, npu_api) + + def _parse_in_list(self, mode, op_combined, npu_api): + """生成输出的规则库中具体部分, 如{silu: torch_npu.npu_silu/torch_npu.contrib.module.SiLU}等""" + if not isinstance(op_combined, str): + logger.warning("Error type in yaml: %s", op_combined) + return + mode_str = mode.lower() + getattr(self, f"{mode_str}_op_names", []).extend(op_combined.split("-")) + + new_npu_api = npu_api + pre_npu_api = getattr(self, f"{mode_str}_op_api_map", {}).get(op_combined) + if pre_npu_api: + new_npu_api = f"{pre_npu_api}/{npu_api}" + getattr(self, f"{mode_str}_op_api_map", {})[op_combined] = new_npu_api + logger.debug("Output rule: %s: %s: %s: %s ", mode, op_combined, new_npu_api, op_combined.split("-")) + + def _load_yaml(self, file_path): + """生成timeline规则库""" + logger.debug("Try to use the following yaml file as timeline ops rule: %s.", os.path.abspath(file_path)) + # 若文件不存在,则报错, 并返回空字典 + if not os.path.exists(file_path): + logger.warning("Path: '%s' does not exist, please specific existed path of " + "fusion operators yaml file by setting env '%s'", + os.path.abspath(file_path), constant.ADVISOR_RULE_PATH) + self.is_empty = True + return {} + + logger.debug("The rule yaml file is successfully found in path: %s", os.path.abspath(file_path)) + + with open(file_path, "rb") as file: + db_content = yaml.safe_load(file) + + if not self._is_version_supported(db_content): + self.is_empty = True + return {} + + logger.debug("The rule library supports the current environment version.") + + # 获取所有版本timeline规则库 + self.timeline_op_rule_handler.set_db_content(db_content) + + # 获取所需版本规则 + unique_id = self._get_unique_id_in_supported_version_dict() + logger.debug("Program is using version %s of the rule.", unique_id) + result_op_rule = self.get_fusion_operator_with_unique_id(unique_id) + if result_op_rule and len(result_op_rule) > 0: + return result_op_rule + + logger.warning( + "Failed to load fusion operators database, skip analyze timeline for affinity api," + " please refer to database yaml %s to customize your yaml.", + self.timeline_fusion_ops_yaml_path + ) + self.is_empty = True + return {} diff --git a/profiler/advisor/common/timeline/fusion_ops_rule.py b/profiler/advisor/common/timeline/fusion_ops_rule.py new file mode 100644 index 0000000000000000000000000000000000000000..deee68edb9a92d0588f3f3c155a7b2595317a5c7 --- /dev/null +++ b/profiler/advisor/common/timeline/fusion_ops_rule.py @@ -0,0 +1,110 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. +import copy +import logging + +from profiler.advisor.utils.log import get_log_level + +logger = logging.getLogger() +logger.setLevel(get_log_level()) + + +class OpRule: + + def __init__(self, rule=None, timeline_op_rule_handler=None): + if rule is None: + self._tmp_rule = {} + else: + self._tmp_rule = copy.deepcopy(rule) + if timeline_op_rule_handler is None: + self.timeline_op_rule_handler = {} + else: + self.timeline_op_rule_handler = copy.deepcopy(timeline_op_rule_handler) + self._rule = {} + + @property + def tmp_rule(self): + return self._tmp_rule + + @staticmethod + def _format_rule(rule): + """格式化规则函数, 将额外规则格式化为{key,数组list}形式, 使得yaml文件中operator_rules若写成key:str形式也能正常读取""" + format_rule = {} + for key, val in rule.items(): + if not isinstance(val, list): + val = [val] + format_rule[key] = val + return format_rule + + def merge(self, extra_rule): + """合并函数, 将已有规则库与额外规则合并, 若无继承则已有规则库应为空""" + for key, val in extra_rule.items(): + for func, op_rules in val.items(): + try: + getattr(self, f"{func}")(key, op_rules) + except AttributeError: + logger.error("Undefined field and function name. Ensure that %s is correct in the rule " + "library.", func) + + def get_final_rules(self): + """获取最终的规则库""" + self._restore_rule() + return self._rule + + def add(self, key, add_rules: dict): + """新增函数, 新增已有规则库不存在的额外规则""" + if add_rules is None: + return + if self._tmp_rule.get(key) is None: + self._tmp_rule[key] = {} + format_add_rule = self._format_rule(add_rules) + for add_key, add_val in format_add_rule.items(): + logger.debug("add: %s: %s", add_key, add_val) + if add_key not in self._tmp_rule: + self._tmp_rule[key][add_key] = add_val + else: + logger.warning("This key has been written to the rule, " + "%s: %s should be written in the overwrite section", add_key, add_val) + self._tmp_rule[key][add_key].update(add_val) + + def overwrite(self, key, overwrite_rules: dict): + """重写函数, 重写已有规则库中已经存在的规则""" + if overwrite_rules is None: + return + if self._tmp_rule.get(key) is None: + self._tmp_rule[key] = {} + format_overwrite_rules = self._format_rule(overwrite_rules) + for overwrite_key, overwrite_val in format_overwrite_rules.items(): + logger.debug("overwrite: %s: %s", overwrite_key, overwrite_val) + if overwrite_key not in self._tmp_rule: + logger.warning("This key is not written to the rule. " + "%s: %s should be written in the add section", overwrite_key, overwrite_val) + self._tmp_rule[key][overwrite_key] = overwrite_val + else: + self._tmp_rule[key][overwrite_key].update(overwrite_val) + + def exclude(self, key, exclude_rules: list): + """除外函数, 将已有规则库已有的规则除外删除""" + if exclude_rules is None: + return + for exclude_key in exclude_rules: + logger.debug("exclude: %s", exclude_key) + if isinstance(exclude_key, str): + if exclude_key not in self._tmp_rule[key]: + logger.warning("This key is not written to the rule. " + "do not need to exclude: %s.", exclude_key) + continue + self._tmp_rule[key].pop(exclude_key) + else: + logger.warning("Error type rule in exclude: %s", exclude_key) + + def inherit_unique_id(self, key, inherit_unique_id): + """局部继承函数, 将规则库中指定unique_id版本覆盖指定位置""" + result_rule = self.timeline_op_rule_handler.get_tmp_timeline_op_rule_with_unique_id(inherit_unique_id) + if result_rule is not None and result_rule.get(key) is not None: + self._tmp_rule[key] = copy.deepcopy(result_rule.get(key)) + return + logger.error("Rule library version %s does not exist. ", inherit_unique_id) + + def _restore_rule(self): + for key, op_api_map in self._tmp_rule.items(): + self._rule[key] = [{op_combined: api} for op_combined, api in op_api_map.items()] diff --git a/profiler/advisor/common/timeline/fusion_ops_rule_handler.py b/profiler/advisor/common/timeline/fusion_ops_rule_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..b0558cca6d951ee057e538b5e4da6d9c2e78111b --- /dev/null +++ b/profiler/advisor/common/timeline/fusion_ops_rule_handler.py @@ -0,0 +1,193 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. +import copy +import logging + +from profiler.advisor.common import constant +from profiler.advisor.common.timeline.fusion_ops_rule import OpRule +from profiler.advisor.utils.log import get_log_level + +logger = logging.getLogger() +logger.setLevel(get_log_level()) + + +class TimelineOpRuleHandler: + """基于线性规划思想保存OpRule,用于局部继承、全局继承等功能""" + + def __init__(self): + self._db_content = None + # 具体生成的timeline规则,key为unique_id + self._all_tmp_timeline_op_rule = {} + # 所有timeline规则的dict集合,key为unique_id + self._all_origin_timeline_op_rule_dict = {} + # 已生成timeline规则的id数组 + self._exist_timeline_op_rule_unique_id_list = [] + + @staticmethod + def _get_local_inherit_id_list(op_rule: dict): + local_inherit_id_list = [] + for _, val in op_rule.items(): + if val.get("inherit_unique_id") is not None: + local_inherit_id_list.append(val.get("inherit_unique_id")) + return local_inherit_id_list + + @staticmethod + def _is_duplicated_element_in_lists(list_a, list_b): + """检查两个数组中是否存在重复的元素,若有任意元素重复,返回True""" + if not isinstance(list_a, list): + list_a = [list_a] + if not isinstance(list_b, list): + list_b = [list_b] + # 将两个数组合并为一个列表,使用集合(set)判断列表中是否存在重复元素 + combined_list = list_a + list_b + if len(combined_list) != len(set(combined_list)): + return True + return False + + def set_db_content(self, db_content): + # 过滤非 dict 格式, 或 dict 中没有定义 unique_id 的数据, 并保存到 _all_origin_timeline_op_rule_dict 中 + self._db_content = copy.deepcopy(db_content) + for rule_dic in self._db_content: + if not isinstance(rule_dic, dict) or rule_dic.get("unique_id") is None: + continue + self._all_origin_timeline_op_rule_dict[rule_dic.get("unique_id")] = rule_dic + if self._all_origin_timeline_op_rule_dict: + self.generate_all_timeline_op_rule() + + def generate_basic_timeline_op_rules(self): + """用于实现获取无全局继承规则, 无全局继承的规则认为是基础版本规则, 默认不会存在局部继承""" + for _, rule_dic in self._all_origin_timeline_op_rule_dict.items(): + if rule_dic.get("inherit_unique_id") is None: + self.add_basic_timeline_op_rule(rule_dic) + + def add_basic_timeline_op_rule(self, rule_dic): + # 若基础规则中存在局部继承的规则,则跳过 + local_inherit_id_list = self._get_local_inherit_id_list(rule_dic.get("operator_rules")) + if local_inherit_id_list: + return + + temp_rule = OpRule() + temp_rule.merge(rule_dic.get("operator_rules")) + + unique_id = rule_dic.get("unique_id") + logger.debug("The rule of version %s is basic rule.", unique_id) + self.add_new_timeline_op_rule(unique_id, temp_rule.tmp_rule) + + def add_empty_timeline_op_rule(self, unique_id): + if self._all_origin_timeline_op_rule_dict.get(unique_id) is None: + self._all_origin_timeline_op_rule_dict[unique_id] = {} + tmp_rule = {} + logger.debug("The rule of version %s is empty.", unique_id) + self.add_new_timeline_op_rule(unique_id, tmp_rule) + + def add_new_timeline_op_rule(self, unique_id, tmp_rule): + if unique_id not in self._exist_timeline_op_rule_unique_id_list: + self._exist_timeline_op_rule_unique_id_list.append(unique_id) + self._all_tmp_timeline_op_rule[unique_id] = tmp_rule + logger.debug("The rule of version %s is successfully generated.", unique_id) + + def generate_specified_list_timeline_op_rule(self, specified_unique_id_list, kid_id_list=None): + for specified_unique_id in specified_unique_id_list: + if specified_unique_id in self._exist_timeline_op_rule_unique_id_list: + self.generate_specified_timeline_op_rule(specified_unique_id, kid_id_list) + + def generate_specified_timeline_op_rule(self, specified_unique_id, kid_id_list=None): + """用于实现生成特定版本规则 + + 若不存在相应specified_unique_id的规则、或是已生成、循环继承等情况,将该规则置空并返回 + 规则库文件结构设置为多叉树, 结构决定了不断向下搜索最终应该是从基础版本开始继承, 递归生成, + 直到specified_unique_id规则依赖继承的规则库全部生成完毕, 再生成该指定规则库, 将specified_unique_id的规则库归档 + + 参数: + specified_unique_id: 指定版本规则id + kid_id_list: 子规则id数组, 用于防止循环继承, 如间接继承自身或直接继承自身等情况 + 返回: + None + """ + if kid_id_list is None: + kid_id_list = [] + + # 若该unique_id规则在timeline_fusion_ops.yaml中没有相应的规则, 生成该id规则,置为空 + if self._all_origin_timeline_op_rule_dict.get(specified_unique_id) is None: + logger.warning("The specified version %s does not exist in the rule library. " + "Ensure that the corresponding rule is configured in the YAML file. " + "The version %s is left blank.", + specified_unique_id, + specified_unique_id) + self.add_empty_timeline_op_rule(specified_unique_id) + return + + # 若该unique_id规则已经生成,则无需再次生成 + if specified_unique_id in self._exist_timeline_op_rule_unique_id_list: + logger.warning("The rule has been generated and does not need to be generated again. " + "Check whether unique id %s in the YAML file is duplicate.", + specified_unique_id) + return + + # 若kid_id_list不为空,且间接继承自身,则尝试生成空规则用于继承 + if kid_id_list and self._is_duplicated_element_in_lists(specified_unique_id, kid_id_list): + logger.warning("It cannot be inherited indirectly. Ensure that the corresponding rules are correctly " + "configured in the YAML file and leave Version %s blank.", + specified_unique_id) + self.add_empty_timeline_op_rule(specified_unique_id) + return + + rule_dic = self._all_origin_timeline_op_rule_dict.get(specified_unique_id) + if rule_dic is not None: + kid_id_list.append(specified_unique_id) + + global_inherit_id = rule_dic.get("inherit_unique_id") + if global_inherit_id and global_inherit_id not in self._exist_timeline_op_rule_unique_id_list: + logger.debug("The rule of version %s global inherit the rule of version %s", + specified_unique_id, global_inherit_id) + self.generate_specified_timeline_op_rule(global_inherit_id, kid_id_list) + + # 若局部继承的规则未生成, 生成该规则 + local_inherit_id_list = self._get_local_inherit_id_list(rule_dic.get("operator_rules")) + if local_inherit_id_list: + logger.debug("The rule of version %s local inherit the rule of version %s", + specified_unique_id, local_inherit_id_list) + self.generate_specified_list_timeline_op_rule(specified_unique_id_list=local_inherit_id_list, + kid_id_list=kid_id_list) + logger.debug("Start to generate rule of version %s", specified_unique_id) + # 实现全局继承与局部继承 + temp_rule = OpRule(timeline_op_rule_handler=self, + rule=self._all_tmp_timeline_op_rule.get(global_inherit_id)) + temp_rule.merge(rule_dic.get("operator_rules")) + # 将生成的规则归档保存 + self.add_new_timeline_op_rule(specified_unique_id, temp_rule.tmp_rule) + return + logger.error("Failed to generate the rule whose unique_id is %s. Ensure that the rule is configured in " + "the YAML file and the version %s is empty.", specified_unique_id, specified_unique_id) + self.add_empty_timeline_op_rule(specified_unique_id) + + def generate_all_timeline_op_rule(self): + """用于实现获取所有版本规则 + + 查找db_content中的规则库, 规则库文件结构设置为多叉树, 优先生成无继承的基础规则版本 + 循环并生成其他版本, 文件结构决定了不断向下搜索最终应该是从基础版本开始继承, 递归生成,直到全部规则库生成后退出函数 + + 参数: + None + 返回: + None + """ + self.generate_basic_timeline_op_rules() + _unique_id_list = copy.deepcopy(list(self._all_origin_timeline_op_rule_dict.keys())) + for unique_id in _unique_id_list: + if unique_id in self._exist_timeline_op_rule_unique_id_list: + continue + self.generate_specified_timeline_op_rule(unique_id) + + def get_tmp_timeline_op_rule_with_unique_id(self, unique_id): + if unique_id not in self._exist_timeline_op_rule_unique_id_list: + logger.error("The specified unique_id does not exist in the rule library. Ensure that the " + "corresponding rule is configured in the YAML file and the version %s is empty." + "If the value of unique_id is a negative number, the version may not be supported.", + unique_id) + self.add_empty_timeline_op_rule(unique_id) + if unique_id < 0: + logger.error("Advise to use a positive integer as the unique id of rules. " + "Negative numbers: %s are not recommended to use as unique id. " + "If specified invalid unique id: %s is used, an empty rule is returned by default.", + unique_id, constant.TIMELINE_FUSION_OPS_INVALID_UNIQUE_ID) + return self._all_tmp_timeline_op_rule.get(unique_id) diff --git a/profiler/advisor/common/version_control.py b/profiler/advisor/common/version_control.py new file mode 100644 index 0000000000000000000000000000000000000000..38b054543fc61e90d91e8442a547376cff4c6406 --- /dev/null +++ b/profiler/advisor/common/version_control.py @@ -0,0 +1,26 @@ +import logging +from typing import List + +logger = logging.getLogger() + + +class VersionControl: + _SUPPORT_VERSIONS = [] + + @classmethod + def is_supported(cls, cann_version: str) -> bool: + """ + Check whether the CANN software version is supported, which can be viewed by executing the following command: + 'cat /usr/local/Ascend/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info' + """ + flag = (cls._SUPPORT_VERSIONS.__contains__(cann_version)) + if not flag: + logger.debug("class type is %s, which is not support current CANN version %s", cls.__name__, cann_version) + return flag + + def get_support_version(self) -> List[str]: + """ + Acquire the CANN software version + :return: supported CANN software version + """ + return self._SUPPORT_VERSIONS diff --git a/profiler/advisor/computation_analysis.ipynb b/profiler/advisor/computation_analysis.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..15d8618bd9f32dccbee214c0a79f2be6863314cb --- /dev/null +++ b/profiler/advisor/computation_analysis.ipynb @@ -0,0 +1,748 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "sys.path.append(\"../..\")\n", + "\n", + "from prettytable import PrettyTable, ALL\n", + "from textwrap import fill\n", + "from profiler.advisor.interface.interface import Interface" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上\n", + "profiling_path = r\"YOUR PROFILING PATH\"\n", + "interface = Interface(profiling_path=profiling_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Block Dim问题识别\n", + "\n", + "Block Dim问题主要为识别相关core算子AI core核未打满或者Vector 核未打满问题,主要调优手段为AOE调优,由于AOE调优依赖静态shape,所以当所有算子都为动态shape时,将不会检测相关Block Dim问题.\n", + "\n", + "下列代码为样例,主要展示如何检测Block Dim类型问题,并获取相关问题检测结果:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# 查询computation相关是否存在block dim问题\n", + "# 如果profiling数据采集自非8.0.0的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version\n", + "block_dim_result = interface.get_result(\"computation\", \"block_dim_analysis\", cann_version=\"7.0.RC1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescriptionsuggestionproblem counttotal_time(us)time ratioincome(us)income ratio
block dimsome operator does not make full use of 25 ai core or 50 ai vector core; Top-10
operator of task duration are as follows: Square, MatMulV2, BatchMatMul,
SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast
1. Optimize operator by AOE, such as: 'aoe --job_type=2
--model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi
sor\\operator_tuning_file_20240613153259.cfg'
101814.01999999999991.0
" + ], + "text/plain": [ + "+-----------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| problem | description | suggestion | problem count | total_time(us) | time ratio | income(us) | income ratio |\n", + "+-----------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| block dim | some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 | 1. Optimize operator by AOE, such as: 'aoe --job_type=2 | 101 | 814.0199999999999 | 1.0 | | |\n", + "| | operator of task duration are as follows: Square, MatMulV2, BatchMatMul, | --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi | | | | | |\n", + "| | SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | sor\\operator_tuning_file_20240613153259.cfg' | | | | | |\n", + "+-----------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "problems = block_dim_result.get(\"problems\")\n", + "if problems: # 如果存在相关问题则获取相关问题检测描述及建议\n", + " problem_table = PrettyTable(problems.get(\"headers\"))\n", + " for row in problems.get(\"data\"):\n", + " row = [fill(str(element), width=80) for element in row]\n", + " problem_table.add_row(row)\n", + " \n", + " problem_table.align = \"l\"\n", + " problem_table.hrules = ALL\n", + " display(problem_table)\n", + "else:\n", + " print(\"There is no suggestion related to block dim.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
op_nameop_typetask_typetask_durationincomeblock_dimmix_block_diminput_shapesinput_data_typesinput_formatsoutput_shapesoutput_data_typesoutput_formats
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention_norm-
LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers-
CellList/0-LLamaDecodeLayer/attention_norm-LlamaRMSNorm/ReduceMean-op35
SquareAI_VECTOR_CORE42.760160"128,128"FLOATNCHW"128,1"FLOATNCHW
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/ffn_norm-
LlamaRMSNorm/Square-op77Default/model-LlamaModel/layers-
CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/ReduceMean-op78
SquareAI_VECTOR_CORE42.240160"128,128"FLOATNCHW"128,1"FLOATNCHW
Default/lm_head-Linear/MatMul-op213MatMulV2AI_CORE39.020200"128,128;128,32000"FLOAT16;FLOAT16FORMAT_ND;FORMAT_ND"128,32000"FLOATFORMAT_ND
" + ], + "text/plain": [ + "+-----------------------------------------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| op_name | op_type | task_type | task_duration | income | block_dim | mix_block_dim | input_shapes | input_data_types | input_formats | output_shapes | output_data_types | output_formats |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention_norm- | Square | AI_VECTOR_CORE | 42.76 | 0 | 16 | 0 | \"128,128\" | FLOAT | NCHW | \"128,1\" | FLOAT | NCHW |\n", + "| LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers- | | | | | | | | | | | | |\n", + "| CellList/0-LLamaDecodeLayer/attention_norm-LlamaRMSNorm/ReduceMean-op35 | | | | | | | | | | | | |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/ffn_norm- | Square | AI_VECTOR_CORE | 42.24 | 0 | 16 | 0 | \"128,128\" | FLOAT | NCHW | \"128,1\" | FLOAT | NCHW |\n", + "| LlamaRMSNorm/Square-op77Default/model-LlamaModel/layers- | | | | | | | | | | | | |\n", + "| CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/ReduceMean-op78 | | | | | | | | | | | | |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/lm_head-Linear/MatMul-op213 | MatMulV2 | AI_CORE | 39.02 | 0 | 20 | 0 | \"128,128;128,32000\" | FLOAT16;FLOAT16 | FORMAT_ND;FORMAT_ND | \"128,32000\" | FLOAT | FORMAT_ND |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+--------+-----------+---------------+---------------------+------------------+---------------------+---------------+-------------------+----------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "if problems: # 如果存在相关问题则获取相关问题检测细节\n", + " block_dim = block_dim_result.get(\"block dim\")\n", + " block_dim_table = PrettyTable(block_dim.get(\"headers\"))\n", + " for row in block_dim.get(\"data\"):\n", + " row = [fill(str(element), width=80) for element in row]\n", + " block_dim_table.add_row(row)\n", + "\n", + " block_dim_table.hrules = ALL\n", + " display(block_dim_table[:3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Operator No Bound问题识别\n", + "Operator No Bound问题主要为识别相关算子无mte, cube, vector, scalar相关bound问题,主要调优手段为AOE调优,由于AOE调优依赖静态shape,所以当所有算子都为动态shape时,将不会检测相关Operator No Bound问题.\n", + "\n", + "下列代码为样例,主要展示如何检测Operator No Bound类型问题,并获取相关问题检测结果:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from prettytable import PrettyTable, ALL\n", + "from textwrap import fill\n", + "from profiler.advisor.interface.interface import Interface\n", + "\n", + "\n", + "# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上\n", + "profiling_path = r\"YOUR PROFILING PATH\"\n", + "interface = Interface(profiling_path=profiling_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# 查询computation相关是否存在operator no bound问题\n", + "# 如果profiling数据采集自非8.0.0的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version\n", + "operator_no_bound_result = interface.get_result(\"computation\", \"operator_no_bound_analysis\", cann_version=\"7.0.RC1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescriptionsuggestionproblem counttotal_time(us)time ratioincome(us)income ratio
block dimsome operator does not make full use of 25 ai core or 50 ai vector core; Top-10
operator of task duration are as follows: Square, MatMulV2, BatchMatMul,
SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast
1. Optimize operator by AOE, such as: 'aoe --job_type=2
--model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi
sor\\operator_tuning_file_20240613153259.cfg'
101814.01999999999991.0
operator no boundThere is no mte, cube, vector, scalar ratio is more than 80.00%; Top task
duration operators need to be tuned are as follows: Square, MatMulV2,
BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast
1. Optimize operator by AOE, such as: 'aoe --job_type=2
--model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi
sor\\operator_tuning_file_20240613153259.cfg'
95814.01999999999990.7985
" + ], + "text/plain": [ + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| problem | description | suggestion | problem count | total_time(us) | time ratio | income(us) | income ratio |\n", + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| block dim | some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 | 1. Optimize operator by AOE, such as: 'aoe --job_type=2 | 101 | 814.0199999999999 | 1.0 | | |\n", + "| | operator of task duration are as follows: Square, MatMulV2, BatchMatMul, | --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi | | | | | |\n", + "| | SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | sor\\operator_tuning_file_20240613153259.cfg' | | | | | |\n", + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| operator no bound | There is no mte, cube, vector, scalar ratio is more than 80.00%; Top task | 1. Optimize operator by AOE, such as: 'aoe --job_type=2 | 95 | 814.0199999999999 | 0.7985 | | |\n", + "| | duration operators need to be tuned are as follows: Square, MatMulV2, | --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi | | | | | |\n", + "| | BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | sor\\operator_tuning_file_20240613153259.cfg' | | | | | |\n", + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "problems = operator_no_bound_result.get(\"problems\")\n", + "problem_table = PrettyTable(problems.get(\"headers\"))\n", + "if problems: # 如果存在相关问题则获取相关问题检测描述及建议\n", + " for row in problems.get(\"data\"):\n", + " row = [fill(str(element), width=80) for element in row]\n", + " problem_table.add_row(row)\n", + "\n", + " problem_table.align = \"l\"\n", + " problem_table.hrules = ALL\n", + " display(problem_table)\n", + "else:\n", + " print(\"There is no suggestion related to operator no bound.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
op_nameop_typetask_typetask_durationvec_ratiomac_ratioscalar_ratiomte1_ratiomte2_ratiomte3_ratioblock_diminput_shapesinput_data_typesinput_formatsoutput_shapesoutput_data_typesoutput_formats
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention_norm-
LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers-
CellList/0-LLamaDecodeLayer/attention_norm-LlamaRMSNorm/ReduceMean-op35
SquareAI_VECTOR_CORE42.760.465400000.005616"128,128"FLOATNCHW"128,1"FLOATNCHW
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/ffn_norm-
LlamaRMSNorm/Square-op77Default/model-LlamaModel/layers-
CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/ReduceMean-op78
SquareAI_VECTOR_CORE42.240.46600000.006216"128,128"FLOATNCHW"128,1"FLOATNCHW
Default/lm_head-Linear/MatMul-op213MatMulV2AI_CORE39.0200.11050.01190.08570.4284020"128,128;128,32000"FLOAT16;FLOAT16FORMAT_ND;FORMAT_ND"128,32000"FLOATFORMAT_ND
" + ], + "text/plain": [ + "+-----------------------------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| op_name | op_type | task_type | task_duration | vec_ratio | mac_ratio | scalar_ratio | mte1_ratio | mte2_ratio | mte3_ratio | block_dim | input_shapes | input_data_types | input_formats | output_shapes | output_data_types | output_formats |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention_norm- | Square | AI_VECTOR_CORE | 42.76 | 0.4654 | 0 | 0 | 0 | 0 | 0.0056 | 16 | \"128,128\" | FLOAT | NCHW | \"128,1\" | FLOAT | NCHW |\n", + "| LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers- | | | | | | | | | | | | | | | | |\n", + "| CellList/0-LLamaDecodeLayer/attention_norm-LlamaRMSNorm/ReduceMean-op35 | | | | | | | | | | | | | | | | |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/ffn_norm- | Square | AI_VECTOR_CORE | 42.24 | 0.466 | 0 | 0 | 0 | 0 | 0.0062 | 16 | \"128,128\" | FLOAT | NCHW | \"128,1\" | FLOAT | NCHW |\n", + "| LlamaRMSNorm/Square-op77Default/model-LlamaModel/layers- | | | | | | | | | | | | | | | | |\n", + "| CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/ReduceMean-op78 | | | | | | | | | | | | | | | | |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+\n", + "| Default/lm_head-Linear/MatMul-op213 | MatMulV2 | AI_CORE | 39.02 | 0 | 0.1105 | 0.0119 | 0.0857 | 0.4284 | 0 | 20 | \"128,128;128,32000\" | FLOAT16;FLOAT16 | FORMAT_ND;FORMAT_ND | \"128,32000\" | FLOAT | FORMAT_ND |\n", + "+-----------------------------------------------------------------------------+----------+----------------+---------------+-----------+-----------+--------------+------------+------------+------------+-----------+---------------------+------------------+---------------------+---------------+-------------------+----------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "if problems: # 如果存在相关问题则获取相关问题检测细节\n", + " operator_no_bound = operator_no_bound_result.get(\"operator no bound\")\n", + " operator_no_bound_table = PrettyTable(operator_no_bound.get(\"headers\"))\n", + " for row in operator_no_bound.get(\"data\"):\n", + " row = [fill(str(element), width=80) for element in row]\n", + " operator_no_bound_table.add_row(row)\n", + " operator_no_bound_table.hrules = ALL\n", + " display(operator_no_bound_table[:3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### AICPU问题识别\n", + "AICPU问题主要为识别相关算子执行时跑到AICPU上计算,并没有利用到AI CORE的计算能力的场景,主要调优手段为修改相关代码来避免AICPU算子,可参见相关资料,来避免AICPU算子的问题:\n", + "https://support.huaweicloud.com/bestpractice-modelarts/modelarts_10_2517.html\n", + "\n", + "下列代码为样例,主要展示如何检测Dynamic Shape类型问题,并获取相关问题检测结果:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from prettytable import PrettyTable, ALL\n", + "from textwrap import fill\n", + "from profiler.advisor.interface.interface import Interface\n", + "\n", + "\n", + "# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上\n", + "profiling_path = r\"YOUR PROFILING PATH\"\n", + "interface = Interface(profiling_path=profiling_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Please ensure only one trace_view.json in C:\\personalC\\profiling_data, there will analyze first timeline profiling data.\n", + " \r" + ] + } + ], + "source": [ + "# 查询computation相关是否存在aicpu问题\n", + "# 如果profiling数据采集自非8.0.0的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version\n", + "aicpu_result = interface.get_result(\"computation\", \"aicpu_analysis\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescriptionsuggestionproblem counttotal_time(us)time ratioincome(us)income ratio
block dimsome operator does not make full use of 25 ai core or 50 ai vector core; Top-10
operator of task duration are as follows: Square, MatMulV2, BatchMatMul,
SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast
1. Optimize operator by AOE, such as: 'aoe --job_type=2
--model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi
sor\\operator_tuning_file_20240613153259.cfg'
101814.01999999999991.0
operator no boundThere is no mte, cube, vector, scalar ratio is more than 80.00%; Top task
duration operators need to be tuned are as follows: Square, MatMulV2,
BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast
1. Optimize operator by AOE, such as: 'aoe --job_type=2
--model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi
sor\\operator_tuning_file_20240613153259.cfg'
95814.01999999999990.7985
AICPU operatorSome operators and task duration exceed 20 us, such as : Cast1. Modify code to avoid aicpu operator39686568.8600000010.0189
" + ], + "text/plain": [ + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| problem | description | suggestion | problem count | total_time(us) | time ratio | income(us) | income ratio |\n", + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| block dim | some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 | 1. Optimize operator by AOE, such as: 'aoe --job_type=2 | 101 | 814.0199999999999 | 1.0 | | |\n", + "| | operator of task duration are as follows: Square, MatMulV2, BatchMatMul, | --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi | | | | | |\n", + "| | SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | sor\\operator_tuning_file_20240613153259.cfg' | | | | | |\n", + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| operator no bound | There is no mte, cube, vector, scalar ratio is more than 80.00%; Top task | 1. Optimize operator by AOE, such as: 'aoe --job_type=2 | 95 | 814.0199999999999 | 0.7985 | | |\n", + "| | duration operators need to be tuned are as follows: Square, MatMulV2, | --model_path=$user_dump_path --tune_ops_file=c:\\personalC\\code\\att\\profiler\\advi | | | | | |\n", + "| | BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast | sor\\operator_tuning_file_20240613153259.cfg' | | | | | |\n", + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+\n", + "| AICPU operator | Some operators and task duration exceed 20 us, such as : Cast | 1. Modify code to avoid aicpu operator | 39 | 686568.860000001 | 0.0189 | | |\n", + "+-------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+---------------+-------------------+------------+------------+--------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "problems = aicpu_result.get(\"problems\")\n", + "if problems: # 如果存在相关问题则获取相关问题检测描述及建议\n", + " problem_table = PrettyTable(problems.get(\"headers\"))\n", + " for row in problems.get(\"data\"):\n", + " row = [fill(str(element), width=80) for element in row]\n", + " problem_table.add_row(row)\n", + "\n", + " problem_table.align = \"l\"\n", + " problem_table.hrules = ALL\n", + " display(problem_table)\n", + "else:\n", + " print(\"There is no suggestion related to operator no bound.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
op_nameop_typetask_durationinput_shapesinput_data_typesinput_formatsoutput_shapesoutput_data_typesoutput_formatsstack_info
trans_Cast_5Cast493.64""INT32FORMAT_ND""UINT64FORMAT_ND/usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/functional.py(1279):
dropout; /usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/dropout.py(58): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/language_model.py(236): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/language_model.py(425): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/gpt_model.py(84): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/module.py(184): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/distributed.py(58): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
../../pretrain_gpt.py(88): forward_step;
/profiling_auto_GPT3/megatron/schedules.py(118): forward_step;
/home/s30040711/Megatron-
LM/megatron_npu_adaptor/megatron_npu/adaptor_schedules.py(96):
forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419):
train_step; /profiling_auto_GPT3/megatron/training.py(837): train;
/profiling_auto_GPT3/megatron/training.py(152): pretrain;
../../pretrain_gpt.py(122): <module>
trans_Cast_5Cast413.4""INT32FORMAT_ND""UINT64FORMAT_ND/usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/functional.py(1279):
dropout; /usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/dropout.py(58): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/language_model.py(236): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/language_model.py(425): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/gpt_model.py(84): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/module.py(184): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
/profiling_auto_GPT3/megatron/model/distributed.py(58): forward;
/usr/local/python3.7.5/lib/python3.7/site-
packages/torch/nn/modules/module.py(1110): _call_impl;
../../pretrain_gpt.py(88): forward_step;
/profiling_auto_GPT3/megatron/schedules.py(118): forward_step;
/home/s30040711/Megatron-
LM/megatron_npu_adaptor/megatron_npu/adaptor_schedules.py(109):
forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419):
train_step; /profiling_auto_GPT3/megatron/training.py(837): train;
/profiling_auto_GPT3/megatron/training.py(152): pretrain;
../../pretrain_gpt.py(122): <module>
" + ], + "text/plain": [ + "+--------------+---------+---------------+--------------+------------------+---------------+---------------+-------------------+----------------+----------------------------------------------------------------------------------+\n", + "| op_name | op_type | task_duration | input_shapes | input_data_types | input_formats | output_shapes | output_data_types | output_formats | stack_info |\n", + "+--------------+---------+---------------+--------------+------------------+---------------+---------------+-------------------+----------------+----------------------------------------------------------------------------------+\n", + "| trans_Cast_5 | Cast | 493.64 | \"\" | INT32 | FORMAT_ND | \"\" | UINT64 | FORMAT_ND | /usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/functional.py(1279): |\n", + "| | | | | | | | | | dropout; /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/dropout.py(58): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/language_model.py(236): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/language_model.py(425): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/gpt_model.py(84): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/module.py(184): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/distributed.py(58): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | ../../pretrain_gpt.py(88): forward_step; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/schedules.py(118): forward_step; |\n", + "| | | | | | | | | | /home/s30040711/Megatron- |\n", + "| | | | | | | | | | LM/megatron_npu_adaptor/megatron_npu/adaptor_schedules.py(96): |\n", + "| | | | | | | | | | forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419): |\n", + "| | | | | | | | | | train_step; /profiling_auto_GPT3/megatron/training.py(837): train; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/training.py(152): pretrain; |\n", + "| | | | | | | | | | ../../pretrain_gpt.py(122): |\n", + "+--------------+---------+---------------+--------------+------------------+---------------+---------------+-------------------+----------------+----------------------------------------------------------------------------------+\n", + "| trans_Cast_5 | Cast | 413.4 | \"\" | INT32 | FORMAT_ND | \"\" | UINT64 | FORMAT_ND | /usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/functional.py(1279): |\n", + "| | | | | | | | | | dropout; /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/dropout.py(58): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/language_model.py(236): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/language_model.py(425): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/gpt_model.py(84): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/module.py(184): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/model/distributed.py(58): forward; |\n", + "| | | | | | | | | | /usr/local/python3.7.5/lib/python3.7/site- |\n", + "| | | | | | | | | | packages/torch/nn/modules/module.py(1110): _call_impl; |\n", + "| | | | | | | | | | ../../pretrain_gpt.py(88): forward_step; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/schedules.py(118): forward_step; |\n", + "| | | | | | | | | | /home/s30040711/Megatron- |\n", + "| | | | | | | | | | LM/megatron_npu_adaptor/megatron_npu/adaptor_schedules.py(109): |\n", + "| | | | | | | | | | forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419): |\n", + "| | | | | | | | | | train_step; /profiling_auto_GPT3/megatron/training.py(837): train; |\n", + "| | | | | | | | | | /profiling_auto_GPT3/megatron/training.py(152): pretrain; |\n", + "| | | | | | | | | | ../../pretrain_gpt.py(122): |\n", + "+--------------+---------+---------------+--------------+------------------+---------------+---------------+-------------------+----------------+----------------------------------------------------------------------------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "if problems: # 如果存在相关问题则获取相关问题检测细节\n", + " aicpu = aicpu_result.get(\"AICPU operator\")\n", + " aicpu_table = PrettyTable(aicpu.get(\"headers\"))\n", + " for row in aicpu.get(\"data\"):\n", + " row = [fill(str(element), width=80) for element in row]\n", + " aicpu_table.add_row(row)\n", + " aicpu_table.hrules = ALL\n", + " display(aicpu_table[:2])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/profiler/advisor/compute_perf_analysis.ipynb b/profiler/advisor/compute_perf_analysis.ipynb deleted file mode 100644 index 27c9caf37bf43871f319a9418294953f54f9cafd..0000000000000000000000000000000000000000 --- a/profiler/advisor/compute_perf_analysis.ipynb +++ /dev/null @@ -1,109 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2024-02-21T09:19:13.937531900Z", - "start_time": "2024-02-21T09:19:13.267899500Z" - } - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "from advisor_backend.interface import Interface\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 算子调优分析\n", - "## 1. 算子分析的数据准备\n", - "当前算子分析工具支持分析Ascend Pyorch Profiler方式生成的ascend_pt目录\n", - "## 2. 算子分析解决的问题\n", - "当前支持分析模型中存在可融合的小算子,并给出优化建议。\n", - "\n", - "\"更多融合算子信息,请查阅 https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/700alpha003/processormodel/hardwaredesc_0001.html" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2024-02-22T08:41:17.455567500Z", - "start_time": "2024-02-22T08:41:16.716884800Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[INFO] Start to analyse the target file: C:\\data\\ascend_pt\\ASCEND_PROFILER_OUTPUT\\kernel_details.csv\n", - " pattern_name pattern len count duration sum(us) op durations(us) index\n", - "18 torch_npu.npu_swiglu (Slice, Slice, Swish, Mul) 4 1 12.56 [3.14, 3.14, 3.14, 3.14] [0]\n", - "\n", - "\n", - "The computing time of fusable op is 12.56 ms.\n", - "\n", - "\n", - "Advice 0:\n", - "Replace [Slice, Slice, Swish, Mul] with torch_npu.npu_swiglu. This pattern first happened in: \n", - "torch/nn/modules/module.py(1513): _call_impl\n", - "profiler_main.py(116):forward\n" - ] - } - ], - "source": [ - "# EDIT THE PROFILING DATA PATH\n", - "compute_path = \"[YOUR PATH]\"\n", - "interface = Interface(compute_path)\n", - "data = interface.get_data('compute', 'npu_fused')\n", - "pd.set_option('display.max_columns', None)\n", - "pd.set_option('display.width', 900)\n", - "print(data['data'].iloc[:, :-2])\n", - "print('\\n')\n", - "print(data['bottleneck'])\n", - "print('\\n')\n", - "print(data['advice'])" - ] - }, - { - "cell_type": "code", - "outputs": [], - "source": [ - "\n", - "\n" - ], - "metadata": { - "collapsed": false - } - } - ], - "metadata": { - "kernelspec": { - "name": "python3", - "language": "python", - "display_name": "Python 3 (ipykernel)" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.18" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/profiler/advisor/config/__init__.py b/profiler/advisor/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/config/config.ini b/profiler/advisor/config/config.ini new file mode 100644 index 0000000000000000000000000000000000000000..c56c1dad9f0d7e9ac02ab76b0e79e102b010da12 --- /dev/null +++ b/profiler/advisor/config/config.ini @@ -0,0 +1,16 @@ +[LOG] +# console_logging_level : DEBUG/INFO/WARNING/ERROR +console_logging_level = INFO +[ANALYSE] +# analysis_result_file : filename of analysis result +analysis_result_file = analysis_result_file.xlsx +# tune_ops_file: filename of tune op name list +tune_ops_file = operator_tuning_file.cfg +[THRESHOLD] +# operator_bound_ratio: (mte, cube, vector, scalar) ratio greater than this value will be checked in operator_bound_checker +operator_bound_ratio = 0.8 +[RULE-BUCKET] +# region : URL of different regions where can download rule yaml file +cn-north-9 = cnnorth9-modelarts-sdk +cn-southwest-2 = cnsouthwest2-modelarts-sdk +cn-north-7 = cnnorth7-modelarts-sdk \ No newline at end of file diff --git a/profiler/advisor/config/config.py b/profiler/advisor/config/config.py new file mode 100644 index 0000000000000000000000000000000000000000..12f4526f8c95a747f97272aed6cf8e4e822da676 --- /dev/null +++ b/profiler/advisor/config/config.py @@ -0,0 +1,108 @@ +""" +advisor config +""" +from profiler.advisor.utils.utils import Timer + +import logging +import os +from configparser import ConfigParser + +from profiler.advisor.utils.utils import singleton + +logger = logging.getLogger() + + +@singleton +class Config: + """ + config + """ + # pylint: disable=too-many-instance-attributes + + _CONFIG_DIR_NAME = "config" + _CONFIG_FILE_NAME = "config.ini" + + def __init__(self) -> None: + config = ConfigParser(allow_no_value=True) + self._work_path = os.getcwd() # pwd + self._root_path = os.path.abspath(os.path.join(__file__, "../../")) + config.read(os.path.join(self._root_path, self._CONFIG_DIR_NAME, self._CONFIG_FILE_NAME)) + self.config = config + # ANALYSE + self._analysis_result_file = self._normalize_path(config.get("ANALYSE", "analysis_result_file")) + self._tune_ops_file = os.path.abspath( + os.path.join(self._work_path, f"operator_tuning_file_{Timer().strftime}.cfg")) + self.log_path = None + + def _normalize_path(self, file) -> str: + if not file.startswith("/"): + file = os.path.join(self._work_path, file) + return os.path.abspath(file) + + @property + def work_path(self) -> str: + """ + get work path + :return: work path + """ + return self._work_path + + @property + def root_path(self) -> str: + """ + get root path + :return: root path + """ + return self._root_path + + def set_config(self, key, value) -> None: + """ + set config value + :param key: config key + :param value: config value + """ + setattr(self, key, value) + + def get_config(self, key) -> str: + """ + get value of config + :param key: config key + :return: config value + """ + try: + return getattr(self, key) + except AttributeError: + return "" + + @property + def analysis_result_file(self) -> str: + """ + get filename of op result file + :return: filename + """ + return self._analysis_result_file + + @property + def tune_ops_file(self) -> str: + """ + get filename of tune op file + :return: filename + """ + return self._tune_ops_file + + @property + def operator_bound_ratio(self) -> float: + """ + operator_bound_ratio + """ + return float(self.config.get("THRESHOLD", "operator_bound_ratio")) + + def set_log_path(self, result_file: str, log_path: str = None): + self.log_path = log_path if log_path is not None else os.path.join(self._work_path, "log") + os.makedirs(self.log_path, exist_ok=True) + self.config._analysis_result_file = os.path.join(self.log_path, result_file) + self._analysis_result_file = os.path.join(self.log_path, result_file) + + def remove_log(self): + if self.log_path and os.path.isdir(self.log_path) and not os.listdir(self.log_path): + os.rmdir(self.log_path) diff --git a/profiler/advisor/config/profiling_data_version_config.yaml b/profiler/advisor/config/profiling_data_version_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f73aecd3baf18e06981ef4d4b0db7d6faadd419a --- /dev/null +++ b/profiler/advisor/config/profiling_data_version_config.yaml @@ -0,0 +1,80 @@ +versions: + - version: 8.0.0 + dirs_pattern: + ^PROF_\d{6}_\d{17}_\w+$: + mindstudio_profiler_output: + [ op_summary, msprof ] + class_attr: + op_summary: OpSummary + msprof: Msprof + file_attr: + op_summary: ^op_summary_\d{14}\.csv$ + msprof: ^msprof_\d{14}\.json$ + + - version: 7.0.0 + dirs_pattern: + ^PROF_\d{6}_\d{17}_\w+$: + ^device_\d+$: + summary: + [ op_summary ] + timeline: + [ msprof, task_time ] + host: + sqlite: + [ ge_info ] + class_attr: + op_summary: OpSummary + task_time: TaskTime + msprof: Msprof + ge_info: GeInfo + file_attr: + op_summary: ^op_summary_\d+_\d+_\d{14}\.csv$ + task_time: ^task_time_\d+_\d+_\d{14}\.json$ + msprof: ^msprof_\d+_\d+_\d{14}\.json$ + ge_info: ge_info.db + + - version: 7.0.RC1 + dirs_pattern: + ^PROF_\d{6}_\d{17}_\w+$: + ^device_\d+$: + summary: + [ op_summary ] + timeline: + [ msprof, task_time ] + host: + sqlite: + [ ge_info ] + class_attr: + op_summary: OpSummary + task_time: TaskTime + msprof: Msprof + ge_info: GeInfo + file_attr: + op_summary: ^op_summary_\d+_\d+_\d+_\d{14}\.csv$ + task_time: ^task_time_\d+_\d+_\d+_\d{14}\.json$ + msprof: ^msprof_\d+_\d+_\d+_\d{14}\.json$ + ge_info: ge_info.db + + - version: 6.3.RC2 + dirs_pattern: + ^PROF_\d{6}_\d{17}_\w+$: + ^device_\d+$: + summary: + [ op_summary ] + timeline: + [ msprof, task_time ] + host: + sqlite: + [ ge_info ] + class_attr: + op_summary: OpSummary + task_time: TaskTime + msprof: Msprof + ge_info: GeInfo + file_attr: + op_summary: ^op_summary_\d+_\d+\.csv$ + task_time: ^task_time_\d+_\d+\.json$ + msprof: ^msprof_\d+_\d+\.json$ + ge_info: ge_info.db + + diff --git a/profiler/advisor/dataset/__init__.py b/profiler/advisor/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/dataset/cluster/__init__.py b/profiler/advisor/dataset/cluster/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/dataset/cluster/cluster_dataset.py b/profiler/advisor/dataset/cluster/cluster_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..09fda2d4dcf2df2f05abb0007befb5c5c36ef824 --- /dev/null +++ b/profiler/advisor/dataset/cluster/cluster_dataset.py @@ -0,0 +1,165 @@ +import logging + +import os + +from profiler.advisor.dataset.dataset import Dataset +from profiler.advisor.utils.utils import singleton +from profiler.cluster_analyse.common_func.file_manager import FileManager +from profiler.advisor.common import constant as const +from profiler.cluster_analyse.common_func.constant import Constant +from collections import defaultdict +from profiler.cluster_analyse.cluster_analysis import Interface +from profiler.advisor.dataset.cluster.cluster_step_trace_time_bean import ClusterStepTraceTimeBean + +logger = logging.getLogger() + + +class ClusterDataset(Dataset): + + def __init__(self, collection_path, data: dict, **kwargs) -> None: + super().__init__(collection_path, data) + + def is_cluster_analysis_output_exist(self): + """ + check whether input path is valid + """ + for file in os.listdir(self.collection_path): + if file == 'cluster_analysis_output': + print("[INFO]Cluster has been analyzed " + "because of the existence of cluster analysis output directory.") + print("[INFO]Skip Cluster analyze backend.") + return True + return False + + def cluster_analyze(self): + if self.is_cluster_analysis_output_exist(): + return + parameter = { + Constant.COLLECTION_PATH: self.collection_path, + Constant.ANALYSIS_MODE: "all" + } + print("[INFO] cluster analysis is in the process, please wait...") + try: + Interface(parameter).run() + except Exception as e: + raise ValueError(f"Cluster analyze backend failed:{e}") from e + + def load_csv_data(self, file_name, dataBean): + csv_path = os.path.join(self.collection_path, const.CLUSTER_ANALYSIS_OUTPUT, file_name) + if not os.path.exists(csv_path): + msg = "[ERROR] cluster_step_trace_time.csv doesn't exist, terminate analysis." + raise RuntimeError(msg) + data = FileManager.read_csv_file(csv_path, dataBean) + return data + + def load_json_data(self, file_name): + json_path = os.path.join(self.collection_path, const.CLUSTER_ANALYSIS_OUTPUT, file_name) + if not os.path.exists(json_path): + msg = "[ERROR] cluster_communication.json doesn't exist, terminate analysis." + raise RuntimeError(msg) + data = FileManager.read_json_file(json_path) + return data + + +@singleton +class ClusterStepTraceTimeDataSet(ClusterDataset): + RANK = "rank" + + def __init__(self, collection_path: str, data: dict, **kwargs): + self._step_dict = defaultdict() + super().__init__(collection_path, data) + + def _parse(self): + self.cluster_analyze() + try: + step_data = self.load_csv_data(const.CLUSTER_STEP_TIME_CSV, ClusterStepTraceTimeBean) + except RuntimeError as e: + print("捕获到异常:", e) + self._step_dict = None + return False + self._step_dict = self.formate_data(step_data) + return True + + def formate_data(self, step_data: list): + step_dict = defaultdict(lambda: [0, 0, 0]) + for step_bean in step_data: + if step_bean.type == self.RANK: + step_dict[step_bean.index][0] += step_bean.compute + step_dict[step_bean.index][1] += step_bean.communication + step_dict[step_bean.index][2] += step_bean.free + return step_dict + + def get_data(self): + return self._step_dict + + +@singleton +class ClusterCommunicationDataSet(ClusterDataset): + RDMA_TIME_MS = "RDMA time(ms)" + RDMA_SIZE_MB = "RDMA size(mb)" + SDMA_TIME_MS = "SDMA time(ms)" + SDMA_SIZE_MB = "SDMA size(mb)" + RDMA_BANDWIDTH = "RDMA bandwidth(GB/s)" + SDMA_BANDWIDTH = "SDMA bandwidth(GB/s)" + COMMUNICATION_BANDWIDTH_INFO = "Communication Bandwidth Info" + TRANSIT_TIME = "Transit Time(ms)" + TRANSIT_SIZE = "Transit Size(MB)" + SDMA = "SDMA" + RDMA = "RDMA" + + def __init__(self, collection_path: str, data: dict, **kwargs): + self.rank_bw_dict = defaultdict(lambda: { + self.RDMA_TIME_MS: 0, + self.RDMA_SIZE_MB: 0, + self.SDMA_TIME_MS: 0, + self.SDMA_SIZE_MB: 0, + }) + super().__init__(collection_path, data) + + @staticmethod + def compute_ratio(dividend: float, divisor: float): + if abs(divisor) < 1e-15: + return 0 + else: + return round(dividend / divisor, 4) + + def _parse(self): + self.cluster_analyze() + try: + communication_json = self.load_json_data(const.CLUSTER_COMM_JSON) + except RuntimeError as e: + print("捕获到异常:", e) + self.rank_bw_dict = None + return False + self.process(communication_json) + return True + + def process(self, communication_json: dict): + for comm_group, group_dict in communication_json.items(): + for step, step_dict in group_dict.items(): + for op, op_dict in step_dict.items(): + self.compute_bandwidth(op_dict) + + def compute_bandwidth(self, op_dict: dict): + for rank_id, rank_dict in op_dict.items(): + try: + rank = int(rank_id) + except ValueError as e: + msg = "[ERROR] Cluster_communication.json has invalid structure." + raise ValueError(msg) from e + for comm_type, bw_dict in rank_dict.get(self.COMMUNICATION_BANDWIDTH_INFO, {}).items(): + if comm_type == self.SDMA: + self.rank_bw_dict[rank][self.SDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_SIZE) + self.rank_bw_dict[rank][self.SDMA_TIME_MS] += bw_dict.get(self.TRANSIT_TIME) + if comm_type == self.RDMA: + self.rank_bw_dict[rank][self.RDMA_SIZE_MB] += bw_dict.get(self.TRANSIT_SIZE) + self.rank_bw_dict[rank][self.RDMA_TIME_MS] += bw_dict.get(self.TRANSIT_TIME) + + for rank, rank_dict in self.rank_bw_dict.items(): + self.rank_bw_dict[rank][self.RDMA_BANDWIDTH] = self.compute_ratio( + self.rank_bw_dict[rank][self.RDMA_SIZE_MB], self.rank_bw_dict[rank][self.RDMA_TIME_MS]) + self.rank_bw_dict[rank][self.SDMA_BANDWIDTH] = self.compute_ratio( + self.rank_bw_dict[rank][self.SDMA_SIZE_MB], self.rank_bw_dict[rank][self.SDMA_TIME_MS]) + + def get_data(self): + return self.rank_bw_dict diff --git a/profiler/advisor/dataset/cluster/cluster_step_trace_time_bean.py b/profiler/advisor/dataset/cluster/cluster_step_trace_time_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..b108fc77a3f3408d48c79ce6b542f98427d88b0b --- /dev/null +++ b/profiler/advisor/dataset/cluster/cluster_step_trace_time_bean.py @@ -0,0 +1,67 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class ClusterStepTraceTimeBean: + STEP = "Step" + TYPE = "Type" + INDEX = "Index" + COMPUTING = "Computing" + COMMUNICATION = "Communication(Not Overlapped)" + FREE = "Free" + + def __init__(self, data: dict): + self._data = data + + @property + def step(self) -> str: + return self._data.get(self.STEP, '') + + @property + def type(self) -> str: + return self._data.get(self.TYPE, '') + + @property + def index(self) -> int: + try: + return int(self._data.get(self.INDEX)) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Index'." + raise ValueError(msg) from e + + @property + def compute(self) -> float: + try: + return float(self._data.get(self.COMPUTING, '')) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Computing'." + raise ValueError(msg) from e + + @property + def communication(self) -> float: + try: + return float(self._data.get(self.COMMUNICATION, '')) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Communication'." + raise ValueError(msg) from e + + @property + def free(self) -> float: + try: + return float(self._data.get(self.FREE, '')) + except ValueError as e: + msg = "[ERROR] Cluster step trace time.csv has invalid value in column 'Free'." + raise ValueError(msg) from e + diff --git a/profiler/advisor/dataset/dataset.py b/profiler/advisor/dataset/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..7f1e40a38b8a4a26585eecfe6271cc75ea054d2d --- /dev/null +++ b/profiler/advisor/dataset/dataset.py @@ -0,0 +1,38 @@ +""" +dataset module +""" +import logging +import os + +from profiler.advisor.config.config import Config + +logger = logging.getLogger() + + +class Dataset: + """ + :param collection_path: dataSet absolute path + dataset base class + """ + + def __init__(self, collection_path, data=None) -> None: + if data is None: + data = {} + self.collection_path = os.path.abspath(os.path.join(Config().work_path, collection_path)) + logger.debug("init %s with %s", self.__class__.__name__, self.collection_path) + if self._parse(): + key = self.get_key() + if key not in data: + data[key] = [] + data[key].append(self) + + def _parse(self): + return None + + @classmethod + def get_key(cls): + """ + get key of dataset + :return: key + """ + return cls.__name__.rsplit('.', maxsplit=1)[-1] diff --git a/profiler/advisor/dataset/graph_dataset.py b/profiler/advisor/dataset/graph_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..951de7fd26b1f986d25285547e63b1a420968249 --- /dev/null +++ b/profiler/advisor/dataset/graph_dataset.py @@ -0,0 +1,53 @@ +import logging +from typing import List + +from profiler.advisor.dataset.dataset import Dataset +from profiler.advisor.common.graph.graph_parser import HostGraphParser +from profiler.advisor.common.graph.graph import Graph +from profiler.advisor.utils.utils import load_parameter, lazy_property, get_file_path_from_directory + +logger = logging.getLogger() + + +class GraphDataset(Dataset): + """ + data directory dataset + """ + FILE_PATTERN = "ATT_ADVISOR_GRAPH_FILE" + + def __init__(self, collection_path, data: dict = None, **kwargs) -> None: + self.graph_files: List[HostGraphParser] = [] + super().__init__(collection_path, data) + + def _parse(self): + graph_list = get_file_path_from_directory(self.collection_path, + lambda file: file.endswith( + load_parameter(self.FILE_PATTERN, "_Build.txt"))) + + for graph_file_path in graph_list[-1:]: + logger.info("Prepare to parse %s as default graph.", graph_file_path) + graph_file = HostGraphParser(graph_file_path) + self.graph_files.append(graph_file) + return self.graph_files + + @lazy_property + def graphs(self) -> List[Graph]: + """ + get a list of graphs + return: List[Graph] + """ + graphs = [] + for parser in self.graph_files: + graph = Graph(nodes=parser.nodes, + edges=parser.edges, + name="Default") + graph.build() + graphs.append(graph) + graphs.sort(key=lambda g: g.name) + if len(self.graph_files) >= 1: + del self.graph_files[0] # remove previous useless data + return graphs + + def is_empty(self) -> bool: + """check empty graph dataset""" + return len(self.graph_files) == 0 diff --git a/profiler/advisor/dataset/profiling/__init__.py b/profiler/advisor/dataset/profiling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/dataset/profiling/builder_base.py b/profiler/advisor/dataset/profiling/builder_base.py new file mode 100644 index 0000000000000000000000000000000000000000..2bfe14f9462b701db2a4ede1d539a07659f48ae8 --- /dev/null +++ b/profiler/advisor/dataset/profiling/builder_base.py @@ -0,0 +1,39 @@ +""" +profiling base +""" +import logging +from typing import Dict, List + +from profiler.advisor.dataset.profiling.profiling_parser import ProfilingParser +from profiler.advisor.utils.utils import join_prof_path + +logger = logging.getLogger() + + +class ProfilingBuilderBase: + """ + profiling base + """ + DATA_LIST: List[Dict] = [] + + def __init__(self, path) -> None: + self._path = path + + def parse_data(self) -> bool: + """ + parse data for file in data_dir + """ + if isinstance(self, ProfilingParser): + return True + ret = False + for data in self.DATA_LIST: + class_name = data.get("class_name") + if class_name is not None: + if data.get("subdir_name"): + data_class = data.get("class_name")(join_prof_path(self._path, data.get("subdir_name"))) + else: + data_class = data.get("class_name")(self._path) + if data_class.parse_data(): + setattr(self, str(data.get("attr_name")), data_class) + ret = True + return ret diff --git a/profiler/advisor/dataset/profiling/db_manager.py b/profiler/advisor/dataset/profiling/db_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..c9fb73c7cf69d94c3ca1aba8c726f574d63cd1a3 --- /dev/null +++ b/profiler/advisor/dataset/profiling/db_manager.py @@ -0,0 +1,70 @@ +""" +connection manager +""" +import os +import re +from typing import List + +from sqlalchemy import MetaData, create_engine + + +class ConnectionManager: + """ + Connection Manager + """ + + def __init__(self, path, db_name): + self.db_path = os.path.join(path, db_name) + self.connection = create_engine(f'sqlite:///{self.db_path}') + self.metadata = MetaData() + self.metadata.reflect(bind=self.connection) + + def __call__(self, *args, **kwargs): + return self.connection + + @staticmethod + def check_db_exists(db_path:str, dbs:List) -> bool: + """ + check db exists + """ + if not os.path.isdir(db_path): + return False + for prof_db in dbs: + if not os.access(db_path, os.R_OK) or prof_db not in os.listdir(db_path): + return False + return True + + def check_table_exists(self, tables:List) -> bool: + """ + check table exists + """ + for table in tables: + if table not in self.metadata.tables: + return False + return True + + def check_column_exists(self, table_name:str, columns:List) -> bool: + """ + check column exists + """ + if table_name not in self.metadata.tables: + return False + for column in columns: + if column not in self.metadata.tables[table_name].columns: + return False + return True + + @classmethod + def get_connection(cls, path, dbs, tables=None, is_host=False): + """ + get connection + """ + if is_host: + pattern = r"/device_[0-9]" + path = re.sub(pattern, "/host", path) + if not cls.check_db_exists(path, dbs): + return None + conn = cls(path, dbs) + if tables and not conn.check_table_exists(tables): + return None + return conn diff --git a/profiler/advisor/dataset/profiling/device_info.py b/profiler/advisor/dataset/profiling/device_info.py new file mode 100644 index 0000000000000000000000000000000000000000..b58930777f969d023eab7885a9095d46aa7ba6ea --- /dev/null +++ b/profiler/advisor/dataset/profiling/device_info.py @@ -0,0 +1,61 @@ +""" +profiling info +""" +import json +import logging + +from profiler.advisor.config.config import Config +from profiler.advisor.utils.utils import get_file_path_from_directory + +logger = logging.getLogger() + + +class DeviceInfoParser: + """ + profiling info + device_id device 名称信息 + "aiv_num" ai vector 个数 + "ai_core_num" aicore 个数 + """ + DATA_LIST = [] + + def __init__(self, path) -> None: + self._path = path + + def parse_data(self) -> bool: + """ + parse profiling data + :return: true for success or false + """ + file_list = get_file_path_from_directory(self._path, lambda x: x.startswith("info.json.")) + if not file_list: + return False + for info in file_list: + if self._parse(info): + return True + return False + + @staticmethod + def _parse(info_file: str) -> bool: + if info_file.endswith("done"): + return False # skip info.json.0.done + try: + with open(info_file, encoding="utf-8") as file: + info = json.load(file) + except (IOError, ValueError) as error: + logger.error("Parse json info file %s failed : %s", info_file, error) + return False + if "DeviceInfo" not in info: + logger.error("No device info in json info file %s", info_file) + return False + config = Config() + for device_info in info["DeviceInfo"]: + if "id" in device_info: + config.set_config("device_id", device_info["id"]) + if "aiv_num" in device_info: + config.set_config("aiv_num", device_info["aiv_num"]) + if "ai_core_num" in device_info: + config.set_config("ai_core_num", device_info["ai_core_num"]) + return True + logger.error("No ai_core_num in json info file %s", info_file) + return False diff --git a/profiler/advisor/dataset/profiling/info_collection.py b/profiler/advisor/dataset/profiling/info_collection.py new file mode 100644 index 0000000000000000000000000000000000000000..b1f84313bb7980ea2186d2727db51b5fba49e12e --- /dev/null +++ b/profiler/advisor/dataset/profiling/info_collection.py @@ -0,0 +1,270 @@ +""" +profiling info +""" +import decimal +import logging + +from profiler.advisor.utils.utils import lazy_property + +logger = logging.getLogger() + + +class Info: + """ + op info + """ + _attr_pre_fix_list = [""] + + def add_attr(self, key: str, value: str): + """ + add attr to op info + :param key: op info key + :param value: op info value + :return: None + """ + if not key or hasattr(self, key): + return + setattr(self, key, value) + + def has_attr(self, key: str, strict_mode=False): + """ + check if op info has attr key + :param key: attr key + :return: true or false + """ + if strict_mode: + return hasattr(self, key) + for prefix in self._attr_pre_fix_list: + attr = prefix + key + if hasattr(self, attr): + return True + return False + + def get_attr(self, key, strict_mode=False): + """ + get attr value by key + :param key: attr key + :return: attr value + """ + if strict_mode: + if hasattr(self, key): + return getattr(self, key) + else: + for prefix in self._attr_pre_fix_list: + attr = prefix + key + if key.startswith("mac") and prefix == "aiv_": + # e.g mac_ratio must match aic_mac_ratio, not aiv_mac_ratio + continue + if key.startswith("vec") and prefix == "aic_": + # e.g vec_ratio must match aiv_vec_ratio, not aic_vec_ratio + continue + if hasattr(self, attr): + return getattr(self, attr) + return "" + + def get_float_attr(self, attr, strict_mode=False): + """ + get attr value by key + :param key: attr key + :return: attr value + """ + try: + return float((self.get_attr(attr, strict_mode))) + except (ValueError, FloatingPointError): + pass + return 0 + + def get_decimal_attr(self, attr, strict_mode=False): + """ + get attr value by key + :param key: attr key + :return: attr value + """ + try: + return decimal.Decimal((self.get_attr(attr, strict_mode))) + except (ValueError, decimal.InvalidOperation): + pass + return decimal.Decimal(0) + + def get_attrs(self) -> dict: + """ + get attr list + :return: attr list + """ + return self.__dict__ + + +class OpInfo(Info): + """ + summary info + """ + + _attr_pre_fix_list = ["", "aic_", "aiv_"] + _mac_ratio_attrs = ["mac_ratio", "mac_fp16_ratio", "mac_int8_ratio", "aic_mac_ratio"] + _aicore_time_key = ["aicore_time", "aiv_time"] + _total_cycles_key = ["total_cycles", "aic_total_cycles", "aiv_total_cycles"] + + def __lt__(self, other): + return self.get_float_attr("task_start_time") < other.get_float_attr("task_start_time") + + @lazy_property + def is_cube_op(self) -> bool: + """ + check type of operator if cube or not + """ + for attr in self._mac_ratio_attrs: + if hasattr(self, attr): + try: + if float(getattr(self, attr)) > 0: + if hasattr(self, "ffts_type") and getattr(self, "ffts_type") == "1": + logger.warning( + "ffts type of op %s is vector buf mac ratio is not 0", getattr(self, "op_name") + ) + return True + except ValueError: + pass + # not cube op + if hasattr(self, "ffts_type") and getattr(self, "ffts_type") == "0": + logger.warning("ffts type of op %s is cube but mac ratio is 0", getattr(self, "op_name")) + return False + + @lazy_property + def has_mac_ratio(self) -> bool: + """ + check if op_info has mac ratio + """ + for attr in self._mac_ratio_attrs: + if attr in self.__dict__: + return True + return False + + def attr_sum(self, attr_list): + """sum of a list attrs""" + total = 0 + for attr in attr_list: + total += self.get_float_attr(attr, strict_mode=True) + return total + + def get_aicore_time(self): + """ + get sum of aicore time and ai vector core time + """ + return self.attr_sum(self._aicore_time_key) + + def get_total_cycles(self): + """ + get sum of total cycle for aicore and ai vector core + """ + return self.attr_sum(self._total_cycles_key) + + +class TaskInfo: + """ + task info + """ + EVENT_TYPE = {"metadata": ['M'], "duration": ['B', 'E'], "complete": ['X'], 'flow': ['s', 't', 'f']} + + def __init__(self, content: dict) -> None: + self._name = content.get("name", "") + self._pid = content.get("pid", 0) + self._tid = content.get("tid", 0) + self._start_time = float(content.get("ts", 0.0)) + self._dur = float(content.get("dur", 0.0)) + self._args = content.get("args", {}) + self._cat = content.get("cat", "") + self._id = content.get("id", "") + + @property + def pk_id(self): + """ + get id + :return: id + """ + return self._id + + @property + def pid(self): + """ + get pid + :return: pid + """ + return self._pid + + @property + def tid(self): + """ + get tid + :return: tid + """ + return self._tid + + @property + def task_type(self): + """ + get pid + :return: pid + """ + return self._args.get("Task Type", "NA") + + @property + def start_time(self): + """ + get starttime + :return: starttime + """ + return self._start_time + + @property + def end_time(self): + """ + get endtime + :return: endtime + """ + return self._start_time + self._dur + + @property + def dur(self): + """ + get duration + :return: duration + """ + return self._dur + + @property + def name(self): + """ + get task name + :return: task name + """ + return self._name + + @property + def stream_id(self): + """ + get stream_id + :return: steram id + """ + return self._args.get("Stream Id", "NA") + + @property + def task_id(self): + """ + get task id + :return: task_id + """ + return self._args.get("Task Id", "NA") + + @property + def args(self): + """ + get args of task + :return: args + """ + return self._args + + @property + def cat(self): + """ + get category of task + """ + return self._cat diff --git a/profiler/advisor/dataset/profiling/profiling_dataset.py b/profiler/advisor/dataset/profiling/profiling_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..46d4a4fe8b12a419f6d0d7472f9776369e122f03 --- /dev/null +++ b/profiler/advisor/dataset/profiling/profiling_dataset.py @@ -0,0 +1,79 @@ +import logging +import os + +import yaml +from profiler.advisor.common import constant +from profiler.advisor.common.profiling.ge_info import GeInfo +from profiler.advisor.common.profiling.msprof import Msprof +from profiler.advisor.common.profiling.op_summary import OpSummary +from profiler.advisor.common.profiling.tasktime import TaskTime +from profiler.advisor.dataset.dataset import Dataset +from profiler.advisor.dataset.profiling.device_info import DeviceInfoParser +from profiler.advisor.utils.utils import join_prof_path + + +logger = logging.getLogger() + + +class ProfilingDataset(Dataset): + PROF_TYPE = "" + + def __init__(self, collection_path, data: dict, **kwargs) -> None: + self.cann_version = kwargs.get("cann_version", constant.DEFAULT_CANN_VERSION) + self.PROF_TYPE = kwargs.get("profiling_type", constant.DEFAULT_PROFILING_TYPE) + self.patterns = self.parse_pattern() + self.current_version_pattern = self.get_current_version_pattern() + super().__init__(collection_path, data) + + def _parse(self): + info = DeviceInfoParser(self.collection_path) + if info.parse_data(): + self._info = info + ret = False + if self.current_version_pattern is not None: + self.build_from_pattern(self.current_version_pattern["dirs_pattern"], self.collection_path) + ret = True + + return ret + + def build_from_pattern(self, dirs_pattern, current_path): + if isinstance(dirs_pattern, dict): + for key, value in dirs_pattern.items(): + self.build_from_pattern(value, join_prof_path(current_path, key)) + elif isinstance(dirs_pattern, list): + for item in dirs_pattern: + data_class = globals()[self.current_version_pattern.get('class_attr').get(item)] + data_class.FILE_PATTERN = self.current_version_pattern.get('file_attr').get(item) + data_object = data_class(current_path) + is_success = data_object.parse_data() + if is_success: + setattr(self, item, data_object) + else: + logger.warning("Skip parse %s from local path %s", self.current_version_pattern.get('class_attr').get(item), current_path) + else: + logger.warning(f"Unsupported arguments : %s to build %s", dirs_pattern, self.__class__.__name__) + + def get_current_version_pattern(self): + for version_config_dict in self.patterns['versions']: + if version_config_dict['version'] == self.cann_version: + return version_config_dict + return dict() + + def parse_pattern(self, config_path="config/profiling_data_version_config.yaml"): + + if not os.path.isabs(config_path): + config_path = os.path.join(os.path.dirname(__file__), + "../", "../", config_path) + + if not os.path.exists(config_path): + logger.warning("Skip parse profiling dataset, because %s does not exist.", config_path) + return [] + + with open(config_path, 'r') as f: + patterns = yaml.safe_load(f) + + return patterns + + def collection_path(self): + """collection_path""" + return self.collection_path diff --git a/profiler/advisor/dataset/profiling/profiling_parser.py b/profiler/advisor/dataset/profiling/profiling_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..bb4caeb29e5c94cbc4373b1d6b10e32f3e10e02e --- /dev/null +++ b/profiler/advisor/dataset/profiling/profiling_parser.py @@ -0,0 +1,132 @@ +import csv +import json +import os +import re +from typing import List, Dict + +from profiler.advisor.dataset.profiling.info_collection import logger +from profiler.advisor.utils.utils import get_file_path_from_directory, SafeOpen, format_excel_title + + +class ProfilingParser: + """ + profiling + """ + FILE_PATTERN = "" + FILE_PATTERN_MSG = "" + FILE_INFO = "" + FILE_PATH = "" + + def __init__(self, path: str) -> None: + self._path = path + self._raw_data: List[List[str]] = [] + self._filename = "" + + @staticmethod + def file_match_func(pattern): + """file match function""" + return lambda x: re.search(re.compile(pattern), x) + + def parse_data(self) -> bool: + """ + pase task time file + :return: true or false + """ + if self._parse_from_file(): + return True + return False + + def _parse_from_file(self): + file_list = get_file_path_from_directory(self._path, self.file_match_func(self.FILE_PATTERN)) + if not file_list: + return False + ## get last file + file = file_list[-1] + self.FILE_PATH = file + if len(file_list) > 1: + logger.warning("Multiple copies of %s were found, use %s", self.FILE_INFO, file) + return self.parse_from_file(file) + + @staticmethod + def get_float(data) -> float: + """ + get float or 0.0 + """ + try: + return float(data) + except (FloatingPointError, ValueError): + return 0.0 + + def parse_from_file(self, file): + """ + parse from file + """ + return False + + @staticmethod + def _check_csv_file_format(csv_file_name: str, csv_content: List[List[str]]): + if not csv_content: + logger.error("%s is empty", csv_file_name) + return False + return True + + def _parse_csv(self, file, check_csv=True) -> bool: + logger.debug("Parse file %s", file) + self._filename = os.path.splitext(os.path.basename(file))[0] + with SafeOpen(file, encoding="utf-8") as csv_file: + try: + csv_content = csv.reader(csv_file) + for row in csv_content: + self._raw_data.append(row) + if check_csv and not self._check_csv_file_format(file, self._raw_data): + logger.error("Invalid csv file : %s", file) + return False + except OSError as error: + logger.error("Read csv file failed : %s", error) + return False + + if not csv_file: + return False + if not self._raw_data: + logger.warning("File %s has no content", file) + return False + return True + + def _parse_json(self, file) -> bool: + logger.debug("Parse file %s", file) + self._filename = os.path.splitext(os.path.basename(file))[0] + try: + with open(file, encoding="utf-8") as json_file: + self._raw_data = json.load(json_file) + except (OSError, ValueError) as error: + logger.error("Parse json file %s failed : %s", file, error) + return False + return True + + def get_raw_data(self): + """ + get raw file name and data + """ + return self._filename, self._raw_data + + @staticmethod + def _get_csv_title(data: List, number=0, title_index=0): + """ + number = 0 replace (us) (ns).. + other replace " " to "_" + title_index: position of title default 0 + """ + title_dict: Dict[int, str] = {} + for idx, title in enumerate(data[title_index]): + if number == 0: + title_dict[idx] = format_excel_title(title) + else: + title_dict[idx] = title.replace(" ", "_") + return title_dict + + @property + def path(self): + """ + path + """ + return self._path diff --git a/profiler/advisor/dataset/timeline_event_dataset.py b/profiler/advisor/dataset/timeline_event_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..94b6fdfef78c044e37e24772699ed7ea67b0da30 --- /dev/null +++ b/profiler/advisor/dataset/timeline_event_dataset.py @@ -0,0 +1,220 @@ +import logging +from typing import List + +import ijson +from profiler.advisor.dataset.dataset import Dataset +from tqdm import tqdm + +from profiler.advisor.common import constant as const +from profiler.advisor.common.timeline.event import TimelineEvent +from profiler.advisor.utils.utils import get_file_path_from_directory +from profiler.advisor.utils.utils import singleton + +logger = logging.getLogger() + + +class OpCompileCollector: + def __init__(self): + self._total_op_compile_counter = 0 + self._total_op_compile_time = 0.0 + + @property + def total_time(self): + return self._total_op_compile_time + + @property + def total_count(self): + return self._total_op_compile_counter + + def is_empty(self): + return self._total_op_compile_counter == 0 + + def update(self, event: TimelineEvent): + self._total_op_compile_time += float(event.dur) + self._total_op_compile_counter += 1 + + def unset(self): + self._total_op_compile_counter = 0 + self._total_op_compile_time = 0.0 + + +@singleton +class TimelineEventDataset(Dataset): + + def __init__(self, collection_path, data: dict, **kwargs) -> None: + self._ops_with_task_type = {} + self._ops_with_stack = {} + self._ops_compile = OpCompileCollector() + self._torch_to_npu = {} + self._acl_to_npu = set() + self._aten: List[str] = [] + self._optimizer: List[str] = [] + self.timeline_dir = collection_path + self.timeline_data_list = get_file_path_from_directory(collection_path, lambda file: file.endswith("trace_view.json")) + self.dataset_len = None + self.analysis_mode = kwargs.get("analysis_mode") + self.task_type = kwargs.get("task_type") + self.cann_version = kwargs.get("cann_version") + self.torch_version = kwargs.get("torch_version") + + if self.analysis_mode in ["fusion_ops", "all"]: + logger.info("Load fusion operators database for cann version '%s' and torch version '%s'", + self.cann_version, self.torch_version) + + super().__init__(collection_path, data) + + if self.analysis_mode in ["op_stack", "all"]: + self._task_op_names = list(set([event_key.split("-")[0] for event_key in self._ops_with_task_type.keys()])) + + self._post_process() + + + @property + def ops_with_stack(self): + return self._ops_with_stack + + @property + def ops_compile(self): + return self._ops_compile + + @property + def torch_to_npu(self): + return self._torch_to_npu + + @property + def acl_to_npu(self): + return self._acl_to_npu + + @property + def ops_with_task_type(self): + return self._ops_with_task_type + + @property + def task_op_names(self): + return self._task_op_names + + @property + def optimizer(self): + return self._optimizer + + @property + def aten(self): + return self._aten + + def _parse(self): + + if len(self.timeline_data_list) == 0: + logger.warning("Please ensure trace_view.json in %s, skip timeline analysis.", self.timeline_dir) + return False + + if len(self.timeline_data_list) > 1: + logger.warning("Please ensure only one trace_view.json in %s, there will analyze first timeline profiling data.", self.timeline_dir) + self.timeline_data_list = [self.timeline_data_list[0]] + + result = self.parse_data_with_generator(self._add_event) + + if not self.dataset_len: + self.dataset_len = len(result) + + return True + + def parse_data_with_generator(self, func): + result = [] + try: + with open(self.timeline_data_list[0], "r") as f: + for i, event in tqdm(enumerate(ijson.items(f, "item")), + leave=False, ncols=100, desc="Building dataset for timeline analysis", + total=self.dataset_len): + func_res = func(index=i, event=event) + if func_res is not None: + result.append(func_res) + except Exception as e: + logger.warning("Error %s while parsing file %s, continue to timeline analysis", e, + self.timeline_data_list[0]) + return result + + def _add_ops_with_task_type(self, event): + key = f"{event.name}-{event.ts}" + self._ops_with_task_type[key] = TimelineEvent( + { + const.TASK_TYPE: event.args.get(const.TASK_TYPE), + "task_id": event.args.get("Task Id"), + "tid": event.tid, + "name": event.name, + "ts": str(event.ts) + } + ) + + def _add_ops_with_stack(self, event): + self._ops_with_stack[str(event.ts)] = TimelineEvent({"name": event.name, "dataset_index": event.dataset_index}) + + def _add_torch_to_npu(self, event): + key = f"{event.ph}-{event.id}" + self._torch_to_npu[key] = TimelineEvent({"tid": event.tid, "ts": str(event.ts)}) + + def _add_acl_to_npu(self, event): + # op with task type equals to ai_cpu which derived from acl_to_npu do not have stacks + self._acl_to_npu.add(str(event.ts)) + + def _add_op_compile(self, event: TimelineEvent): + if event.name == const.OP_COMPILE_NAME or event.args.get("id") == const.OP_COMPILE_ID: + self._ops_compile.update(event) + + def _add_optimizer(self, event: TimelineEvent): + self._optimizer.append(TimelineEvent({"name": event.name, "dataset_index": event.dataset_index})) + + def _add_aten(self, event: TimelineEvent): + self._aten.append(TimelineEvent({ + "name": event.name, "dataset_index": event.dataset_index, "ts": event.ts, "dur": event.dur + })) + + def _add_event(self, index, event): + event["dataset_index"] = index + if not isinstance(event, TimelineEvent): + event = TimelineEvent(event) + + self._add_op_compile(event) + if self.analysis_mode == "fusion_ops": + self._add_event_for_fusion_ops(event) + elif self.analysis_mode == "op_stack": + self._add_event_for_op_stack(event) + else: + self._add_event_for_fusion_ops(event) + self._add_event_for_op_stack(event) + return True + + def _add_event_for_fusion_ops(self, event): + if event.name.lower().startswith(f"{const.ATEN}{const.ATEN_SEP}") or event.name.lower().startswith( + f"{const.NPU}{const.ATEN_SEP}"): + self._add_aten(event) + return + + if event.name.startswith(f"{const.OPTIMIZER}.{const.OPTIMIZER_STEP}{const.OPTIMIZER_SEP}"): + self._add_optimizer(event) + return + + def _add_event_for_op_stack(self, event): + if event.name.lower() == const.TORCH_TO_NPU: + self._add_torch_to_npu(event) + return + + if event.args.get(const.CALL_STACKS): + self._add_ops_with_stack(event) + return + + if event.args.get(const.TASK_TYPE) and event.args.get(const.TASK_TYPE) in [const.AI_CORE, const.AI_CPU]: + self._add_ops_with_task_type(event) + return + + if event.name and event.ts and event.name == const.ACL_TO_NPU: + self._add_acl_to_npu(event) + return + + def _post_process(self): + # eliminate sub aten operator of the first level aten operator by 'ts' and 'dur', + # keep the first level aten operator contiguous + formated_atens = [] + for aten_event in sorted(self._aten, key=lambda x: x.get("ts", -1)): + if not formated_atens or not formated_atens[-1].ts_include(aten_event): + formated_atens.append(aten_event) + self._aten = formated_atens diff --git a/profiler/advisor/display/__init__.py b/profiler/advisor/display/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/display/html/__init__.py b/profiler/advisor/display/html/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/display/html/render.py b/profiler/advisor/display/html/render.py new file mode 100644 index 0000000000000000000000000000000000000000..8ea7c9e0fc22c7da71a673e399fcfc231fbf1453 --- /dev/null +++ b/profiler/advisor/display/html/render.py @@ -0,0 +1,45 @@ +import os +import logging +from typing import List, Dict + +from jinja2 import Environment, FileSystemLoader +from profiler.advisor.common import constant + +from profiler.advisor.config.config import Config +from profiler.advisor.utils.utils import singleton, safe_write + +logger = logging.getLogger() + + +@singleton +class HTMLRender: + def __init__(self): + self.html = "" + self.render_list: Dict[str, List] = {} + + def render_html(self, template_dir: str = "templates", template_name: str = "main.html", + template_header=constant.DEFAULT_TEMPLATE_HEADER): + self.html = self.render_template("main", template_dir, template_name, render_list=self.render_list, + template_header=template_header) + + def render_template(self, key: str, template_dir: str, template_name: str, **kwargs): + if not os.path.isabs(template_dir): + template_dir = os.path.join(os.path.dirname(__file__), template_dir) + + env = Environment(loader=FileSystemLoader(template_dir), + autoescape=True) + template = env.get_template(template_name) + rendered_html = template.render(**kwargs) + if key not in self.render_list: + self.render_list[key] = [] + self.render_list[key].append(rendered_html) + return rendered_html + + def save_to_file(self, save_path: str): + if not save_path.endswith(".html"): + logger.error("Skip save html file because file name must endswith `.html`, " + "but got %s.", os.path.basename(save_path)) + return + + safe_write(self.html, save_path) + logger.info("Save suggestion to %s.", os.path.join(Config().work_path, save_path)) diff --git a/profiler/advisor/display/html/templates/__init__.py b/profiler/advisor/display/html/templates/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/display/html/templates/affinity_api.html b/profiler/advisor/display/html/templates/affinity_api.html new file mode 100644 index 0000000000000000000000000000000000000000..4d12c3e37536392d122f85fc6ef3a4fcc123ef77 --- /dev/null +++ b/profiler/advisor/display/html/templates/affinity_api.html @@ -0,0 +1,50 @@ +{% if result|length > 0 %} +
+

Affinity API Issues

+
+ The analysis results of following affinity APIs are based on runtime env + cann-{{ cann_version }} + and + torch-{{ torch_version }} + +
+ + {% if empty_stacks %} + Suggestion: + These APIs have no code stack. If parameter 'with_stack=False' was set while profiling, please refer to + Ascend PyTorch Profiler to set + 'with_stack=True'. Otherwise, ignore following affinity APIs due to backward broadcast lack of stack. + {% endif %} + + {% for api_name, stacks in result.items() %} + + {% if empty_stacks %} +
{{api_name|safe}}
+ + {% else %} + +
{{api_name|safe}}
+
+ +
+ {% for stack in stacks %} +
No.{{loop.index|safe}} code stack, called {{stack[1]|safe}} times
+ + {% endfor %} +
+
+ {% endif %} + + {% endfor %} + +
+ +
+
+{% endif %} diff --git a/profiler/advisor/display/html/templates/cluster_analysis.html b/profiler/advisor/display/html/templates/cluster_analysis.html new file mode 100644 index 0000000000000000000000000000000000000000..32379d56fcb87a78269612107d1b7634b722d8d8 --- /dev/null +++ b/profiler/advisor/display/html/templates/cluster_analysis.html @@ -0,0 +1,49 @@ +
+

{{title|safe}}

+
+
+ + {% if result.get("Description") %} +
Description
+ + {% endif %} + + {% if result.get("Suggestion") %} +
Suggestion
+ + {% endif %} + + {% if result.get("details") %} +
details
+
+ {% for item in result.get("details") %} + + + {% for header in item.get("headers") %} + + {% endfor %} + + {% for row in item.get("data") %} + + {% for element in row %} + {% if element is number %} + + {% else %} + + {% endif %} + {% endfor %} + + {% endfor %} +
{{ header }}
{{ element|round(2) }}{{ element }}
+ {% endfor %} +
+ {% endif %} + +
+ +
+
\ No newline at end of file diff --git a/profiler/advisor/display/html/templates/compute_analysis.html b/profiler/advisor/display/html/templates/compute_analysis.html new file mode 100644 index 0000000000000000000000000000000000000000..e1907c091b705969004bf709db24211c66c38107 --- /dev/null +++ b/profiler/advisor/display/html/templates/compute_analysis.html @@ -0,0 +1,29 @@ +
+

Abnormal Performance Operator

+
+ {{table.get("title")}} + + + + {% for header in table.get("headers") %} + + {% endfor %} + + {% for row in table.get("rows") %} + + {% for element in row %} + {% if element is number %} + + {% else %} + + {% endif %} + {% endfor %} + + {% endfor %} +
{{ header }}
{{ element|round(2) }}{{ element }}
+ {% if call_stack %} + call stack:
+ {{call_stack}} + {% endif %} +
+
\ No newline at end of file diff --git a/profiler/advisor/display/html/templates/fusion.html b/profiler/advisor/display/html/templates/fusion.html new file mode 100644 index 0000000000000000000000000000000000000000..605a9d748f7d4499a603efb87bc310fab9bc02f3 --- /dev/null +++ b/profiler/advisor/display/html/templates/fusion.html @@ -0,0 +1,47 @@ +{% if candidates|length > 0 %} +
+

Fusion Issues

+
+
+ {% for node in candidates %} +
{{node.op_pass|safe}}
+
+ + + + + + + + + + + +
StructureCountsElapsed Time(us)
{{ node.fusion_pattern|safe }}{{ node.counts|safe }}{{ node.total_duration|safe }}
+
+ {% for match in node.matches %} +
SubGraph {{ loop.index|safe }}
+
+ + + + + + + {% for node in match %} + + + + + + {% endfor %} +
OP NameOP TypeElapsed Time(us)
{{ node.op_name|safe }}{{ node.dtype|safe }}{{ node.duration|safe }}
+
+ {% endfor %} +
+
+ {% endfor %} +
+
+
+{% endif %} diff --git a/profiler/advisor/display/html/templates/main.html b/profiler/advisor/display/html/templates/main.html new file mode 100644 index 0000000000000000000000000000000000000000..3727125b419547fc6a9ac9743eab34e1e1b76256 --- /dev/null +++ b/profiler/advisor/display/html/templates/main.html @@ -0,0 +1,203 @@ + + + + + + + +
+

Performance Optimization Suggestions

+{% for key, renders in render_list.items() %} + {% if key == 'operator'%} +
+

computation

+
+ {% for render in renders %} + {{render|safe}} + {% endfor %} +
+
+ {% else %} +
+

{{ key }}

+
+ {% for render in renders %} + {{render|safe}} + {% endfor %} +
+
+ {% endif %} +{% endfor %} + +
+ + + + + \ No newline at end of file diff --git a/profiler/advisor/display/html/templates/operator_ai_cpu.html b/profiler/advisor/display/html/templates/operator_ai_cpu.html new file mode 100644 index 0000000000000000000000000000000000000000..b3235a88022fc3973ae0098f543d94cc4b7fac25 --- /dev/null +++ b/profiler/advisor/display/html/templates/operator_ai_cpu.html @@ -0,0 +1,61 @@ +
+

AICPU Issues

+
+ + + + + + + + + + + + + +
DescriptionSuggestionElapsed Time(us)Time Ratio
{{ format_result.record.optimization_item.description|safe }}{{ format_result.suggestion|safe }}{{ format_result.task_duration|safe }}{{ format_result.record.statistics_item.task_duration_ratio|safe }}
+
+ {% for op_type, op_info in format_result.statistic %} +
{{ op_type|safe }}
+
+ + + + + + + + + + + +
Operator TypeCountsElapsed Time(us)
{{ op_info.summary.op_type|safe }}{{ op_info.summary.counts|safe }}{{ op_info.summary.total_duration|safe }}
+
+ {% for trace_stack, info in op_info.op_info_list %} +
+ {{ info.summary.op_type|safe }} | Input DType:({{info.op_info_list[0].input_data_types|safe}}) | Output DType:({{info.op_info_list[0].output_data_types|safe}}) | Counts:{{ info.summary.counts|safe}} | Elapsed Time(us):{{ + info.summary.total_duration|safe}} +
+
+ {% if info.op_info_list[0].suggestions|length > 0 %} +
+ {% for suggestion in info.op_info_list[0].suggestions %} +

+ Suggestion {{ loop.index|safe }}: {{suggestion|safe}} +

+ {% endfor %} +
+ {% else %} +

Suggestion 1: Modify code to avoid AICPU operator

+ {% endif %} +
+ {{ info.op_info_list[0].stack_info|safe }} +
+ {% endfor %} +
+
+ {% endfor %} +
+
+
\ No newline at end of file diff --git a/profiler/advisor/display/html/templates/operator_block_dim.html b/profiler/advisor/display/html/templates/operator_block_dim.html new file mode 100644 index 0000000000000000000000000000000000000000..4e2c832f623a4c0a0f315ebdc2b7a97aeb1996a1 --- /dev/null +++ b/profiler/advisor/display/html/templates/operator_block_dim.html @@ -0,0 +1,38 @@ +
+

Block Dim Issues

+
+ + + + + + + + + + + + + +
DescriptionSuggestionElapsed Time(us)Time Ratio
{{ format_result.record.optimization_item.description|safe }}{{ format_result.suggestion|safe }}{{ format_result.task_duration|safe }}{{ format_result.record.statistics_item.task_duration_ratio|safe }}
+
+ {% for op_type, op_info in format_result.statistic %} +
{{ op_type|safe }}
+
+ + + + + + + + + + + +
Operator TypeCountsElapsed Time(us)
{{ op_info.summary.op_type|safe }}{{ op_info.summary.counts|safe }}{{ op_info.summary.total_duration|safe }}
+
+ {% endfor %} +
+
+
\ No newline at end of file diff --git a/profiler/advisor/display/html/templates/operator_dispatch.html b/profiler/advisor/display/html/templates/operator_dispatch.html new file mode 100644 index 0000000000000000000000000000000000000000..c805086354a41f7f98a803b66b3b666c59393899 --- /dev/null +++ b/profiler/advisor/display/html/templates/operator_dispatch.html @@ -0,0 +1,37 @@ +{% if optimizers|length > 0 %} +
+

Operator Dispatch Issues

+
+ + + + + + + {% for optimizer in optimizers %} + + + + + {% endfor %} +
DescriptionSuggestion
{{ optimizer.description |safe }}{{ optimizer.suggestion|safe }}
+ + + + + + + + + {% for issue in issues %} + + + + + + {% endfor %} +
IssueCountsElapsed Time(us)
{{ issue.op_name |safe }}{{ issue.counts |safe }}{{ issue.total_time |safe }}
+
+ +
+{% endif %} \ No newline at end of file diff --git a/profiler/advisor/display/html/templates/operator_dynamic_shape.html b/profiler/advisor/display/html/templates/operator_dynamic_shape.html new file mode 100644 index 0000000000000000000000000000000000000000..59920b6c9ec276c9edddfd1906a31b41fb106e26 --- /dev/null +++ b/profiler/advisor/display/html/templates/operator_dynamic_shape.html @@ -0,0 +1,15 @@ +
+

Operator Dynamic Shape Issues

+
+ + + + + + + + + +
DescriptionSuggestion
{{ format_result.record.optimization_item.description|safe }}{{ format_result.suggestion|safe }}
+
+
\ No newline at end of file diff --git a/profiler/advisor/display/html/templates/operator_no_bound.html b/profiler/advisor/display/html/templates/operator_no_bound.html new file mode 100644 index 0000000000000000000000000000000000000000..cfbd20baad208216d2d9a1ee856702a163a6abfa --- /dev/null +++ b/profiler/advisor/display/html/templates/operator_no_bound.html @@ -0,0 +1,38 @@ +
+

Operator No Bound Issues

+
+ + + + + + + + + + + + + +
DescriptionSuggestionElapsed Time(us)Time Ratio
{{ format_result.record.optimization_item.description|safe }}{{ format_result.suggestion|safe }}{{ format_result.task_duration|safe }}{{ format_result.record.statistics_item.task_duration_ratio|safe }}
+
+ {% for op_type, op_info in format_result.statistic %} +
{{ op_type|safe }}
+
+ + + + + + + + + + + +
Operator TypeCountsElapsed Time(us)
{{ op_info.summary.op_type|safe }}{{ op_info.summary.counts|safe }}{{ op_info.summary.total_duration|safe }}
+
+ {% endfor %} +
+
+
\ No newline at end of file diff --git a/profiler/advisor/display/html/templates/overall_analysis.html b/profiler/advisor/display/html/templates/overall_analysis.html new file mode 100644 index 0000000000000000000000000000000000000000..ec61ae224ff2da59f2a80a9b4b10117d4c4c7c7a --- /dev/null +++ b/profiler/advisor/display/html/templates/overall_analysis.html @@ -0,0 +1,15 @@ +

Model Profiling Time Distribution

+ + + {% for header in headers %} + + {% endfor %} + + {% for row in rows %} + + {% for element in row %} + + {% endfor %} + + {% endfor %} +
{{ header }}
{{ element }}
\ No newline at end of file diff --git a/profiler/advisor/display/html/templates/timeline_analysis.html b/profiler/advisor/display/html/templates/timeline_analysis.html new file mode 100644 index 0000000000000000000000000000000000000000..b5ea89124277e05e7fdea63a34704df52bb322d4 --- /dev/null +++ b/profiler/advisor/display/html/templates/timeline_analysis.html @@ -0,0 +1,34 @@ +
+

{{title|safe}}

+
+
+
+ {% if result.get("img") %} +
+ Image +
+ {% endif %} + + {% if result.get("current") %} + + {% endif %} + + {% if result.get("bottlenect") %} + + {% endif %} + + {% if result.get("advice") %} + + {% endif %} + +
+
+
+
diff --git a/profiler/advisor/fusion_operators_api_analysis.ipynb b/profiler/advisor/fusion_operators_api_analysis.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..dcc71ba3c139f630c07545340e61c66b1f29d929 --- /dev/null +++ b/profiler/advisor/fusion_operators_api_analysis.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../..\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from prettytable import PrettyTable, ALL\n", + "from textwrap import fill\n", + "from profiler.advisor.interface.interface import Interface" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "profiling_path = \"YOUR PROFILING PATH\"\n", + "interface = Interface(profiling_path=profiling_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 融合算子API识别" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "指定profiling路径后,可以自动识别其中包含的融合算子并给出对应的torch_npu api和需要修改的代码堆栈。基于给定堆栈可以快速定位到需要修改的代码段,替换torch_npu api后,能够减少pytorch侧的小算子的下发,进而提升模型训练速度。" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "timeline_fusion_ops_result = interface.get_result(\"schedule\", \"timeline_fusion_ops\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
problemdescriptionsuggestion
timeline_fusion_opsFound 2 apis to be replaced based on the runtime env cann-8.0.0 and torch-2.1.01. Please replace training api according to sub table 'Affinity training api'
" + ], + "text/plain": [ + "+---------------------+---------------------------------------------------------------------------------+-------------------------------------------------------------------------------+\n", + "| problem | description | suggestion |\n", + "+---------------------+---------------------------------------------------------------------------------+-------------------------------------------------------------------------------+\n", + "| timeline_fusion_ops | Found 2 apis to be replaced based on the runtime env cann-8.0.0 and torch-2.1.0 | 1. Please replace training api according to sub table 'Affinity training api' |\n", + "+---------------------+---------------------------------------------------------------------------------+-------------------------------------------------------------------------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_column_num = 3\n", + "problems = timeline_fusion_ops_result.get(\"problems\")\n", + "problem_table = PrettyTable(problems.get(\"headers\")[:display_column_num])\n", + "for row in problems.get(\"data\"):\n", + " for i in range(len(row)):\n", + " row[i] = fill(str(row[i]), width=80)\n", + " problem_table.add_row(row[:display_column_num])\n", + "\n", + "display(problem_table)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "如下所示,存在亲和优化器和梯度裁剪两个可替换的torch_npu api,并给出了具体的堆栈。" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Affinity APICode stacksStack called counts
optimizer.clip_grad_norm_fused_/home/ma-user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site-
packages/torch/nn/utils/clip_grad.py(49): clip_grad_norm_; /home/ma-
user/work/algorithms/doc_cls/Bert.py(205): train_epoch; /home/ma-
user/work/algorithms/doc_cls/Bert.py(252): <module>
2
torch_npu.optim.NpuFusedAdamW/home/ma-user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site-
packages/torch_npu/npu/profiler.py(675): __enter__; /home/ma-
user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site-
packages/torch_npu/npu/profiler.py(719): wrapper; /home/ma-
user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site-
packages/torch/optim/lr_scheduler.py(65): wrapper; /home/ma-
user/work/algorithms/doc_cls/Bert.py(219): train_epoch; /home/ma-
user/work/algorithms/doc_cls/Bert.py(252): <module>
2
" + ], + "text/plain": [ + "+---------------------------------+-----------------------------------------------------------------------+---------------------+\n", + "| Affinity API | Code stacks | Stack called counts |\n", + "+---------------------------------+-----------------------------------------------------------------------+---------------------+\n", + "| optimizer.clip_grad_norm_fused_ | /home/ma-user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- | 2 |\n", + "| | packages/torch/nn/utils/clip_grad.py(49): clip_grad_norm_; /home/ma- | |\n", + "| | user/work/algorithms/doc_cls/Bert.py(205): train_epoch; /home/ma- | |\n", + "| | user/work/algorithms/doc_cls/Bert.py(252): | |\n", + "+---------------------------------+-----------------------------------------------------------------------+---------------------+\n", + "| torch_npu.optim.NpuFusedAdamW | /home/ma-user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- | 2 |\n", + "| | packages/torch_npu/npu/profiler.py(675): __enter__; /home/ma- | |\n", + "| | user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- | |\n", + "| | packages/torch_npu/npu/profiler.py(719): wrapper; /home/ma- | |\n", + "| | user/anaconda3/envs/PyTorch-1.11.0/lib/python3.9/site- | |\n", + "| | packages/torch/optim/lr_scheduler.py(65): wrapper; /home/ma- | |\n", + "| | user/work/algorithms/doc_cls/Bert.py(219): train_epoch; /home/ma- | |\n", + "| | user/work/algorithms/doc_cls/Bert.py(252): | |\n", + "+---------------------------------+-----------------------------------------------------------------------+---------------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fusion_ops_api = timeline_fusion_ops_result.get(\"timeline_fusion_ops\")\n", + "if fusion_ops_api:\n", + " fusion_ops_api_table = PrettyTable(fusion_ops_api.get(\"headers\"))\n", + "\n", + " for row in fusion_ops_api.get(\"data\"):\n", + " for i in range(len(row)):\n", + " row[i] = fill(str(row[i]), width=80)\n", + " fusion_ops_api_table.add_row(row)\n", + "\n", + " fusion_ops_api_table.hrules = ALL\n", + " display(fusion_ops_api_table)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/profiler/advisor/interface/__init__.py b/profiler/advisor/interface/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/interface/interface.py b/profiler/advisor/interface/interface.py new file mode 100644 index 0000000000000000000000000000000000000000..231f595d70b7e9dd6ee436153dc24259cfef640b --- /dev/null +++ b/profiler/advisor/interface/interface.py @@ -0,0 +1,75 @@ +import os +from collections import OrderedDict +import sys +sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), "cluster_analyse")) +sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), "compare_tools")) + +from profiler.advisor.utils.utils import Timer +from profiler.advisor.analyzer.computation.profiling_analyzer import AicpuAnalyzer, BlockDimAnalyzer, DynamicShapeAnalyzer, OperatorBoundAnalyzer +from profiler.advisor.analyzer.schedule.fusion_ops.fusion_ops_analyzer import TimelineFusionOpsAnalyzer +from profiler.advisor.analyzer.graph_fusion.graph_fusion_analyzer import FusionOPAnalyzer +from profiler.advisor.common.analyzer_scopes import SupportedScopes +from profiler.advisor.analyzer.cluster.slow_rank_analyser import SlowRankAnalyzer +from profiler.advisor.analyzer.cluster.slow_link_analyser import SlowLinkAnalyzer +from profiler.advisor.analyzer.overall.overall_summary_analyzer import OverallSummaryAnalyzer +from profiler.advisor.analyzer.schedule.dispatch.timeline_op_dispatch_analyzer import OpDispatchAnalyzer + +class Interface: + supported_analyzer = { + "schedule": OrderedDict({ + SupportedScopes.TIMELINE_FUSION_OPS: TimelineFusionOpsAnalyzer + }), + "computation": OrderedDict({ + SupportedScopes.DYNAMIC_SHAPE_ANALYSIS: DynamicShapeAnalyzer, + SupportedScopes.AICPU_ANALYSIS: AicpuAnalyzer, + SupportedScopes.OPERATOR_NO_BOUND_ANALYSIS: OperatorBoundAnalyzer, + SupportedScopes.BLOCK_DIM_ANALYSIS: BlockDimAnalyzer, + SupportedScopes.GRAPH: FusionOPAnalyzer, + SupportedScopes.TIMELINE_OP_DISPATCH: OpDispatchAnalyzer + }), + "communication": OrderedDict(), + "overall": OrderedDict({SupportedScopes.OVER_ALL: OverallSummaryAnalyzer}), + "dataloader": OrderedDict(), + "cluster": OrderedDict({ + SupportedScopes.SLOW_RANK: SlowRankAnalyzer, + SupportedScopes.SLOW_LINK: SlowLinkAnalyzer + }) + } + + all_dimension = list(supported_analyzer.keys()) + + def __init__(self, **kwargs): + self.collection_path = os.path.realpath(kwargs.get("profiling_path")) + + @staticmethod + def get_scope(dimension): + return list(Interface.supported_analyzer.get(dimension).keys()) + + @staticmethod + def get_analyzer(dimension, scope): + return Interface.supported_analyzer.get(dimension).get(scope) + + def get_result(self: any, dimension: str, scope: str, render_html=False, output_dict=True, **kwargs): + """ + :Param mode: affinity apis, ai cpu and so on. + """ + if dimension not in self.all_dimension: + raise ValueError(f"Error dimension {dimension}, supported dimensions are {self.all_dimension}") + + supported_scopes = self.get_scope(dimension) + if scope not in supported_scopes: + raise ValueError(f"Error scope {scope}, supported scopes are {supported_scopes}") + + analyzer = self.get_analyzer(dimension, scope)(collection_path=self.collection_path, **kwargs) + result = analyzer.optimize(**kwargs) + + if render_html and result.data: + if hasattr(analyzer, "html_render"): + analyzer.html_render.render_html() + analyzer.html_render.save_to_file(f'att_advisor_{Timer().strftime}.html') + + return result if not output_dict else dict(result.data) + + +if __name__ == "__main__": + Interface() diff --git a/profiler/advisor/overall_perf_analysis.ipynb b/profiler/advisor/overall_perf_analysis.ipynb deleted file mode 100644 index 0d1d5fcf66761bf9eaa18a8524a4d8b3369693e1..0000000000000000000000000000000000000000 --- a/profiler/advisor/overall_perf_analysis.ipynb +++ /dev/null @@ -1,323 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 18, - "id": "initial_id", - "metadata": { - "ExecuteTime": { - "end_time": "2023-11-21T13:31:25.022339600Z", - "start_time": "2023-11-21T13:31:25.016155200Z" - } - }, - "outputs": [], - "source": [ - "from advisor_backend.interface import Interface\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "id": "57d17a21205c3c5e", - "metadata": { - "jupyter": { - "outputs_hidden": false - } - }, - "source": [ - "# 总体性能拆解分析\n", - "### 1. 数据准备\n", - "我们当前支持Ascend PyTorch Profiler工具采集到的性能数据,您需要采集到的profiling_path路径,指定到*_ascend_pt。\n", - "\n", - "### 2. 拆解项说明\n", - "将整体耗时拆解为计算(Computing Time)、通信(Uncovered Communication Time)和空闲(Free Time)3个部分。\n", - "\n", - "1). Computing Time:指device在执行计算的耗时,若存在多条流并行计算的情况,对于耗时重叠部分只会计算一次\n", - "\n", - "计算耗时细分如下\n", - "\n", - " Cube Time:Cube算子耗时,该耗时占Computing Time的60%以上更能充分发挥NPU的算力\n", - " Vector Time:Vector算子耗时\n", - " Flash Attention Time(Forward):Flash Attention算子前向耗时\n", - " Flash Attention Time(Backward):Flash Attention算子反向耗时\n", - " Oter Time:AI CPU、DSA、TensorMove等其他算子耗时\n", - " \n", - "2). Uncovered Communication Time:未被计算掩盖的通信耗时,即总通信耗时减去通信与计算并行执行的耗时\n", - "\n", - "3). Free Time:指device既不在通信又不在计算的时间,空闲耗时 = 整体耗时 - 计算耗时 - 未被计算掩盖的通信耗时,该时间包含下发调度、SDMA时间(内存拷贝时间)。该耗时建议保持在10%以下\n", - "\n", - "空闲耗时细分如下\n", - "\n", - " SDMA Time:内存拷贝任务的耗时\n", - "\n", - "特别说明:通信(Uncovered Communication Time)和空闲(Free Time)耗时会受profiling性能膨胀的影响,以L0 + NPU采集的profiling为准。" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "36b7a24cc7ca5da2", - "metadata": { - "ExecuteTime": { - "end_time": "2023-11-21T12:53:38.379699800Z", - "start_time": "2023-11-21T12:53:38.363755900Z" - }, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "# 数据准备 EDIT THE PROFILING DATA PATH\n", - "profiling_path = \"YOUR PATH\"\n", - "# 若您有GPU上采集到的性能数据,可将NPU的性能数据与GPU之间进行对比,分析性能差距。输入GPU的性能数据路径\n", - "gpu_profiling_path = \"\" #默认为空,若有则可填写\n", - "interface = Interface(profiling_path)" - ] - }, - { - "cell_type": "markdown", - "id": "cf832ac2e0dfa30f", - "metadata": { - "jupyter": { - "outputs_hidden": false - } - }, - "source": [ - "## 1) 性能拆解分析" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "40aac93278dd6e34", - "metadata": { - "ExecuteTime": { - "end_time": "2023-11-21T12:53:41.815599700Z", - "start_time": "2023-11-21T12:53:41.783393700Z" - }, - "jupyter": { - "outputs_hidden": false - }, - "scrolled": false - }, - "outputs": [], - "source": [ - "print(\"Start performance analysis, please wait...\")\n", - "dataset = interface.get_data('overall', 'summary', base_collection_path=gpu_profiling_path)\n", - "data = dataset.get('data', {}) or {}\n", - "bottleneck = dataset.get('bottleneck', {}) or {}\n", - "print(\"Performance analysis is complete, you can edit the data to show what you want.\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3f353506", - "metadata": {}, - "outputs": [], - "source": [ - "# 等待性能分析完成后再查看数据" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "cd3fceda-49f0-439f-9c54-cc31490fc99e", - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiIAAAH2CAYAAABN8+eOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy88F64QAAAACXBIWXMAAA9hAAAPYQGoP6dpAABsjklEQVR4nO3dd3hTZcMG8DujTZqme6V0Ugod0LK3LAFZKoggr2wHoIKAivI6UFTgE9RXHAwXQ0QEURFBQJkCsvcoFAoFCi0tdO82Od8fpZHQAi00fTLu33X1gqQn59xN193nPOeJTJIkCUREREQCyEUHICIiIvvFIkJERETCsIgQERGRMCwiREREJAyLCBEREQnDIkJERETCsIgQERGRMCwiREREJAyLCBEREQnDIkJWY9GiRZDJZJDJZNi6dWuF90uShPDwcMhkMnTu3LlGjy2TyTB16tRqPy4xMREymQyLFi2q0nblb3K5HF5eXujduzd27dp1b6Hv4K233kJwcDCUSiXc3d0BAJ07d67wvN36cW/duvW2z785hYaGmjw/t3tbtGgRpk6dCplMVqv57ubmjAqFAh4eHmjcuDHGjBmD3bt3V9i+ql83t/rhhx8we/bsaj2msmOVP4fXrl2r1r7u5OTJk5g6dSoSExMrvG/kyJEIDQ2tsWORdVGKDkBUXS4uLvj2228r/NLctm0bEhIS4OLiIiZYDXjxxRcxePBg6PV6nDhxAu+++y66dOmCXbt2oWnTpjVyjN9++w3Tp0/Hm2++iV69ekGlUgEA5s6de9fHNmvWDLt27UJ0dHSNZKmqX3/9FUVFRcbb33zzDb799lusX78ebm5uxvvr1auHoqIi9OzZs1bzVcWAAQPwyiuvQJIkZGdn4/jx4/juu+/w1VdfYfz48fj000+N2/r7+2PXrl2oV69etY7xww8/4Pjx45g4cWKVH3Ovx6qukydP4t1330Xnzp0rlI4pU6ZgwoQJZj0+WS4WEbI6gwYNwtKlSzFnzhy4uroa7//222/Rtm1bZGdnC0x3f4KDg9GmTRsAQPv27REeHo6uXbti7ty5+Prrryt9TEFBAdRqdZVHAY4fPw4AGD9+PHx9fY33V6VcuLq6GvPVpltL2Pr16wEAzZs3h7e3d4XtAwMDayVXdfj5+Zk8dz169MDEiRMxevRofPbZZ4iMjMTzzz8PAFCpVGZ/nvV6PUpLS2vlWHdj7hJElo2nZsjqPPnkkwCAZcuWGe/LysrCzz//jKeffrrSx6Snp+OFF15AQEAAHB0dERYWhjfffNPkr2wAyM7OxqhRo+Dl5QWtVouePXsiPj6+0n2eOXMGgwcPhq+vL1QqFaKiojBnzpwa+ijLlP+CuHDhAoB/T0/9+eefePrpp+Hj4wONRoOioiIYDAbMmjULkZGRUKlU8PX1xfDhw5GUlGTcX2hoKN566y0AZb8Ybz71UtmpmVtVdmpm5MiR0Gq1OHv2LHr37g2tVougoCC88sorFZ7fpKQkDBgwAC4uLnB3d8eQIUOwb9++ezoNcTuVnZoJDQ3Fww8/jDVr1qBp06ZwcnJCVFQU1qxZA6DseY2KioKzszNatWqF/fv3V9jv/v378eijj8LT0xNqtRpNmzbFihUr7iurQqHAF198AW9vb3z44YfG+ys7XZKWlobRo0cjKCgIKpUKPj4+aN++PTZu3Aig7PO3du1aXLhwweRU0M37mzVrFqZNm4a6detCpVJhy5YtdzwNdOnSJfTv3x+urq5wc3PD0KFDkZaWZrLN7U5bhoaGYuTIkQDKnt+BAwcCALp06WJyKg2o/NRMYWEhXn/9ddStWxeOjo4ICAjA2LFjkZmZWeE4Dz/8MNavX49mzZrByckJkZGRWLBgwV2efbIULCJkdVxdXTFgwACTHzTLli2DXC7HoEGDKmxfWFiILl264LvvvsPLL7+MtWvXYujQoZg1axb69+9v3E6SJPTr1w9LlizBK6+8gl9//RVt2rRBr169Kuzz5MmTaNmyJY4fP46PP/4Ya9asQZ8+fTB+/Hi8++67Nfaxnj17FgDg4+Njcv/TTz8NBwcHLFmyBCtXroSDgwOef/55TJ48Gd27d8fq1avx/vvvY/369WjXrp3xXP+vv/6KZ555BkDZqMKuXbvw7LPP3nfOkpISPProo+jatSt+++03PP300/jkk08wc+ZM4zZ5eXno0qULtmzZgpkzZ2LFihXw8/Or9HNmDkeOHMHrr7+OyZMn45dffoGbmxv69++Pd955B9988w1mzJiBpUuXIisrCw8//DAKCgqMj92yZQvat2+PzMxMzJ8/H7/99huaNGmCQYMG3XeBcnJyQrdu3XD+/HmT0nirYcOGYdWqVXj77bfx559/4ptvvkG3bt1w/fp1AGWn1tq3bw+dToddu3YZ32722WefYfPmzfjoo4+wbt06REZG3jHbY489hvDwcKxcuRJTp07FqlWr0KNHD5SUlFTrY+zTpw9mzJgBAJgzZ44xW58+fSrdvvx78aOPPsKwYcOwdu1avPzyy1i8eDEefPDBCgX3yJEjeOWVV/DSSy/ht99+Q2xsLJ555hn8/fff1cpJgkhEVmLhwoUSAGnfvn3Sli1bJADS8ePHJUmSpJYtW0ojR46UJEmSGjZsKHXq1Mn4uPnz50sApBUrVpjsb+bMmRIA6c8//5QkSZLWrVsnAZA+/fRTk+2mT58uAZDeeecd4309evSQAgMDpaysLJNtx40bJ6nVaik9PV2SJEk6f/68BEBauHDhHT+28u1mzpwplZSUSIWFhdKBAwekli1bSgCktWvXmjwHw4cPN3l8XFycBEB64YUXTO7fs2ePBEB64403jPe98847EgApLS3NZNtOnTqZPG+SJFX4uMuf9y1bthjvGzFiRKXPb+/evaWIiAjj7Tlz5kgApHXr1plsN2bMmCo9Rze73cdw8/tuFhISIjk5OUlJSUnG+w4fPiwBkPz9/aW8vDzj/atWrZIASKtXrzbeFxkZKTVt2lQqKSkx2e/DDz8s+fv7S3q9/o55AUhjx4697fsnT54sAZD27NkjSVLlXzdarVaaOHHiHY/Tp08fKSQkpML95furV6+eVFxcXOn7bj5W+XP40ksvmWy7dOlSCYD0/fffm3xsN3+NlAsJCZFGjBhhvP3TTz9V+NopN2LECJPc69evlwBIs2bNMtlu+fLlEgDpq6++MjmOWq2WLly4YLyvoKBA8vT0lMaMGVPhWGR5OCJCVqlTp06oV68eFixYgGPHjmHfvn23PS2zefNmODs7Y8CAASb3lw8bb9q0CUDZX70AMGTIEJPtBg8ebHK7sLAQmzZtwmOPPQaNRoPS0lLjW+/evVFYWFjplRBVMXnyZDg4OECtVqN58+a4ePEivvzyS/Tu3dtku8cff9zkdnn28o+pXKtWrRAVFWX8GM1FJpPhkUceMbkvNjbWeEoJKJtM7OLiUmEiafmpNnNr0qQJAgICjLejoqIAlJ3S0Gg0Fe4vz3727FmcOnXK+HVx6+c7OTkZp0+fvq9skiTddZtWrVph0aJFmDZtGnbv3l3tUQkAePTRR+Hg4FDl7W/9XnjiiSegVCqNX2/msnnzZgAVv54HDhwIZ2fnCl/PTZo0QXBwsPG2Wq1GgwYNTL7+yHKxiJBVkslkeOqpp/D9999j/vz5aNCgATp06FDpttevX4dOp6swb8DX1xdKpdI4tH39+nUolUp4eXmZbKfT6Srsr7S0FJ9//jkcHBxM3soLw71e9jhhwgTs27cPBw4cQEJCApKTkzF69OgK2/n7+1fIVNn9AFCnTh3j+81Fo9FArVab3KdSqVBYWGiS0c/Pr8JjK7vPHDw9PU1uOzo63vH+8uxXr14FAEyaNKnC5/uFF14AcO+f73LlvzDr1Klz222WL1+OESNG4JtvvkHbtm3h6emJ4cOHIyUlpcrHqezr405u/dov//4w99dT+ffirackZTIZdDpdhePf+j0LlH393Xx6jSwXr5ohqzVy5Ei8/fbbmD9/PqZPn37b7by8vLBnzx5IkmRSRlJTU1FaWmq86sLLywulpaW4fv26yQ+2W3/Qe3h4QKFQYNiwYRg7dmylx6xbt+49fUyBgYFo0aLFXbe7tVSV501OTq5wxciVK1cqvbKktnl5eWHv3r0V7q/OL1IRyp+7119/3WRO0c0iIiLuef8FBQXYuHEj6tWrd8erfby9vTF79mzMnj0bFy9exOrVq/Hf//4XqampxquI7qa666ukpKSYjCJV9v2hUqkqzNkAcF9lpfx7MS0tzaSMSJKElJQUtGzZ8p73TZaHIyJktQICAvDqq6/ikUcewYgRI267XdeuXZGbm4tVq1aZ3P/dd98Z3w+UzeYHgKVLl5ps98MPP5jc1mg06NKlCw4dOoTY2Fi0aNGiwltlf6GZ04MPPggA+P77703u37dvH+Li4owfo0idOnVCTk4O1q1bZ3L/jz/+KChR1URERKB+/fo4cuRIpZ/rFi1a3PPaNXq9HuPGjcP169cxefLkKj8uODgY48aNQ/fu3XHw4EHj/TU9CnDr98KKFStQWlpqcnVVaGgojh49arLd5s2bkZuba3Jf+Xo1VclX/vV669fzzz//jLy8PIv4eqaawxERsmoffPDBXbcZPnw45syZgxEjRiAxMRExMTHYsWMHZsyYgd69e6Nbt24AgIceeggdO3bEa6+9hry8PLRo0QI7d+7EkiVLKuzz008/xQMPPIAOHTrg+eefR2hoKHJycnD27Fn8/vvvxnPctSUiIgKjR4/G559/Drlcjl69eiExMRFTpkxBUFAQXnrppVrNU5kRI0bgk08+wdChQzFt2jSEh4dj3bp12LBhAwBALrfcv4u+/PJL9OrVCz169MDIkSMREBCA9PR0xMXF4eDBg/jpp5/uuo+rV69i9+7dkCQJOTk5xgXNjhw5gpdeegmjRo267WOzsrLQpUsXDB48GJGRkXBxccG+ffuwfv16k1GamJgY/PLLL5g3bx6aN28OuVxepRG22/nll1+gVCrRvXt3nDhxAlOmTEHjxo3xxBNPGLcZNmwYpkyZgrfffhudOnXCyZMn8cUXX5gsNAcAjRo1AgB89dVXcHFxgVqtRt26dSst7d27d0ePHj0wefJkZGdno3379jh69CjeeecdNG3aFMOGDbvnj4ksD4sI2Ty1Wo0tW7bgzTffxIcffoi0tDQEBARg0qRJeOedd4zbyeVyrF69Gi+//DJmzZqF4uJitG/fHn/88UeFyxyjo6Nx8OBBvP/++3jrrbeQmpoKd3d31K9fv8LE0toyb9481KtXD99++y3mzJkDNzc39OzZE//3f/9X6yM0lXF2dsbmzZsxceJEvPbaa5DJZHjooYcwd+5c9O7d27jUvCXq0qUL9u7di+nTp2PixInIyMiAl5cXoqOjTX4p38nKlSuxcuVKyOVyaLVahISEoG3btpg/f/5dFxRTq9Vo3bo1lixZgsTERJSUlCA4OBiTJ0/Ga6+9ZtxuwoQJOHHiBN544w1kZWVBkqQqTYS9nV9++QVTp07FvHnzjBOSZ8+ebZxHAwCvvvoqsrOzsWjRInz00Udo1aoVVqxYgb59+5rsq27dupg9ezY+/fRTdO7cGXq9HgsXLqwwIRUoO4W0atUqTJ06FQsXLsT06dPh7e2NYcOGYcaMGcbRFbINMul+vkqJiO7TjBkz8NZbb+HixYsWuSIqEZkXR0SIqNZ88cUXAIDIyEiUlJRg8+bN+OyzzzB06FCWECI7xSJCRLVGo9Hgk08+QWJiIoqKioynF8qXnSci+8NTM0RERCSM5U5TJyIiIpvHIkJERETCsIgQERGRMCwiREREJAyLCBEREQnDIkJERETCsIgQERGRMCwiREREJAyLCBEREQnDIkJERETCsIgQERGRMCwiREREJAyLCBEREQnDIkJERETCsIgQERGRMCwiREREJAyLCBEREQnDIkJERETCsIgQERGRMCwiREREJAyLCBEREQnDIkJERETCsIgQERGRMCwiREREJAyLCBEREQnDIkJERETCsIgQERGRMCwiREREJAyLCBEREQnDIkJERETCsIgQERGRMCwiREREJAyLCBEREQnDIkJERETCsIgQERGRMCwiREREJAyLCBEREQnDIkJERETCsIgQERGRMCwiREREJAyLCBEREQnDIkJERETCsIgQERGRMCwiREREJAyLCBEREQnDIkJERETCsIgQERGRMCwiREREJIxSdAAioltlF5YgI68YGfklyCsqRbHegFK9hFK9ASUGCSWlBuglybi97KbHOijkcFYp4aJWQnvjXxe1A1zUSjgo+LcXkaVhESEis9MbJFzOKEDi9TykZBciM7+sZJSVjWJk5JWU/ZtfjMz8EpQapLvv9B6olHJjMSkvKW5ODvB3c0KghxMCPMr+DXTXwE3jYJYMRGRKJkmSeb7jiciu3Fw2Eq/nIfFa/o1/85CUUYBivUF0xGpxUSkR4OGEAPebS4oGwZ4ahPtqoXZQiI5IZBNYRIio2i5nFuDopUwcScpC/NUcqy0b90ohl6GejzMa1nFDwzquiPZ3RcM6bhxFIboHLCJ0T0JDQzFx4kRMnDhRaI7OnTujSZMmmD17ttActiwrvwRHkjJx5FJm2b9JWUjLKRIdyyIFuDshuo4rGtZxNZaUOu5OomMRWTQWETNLSUnB9OnTsXbtWly+fBm+vr5o0qQJJk6ciK5du4qOd1eLFi3CxIkTkZmZaXJ/WloanJ2dodFozHLcrVu3okuXLnfcZuHChXj00Ufh4OAAFxcXs+SwN8WlBhy7nPVv6biUiQvp+eBPiXvnrXVE67peaFvPC+3DvVHX21l0JCKLwsmqZpSYmIj27dvD3d0ds2bNQmxsLEpKSrBhwwaMHTsWp06dEh3xnvn4+Jh1/+3atUNycrLx9oQJE5CdnY2FCxca73Nzc4OTE//avF8Xr+dja3wq/o5Pw66E68gr1ouOZFOu5RZj7bFkrD1W9vVcx02NtvW80e5GMdG5qQUnJBKL17KZ0QsvvACZTIa9e/diwIABaNCgARo2bIiXX34Zu3fvNm538eJF9O3bF1qtFq6urnjiiSdw9epV4/unTp2KJk2aYMGCBQgODoZWq8Xzzz8PvV6PWbNmQafTwdfXF9OnTzc5vkwmw7x589CrVy84OTmhbt26+Omnn4zv37p1K2Qymclox+HDhyGTyZCYmIitW7fiqaeeQlZWFmQyGWQyGaZOnQqg7NTMzadDZDIZvvnmGzz22GPQaDSoX78+Vq9ebZJn9erVqF+/PpycnNClSxcsXry4wvHLOTo6QqfTGd+cnJygUqkq3Ne5c2eT00OhoaGYNm0ahg8fDq1Wi5CQEPz2229IS0szPscxMTHYv3+/yfH++ecfdOzYEU5OTggKCsL48eORl5d3t0+xVcovLsWmuKt4+7fj6PzhFnT8cAve/u0ENsalsoTUgitZhfj5YBJe+ekI2vzfJjz40Va8+esx/HEsGRl5xaLjEdU6FhEzSU9Px/r16zF27Fg4O1ccinV3dwcASJKEfv36IT09Hdu2bcNff/2FhIQEDBo0yGT7hIQErFu3DuvXr8eyZcuwYMEC9OnTB0lJSdi2bRtmzpyJt956y6TgAMCUKVPw+OOP48iRIxg6dCiefPJJxMXFVeljaNeuHWbPng1XV1ckJycjOTkZkyZNuu327777Lp544gkcPXoUvXv3xpAhQ5Ceng6gbHRowIAB6NevHw4fPowxY8bgzTffrFKO6vrkk0/Qvn17HDp0CH369MGwYcMwfPhwDB06FAcPHkR4eDiGDx+O8rOSx44dQ48ePdC/f38cPXoUy5cvx44dOzBu3Diz5BPhVEo2vtyWgCHf7EaT9/7CM4v347tdF5B4PV90NLt37loelu65iBeWHkSzaX+h75ydmL8tAZfS+bkh+8BTM2Zy9uxZSJKEyMjIO263ceNGHD16FOfPn0dQUBAAYMmSJWjYsCH27duHli1bAgAMBgMWLFgAFxcXREdHo0uXLjh9+jT++OMPyOVyREREYObMmdi6dSvatGlj3P/AgQPx7LPPAgDef/99/PXXX/j8888xd+7cu34Mjo6OcHNzg0wmg06nu+v2I0eOxJNPPgkAmDFjBj7//HPs3bsXPXv2xPz58xEREYEPP/wQABAREYHjx49XGMWpCb1798aYMWMAAG+//TbmzZuHli1bYuDAgQCAyZMno23btrh69Sp0Oh0+/PBDDB482DiyUr9+fXz22Wfo1KkT5s2bB7XaOofOj1zKxG+Hr2Dd8WQkZxWKjkNVIElln7cjlzLxwbpTiAlwQ68YHfrE+CPEi3NLyDaxiJhJ+V/bMpnsjtvFxcUhKCjIWEIAIDo6Gu7u7oiLizMWkdDQUJMJmX5+flAoFJDL5Sb3paammuy/bdu2FW4fPnz4nj6mu4mNjTX+39nZGS4uLsY8p0+fNn4s5Vq1amX2HH5+fgCAmJiYCvelpqZCp9PhwIEDOHv2LJYuXWrcRpIkGAwGnD9/HlFRUWbJaQ7nr+Vh1aHL+P3IFZy7ZpunluzJsctZOHY5C7PWn0bDOq7oHeOP3jH+nPBKNoVFxEzq168PmUyGuLg49OvX77bbSZJUaVm59X4HB9P1CWQyWaX3GQx3X8ehfL/lJebmC6dKSkru+vjbuVOeyj5Oc12wdXOO8mNWdl95NoPBgDFjxmD8+PEV9hUcHGyWjDUpNbsQq49cwW+Hr+DY5SzRcchMTlzJxokr2fhww2lE+buidyMd+sT6I8xHKzoa0X1hETETT09P9OjRA3PmzMH48eMrzBPJzMyEu7s7oqOjcfHiRVy6dMk4KnLy5ElkZWXVyF/iu3fvxvDhw01uN23aFMC/V74kJyfDw8MDACqMljg6OkKvv/8JjJGRkfjjjz9M7rt1wqgozZo1w4kTJxAeHi46SpVlFZRg/fFk/Hb4Cnafuw4zrYhOFiouORtxydn4+K94tAjxwJA2wegd4w+Vkqu9kvXhZFUzmjt3LvR6PVq1aoWff/4ZZ86cQVxcHD777DPjKZNu3bohNjYWQ4YMwcGDB7F3714MHz4cnTp1QosWLe47w08//YQFCxYgPj4e77zzDvbu3WuchBkeHo6goCBMnToV8fHxWLt2LT7++GOTx4eGhiI3NxebNm3CtWvXkJ9/bxPoxowZg1OnTmHy5MmIj4/HihUrsGjRIgB3P31lbpMnT8auXbswduxYHD58GGfOnMHq1avx4osvCs1VmQMX0vHiskNoOX0jJv98DP8ksITYu/0XMvDS8iNoM2MTpq89ifM8JUdWhkXEjOrWrYuDBw+iS5cueOWVV9CoUSN0794dmzZtwrx58wCU/RJetWoVPDw80LFjR3Tr1g1hYWFYvnx5jWR499138eOPPyI2NhaLFy/G0qVLER0dDaDsdMWyZctw6tQpNG7cGDNnzsS0adNMHt+uXTs899xzGDRoEHx8fDBr1qx7ylG3bl2sXLkSv/zyC2JjYzFv3jzjVTMqler+Psj7FBsbi23btuHMmTPo0KEDmjZtiilTpsDf319ornJFpXqsPJCERz7fgcfn7cLvR66guNQ+llKnqsvIL8HX28/jwY+3YvDXu7H2aDJK7GTJfbJuXFnVhslkMvz66693nKMi0vTp0zF//nxcunRJdBSLdDW7EEt2XcCyvRdxnetL0D3w1qrwRItAPNkqGEGe5lkFmeh+cY4I1Zq5c+eiZcuW8PLyws6dO/Hhhx/a1FodNWV/YjoW/pOIDcdTUMrzLnQfruUWYe7WBMzfloCODXwwqkMY2od7i45FZIJFhGrNmTNnMG3aNKSnpyM4OBivvPIKXn/9ddGxLEJRqR6rD1/B4l2JOH45W3QcsjEGCdh6Og1bT6ehabA7xnUJR9coP9GxiADw1AyRUEWleizbcxHztiXgajZf0ZZqT7S/K8Y9GI5ejXTCJ4yTfWMRIRKguNSA5fsvYe6Ws1z1lISq76vFC13q4dHGAVDIWUio9rGIENWiUr0BKw8k4fPNZ3E5s0B0HCKjEC8NnutUD483C4SjkhdUUu1hESGqBXqDhF8OlhWQi3wxM7JgddzUGNOpHv7TKogLpFGtYBEhMiODQcJvRy7js01nudAUWZVADye82iMCjzauwzkkZFYsIkRmsuFECj7ccBpnU3NFRyG6Z40D3fBG7yi0DvMSHYVsFIsIUQ07nZKD99acwM6z10VHIaox3aP98HqvSL7IHtU4FhGiGpKZX4z//RWPpXsuQs+FyMgGOShkeKp9XYzvWh9aFZehoprBIkJ0nwwGCUv3XMDHf8UjM79EdBwis/NxUeG1HhEY0DyQ80fovrGIEN2HI5cy8daq4zh2OUt0FKJa1zjIHVMfiUbTYA/RUciKsYgQ3YOsghLMWn8Ky/ZeBM/CkD2Ty4CR7eri1R4RcHLk5b5UfSwiRNX0y8EkzPgjDtdy+Yq4ROVCvDSY+Xgs2vDqGqomFhGiKkrLKcLrvxzFxrhU0VGILJJMBgxrE4LJPSPhzMmsVEUsIkRVsP54Mt749TjS8zgKQnQ3gR5OmPl4LNqHe4uOQlaARYToDrILS/DObyfw66HLoqMQWZ0nWwXhjd5RcFE7iI5CFoxFhOg2dp69hld/OoIrfHVcontWx02NGf1j0DnCV3QUslAsIkS3KCzR44N1p7B4VyL43UFUMwY2D8TURxty7ghVwCJCdJMjlzLx0orDOJfGF6gjqmn1fJwxd0hzROhcREchC8IiQgRAb5Dw2aYzmLPlLEq5MAiR2Tg5KPB+v0YY0DxQdBSyECwiZPcy8oox/sdD2H7mmugoRHZjYPNAvN+vEdQOXATN3rGIkF07fjkLY5YcwOXMAtFRiOxOpM4Fc4c04yv62jkWEbJbKw8k4c1fj6Go1CA6CpHd0qqU+ODxGDwcW0d0FBKERYTsTonegHd/P4Hvd18UHYWIbhjeNgRv9YmGo1IuOgrVMhYRsiup2YV4fulBHLiQIToKEd0iNtANcwY3Q5CnRnQUqkUsImQ39iWm44WlB5GWUyQ6ChHdhqezI74e3gLNQzxER6FawiJCdmHRzvOY/kccSvT8cieydCqlHJ8MaoLeMf6io1AtYBEhm6Y3SHjz12P4cd8l0VGIqBpkMuC/PSMxplM90VHIzFhEyGYVlugx7oeD2BiXKjoKEd2joW2C8e6jjaCQy0RHITNhESGblJlfjGcW7+ekVCIb8GCkL74Y3BQaR75OjS1iESGbcyWzACMW7MWZ1FzRUYiohjQKcMWCES3h66oWHYVqGIsI2ZQzV3MwfMFeJGcVio5CRDUswN0JC0a25Ivm2RgWEbIZ+xPT8czi/cgqKBEdhYjMxEWtxPyhzdE+3Ft0FKohLCJkE/46eRUvLjuIwhIu105k6xwVcswb2gxdo/xER6EawCJCVu/HvRfx5qrj0Bv4pUxkLxwVcswZ0gzdo1lGrB2LCFm1L7cl4P/WnRIdg4gEcFDI8MXgZujRUCc6Ct0HvroQWa1vd5xnCSGyYyV6CeN+OIj1x5NFR6H7wCJCVmnJ7gt4f81J0TGISLCyMnIIfxxjGbFWLCJkdZbvu4i3fzsuOgYRWYhSg4Txyw5hzdEroqPQPWARIavy66EkvP7LMXBmExHdrNQgYcKPh7H6CMuItWERIaux5ugVTPrpKHhxDBFVRm+Q8NLyw/jt8GXRUagaWETIKmw4kYKJPx7mJbpEdEd6g4SXVxzhaRorwiJCFm/LqVS8+MMhlLKEEFEV6A0SXl5+BLsSrouOQlXAIkIWbfuZNDz3/QEU67liKhFVXbHegDFL9uN0So7oKHQXLCJksY5fzsKYJQdQVMoSQkTVl11YipEL9yKFL4Jp0VhEyCKlZBXimcX7kF+sFx2FiKxYclYhRi7ci5xCvhimpWIRIYuTX1yKpxftw9XsItFRiMgGnErJwZglB1DM0VWLxCJCFsVwY2Gik8nZoqMQkQ35J+E6Xl15BHx5NcvDIkIWZdraOGyMSxUdg4hs0G+Hr2Dm+tOiY9AtWETIYizZfQELdp4XHYOIbNj8bQn4blei6Bh0ExYRsgjb4tPw7uoTomMQkR2YuvoENp68KjoG3cAiQsLFX83BuKUHuWAZEdUKgwS8tOIwEq/liY5CYBEhwdJyivDUwn3IKSoVHYWI7EhOYSme+/4ACku4RIBoLCIkTKnegOe/P4DLmQWioxCRHTqVkoM3fj0mOobdYxEhYT788zT2X8gQHYOI7NgvBy9j6Z4LomPYNRYREmLL6VR89fc50TGIiPDu7ydxNClTdAy7xSJCtS4lqxCvrDgCritERJaguNSA578/iIy8YtFR7BKLCNUqvUHC+B8PIZ3f8ERkQS5nFmDC8sMw8Oq9WsciQrVq9sZ47D2fLjoGEVEFf8en4dNNZ0THsDssIlRrdp69hjlbzoqOQUR0W59vPoOtp/kyE7WJRYRqRVpOESb8eBgc9SQiS2aQgJdXHEFaDl/9u7awiJDZGQwSJi4/hGu5/MYmIsuXnleMN7m+SK1hESGzm7PlLHaevS46BhFRlf158ipWHbosOoZdYBEhszqWlIXZnPxFRFbondUnkJpdKDqGzWMRIbMp1Rvw2s9HoefEECKyQlkFJfjvLzxFY24sImQ2X/59DnHJ2aJjEBHds82nUrFi/yXRMWwaiwiZRUJaLq/HJyKb8P6ak0jO4otzmotMkrjQNtUsSZLwxJe7sC+RL2hXXaU515C5dREKzh2AVFoMpWcdePWaAJUuHABwYebDlT7OvfNTcGv9eKXvK067gKwdS1GUchb67FR4PDgKri37mmyTe2ILMrcthlRSCG3sQ/Do8vS/mbKu4uryKfAfMRtylaaGPlIi69KhvjeWPNNadAybpBQdgGzP97svsITcA31hLlK+fw3q4Fj4DpwKhbM7SjKSIVc5G7cJHLvE5DEF5/bj+rrPoIlof9v9SqVFULrroIloj4zN31Q8bn4W0td/Dq/eE6F01yF15btQBcdAU68lAOD6hrnw6DSSJYTs2vYz1/DDnosY3DpYdBSbwyJCNepKZgFmrj8tOoZVyt69EkpXb3j3mWi8T+nmZ7KNQuthcjv/7B6oQ2Lg4K677X5V/g2g8m8AAMjYtrjC+0szUyBTaeAc1REAoA6ORcm1i0C9lsg7uRUyhRKaiHb3+mER2Yzpa0+iQ31vBHmylNckzhGhGvXmr8eQW1QqOoZVKji7B466+khb9X+49PkQXFk4HjmH1992e31eBgoS9kEb+9B9HVfpGQCppAjFVxOgL8hBcXI8HH1CoS/IQeb2pfDs/tx97Z/IVuQV6/EGFzqrcRwRoRrz2+HL2HI6TXQMq1WSmYKSQ3/AtWU/+LV9AkXJ8cjY9BVkSgdoG3WtsH3u8U2QOzpB0+D+RisUai28+7yEa2v+B6m0GM6NHoRTWHNc+2M2XJo/jNKsq0j9+X3AUAq39oPhHPnAfR2PyJptP3MN644lo1eMv+goNoNFhGpEel4x3v39pOgY1k2SoNKFw6PTCACAo189lFy7iJxDf1ReRI5uhHN0Z8iUjvd9aE2DdiaFpvDiUZSkXYBn9+dw5avR8H7kVSicPZD83ctQBzWCwtn9vo9JZK2mrY1D5whfODkqREexCTw1QzVi+to4pOcVi45h1RRaDzh4m06Ec/AKgj674ihT4aXjKE1Pgrbx/Z2WqYxUWoL0P+fBs8dYlGYkQzLooQ6OgYNXIBw8A1CUzDlAZN8uZxbwlcRrEIsI3bcjlzLxy6Ek0TGsniogGiXpps9jSfplKF19K2ybe/QvOOrC4egbVuM5Mv/5Eeqw5mWXDEsGwKA3vk8ylAIGQ40fk8jafLX9HBKv5YmOYRNYROi+vbfmJLgazf1zbdkXRVdOI2vXCpRkXEHeya3IPbIe2mZ9TLYzFOUj//SO205SvbbmY2RsW2S8LelLUHz1HIqvngMMpdDnXkfx1XMoybhS4bHFaReQf+pvuD8wFACg9AwEZHLkHPkT+Qn7UHI9CY7+9WvugyayUsWlBrz7+wnRMWwC54jQfVl95AoOXOCaITVB5d8APo+9icxti5G5cxmUbn7weHAUtA27mGyXF/c3IAHO0Z0q3U9pdhog+/dvDH1uOpIXjTfezt77C7L3/gJVUCPoBn9gvF+SJKRv+AIeD46C3FENAJA7qODVeyLS/5oHSV8Cz+7PQeniXZMfNpHV2nI6Ddvi09CpgY/oKFaNK6vSPSss0aPrx9twOZNLHxORfWrgp8W6CR2hkMtER7FaPDVD9+yb7edYQojIrsVfzcUPey6IjmHVWETonlzPLcL8bedExyAiEu6TjWeQVVAiOobVYhGhe/L55rNcQZWICGXrKH2xma82fq9YRKjaLl7Px1IORRIRGX236wJSswtFx7BKLCJUbR/+eRoles5xJiIqV1RqwLxtCaJjWCUWEaqWY0lZWHO04voTRET27oc9F5Gaw1GR6mIRoWqZvTGei5cREVWiqNSALzmJv9pYRKjKTqVkY/PpVNExiIgs1tI9F5CWUyQ6hlVhEaEqm7c1gaMhRER3UFhiwFd/c65IdbCIUJVcvJ6PNUeTRccgIrJ43+++iGu5HBWpKhYRqpIv/06A3sDhECKiuyko0ePrvzlXpKpYROiuUnMK8dOBpLtvSEREAIAluy/gOkdFqoRFhO7q2x3nUVxqEB2DiMhq5Bfr8fX286JjWAUWEbqjrIISLN19UXQMIiKrs2RXIjLyikXHsHgsInRHS3Yl8jVliIjuQV6xHsv3XxIdw+KxiNBtFZbosXBnougYRERW6/vdF2DgRP87YhGh2/px70Vc57AiEdE9S8oowBYuBHlHLCJ0W4t38RV2iYju13f8WXpHLCJUqV0J13H+Wp7oGEREVu/vM2m4cJ0/T2+HRYQq9eM+XilDRFQTJKlsrghVjkWEKsjML8a64ymiYxAR2YyfDiShsEQvOoZFYhGhCn45eJkLmBER1aDM/BKsPnJFdAyLxCJCFfC0DBFRzVvCSauVYhEhEwcuZCD+aq7oGERENufY5SwcupghOobFYREhEz/u5WgIEZG5cFSkIhYRMsopLMGao8miYxAR2aw/jicjjy+bYYJFhIx+O3wFBZzVTURkNoUlBmyMuyo6hkVhESEjTlIlIjK/349w5PlmLCIEADiVko3jl7NFxyAisnl/x6chu7BEdAyLwSJCAIB1x7iAGRFRbSjWG7CBi0YasYgQAGDDCX5TEBHVFl4Y8C8WEULitTycSskRHYOIyG7sPHsNGXnFomNYBBYRwnqOhhAR1apSg8TX9LqBRYSwnt8MRES1bs1RvvYMwCJi91KyCnEkKVN0DCIiu7P73HWk5hSKjiEci4id23AiBZIkOgURkf0xSLxiEWARsXs8LUNEJM7aY7x6hkXEjmXkFWNvYrroGEREduvQxQy7f+0ZFhE79tfJq9AbeF6GiEiUEr2E3eeui44hFIuIHeMiZkRE4m0/c010BKFYROxUid6AXXbewomILMGOsywiZIeOJmUiv1gvOgYRkd07m5qLlCz7vYyXRcRO7T7HSapERJZi+5k00RGEYRGxU/Y+OYqIyJLY8+kZFhE7VKo34MCFDNExiIjohp1nr0Gy09UlWUTs0JGkLM4PISKyINdyi3EyOVt0DCFYROwQT8sQEVmeHXZ6GS+LiB1iESEisjz2Ok+ERcTOlOoNOMj5IUREFmd/YoZdrnbNImJnjl7OQh7nhxARWZyCEj3OpuaKjlHrWETsDE/LEBFZrqNJmaIj1DoWETuz9zwXMiMislTHL2eJjlDrWETsjD1+kRMRWYtjdvgzmkXEjqTlFOFabrHoGEREdBsnk7PtbsIqi4gdOZVin4vlEBFZi8ISA86k5oiOUatYROzIqWT7+uImIrJGx5Ls6/QMi4gdieOICBGRxbO3eSIsInYkjiMiREQWj0WEbFKp3oAEO1woh4jI2sTZ2YRVFhE7kZCWh2K9QXQMIiK6i8ISA+Kv2s8INouIneAVM0RE1oNFhGwO54cQEVmPi9fzRUeoNSwidiIumSMiRETW4mI6iwjZGHsa5iMisnYsImRTSvQGXM0uFB2DiIiq6BKLCNmSlKxC2NGVYEREVi8luxDFpfZxpSOLiB1IzuJoCBGRNTFIQFKGfYyKsIjYgeSsAtERiIiomuxlngiLiB24kskRESIia2Mv80RYROwAR0SIiKwPR0TIZnBEhIjI+rCIkM3giAgRkfW5mG4fP7tZROwAr5ohIrI+9rL+E4uIjSss0SM9r1h0DCIiqqbsghLREWoFi4iN42gIEZF1KjVIyC0qFR3D7FhEbBznhxARWa8sOxgVYRGxcdkFtt+miYhsVVY+iwhZufxiFhEiImvFERGyennFetERiIjoHrGIkNXLt4OJTkREtsoerpxhEbFxHBEhIrJeHBEhq8cRESIi68UiQlaPIyJERNaLRYSsHq+aISKyXiwiZPXyijgiQkRkrfLs4PQ6i4iN44gIEZH10kuS6AhmxyJi4zhHhIjIeukNLCJk5Qo4IkJEZLUMHBEha1dqB22aiMhWcUSErJ5cJhMdgYiI7pHBIDqB+SlFByDzkrOHkA2r46bG813CRccgMhtfF5XoCGbHImLjOCJCtuzlhyIwoHmg6BhEdB94asbGyVhEyEZF6lzQv2mA6BhEdJ84ImLjFKyaZKMm94qEXC4rO4kul2PWvlko0dv+KpRkn1xVrnix6YuiY5gFi4iNU8rZRMj2tA3zQpcIX+B6AiBXIsVRhSUnl4iORWQ2OmedzRYR/paycSolP8VkW2Qy4PXekWU3tv8PcAvCqfRTYkMRmZlCphAdwWz4W8rGObKIkI3pE+OP2EB3IOkAcP0sIJeziJDNk8F25/vxt5SNUyltt0WT/XFQyPBqj4iyG3+9DegaAQBOp58WmIrI/OQy2/11bbsfGQEAVA78FJPtGNI6BCFezkD8BuDCDkAXAwAcESGbp5Db7h+V/C1l4zhHhGyFi0qJ8V3rl10ls3Fq2Z26GOQW5+Jy7mWh2YjMTaPUiI5gNvwtZeNcVLwwimzDmE5h8HR2BI78AKSeBGQKwLchTqWfggTbfz0Osm9aR63oCGbDImLjPJ1tf3lgsn1+rio880AYUFIAbJlRdqd3A8BBjdMZnB9Ctk/rwCJCVspL6yg6AtF9m9itAZwcFcDueUD2jdMwnB9CdsTZwVl0BLNhEbFx3iwiZOXCfbV4okUQkJ8O7Jj97ztuFBFeMUP2gCMiZLW8tDw1Q9bttR4RUMhlwN8fAUVZ/75DF4MSQwnOZp4VF46olnBEhKyWlzNHRMh6tQz1wEMNdUDGBWDf16bv1MXiXOY5lBj4+jJk+zhZlawWR0TImv23V1TZfzZPA/TF/77DpQ7g7MX5IWQ3eGqGrJabkwMc+RK8ZIV6NtSheYgHkHwEOPaT6Tv9YwFwoirZD56aIavmydMzZGWUchle61m+lPs7wK3rhJRPVOWlu2QnXBxdREcwGxYRO8BLeMnaDGoZhDAfLXB2E3BuS8UNeMUM2RlvJ2/REcyGRcQOcESErInGUYEJ3eoDkgRsfKfyjXQxuJx7GdnF2bUbjkgQnbNOdASzYRGxA96csEpW5NkOYfB1UQNHVwApxypuoHIFPOpyfgjZDUe5IzzVnqJjmA2LiB3QualFRyCqEm+tI8Z0DANKi8qulKmMXyNAJuNpGbIbvhpf0RHMikXEDtT1tt3Z1mRbxnetD2eVEtj7NZB1sfKNuLQ72RlbPi0DsIjYhXo+LCJk+ep6O2Nwq2CgIBPY/tHtN+REVbIzfs5+oiOYFYuIHQjztt2FcMh2vNojAkqFHNjxCVCQcfsNdTHIKsrClbwrtReOSCA/DYsIWTkPZ0d4aBxExyC6rSZB7ugd4w9kXQb2zL/9hnIHwDeKoyFkV1hEyCaE+XBUhCzX670iy/6zZTpQWnj7DX0iAKWK80PIrnCOCNmEME5YJQvVNdIXrcO8gKsngCPL7rwxV1QlO8QiQjaBIyJkiRRyGSaXj4ZsnApIhjs/gFfMkB0KdQ0VHcGsWETsRBivnCEL9HizADTwcwHObwfO/Hn3B+hiUKwvxrmsc+YPR2QB/J39oXHQiI5hViwidoKX8JKlUTvI8XL3iLKl3P96u2oP0sXgbOZZlBpKzRuOyELUc68nOoLZsYjYiWBPZyjkMtExiIyebl+3bNXfE78CVw7e/QFuwYCTB6+YIbtSz41FhGyEo1KOIA8n0TGIAAAeGgc817keoC8BNr1XtQdxfgjZIY6IkE1p4OciOgIRAGDcg/XhqnYA9i8AMs5X7UEsImSHWETIpjQN9hAdgQhBnk4Y1iYEKMoBts2q+gN1MZAkCfEZ8eYLR2RhWETIpjQNdhcdgQiTHoqAo1IO7PwUyL9W9QfqYpCUk4TcklzzhSOyIP7O/nB2sP0LDVhE7EjjQHcoOWGVBGoU4IpHG9cBclKAXXOq/kC1O+ARglMZPC1D9iPMPUx0hFrBImJHnBwViNBxngiJ83qvKMhkMmDr/wEl+VV/IOeHkB0KdwsXHaFWsIjYmWacJ0KCdGzgg/bh3kBaPHBwSfUeXL60Oy/dJTvSyKeR6Ai1gkXEznCeCIkglwH/7XljKfdN7wKSvno7uFFE4tLjajgZkeVq4tNEdIRawSJiZzgiQiL0axKA6DquwMXdwKk11d+BLgYZhRlIzU+t+XBEFshX42vzL3ZXjkXEzoR6O8PT2VF0DLIjjko5Xn6oQdmNP6dUfwcKR8AnkvNDyK409mksOkKtYRGxQ02D3EVHIDsyom0IAj00QNzvQNLe6u/AJxJQOHB+CNkVFhGyaZwnQrXFVa3E2C7hgL4U2Pjuve1EFwsAvHSX7AqLCNk0zhOh2vJCl3C4axyBQ98B18/c207KL929ziJC9sFB7oBor2jRMWoNi4gdahrsUbayJZEZ1XFTY2S7UKA4D9j6wb3vSBeDIn0RErMTayoakUWL8oqCo8J+5vLxt5EdcnJUoHVdT9ExyMa9/FAE1A6KshVUc6/e415kgC4GZzLOQF/dS36JrJQ9nZYBWETsVpcIX9ERyIZF6lzQv2kAkHcN2PnZve/IIwRQu/KKGbIr9rJ+SDkWETv1YCSLCJnP5F6RkMtlZadkinPufUdc2p3sjEKmQGv/1qJj1CoWETsV6u2Mut62/6qOVPvahnmVjbhdTwAOLLq/nZVfMcMiQnaikXcjuKncRMeoVSwidqxzhI/oCGRjZDLg9d7lS7m/BxhK7m+HuhgYJAPOZNzjFTdEVqZ9QHvREWodi4gd4+kZqml9YvwRG+gOJB0ATq66/x3qYnEx+yLyS6vxSr1EVuyBOg+IjlDrWETsWOu6XtA4KkTHIBvhqJDjtR43RkP+evv+d6jxAtwCuJAZ2Q0PlQcaejcUHaPWsYjYMUelHO3qeYuOQTZicOtgBHtpgPgNwIUd979Dv7KXQOfS7mQv2tRpA7nM/n4t299HTCZ4eoZqgotKifFd6wMGA7Bxas3s9MYVM3HpcTWzPyIL90CA/Z2WAVhE7F6XSE5Ypfs3plNY2as6H14KpJ6smZ3euGKGIyJkD2SQoV2ddqJjCMEiYuf83ZwQqXMRHYOsmJ+rCs88EAaUFABbZtTcjv1jca3gGq4VXKu5fRJZqEjPSHg72eepchYRwiON64iOQFZsYrcGcHJUALvnATlXamanSjXg3YCjIWQ3OgZ2FB1BGBYRQr+mAZDJRKcgaxTuq8UTLYKA/HRgx+ya27FvFCBXcCEzshs9QnuIjiAMiwghwN2JL4JH9+S1HhFQyGXA3x8BRVk1t2Mu7U52JNw9HPU96ouOIQyLCAEA+jcNFB2BrEzLUA881FAHZFwA9n1dszvn0u5kR+x5NARgEaEbesXooHbglwNV3X97RZX9Z/M0QF9cszvXxaKgtAAXcy7W7H6JLFCvur1ERxCKv3kIAOCidkC3KD/RMchK9GyoQ/MQDyD5CHDsp5rduUwO+DVEfEY8DJKhZvdNZGGiPKMQ4hoiOoZQLCJk1L9ZgOgIZAWUchle6xlRduOvtwFINXsAj7qASotT13lahmxfz7o9RUcQjkWEjDrW94G31lF0DLJwg1oGIcxHC5zdBJzbWvMHKJ+oyteYITvQM5RFhEWEjJQKOR6O5ZoidHsaRwUmdKsPSBKw8R3zHORGEeEaImTrYn1iUUfLn7ksImSCp2foTp7tEAZfFzVwdAWQcsw8B/FvDL1BjzMZZ8yzfyILwdGQMiwiZCI20B31fJxFxyAL5K11xJiOYUBpUdmVMuaii8GF7Aso1Bea7xhEgjnIHdAnrI/oGBaBRYQq+E/LYNERyAJN6FofziolsPdrIMtMl9U6+wIuOr7iLtm8rsFd4anmQpIAiwhV4omWQXB2VIiOQRakrrcznmwVDBRkAts/Mt+BdI0AcH4I2b6BDQaKjmAxWESoAjcnBwxozpVW6V+v9oiAUiEHdvwPKMgw34G4tDvZgVDXULTybyU6hsVgEaFKPdW+Ll8IjwAATYLc0TvGH8hKAvZ8ad6D3Vja/XQGR0TIdg1oMEB0BIvCIkKVCvV2RtdIX9ExyAK83iuy7D9bZgClZp5AqotFan4q0gvTzXscIkFUChX6hfcTHcOisIjQbT3dvq7oCCRY10hftA7zAq6eAI4sM+/BHDSAVzhPy5BN6x7SHW4qN9ExLAqLCN1Wu3BvRPm7io5BgijkMkwuHw3ZOBUw9+u++EYDcjmLCNk0TlKtiEWE7uip9qGiI5AgjzcLQAM/F+D8duDMn+Y/ICeqko0Ldw9HM79momNYHBYRuqO+TerAW6sSHYNqmdpBjpe7R5Qt5f7X27VzUC7tTjZuUMQg0REsEosI3ZFKqcCQ1lzgzN483b4udG5q4MQvwJWDtXNQ/8bIK8nDpZxLtXM8olrkofJA3/C+omNYJBYRuquhbULgqOSXir3w0Djguc71AH0JsOn92jmoTAH4NcTp9NOQINXOMYlq0X8i/wMnpZPoGBaJv13ornxcVHicL4ZnN8Y9WB+uagdg/wIg43ztHNSrHuDgxPkhZJPUCjWejHxSdAyLxSJCVfLig/U5KmIHgjydMKxNCFCUA2ybVXsHLp8fwoXMyAb1De8LD7WH6BgWi79ZqErquDtxrogdmPRQRFnh3PkpkH+t9g7MK2bIRillSoxsOFJ0DIumFB2ArMfYLuFYvu8S8ov1oqOQGTQKcMWjjesAOSnArjm1e3BdLEoNpTibcbZ2j2uB0takIftANoqSiyBzkEETroHuCR1U/v9evaYv1OPqT1eRfTAb+lw9HL0d4dndE14Pet1x31n7spD6ayqKU4vh6OsIv8f94Nr837WCMv/JRMrKFEhFEjw6eED3H53xfcVpxUj8KBH1ptaDwokvillVvcN6I9CFr911JxwRoSrz1qowsl2o6BhkJq/3ioJMJgO2/h9Qkl+7B9fF4lzWORQbimv3uBYo71QePB/0RNiUMIS+GgoYgMSPEmEo+ndBuZQfUpB7LBeBowNRf0Z9ePXwQvL3ycg+mH3b/eafzceleZfg3s4d4e+Fw72dOy7OvYj8hLLPdWlOKS4vvAz/Qf4IeSUEGTszkHM4x/j4K99dgd9AP5aQapDL5Hgm5hnRMSweiwhVy5hO9eCq5kCarenYwAftw72BtHjg4JLaPbhWB2h9uH7IDaGTQuHRwQPqADWcgp0Q8EwASq6XoCCxwLhNfkI+3Nu7QxulhaOPIzw7e0IdpEbB+YLb7vfan9egbaiFz8M+UNVRwedhH2ijtLj+53UAZSMeCicF3Fq7QROmgXOUMwqvlL22UOauTMiUMri14NLk1dE1uCvC3MJEx7B4LCJULW5ODhjdkd9YtkQuA/7b8+al3Gv51Bvnh9yRvqDs86Fw/nckQlNfg5zDOSjJKIEkSciNy0Xx1WJoY7S33U/B2QJoG5m+XxujRf7ZshERlZ8KhmIDCi4UoDS3FAXnC6AOUqM0txSpv6bCf6i/GT462yWDDKNjR4uOYRX4py1V21Pt62LRP4m4lsthdFvQr0kAouu4Ahd3A6fX1n4Arqh6W5IkIWVZCjQNNFAHqo33+w/1x5WFV3D6pdOAApDJZKjzVB04N3C+7b5Ks0qhdDX9ka90VaI0qxRAWdEJHBWIpK+TIBVLcG/nDpcYFyR9mwTPbp4ouVaCi59ehKSX4NvPF24tOTpyJz1DeyLSM1J0DKvAIkLV5qxS4vnO4Xh/zUnRUeg+OSrlePmhBmU3/pwiJoR/LADgVAZHRG6VvCQZhZcKEfam6Shk+l/pyE/IR/CEYDh6OyLvdB6SlyTDwd0B2oa3HxWB7Jbbt6wd59rc1WTyam5cLoqSilBnaB3ET45H0HNBULopkfBeApwjnCsUGyqjlCvxYtMXRcewGjw1Q/dkaJtg+Lup774hWbQRbUMQ6KEB4n4HkvaKCaGLRXJuMrKKssQc30JdWXIF2YezUfe/deHg6WC831BswNWVV+H/H3+4NnWFOkgNr25ecGvlhmvrbn/JtdLt39GPcqU5pVC6VV4mDCUGJC9JRp0RdVCcWgxJL8E50hkqfxVUOpVxkitVNKD+AAS5BomOYTVYROieqJQKvPhgfdEx6D64qpUY2yUc0JcCG98VE8JRC3iGcX7ITSRJKishB7JR97W6cPRxNH2/XoKklyr+9JaXPfZ2nMKdkHsi1+S+3OO50IRrKt0+bXUatDFaOIU6QTJIwL8X7UAqNb1N/9IoNXiu8XOiY1gVFhG6Z0+0CEQDvzsMA5NFe6FLONw1jsCh74DrZ8SE8GsIyGQ8LXOT5CXJyPwnE0HPBUGulqMkswQlmSUwFJf95lc4KaCJ0CBleUrZJNW0YmRsz0DmzkyT0ypJXyUh5acU423v7t7IPZ6LtLVpKLpShLS1acg9mQuvhyquPVJ4uRBZe7Pg198PAMrWMJEB6dvSkXM4B0XJRXAK4+umVGZEwxHwcrrzei5kiif46J4pFXJMfaQhBn+zR3QUqqY6buqyNWGK84CtH4gLwomqFaRvTgcAnP/A9HV+Ap4JgEeHsmXCg54PwtWVV5H0ZRL0eXo4eDnA73E/eHbxNG5ffL3YZE6Ipr6m7HE/X0XqL6lw9HVE0PNB0NQzHRGRJAlXFl6B7kkd5Kqyv1XljnIEPBuA5CXJkEok+A/zh4OHA8iUp9qTq6jeA5l0p7E8oioYu/Qg1h5LFh2DquGjgY0xoHkgsHUmsHWGuCCPfAY0H4GeP/fE5dzL4nIQ1YDXW72OwVGDRcewOjw1Q/ftzT5RcHLgaovWIlLngv5NA4DcNOCfz8SG0cUgpziHJYSsXpBLEAZGDBQdwyqxiNB9q+PuhLFd6omOQVU0uVck5HIZsG0mUJx79weYi1wJ+EZzoirZhPHNxsNBztNV94JFhGrEqI5hqOt9+8WUyDK0DfNClwhf4HoCcGCR2DBe9QEHNeeHkNVr498GPUN7io5htVhEqEaolApM69dIdAy6A5kMeL33jZUeN70HGErEBuLS7mQDHOWOeLP1m6JjWDUWEaox7cO98VjTANEx6Db6xPgjNtAdSDoAnFwlOo5xRdXTGRwRIev1VKOnEOoWKjqGVWMRoRr1Vp8ouGt4ntTSOCrkeK3HjdGQv94WG6acLgYl+hKczTwrOgnRPQnUBmJU7CjRMaweiwjVKC+t6t9XciWLMbh1MIK9NED8BuDCDtFxyuhikZCVgFJD6d23JbJAb7R+AyqFSnQMq8ciQjVuUMsgtKrrefcNqVa4qJQY37U+YNADf70jOk4Z1wBA48n5IWS1uod0R4fADqJj2AQWEapxMpkMHw9sDBcVF+61BGM6hcHT2RE4/AOQFic6ThmuqEpWTKPU4LWWr4mOYTNYRMgsgjw1eOfRhqJj2D0/VxWeeSAMKCkAtszA1K2FkL2bbfKm+yjnjvvYlliK5l/lQj0tG2Gf5mD+/mKT9/+VUIoGn+fC7YNsjFhVgGL9v4s1ZxVKaPB5Li5m3fIKabqyiaocESFr9EKTF6Bz1omOYTP4JyuZzYDmgdh86ir+OJZy943JLCZ2awAnRwWwfR6QcwUA0NBHjo3D/319EYXsdo8GzmcY0PuHfIxq5ojvH3PAzkt6vLC2ED4aGR6PdoBBkjDklwL89wFH9KinxICfCvD1gRKMbVX2irGTNxbiuRYOCHa75W8ejoiQlWro1RBDooaIjmFTWETIrGY8FoMDFzJwNbtIdBS7E+6rxRMtgoD8dGDHbOP9Sjmg01ZtMHT+/mIEu8kxu6caABDlo8D+K3p8tKsYj0c74Fq+hLR8CS+0dIRaKcOjDZQ4maYHAOy8WIr9V/SY01tdcce6GCTlJCGn5M6jMUSWRK1QY0aHGVDK+auzJvHUDJmVu8YRHw1sDNkd/uom83itRwQUchnw90dAUZbx/jPpBtT5OAd1P83Bf1bm41yG4bb72JWkx0Nhpj90e9RTYv8VPUr0Enw0MvhrZfgzoRQFJRK2X9Qj1k+BYr2E59cWYv7DTmUZbqZyAzxCORpCVmdCswkIcwsTHcPmsIiQ2XWo71P2kvNUa1qGeuChhjog4wKw72vj/a0DFPiunxM2DNXg60eckJIrod23ebieX3kZScmV4Kc1LRJ+WhlKDcC1fAkymQwrBjrh/b+LED03F011cjzd1AEf7ChG17pKOCmB9gvyEPFFLr7Ye2NuiV9DQCbDqQzODyHr0dq/NU/JmAnHl6hWTO4ZiX/OXsfpqxyKrw3/7RVV9p/N7wP6fyeX9qr/72JzMQDaBipQ77NcLD5SgpfbVr4ewq2DWdKNuajlo1wPBCuxb5TW+P7463osOVqCQ2Oc0XFhHia2cUTPcCUazc1DxxAFYltzoipZFxcHF0xrPw0yDu2aBUdEqFaoHRSY/Z8mcFTyS87cejbUoXmIB3DlMHBs5R23dXaUIcZPjjPXKx8R0WllSMmVTO5LzZOglANeThV/KEuShNG/F+Ljh1QwSMChFAMGRDvA11mOTqEKbEvU8zVmyOq83vp1XiVjRvytQLUmyt8Vkx5qIDqGTVPKZXitZ0TZjY3vAJDuuH1RqYS4NAP8XSr/UdA2UIG/zpmufPpnQila1FHAoZLLbb49VAIvjQyPRjhAf6PblOj//VcvSYAuBllFWUjJ49VUZPm6h3THI/UeER3DprGIUK169oEwPBDuLTqGzRrUMghhPlrg7Cbg3NYK75/0ZyG2JZbifIYBe5JKMeCnAmQXSRjRuOyUzesbCzH81wLj9s+1cMSFLANe3lCIuDQ9FhwqxreHSjCprWOFfafmGTDt7yJ8duMKGw8nGaK85Zi9uxi7LpVi0/lStAtWAT5RHA0hq+Dt5I2321jIazPZMBYRqlVyuQyfPdkUgR5OoqPYHI2jAhO61S+bxLGx8qXck7INePLnAkR8kYv+KwrgqAB2P+uMEPeyHwXJuZLJ4mN1PeT4Y7AGWxNL0eTLPLz/dxE+66XG49EVX9hwwvpCTGqnQoDrvz9WFvVzwo8nSvDwsgK82k6FVk0bAUpHFhGyeDLI8F679+CudhcdxebJJEm689gtkRmcuJKFAfN2oaB83J7u2/iu9fFy9wbAkeXAr6NFx6lc48HAY/PwxvY38Pu530WnIbqtUTGjML7ZeNEx7AJHREiIhnXcMHNArOgYNsNb64gxHcOA0iJg8zTRcW7vxkTVuHQLec0bokq08W+DcU3HiY5hN1hESJhHG9cp++VJ921C1/pwVimBvV8BWRdFx7k9XQyK9cVIzEoUnYSoUn4aP8zsOBNyGX891hY+0yTU5J6R6NjAR3QMq1bX2xlPtgoGCjKB7R+LjnNnuhicyTyDUqn07tsS1TIHuQM+7vwxPNWeoqPYFRYREkoul+Hz/zRFqJfm7htTpV7tEQGlQg7s+B9QkCE6zu25BQNO7lzanSzWpBaT0NinsegYdodFhIRz0zjgq+Et4OyoEB3F6jQJckfvGH8gKwnY86XoOHfmzxVVyXL1rtsbg6MGi45hl1hEyCI08HPBx0804YvjVdPrvSLL/rNlBlBaKDbM3XBFVbJQ4e7heKdt5Ze8k/mxiJDF6NlIh3FdwkXHsBpdI33ROswLuHoCOLJMdJy708VAkiTEZ8SLTkJk5OLggv91/h80Djw9LAqLCFmUl7s3QL8mdUTHsHgKuQyTy0dDNk4FpMpfK8ai6GJwKecS8kryRCchAgAo5Up80uUT1HWrKzqKXWMRIYsik8nw4cDG6BzBK2nuZECzQDTwcwHObwfO/Ck6zt2p3QH3YJ6WIYsyte1UtPZvLTqG3WMRIYvjoJBj3pDmZa8gSxWoHeR4qXuDsqXc/7KS18Hg/BCyMGNix6BveF/RMQgsImShnBwVWDCiJSL8XERHsThPt68LnZsaOPELcOWg6DhVo+MVM2Q5Hg57mCunWhAWEbJYbhoHfPdMKwR58gXyynloHPBc53qAvgTY9L7oOFV3Y0SEa4iQaC38WuC9du+JjkE3YREhi+bnqsaSp1vDW6sSHcUijHuwPlzVDsD+BUDGedFxqk4Xg/TCdKQWpIpOQnasrltdzO4yGw6Kiq8eTeKwiJDFC/V2xuKnW8JFrRQdRaggTycMaxMCFOUA22aJjlN1ChXgE8HTMiSUp9oTc7vOhZvKTXQUugWLCFmFhnXc8M3wFlAp7fdLdtJDEXBUyoGdnwL510THqTrfSEDhwNMyJIzWQYu5Xeci0CVQdBSqhP3+VCer0zrMC18MbgaF3P6WX20U4IpHG9cBclKAXXNEx6meG/ND4tLjBAche6RRajCv2zw09G4oOgrdBosIWZXu0X744smmcFTY15fu672iIJPJypZyL8kXHad6blwxwxERqm1qhRpfdP0CTXybiI5Cd2BfP83JJvSK8ceXw5vbzWmajg180D7cG0g7DRz6XnSc6tPFoLC0EBeyL4hOQnbEUe6IT7t8ipa6lqKj0F3Yx09ysjldInyx6KlWNv+KvXIZ8N+e5Uu5vwtIerGBqk0G+DXCmYwz0FtddrJWSrkSH3f+GO0C2omOQlXAIkJWq209Lyx5tjVcbfhqmn5NAhBdxxW4uBs4vVZ0nOrzCAXUrjiVwStmqHYoZArM7DATnYM6i45CVcQiQlatWbAHlo1uAy9nR9FRapyjUo6XH2pQduPPKWLD3Kvypd2vs4iQ+cllckx7YBoeCn1IdBSqBhYRsnoN67hh+Zi20LmqRUepUSPahiDQQwPE/Q4k7RUd596UL+3OEREyM4VMgXfbvYuHwx4WHYWqiUWEbEK4rxY/PdfWZpaDd3NywLgu9QF9adncEGuli4FBMuBMxhnRSciGOcgd8GGnD9EvvJ/oKHQPWETIZgR5arBiTFuE+TiLjnLfXuhcD24aB+DQd8B1K/4lrovBhewLKCgtEJ2EbJRGqcGcrnPQPaS76Ch0j1hEyKb4uzlhxZi2aBLkLjrKPavjpsaIdqFAcR6w9QPRce6dxgtwC+D6IWQ2bio3fPPQN2hbp63oKHQfWETI5nhrVfhxdJuylUit0MsPRUDtoAD++QLIvSo6zr3jiqpkRr4aXyzuuRgxPjGio9B9YhEhm6R2UOCzJ5vile4NILOiFeEjdS7o3zQAyE0D/vlMdJz7c6OIcESEalqwSzC+6/Ud6rnXEx2FagCLCNm0F7vWx9zBzeDkYB0Ln03uFQm5XAZsmwkU54qOc3/Kr5jhq+5SDYr0jMTiXosRoA0QHYVqCIsI2bxeMf746TnLv7y3bZgXukT4AtcTgAOLRMe5f7oYXCu4huuF10UnIRvRStcKC3osgLeTt+goVINYRMguNApww+px7dE40E10lErJZMDrvW8s5b7pPcBQIjbQ/VKqAe8GHA2hGvN4/ccxv/t8uDi6iI5CNYxFhOyGr6say8e0xcOx/qKjVNAnxh+xge5A0gHg5CrRce6fbzQgV7CI0H2Ty+SY1GISprabCge5g+g4ZAYsImRX1A4KfDG4GSZ2q28xk1gdFXK81uPGaMhfb4sNU1PKl3ZnEaH7oFFqMLvzbIxoOEJ0FDIjFhGySxO7NcBXw1rAQyP+L6zBrYMR7KUBTq8HLuwQHadm8IoZuk+B2kB83/t7dAnuIjoKmRmLCNmt7tF+WDehI1rX9RSWwUWlxPiu9QGDHtg4VViOGqeLRX5JPi7mXBSdhKxQa//W+PHhH1Hfo77oKFQLWETIrunc1Fg2qg1e7t4ACnntn6sZ0ykMns6OwOEfgDQbWfhLJgf8GiI+Ix4GySA6DVmZIVFDML/bfLipLHNiOdU8FhGye3K5DOO71sfy0W0Q4F57L5rn56rCMw+EASUFwJYZtXZcs/MMA1Razg+hatE6aPFhpw/x31b/hVKuFB2HahGLCNENLUI98cf4DujZUFcrx5vYrQGcHBXA7nlAzpVaOWat4ERVqqaGXg2x4pEV6BnaU3QUEoBFhOgmbhoHzB/WHNMfawS1g/m+PcJ9tXiiRRCQnw7smG224wjBiapUDcOih2FJryUIcgkSHYUE4fgXUSWGtA5BixBPvLjsIOKv1vxS66/1iCibk/L3h0BRVo3vXyhdLPQGPc5mnhWdhCyYu8od09pPQ6egTqKjkGAcESG6jQidC1aPewCjOtSt0YmsLUM98FBDHZCRCOz7psb2azF0MTifdR6F+kLRSchCNfNthp8e+cnuS8jIkSPRr18/0TGEYxEhugO1gwJv9onGb2Pbo1GAa43s87+9osr+s3kaoC+ukX1aDGdfwEWHUxmcH0IVyWVyjIoZhQU9FkDnbL65WCNHjoRMJqvwdvZs7YzSJSYmVnr8m9+mTp2KTz/9FIsWLaqVTJaMp2aIqqBRgBt+G/sAFu48j//9FY/8Yv097adnQx2ah3gAVw4Dx1bWbEhLwPkhdBuhrqF4v/37aOLbpFaO17NnTyxcuNDkPh8fnwrbFRcXw9HRsUaPHRQUhOTkZOPtjz76COvXr8fGjRuN92m1Wmi12ho9rrXiiAhRFSnkMjzbIQx/vtQRnSMq/kC7G6Vchtd6RpTd2PgOAKlmA1oCXjFDt1DIFHiq0VNY+ejKWishAKBSqaDT6UzeFAoFOnfujHHjxuHll1+Gt7c3unfvDgA4efIkevfuDa1WCz8/PwwbNgzXrl0z7k+SJMyaNQthYWFwcnJC48aNsXJl5X9MKBQKk+NqtVoolcoK9916aqZz58548cUXMXHiRHh4eMDPzw9fffUV8vLy8NRTT8HFxQX16tXDunXrTI53t+yWjkWEqJoCPTRY9FQrfPZkU3hrVVV+3KCWQQjz0QJnNwHntpovoEgcEaGbhLuH4/ve3+Pl5i9Dpaj694q5LV68GEqlEjt37sSXX36J5ORkdOrUCU2aNMH+/fuxfv16XL16FU888YTxMW+99RYWLlyIefPm4cSJE3jppZcwdOhQbNu2rcazeXt7Y+/evXjxxRfx/PPPY+DAgWjXrh0OHjyIHj16YNiwYcjPzweAKmW3dDJJkmzwzzKi2pGVX4IZf8RhxYFLuNN3ksZRga2vdoavVgV82QFIOVZ7IWvT2H1I0bii+8ruopOQQEqZEk/HPI3nYp+Dg6L2X89p5MiR+P7776FWq4339erVCz/99BM6d+6MrKwsHDp0yPi+t99+G3v27MGGDRuM9yUlJSEoKAinT59GQEAAvL29sXnzZrRt29a4zbPPPov8/Hz88MMPd8wzdepUrFq1CocPH66QMzMzE6tWrQJQNiKi1+uxfft2AIBer4ebmxv69++P7777DgCQkpICf39/7Nq1C23atLlr9gYNGlTvyROAc0SI7oObxgEzB8TisWYBePPXY0hIy6t0u2c7hMHXRQ0cWW67JcTBGfAKx+nL20UnIYEiPSPxXrv3EOUVJTRHly5dMG/ePONtZ2dn4/9btGhhsu2BAwewZcuWSudsJCQkICsrC4WFhcbTOOWKi4vRtGnTGs0dGxtr/L9CoYCXlxdiYmKM9/n5+QEAUlNTq5SdRYTITrQJ88KGiR2xdM9FfLrpDNLz/r0axlurwpiOYUBpUdmVMrbKLxqQyzk/xE45KZ3wbMyzeKrRU3CQi39Va2dnZ4SHh9/2fTczGAx45JFHMHPmzArb+vv74/jx4wCAtWvXIiAgwOT9KlXNnnJycDB97mQymcl9MpnMmLkq2a0BiwhRDVEq5BjRLhSPNQvAnM1nsfCfRBSXGjChazicVUrgn3lAlg2/Gm35/JAMzg+xNz1De+KVFq+Y9ZJcc2rWrBl+/vlnhIaGQqms+GsxOjoaKpUKFy9eRKdOlrX2yd2yWwNOViWqYa5qB7zeOwqbXu6Ep9qH4slWwUBBJrD9Y9HRzItXzNidSM9ILOyxEB92+tBqSwgAjB07Funp6XjyySexd+9enDt3Dn/++Seefvpp6PV6uLi4YNKkSXjppZewePFiJCQk4NChQ5gzZw4WL15s0dmtgXXWJyIrEOSpwTuPNCy7seN/QEGG2EDmpotFbnEuknKSRCchM/NQeWBc03EY0GAA5DLr/3u2Tp062LlzJyZPnowePXqgqKgIISEh6NmzJ+Tyso/v/fffh6+vL/7v//4P586dg7u7O5o1a4Y33njD4rNbOl41Q2RuWUnA582BUhte8lymAN64jAPpJzFy/UjRachMlDIlnoh4Ai80eQFuKjfRcchGcESEyNy2zLDtEgIAXuGAgxNPy9iw9nXaY1KLSQj3qHwCKNG9YhEhMrfwbsDFXUD6OdFJzIcLmdmsVrpWGNd0HJr61uxlqkTlWESIzK1RfyDqUeDQEmDbLCDniuhENY8TVW1OM99mGNtkLFr5txIdhWwciwhRbVAogRZPAY2fBPZ+Bez4BChIF52q5uhiUGIoQUJmgugkdJ9ivGMwrsk4tAtoJzoK2QlOViUSoTAb2D2vrJTkW8+LU93Wqwk4XXQdA34fIDoJ3aMozyiMbTIWnYIsa50Msn0cESESQe0KdJ4MtJ8AHP0R2DUXuGal8ytc/AFnb5xO+Ud0EroHLfxaYHj0cHQO6mxctZOoNrGIEInkoAaajwSajQDObgR2fWF9r8zL+SFWRylXokdoDwyPHo5or2jRccjOsYgQWQKZDKjfvewt5Tiwaw5wfCWgL777Y0XjFTNWw9XRFQMbDMSTkU/Cz9lPdBwiAJwjQmS5clLK5pDsX2DZq7IOXAQ0fAztl7VHdnG26DRUiRDXEAyJGoK+9fpC46ARHYfIBIsIkaUrzgeO/FA2jyTdAq9KefEgrqic0OPnHqKT0E2UMiU6BHZA//r90TGwo00sxU62iadmiCydowZo+SzQ4hkgfkPZeiRn/rSM0zaOLoBnGE5d2iI6Cd0Q7h6OfuH90CesD7ydvEXHIborFhEiayGTARE9y97y04FjK4Ejy4ArB8Vl8msIyGScHyKYi6MLeoX2wmP1H0Mj70ai4xBVC4sIkTXSeAKtR5e9pZ4qKyRHV9T+qq28YkYYuUyOVrpWeCz8MXQN6QqVQiU6EtE9YREhsna+kUD3d4FuU8te0+b4z8DJ34C8NPMfm0WkVinlSrTWtUbXkK7oEtSFp17IJnCyKpEtMuiBxO1lpSTud/NddTN6K7K966H9svbm2T/BSemEBwIeQNfgrugY2BEuji6iIxHVKBYRIlunLwEu7AQStgAJm4GUYwBq4NtergTeuIJ9147i6Q1P3//+yMhN5YZOgZ3QNbgr2tVpB7VSLToSkdnw1AyRrVM4AGGdy966vwvkXStbvfXcFiBhK5CddG/79W4AKFU8LVMDlHIlYr1j0aZOG7T1b4sY7xgo5ArRsYhqBYsIkb1x9gZiBpS9AUBa/I1SsgVI3AEU51RtP5wfcs8UMgUiPCPQ0q8lWvm3Qgu/FlxojOwWiwiRvfNpUPbWekzZaZyk/WWncM5tAa4cAgyllT+ORaTKtA5aRHlFoZFXI7TQtUAz32bQOmpFxyKyCJwjQkS3V1JYNqfkyiEg+XDZv2mnAUkPDF+NkpB2aPVDK5TerqzYIWcHZ0R5RiHaKxoNvRoi2isaIa4hfGVbottgESGi6inOLysnukaQHDRIyklCfEY84jPjcSbjDOIz4pGUkwS9pBed1KyUciUCtYEIcQ1BqGsoorzKykeoayhLB1E1sIgQUY0rNZQiOS8ZSTlJSMpNwuWcy0jKTTLezirKEh2xSpQyJQJcAhDsEowQ1xAEuwYjxKXsX39nf04oJaoBLCJEVOtyi3ORlJuE1PxUZBZlIqMwA1lFWcgouvFvYQYyizKNbzVx6kcGGVQKFVRKFZyUTvBUe8JL7VX2r5MXvNRe8HIyve2h9uCLxRGZGYsIEVm8In0RSvQlKDWUosRw+38lSGVl48abWqmGo8IRakXZv0RkeVj1a0hoaChmz54tOoZFqo3nZuvWrZDJZMjMzDTrce5m6tSpaNKkidAMtkilUEHrqIW72h0+Gh/4a/0R7BqMMPcwRHhGoKF3QzTxbYKmvk0R7RWNeu71EOgSCG8nb7g6urKEEFmwahWRzp07Y+LEiRXuX7VqFSdnVUF2djbefPNNREZGQq1WQ6fToVu3bvjll19gywNT+/btw+jRo2tsf5V9HbZr1w7Jyclwc3OrsePcSiaT3fFt5MiRmDRpEjZt2mS2DEREtobriNykuLgYjo7m+cspMzMTDzzwALKysjBt2jS0bNkSSqUS27Ztw2uvvYYHH3wQ7u7uZjm2aD4+PmY/hqOjI3Q6nVmPkZycbPz/8uXL8fbbb+P06dPG+5ycnKDVaqHVcn0IIqKqMsupmfLh6SVLliA0NBRubm74z3/+g5ycf1dsNBgMmDlzJsLDw6FSqRAcHIzp06cb33/s2DE8+OCDcHJygpeXF0aPHo3c3FwAwIYNG6BWqysMw48fPx6dOnUy3v7nn3/QsWNHODk5ISgoCOPHj0deXp7x/aGhoZg2bRpGjhwJNzc3jBo1qkqPS01NxSOPPAInJyfUrVsXS5cuvetz8sYbbyAxMRF79uzBiBEjEB0djQYNGmDUqFE4fPiw8ZdXRkYGhg8fDg8PD2g0GvTq1Qtnzpwx7mfRokVwd3fHmjVrEBERAY1GgwEDBiAvLw+LFy9GaGgoPDw88OKLL0Kv//fyyfKPdfjw4dBqtQgJCcFvv/2GtLQ09O3bF1qtFjExMdi/f3+Fz+PNZs+ejdDQUOPtkSNHol+/fvjoo4/g7+8PLy8vjB07FiUlJSbHvvnUTGZmJkaPHg0/Pz+o1Wo0atQIa9asAQBcv34dTz75JAIDA6HRaBATE4Nly5aZHG/btm349NNPjSMRiYmJlZ6a+fnnn9GwYUOoVCqEhobi448/NvlYQkNDMWPGDDz99NNwcXFBcHAwvvrqq9t+DnU6nfHNzc0NMpmswn23Pmflz8+MGTPg5+cHd3d3vPvuuygtLcWrr74KT09PBAYGYsGCBSbHunz5MgYNGgQPDw94eXmhb9++SExMvG02IiJrZbY5IgkJCVi1ahXWrFmDNWvWYNu2bfjggw+M73/99dcxc+ZMTJkyBSdPnsQPP/wAPz8/AEB+fj569uwJDw8P7Nu3Dz/99BM2btyIcePGAQC6desGd3d3/Pzzz8b96fV6rFixAkOGDAFQVmR69OiB/v374+jRo1i+fDl27Nhh3Ee5Dz/8EI0aNcKBAwcwZcqUKj1u5MiRSExMxObNm7Fy5UrMnTsXqampt30uDAYDfvzxRwwZMgR16tSp8H6tVgulUmnc9/79+7F69Wrs2rULkiShd+/eJr/Y8/Pz8dlnn+HHH3/E+vXrsXXrVvTv3x9//PEH/vjjDyxZsgRfffUVVq5caXKcTz75BO3bt8ehQ4fQp08fDBs2DMOHD8fQoUNx8OBBhIeHY/jw4dU+TbRlyxYkJCRgy5YtWLx4MRYtWoRFixbd9rno1asX/vnnH3z//fc4efIkPvjgAygUZZdBFhYWonnz5lizZg2OHz+O0aNHY9iwYdizZw8A4NNPP0Xbtm0xatQoJCcnIzk5GUFBQRWOc+DAATzxxBP4z3/+g2PHjmHq1KmYMmVKhVwff/wxWrRogUOHDuGFF17A888/j1Onanal0M2bN+PKlSv4+++/8b///Q9Tp07Fww8/DA8PD+zZswfPPfccnnvuOVy6dAlA2ee3S5cu0Gq1+Pvvv7Fjxw5otVr07NkTxcXFNZqNiEg4qRo6deokTZgwocL9v/76q3Tzrt555x1Jo9FI2dnZxvteffVVqXXr1pIkSVJ2drakUqmkr7/+utLjfPXVV5KHh4eUm5trvG/t2rWSXC6XUlJSJEmSpPHjx0sPPvig8f0bNmyQHB0dpfT0dEmSJGnYsGHS6NGjTfa7fft2SS6XSwUFBZIkSVJISIjUr18/k23u9rjTp09LAKTdu3cb3x8XFycBkD755JNKP56rV69KAKT//e9/lb6/XHx8vARA2rlzp/G+a9euSU5OTtKKFSskSZKkhQsXSgCks2fPGrcZM2aMpNFopJycHON9PXr0kMaMGWO8HRISIg0dOtR4Ozk5WQIgTZkyxXjfrl27JABScnKyJElln8fGjRubZPzkk0+kkJAQ4+0RI0ZIISEhUmlpqfG+gQMHSoMGDTI5dvlzs2HDBkkul0unT5++43Nxs969e0uvvPKK8XZlX4dbtmyRAEgZGRmSJEnS4MGDpe7du5ts8+qrr0rR0dEmuW5+TgwGg+Tr6yvNmzfvrpkWLlwoubm5Vbj/1ues/PnR6/XG+yIiIqQOHToYb5eWlkrOzs7SsmXLJEmSpG+//VaKiIiQDAaDcZuioiLJyclJ2rBhw12zERFZE7PNEQkNDYWLi4vxtr+/v3HUIC4uDkVFRejatWulj42Li0Pjxo3h7OxsvK99+/YwGAw4ffo0/Pz8MGTIELRt2xZXrlxBnTp1sHTpUvTu3RseHh4Ayv4iPnv2rMlpE0mSYDAYcP78eURFRQEAWrRoYXLsuz0uPj4eSqXS5HGRkZF3nN8h3RhhuNuE3ri4OCiVSrRu3dp4n5eXFyIiIhAXF2e8T6PRoF69esbbfn5+CA0NNZmb4OfnV2GUJjY21uT9ABATE1PhvtTU1GrNt2jYsKFxRAMo+1wfO3as0m0PHz6MwMBANGjQoNL36/V6fPDBB1i+fDkuX76MoqIiFBUVmXwtVEVcXBz69u1rcl/79u0xe/Zs6PV6Y96bn5PyUy13Gt26Fw0bNoRc/u/go5+fHxo1amS8rVAo4OXlZTxu+dfgzd8/QNloUUJCQo1mIyISrVpFxNXVFVlZFVdEzMzMhKurq8l9Dg4OJrdlMhkMBgOAskl9dyJJ0m1/aZff36pVK9SrVw8//vgjnn/+efz6669YuHChcTuDwYAxY8Zg/PjxFfYRHBxs/P+tv+Du9rjyyYnVuUrIx8cHHh4eJmWiMtJtTonc+nxU9tze6fmu7HHl+6vsvvLHyeXyCpluPkV0pzy3Hrvc3T73H3/8MT755BPMnj0bMTExcHZ2xsSJE6t9SqKyr6HKnt/qZL9X1f18GQwGNG/evNK5R7Ux8ZeIqDZVq4hERkZi3bp1Fe7ft28fIiIiqryf+vXrw8nJCZs2bcKzzz5b4f3R0dFYvHgx8vLyjEVh586dkMvlJn9JDx48GEuXLkVgYCDkcjn69OljfF+zZs1w4sQJhIeHV+dDvOvjoqKiUFpaiv3796NVq1YAgNOnT99x/Qq5XI5BgwZhyZIleOeddyrME8nLy4NKpUJ0dDRKS0uxZ88etGvXDkDZ5M34+HjjCE5t8vHxQUpKiskv9cOHD9/XPmNjY5GUlIT4+PhKR0W2b9+Ovn37YujQoQDKfimfOXPG5ON3dHQ0mYhbmejoaOzYscPkvn/++QcNGjQwGb2xRM2aNcPy5cvh6+tboeATEdmaak1WfeGFF5CQkICxY8fiyJEjiI+Px5w5c/Dtt9/i1VdfrfJ+1Go1Jk+ejNdeew3fffcdEhISsHv3bnz77bcAgCFDhkCtVmPEiBE4fvw4tmzZghdffBHDhg0znj4o3+7gwYOYPn06BgwYALVabXzf5MmTsWvXLowdOxaHDx/GmTNnsHr1arz44ot3zHa3x0VERKBnz54YNWoU9uzZgwMHDuDZZ5+961/6M2bMQFBQEFq3bo3vvvsOJ0+exJkzZ7BgwQI0adIEubm5qF+/Pvr27YtRo0Zhx44dOHLkCIYOHYqAgIAKpxlqQ+fOnZGWloZZs2YhISEBc+bMqbSIVkenTp3QsWNHPP744/jrr79w/vx5rFu3DuvXrwcAhIeH46+//sI///yDuLg4jBkzBikpKSb7CA0NxZ49e5CYmIhr165VOoLxyiuvYNOmTXj//fcRHx+PxYsX44svvsCkSZPuK39tGDJkCLy9vdG3b19s374d58+fx7Zt2zBhwgQkJSWJjkdEVKOqVURCQ0Oxfft2JCQk4KGHHkLLli2NV0gMHDiwWgeeMmUKXnnlFbz99tuIiorCoEGDjOfINRoNNmzYgPT0dLRs2RIDBgxA165d8cUXX5jso379+mjZsiWOHj1qvFqmXGxsLLZt24YzZ86gQ4cOaNq0KaZMmQJ/f/875qrK4xYuXIigoCB06tQJ/fv3x+jRo+Hr63vH/Xp4eGD37t0YOnQopk2bhqZNm6JDhw5YtmwZPvzwQ+NCXAsXLkTz5s3x8MMPo23btpAkCX/88UeFofzaEBUVhblz52LOnDlo3Lgx9u7dWyO/yH/++We0bNkSTz75JKKjo/Haa68ZRzimTJmCZs2aoUePHujcuTN0Oh369etn8vhJkyZBoVAgOjoaPj4+uHjxYoVjNGvWDCtWrMCPP/6IRo0a4e2338Z7772HkSNH3nd+c9NoNPj7778RHByM/v37IyoqCk8//TQKCgo4QkJENoevNUNERETC8LVmiIiISBgWESIiIhKGRYSIiIiEYREhIiIiYVhEiIiISBgWESIiIhKGRYSIiIiEYREhIiIiYVhEiIiISBgWESIiIhKGRYSIiIiEYREhIiIiYVhEiIiISBgWESIiIhKGRYSIiIiEYREhIiIiYVhEiIiISBgWESIiIhKGRYSIiIiEYREhIiIiYVhEiIiISBgWESIiIhKGRYSIiIiEYREhIiIiYVhEiIiISBgWESIiIhKGRYSIiIiEYREhIiIiYVhEiIiISBgWESIiIhKGRYSIiIiEYREhIiIiYVhEiIiISBgWESIiIhKGRYSIiIiEYREhIiIiYVhEiIiISBgWESIiIhKGRYSIiIiEYREhIiIiYVhEiIiISBgWESIiIhLm/wHI3SQD12oObQAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The Model E2E Time is 9.352s.\n", - " --Computing Time is 6.273s\n", - " --Uncovered Communication Time is 0.464s\n", - " --Free Time is 2.615s\n" - ] - } - ], - "source": [ - "# 饼图展示计算、通信、空闲耗时的占比\n", - "overall_data = data.get(\"overall_data\", {})\n", - "plt.figure(figsize=(6, 6)) #设置饼图大小\n", - "plt.pie(x=overall_data.values(), labels=overall_data.keys(), explode=[0.01]*len(overall_data), autopct=\"%1.1f%%\")\n", - "plt.title(\"Model Profiling Time Distribution\")\n", - "plt.show()\n", - "print(bottleneck.get(\"overall_data\", \"\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "6a1d82fb-a31b-49ab-a859-6d4bb898c512", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Computing Time Subtype Duration(s) Duration Ratio Kernel Number\n", - "0 Cube Time 3.956 63.06% 584\n", - "1 Vector Time 1.994 31.79% 5224\n", - "\n", - "Computing Time is 6.273s\n", - " if you want more detailed advice please go to compute_perf_analysis.ipynb\n" - ] - } - ], - "source": [ - "# 展示计算细分耗时,NPU开启level1或level2,aic_metric设为PipeUtilization\n", - "compute_time = data.get(\"computing\", {})\n", - "print(pd.DataFrame(compute_time))\n", - "print(\"\\n\", bottleneck.get(\"computing\", \"\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "35df1f13", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []\n", - "\n" - ] - } - ], - "source": [ - "# 展示通信细分耗时,通信耗时受profiling性能膨胀的影响,以L0 + NPU采集的profiling为准\n", - "communication_time = data.get(\"communication\", {})\n", - "print(pd.DataFrame(communication_time))\n", - "print(\"\\n\", bottleneck.get(\"communication\", \"\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "c5e6034e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Free Time Subtype Duration(s) Duration Ratio Kernel Number\n", - "0 SDMA Time 0.073 2.79% 852\n", - "\n", - "Free Time is 2.615s\n", - " if you want more detailed advice please go to timeline_perf_analysis.ipynb\n" - ] - } - ], - "source": [ - "# 展示空闲细分耗时,该耗时受profiling性能膨胀的影响,以L0 + NPU采集的profiling为准\n", - "free_time = data.get(\"free\", {})\n", - "print(pd.DataFrame(free_time))\n", - "print(\"\\n\", bottleneck.get(\"free\", \"\"))" - ] - }, - { - "cell_type": "markdown", - "id": "3511befaff513e8e", - "metadata": { - "jupyter": { - "outputs_hidden": false - } - }, - "source": [ - "## 2)有对标的GPU数据" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "2a1e617d2a117125", - "metadata": { - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+----------------------------------------------------------------------------------------------------------------+\n", - "| Model Profiling Time Distribution |\n", - "+-----+----------------+------------------+----------------+------------------------------+-----------+----------+\n", - "| | Cube Time(Num) | Vector Time(Num) | Computing Time | Uncovered Communication Time | Free Time | E2E Time |\n", - "+-----+----------------+------------------+----------------+------------------------------+-----------+----------+\n", - "| GPU | 3.149s(582) | 1.346s(3433) | 4.748s | 0.024s | 0.051s | 4.840s |\n", - "| NPU | 3.956s(584) | 1.994s(5224) | 6.273s | 0.464s | 2.615s | 9.352s |\n", - "+-----+----------------+------------------+----------------+------------------------------+-----------+----------+\n" - ] - } - ], - "source": [ - "# 有可对比的GPU数据情况下,展示比对结果\n", - "from prettytable import PrettyTable\n", - "comparison_result = data.get(\"comparison_result\", {})\n", - "if not comparison_result:\n", - " print(\"Invalid comparison data, you need to set the gpu_profiling_path.\")\n", - "if comparison_result:\n", - " for sheet_name, data in comparison_result.items():\n", - " if data.get(\"rows\", []):\n", - " table = PrettyTable()\n", - " table.title = sheet_name\n", - " table.field_names = data.get(\"headers\", [])\n", - " for row in data.get(\"rows\", []):\n", - " table.add_row(row)\n", - " print(table)\n", - " print(bottleneck.get(\"comparison_result\", \"\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d968851", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/profiler/advisor/result/__init__.py b/profiler/advisor/result/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/result/item.py b/profiler/advisor/result/item.py new file mode 100644 index 0000000000000000000000000000000000000000..fa0ffb5b1c769dd5e7a0d69523d0c94a65ffaf19 --- /dev/null +++ b/profiler/advisor/result/item.py @@ -0,0 +1,61 @@ +class OptimizeItem: + + def __init__(self, problem, description, suggestion): + self.problem = problem + self.description = description + self.suggestion = suggestion + + @property + def data(self): + format_suggestions = [] + for index, suggesion in enumerate(self.suggestion): + format_suggestions.append(f"{index + 1}. {suggesion}") + suggestion_str = "\n".join(format_suggestions) + return [self.problem, self.description, suggestion_str] + + @property + def headers(self): + return ["problem", "description", "suggestion"] + + +class StatisticsItem: + def __init__(self, total_task_duration, task_duration, count, income=None): + self.total_task_duration = total_task_duration + self.task_duration = task_duration + self.count = count + self.income = income + if not isinstance(task_duration, str): + self.task_duration_ratio = round(task_duration / total_task_duration, 4) if total_task_duration != 0 else 0 + else: + self.task_duration_ratio = "" + + @property + def data(self): + + def _cal_ratio(divisor, dividend): + if divisor and dividend != 0: + return divisor, round(divisor / dividend, 4) + else: + return "", "" + + income, income_ratio = _cal_ratio(self.income, self.total_task_duration) + return [self.count, self.total_task_duration, self.task_duration_ratio, income, income_ratio] + + @property + def headers(self): + return ["problem count", "total_time(us)", "time ratio", "income(us)", "income ratio"] + + +class OptimizeRecord: + + def __init__(self, optimization_item, statistics_item=None) -> None: + self.optimization_item = optimization_item + self.statistics_item = statistics_item or StatisticsItem("", "", "") + + @property + def data(self): + return self.optimization_item.data + self.statistics_item.data + + @property + def headers(self): + return self.optimization_item.headers + self.statistics_item.headers diff --git a/profiler/advisor/result/result.py b/profiler/advisor/result/result.py new file mode 100644 index 0000000000000000000000000000000000000000..c7d7da8663c8f0105734ec211e2a55a988030465 --- /dev/null +++ b/profiler/advisor/result/result.py @@ -0,0 +1,210 @@ +import json +import os +import stat +from textwrap import fill +from collections import OrderedDict + +import click +import xlsxwriter +from prettytable import ALL, PrettyTable + +from profiler.advisor.common import constant as const +from profiler.advisor.utils.utils import singleton, logger +from profiler.advisor.config.config import Config + + +class ResultWriter: + def __init__(self, result_path=None): + self.result_path = result_path + self.workbook = xlsxwriter.Workbook(result_path) + + self.header_format = None + self.data_cell_format = None + self._init_header_format() + self._init_data_cell_format() + + def _init_header_format(self): + self.header_format = self.workbook.add_format({ + "bold": True, + "color": "#FFFFFF", + "bg_color": "#187498", + "align": "center", + "border": 1, + "font_name": "Arial", + }) + + def _init_data_cell_format(self): + self.data_cell_format = self.workbook.add_format({ + "bold": False, + "align": "left", + "valign": "top", + "border": 1, + "font_name": "Arial", + 'text_wrap': True + }) + + def add_data(self, sheet_name, headers, data_list): + sheet = self.workbook.add_worksheet(sheet_name) + + if headers: + for col_index, header in enumerate(headers): + sheet.write(0, col_index, header, self.header_format) + + if data_list: + for i, row_data in enumerate(data_list): + row_index = i + 1 + for col_index, value in enumerate(row_data): + sheet.write(row_index, col_index, value, self.data_cell_format) + + sheet.autofit() + + def save(self): + try: + self.workbook.close() + except Exception as e: + logger.error("Failed to save analysis results, reason is %s", e) + + +@singleton +class SheetRecoder: + + def __init__(self): + self._sheet_data = OrderedDict() + + @property + def sheet_data(self): + return self._sheet_data + + def _init_sheet_name(self, sheet_name): + if sheet_name not in self._sheet_data: + self._sheet_data[sheet_name] = {} + + def add_headers(self, sheet_name, headers): + self._init_sheet_name(sheet_name) + + if self._sheet_data[sheet_name].get("headers") is None: + self._sheet_data[sheet_name]["headers"] = headers + + def add_data(self, sheet_name, data): + self._init_sheet_name(sheet_name) + + if not isinstance(self._sheet_data[sheet_name].get("data"), list): + self._sheet_data[sheet_name]["data"] = [] + if data not in self._sheet_data[sheet_name]["data"]: + self._sheet_data[sheet_name]["data"].append(data) + + +@singleton +class OptimizeResult: + + def __init__(self): + self.result_writer = ResultWriter(Config().analysis_result_file) + self.sheet_recorder = SheetRecoder() + self.page_dict = False + self._tune_op_list = [] + + @property + def data(self): + return self.sheet_recorder.sheet_data + + def add_tune_op_list(self, tune_op_list) -> None: + """ + add tune op name to tune op list + :param tune_op_list: tune op name list to be added + :return: None + """ + for op_name in tune_op_list: + if op_name not in self._tune_op_list: + self._tune_op_list.append(op_name) + + def add(self, overview_item): + sheet_name = "problems" + + headers = overview_item.headers + data = overview_item.data + self.sheet_recorder.add_headers(sheet_name, headers) + self.sheet_recorder.add_data(sheet_name, data) + + TerminalResult().add(overview_item.optimization_item.data) + self.page_dict = True + + def add_detail(self, sheet_name, headers=None, detail=None): + if headers: + self.sheet_recorder.add_headers(sheet_name, headers) + if detail: + self.sheet_recorder.add_data(sheet_name, detail) + self.page_dict = True + + def show(self): + for sheet_name, sheet_data in self.sheet_recorder.sheet_data.items(): + self.result_writer.add_data(sheet_name, sheet_data.get("headers"), sheet_data.get("data")) + + terminal_result = TerminalResult() + terminal_result.print() + if not terminal_result.result_list: + Config().remove_log() + return + self.result_writer.save() + logger.info("Save problems details file to %s", Config().analysis_result_file) + self._save_op_file_list() + + def _save_op_file_list(self) -> None: + if not self._tune_op_list: + return + tune_op_dict = {"tune_ops_name": self._tune_op_list} + tune_ops_file = Config().tune_ops_file + try: + + with os.fdopen(os.open(tune_ops_file, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, stat.S_IWUSR | stat.S_IRUSR), + 'w', encoding="utf-8") as op_tune_file: + json.dump(tune_op_dict, op_tune_file) + except OSError as error: + logger.error("Dump op_list to %s failed, %s", tune_ops_file, error) + return + logger.info("Save tune op name list to %s", tune_ops_file) + + +@singleton +class TerminalResult: + """ + Result output to screen + """ + + def __init__(self): + self.width, _ = self.get_terminal_size() + if self.width is None: + self.table = PrettyTable(["No.", "Problem", "Description", "Suggestion"]) + else: + self.table = PrettyTable(["No.", "Problem", "Description", "Suggestion"], + max_table_width=max(self.width - 20, 180)) + self.table.hrules = ALL + self.result_list = [] + + @staticmethod + def get_terminal_size(): + try: + width, height = os.get_terminal_size() + except OSError: + width, height = None, None + return width, height + + def add(self, result_str): + """ + add a result str + """ + self.result_list.append(result_str) + + def print(self): + """ + print screen result with format table + """ + table_row_cnt = 0 + for result in self.result_list: + table_row_cnt += 1 + self.table.add_row([table_row_cnt] + result) + self.table.align = "l" + + if table_row_cnt > 0: + click.echo(self.table) + else: + click.echo(click.style(const.SKIP_ANALYZE_PROMPT, fg='red')) diff --git a/profiler/advisor/rules/__init__.py b/profiler/advisor/rules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/rules/aicpu_rules.yaml b/profiler/advisor/rules/aicpu_rules.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9313700c800d337eaea18f5a634521710f09e465 --- /dev/null +++ b/profiler/advisor/rules/aicpu_rules.yaml @@ -0,0 +1,103 @@ +DataTypeSuggeation: &DataTypeSuggeation "Data type {} in {} operator may cause AICPU issues, Try to convert to {} if possible." +AICPU_DOC_URL: &AICPU_DOC_URL "https://support.huaweicloud.com/bestpractice-modelarts/modelarts_10_2517.html" + +CommonChecker: + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ __ALL__ ] + ignore_type: [ cast, tensorequal, equal, nonzero, mul ] + input: [ float, float32, float16, bool, int32, uint32, int64, uint64, int8, uint8, int16, uint16, dt_bf16 ] + output: [ float, float32, float16, bool, int32, uint32, int64, uint64, int8, uint8, int16, uint16, dt_bf16 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ cast ] + input: [ float, float32, float16, bool, int32, uint32, int64, uint64, uint8, dt_bf16 ] + output: [ float, float32, float16, bool, int32, uint32, int64, uint64, uint8, dt_bf16 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ tensorequal ] + input: [ float, float32, float16, bool, int32, int8, uint8 ] + output: [ bool ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ equal ] + input: [ float, float32, float16, bool, int32, int64, int8, uint8 ] + output: [ bool ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ nonzero ] + input: [ float16, bool, dt_bf16 ] + output: [ int64 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [7.0.RC1] + op_type: [ mul ] + input: [ float, float32, float16, bool, int32, uint32, int64, uint64, int8, uint8, dt_bf16 ] + output: [ float, float32, float16, bool, int32, uint32, int64, uint64, int8, uint8, dt_bf16 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.0, 7.0.0] + op_type: [ __ALL__ ] + ignore_type: [ cast, tensorequal, equal, nonzero, mul ] + input: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8, int16, complex64, complex128 ] + output: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8, int16, complex64, complex128 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.0, 7.0.0] + op_type: [ cast ] + input: [ float, float32, float16, bool, int32, uint32, int64, uint64, uint8, dt_bf16 ] + output: [ float, float32, float16, bool, int32, uint32, int64, uint64, uint8, dt_bf16 ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.0, 7.0.0] + op_type: [ tensorequal ] + input: [ float, float32, float16, dt_bf16, float64, bool, int32, int8, uint8 ] + output: [ bool ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.0, 7.0.0] + op_type: [ equal ] + input: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8 ] + output: [ bool ] + suggestion: *DataTypeSuggeation + + - DataTypeChecker: + cann_version: [8.0.0, 7.0.0] + op_type: [ mul ] + input: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8, complex64 ] + output: [ float, float32, float16, dt_bf16, float64, bool, int32, int64, int8, uint8, complex64 ] + suggestion: *DataTypeSuggeation + +ExampleGuideChecker: + - IndexPutChecker: + op_type: [index] + url: *AICPU_DOC_URL + suggestion: 'Please modify source code followed by this LINK, try to replace index operator with equivalent operator.' + + - NonzeroChecker: + op_type: [ indexput, indexputv2 ] + url: *AICPU_DOC_URL + suggestion: 'Please modify source code followed by this LINK, try to replace indexput operator with equivalent operator.' + + - CastChecker: + op_type: [ argmin ] + url: *AICPU_DOC_URL + suggestion: 'Please update your cann-tookit to at least 7.0.RC1 version by this LINK.' + + - CastChecker: + op_type: [ nonzero ] + url: *AICPU_DOC_URL + suggestion: 'Please modify source code followed by this LINK, try to replace nonzero operator with equivalent operator.' \ No newline at end of file diff --git a/profiler/advisor/rules/op_fusion_pass.yaml b/profiler/advisor/rules/op_fusion_pass.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3ff69a578285ba15d075f2acbb852499d56021a2 --- /dev/null +++ b/profiler/advisor/rules/op_fusion_pass.yaml @@ -0,0 +1,491 @@ +Elementwise: &Elementwise [ Relu, Pow, Add, Sub, Mul, Div, Abs, Ceil, Log, Sqrt, Exp, LeakyRelu ] + +GraphFusion: + - FlashAttentionFusionPass: + version: 1 + nodes: + - node_1: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + - node_2: [ Mul ] + - node_3: [ Softmax, SoftmaxV2 ] + - node_4: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + - [ node_3, node_4 ] + + - FlashAttentionFusionPass_V2: + version: 1 + nodes: + - node_1: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + - node_2: [ Mul ] + - node_3: [ TransData ] + - node_4: [ Softmax, SoftmaxV2 ] + - node_5: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + - [ node_3, node_4 ] + - [ node_4, node_5 ] + + - BMMStridedSliceDGeluFusionPass: + version: 1 + nodes: + - node_1: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + - node_2: [StridedSliceD] + - node_3: [Relu] + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + + - BMMConfusionTransposeDFusionPass: + version: 1 + nodes: + - node_1: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + - node_2: [ ConfusionTransposeD ] + - node_3: [ Relu ] + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + + - BMMConfusionTransposeDFusionPass_V2: + version: 1 + nodes: + - node_1: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + - node_2: [ ConfusionTransposeD ] + edges: + - [ node_1, node_2 ] + + - Conv2DAddGroupNormFusionPass: + version: 0 + struct: [ Conv2D, Add, GroupNorm ] + + - RMSnormAddFusionPass: + version: 0 + struct: [ RMSnorm, Add ] + + - ConvToFullyConnectionFusionPass: + version: 0 + struct: [ Conv ] + + - ZConcatv2dFusionPass: + version: 0 + struct: [ ConcatV2d, ConcatV2d ] + + - BatchMatMulReduceMeanFusionPass: + version: 1 + nodes: + - node_1: [ BatchMatMulV2, BatchMatMul, MatMul, MatMulV2 ] + - node_2: [ Add ] + - node_3: [ Relu ] + - node_4: [ ReduceMean ] + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + - [ node_3, node_4 ] + + - PadDepthwiseConv2dFusionPass: + version: 0 + struct: [ PadD, DepthwiseConv2D ] + + - ConvBatchnormFusionPass: + version: 1 + nodes: + - node_1: [ Conv2d, Conv3d, DepthwiseConv2d ] + - node_2: [ Batchnorm ] + + edges: + - [ node_1, node_2 ] + + - AConv2dMulFusion: + version: 1 + nodes: + - node_1: [ Conv2d, Conv3d ] + - node_2: [ Mul ] + + edges: + - [ node_1, node_2 ] + + - TBEConvAddFusion: + version: 1 + nodes: + - node_1: [ Conv2d, Conv3d ] + - node_2: [ Add ] + + edges: + - [ node_1, node_2 ] + + - ZBNupdateReluV2Conv2DBNreducePass: + version: 0 + struct: [ BNTrainingUpdate, ReluV2, Conv2D, BNTrainingReduce ] + + - ASplitConv2dConcatPass: + version: 1 + nodes: + - node_1: [ MatMul, MatMulV2, BatchMatMul, BatchMatMulV2 ] + - node_2: [ Cast ] + + edges: + - [ node_1, node_2 ] + + - MatMulBiasAddFusionPass: + version: 1 + nodes: + - node_1: [ MatMul, MatMulV2, BatchMatMul, BatchMatMulV2 ] + - node_2: [ BiasAdd, Add ] + + edges: + - [ node_1, node_2 ] + + - Conv2DbpInputBiasAddFusionPass: + version: 0 + struct: [ Conv2DBackpropInput, BiasAdd ] + + - BatchMatmulV2ReduceFusionPass: + version: 0 + struct: [ BatchMatMulV2, ReduceSumD ] + + - BatchMatmulV2ReduceFusionPass_V2: + version: 0 + struct: [ BatchMatMulV2, Cast, ReduceSumD ] + + - Conv3DbpInputBiasAddFusionPass: + version: 0 + struct: [ Conv3DBackpropInputD, BiasAdd ] + + - AFullyConnectionReshapePass: + version: 0 + struct: [ FullyConnection, Reshape ] + + - GemmTransFusionPass: + version: 0 + struct: [ Transpose, Gemm ] + + - Resnet50DbnDwFusionPass: + version: 0 + struct: [ BNTrainingReduceGrad, Conv2DBackpropFilterD ] + + - CastReluCastFusionPass: + version: 0 + struct: [ Cast, Relu, Cast ] + + - PadConv2dFusionPass: + version: 1 + nodes: + - node_1: [ PadD, PadDV3 ] + - node_2: [ Conv2D ] + + edges: + - [ node_1, node_2 ] + + - Conv2DTransposeBatchnormFusionPass: + version: 1 + nodes: + - node_1: [ Conv2dTranspose ] + - node_2: [ BatchNorm, BNInference ] + + edges: + - [ node_1, node_2 ] + + - AvgPoolV2GradFusionPass: + version: 0 + struct: [ AvgPooV2lGrad ] + + - DropOutDoMaskFusionPass: + version: 0 + struct: [ DropOutDoMaskV3D ] + + - ConvCastFusionPass: + version: 0 + struct: [ Conv2D, Cast ] + + - ConvCastFusionPass_V2: + version: 0 + struct: [ Conv2D, TransData, Cast ] + + - StridedSliceConcatFusionPass: + version: 1 + nodes: + - node_1: [ StridedSliceD ] + - node_2: [ StridedSliceD ] + - node_3: [ ConcatD ] + + edges: + - [ node_1, node_3 ] + - [ node_2, node_3 ] + + - ConvCastFusionPass: + version: 0 + struct: [ SplitV ] + + - AInplaceAddFusionPass: + version: 0 + struct: [ InplaceAdd ] + + - AInplaceSubFusionPass: + version: 0 + struct: [ InplaceSub ] + + - AInplaceUpdateFusionPass: + version: 0 + struct: [ InplaceUpdate ] + +UBFusion: + - TbeConv3dElemwisePass: + version: 1 + nodes: + - node_1: [ Conv3D ] + - node_2: *Elementwise + edges: + - [ node_1, node_2 ] + + - TbeConv3dDxElemwisePass: + version: 0 + struct: [ Conv3dBackpropInput, AddN, LeakyReluGrad ] + + - TbeConv3dDxElemwisePass_V2: + version: 0 + struct: [ Conv3dBackpropInput, LeakyReluGrad ] + + - MatMulDropoutDoMaskV3dFusionPass: + version: 0 + struct: [ MatMul, Dropout_do_mask_v3_d, Add ] + + - BatchMatMulDropoutDoMaskV3dFusionPass: + version: 0 + struct: [ BatchMatMul, Dropout_do_mask_v3_d, Add ] + + - MatmulReduceSumUbFusion: + version: 0 + struct: [ BatchMatMul, ReduceSum ] + + - TbeBatchMatMulElementWiseFusionPass: + version: 1 + nodes: + - node_1: [ BatchMatMul, GEMM ] + - node_2: *Elementwise + + edges: + - [ node_1, node_2 ] + + - ATbeMatMulElemwiseFusionPass: + version: 1 + nodes: + - node_1: [ MatMul, GEMM ] + - node_2: *Elementwise + + edges: + - [ node_1, node_2 ] + + - MatmulConfusiontransposeUbFusion: + version: 0 + struct: [ MatMul, matmul_transpose ] + + - TbeFullyconnectionElemwiseDequantFusionPass: + version: 1 + nodes: + - node_1: [ BatchMatMul, MatMul, FullyConnection ] + - node_2: *Elementwise + + edges: + - [ node_1, node_2 ] + + - BatchMatmulConfusiontransposeUbFusion: + version: 0 + struct: [ BatchMatMul, batchmatmul_transpose ] + + - TbeConvSigmoidMulQuantFusionPass: + version: 1 + nodes: + - node_1: [ Conv ] + - node_2: [ Sigmoid ] + - node_3: [ Mul ] + - node_4: [ Quant ] + + edges: + - [ node_1, node_2 ] + - [ node_1, node_3 ] + - [ node_2, node_3 ] + - [ node_3, node_4 ] + + - TbeConv2DReluv2Pass: + version: 0 + struct: [ Conv2D, ReluV2 ] + + - TbeConvDoubleInFusionPass: + version: 1 + nodes: + - node_1: [ Conv2D ] + - node_2: *Elementwise + - node_3: *Elementwise + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + + - TbeConv2dAddClipMulDivFusionPass: + version: 0 + struct: [ Conv2D, Add, Clip, Mul, Div ] + + - TbeConv2dAddClipMulDivFusionPass_V2: + version: 0 + struct: [ Conv2D, Add, Clip, Mul ] + + - TbeConv2dAddRelu6MulMulFusionPass: + version: 1 + nodes: + - node_1: [ Conv2D, DepthwiseConv2D ] + - node_2: [ Add ] + - node_3: [ Relu6 ] + - node_4: [ Mul ] + - node_5: [ Mul ] + + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + - [ node_3, node_4 ] + - [ node_4, node_5 ] + + - ConvClipByValueFusionPass: + version: 1 + nodes: + - node_1: [ Conv2D ] + - node_2: *Elementwise + edges: + - [ node_1, node_2 ] + + - TbeAippConvReluMaxpoolingFusion: + version: 1 + nodes: + - node_1: [ Conv2D ] + - node_2: *Elementwise + - node_3: [ MaxPool, MaxPoolv3 ] + + edges: + - [ node_1, node_2 ] + - [ node_2, node_3 ] + + - TbeReduceElemwiseFusionPass: + version: 1 + nodes: + - node_1: *Elementwise + - node_2: [ CommReduce ] + edges: + - [ node_1, node_2 ] + + - TbeReadSelectEltwiseFusionPass: + version: 1 + nodes: + - node_1: [ ReadSelect ] + - node_2: *Elementwise + + edges: + - [ node_1, node_2 ] + + - TbeEltwiseWriteSelectFusionPass: + version: 1 + nodes: + - node_1: *Elementwise + - node_2: [ write_select ] + + edges: + - [ node_1, node_2 ] + + - TbeEltwiseFusionPass: + version: 1 + nodes: + - node_1: *Elementwise + - node_2: *Elementwise + + edges: + - [ node_1, node_2 ] + + - TbeConvBnreduceFusionPass: + version: 0 + struct: [ Convolution, bn_reduce ] + + - TbeBnupdateEltwiseFusionPass: + version: 1 + nodes: + - node_1: [ bn_update ] + - node_2: *Elementwise + edges: + - [ node_1, node_2 ] + + - TbeConv2DBackpropElemwiseFusionPass: + version: 1 + nodes: + - node_1: [ Conv2DBackpropInputD, Conv2DTransposeD, Deconvolution ] + - node_2: [ Add, ReluGradV2 ] + + edges: + - [ node_1, node_2 ] + + - TbeDxElemwisePass: + version: 1 + nodes: + - node_1: [ Conv2DBackpropInputD, Conv2DTransposeD, Deconvolution ] + - node_2: [ LeakyRelu, Prelu ] + + edges: + - [ node_1, node_2 ] + + - TbeConv2dBackpropRequantFusionPass: + version: 1 + nodes: + - node_1: [ Conv2DBackpropInputD, Conv2DTransposeD, Deconvolution ] + - node_2: [ AscendRequant ] + + edges: + - [ node_1, node_2 ] + + - TbeDwTransdataFusionPass: + version: 1 + nodes: + - node_1: [ Transdate ] + - node_2: [ Transdate ] + - node_3: [ Conv2DBackpropFilter ] + + edges: + - [ node_1, node_3 ] + - [ node_2, node_3 ] + + - TbeDxTransdataFusionPass: + version: 1 + nodes: + - node_1: [ Transdate ] + - node_2: [ Transdate ] + - node_3: [ Conv2DBackpropInput ] + + edges: + - [ node_1, node_3 ] + - [ node_2, node_3 ] + + - TbeEltwiseCastFusionPass: + version: 1 + nodes: + - node_1: [ Relu, Add, Mul, Sqrt ] + - node_2: [ Cast ] + + edges: + - [ node_1, node_2 ] + + - TbeEltwiseCastFusionPass_V2: + version: 1 + nodes: + - node_1: [ Cast ] + - node_2: [ Relu, Add, Mul, Sqrt ] + + + edges: + - [ node_1, node_2 ] + + - TbeConv2DBackpropDequantFusionPass: + version: 1 + nodes: + - node_1: [ Conv2DBackpropInputD, Conv2DTransposeD, Deconvolution ] + - node_2: [ AscendDequant ] + + + edges: + - [ node_1, node_2 ] diff --git a/profiler/advisor/rules/timeline_fusion_ops.yaml b/profiler/advisor/rules/timeline_fusion_ops.yaml new file mode 100644 index 0000000000000000000000000000000000000000..10c12ff18dd8792e24a89c6d5fbb7ed87f643a9d --- /dev/null +++ b/profiler/advisor/rules/timeline_fusion_ops.yaml @@ -0,0 +1,59 @@ +- cann_version: 6.3.RC2 + torch_version: 1.11.0 + unique_id: 0 + operator_rules: + aten: + add: + torch_npu.npu_confusion_transpose: ["(permute|transpose)-(contiguous){0,1}-(reshape|view)", + "(reshape|view)-(contiguous){0,1}-(permute|transpose)"] + torch_npu.fast_gelu: [gelu] + torch_npu.npu_linear: [linear] + torch_npu.npu_mish: [mish] + torch_npu.contrib.module.Mish: [mish] + torch_npu.npu_scaled_masked_softmax: [ "softmax-(mul){0,1}-(masked_fill_|add)" ] + torch_npu.npu_silu: [ silu, mul-sigmoid, sigmoid-mul ] + torch_npu.contrib.module.SiLU: [ silu, mul-sigmoid, sigmoid-mul ] + optimizer.clip_grad_norm_fused_: [add-reciprocal-mul] + Optimizer: + add: + torch_npu.optim.NpuFusedAdamW: [AdamW.step] + torch_npu.optim.NpuFusedSGD: [SGD.step] + torch_npu.optim.NpuFusedAdadelta: [Adadelta.step] + torch_npu.optim.NpuFusedLamb: [Lamb.step] + torch_npu.optim.NpuFusedAdamP: [AdamP.step] + torch_npu.optim.NpuFusedBertAdam: [BertAdam.step] + torch_npu.optim.NpuFusedRMSprop: [RMSprop.step] + torch_npu.optim.NpuFusedRMSpropTF: [RMSpropTF.step] + torch_npu.optim.NpuFusedAdam: [Adam.step] + + +- cann_version: 7.0.RC1 + torch_version: [1.11.0,2.1.0] + unique_id: 1 + inherit_unique_id: 0 + operator_rules: + aten: + add: + torch_npu.npu_fusion_attention: ["matmul-(add){0,1}-(mul){0,1}-(masked_fill_|add){0,1}-softmax-(dropout){0,1}-matmul"] + torch_npu.npu_rotary_mul: ["(chunk|slice)-neg-cat-(mul){0,2}-add"] + +- cann_version: 7.0.0 + torch_version: [1.11.0, 2.1.0] + unique_id: 2 + inherit_unique_id: 1 + operator_rules: + aten: + add: + torch_npu.npu_rms_norm: ["(pow){0,1}-(mean){0,1}-(add){0,1}-rsqrt-mul-(type_as){0,1}"] + torch_npu.npu_swiglu: [ "(slice|chunk)-silu-mul", "(slice|chunk)-mul-silu", + "(slice|chunk)-sigmoid-mul-mul", "(slice|chunk)-mul-sigmoid-mul", + "(slice|chunk)-mul-mul-sigmoid" ] + +- cann_version: 8.0.0 + torch_version: [1.11.0, 2.1.0] + unique_id: 3 + inherit_unique_id: 2 + operator_rules: + aten: + add: + torch_npu.npu_geglu: ["(slice|chunk)-gelu-mul", "(slice|chunk)-mul-gelu"] \ No newline at end of file diff --git a/profiler/advisor/timeline_perf_analysis.ipynb b/profiler/advisor/timeline_perf_analysis.ipynb deleted file mode 100644 index 34233db6fe10f6cec0e708e3d829a6be73436d6b..0000000000000000000000000000000000000000 --- a/profiler/advisor/timeline_perf_analysis.ipynb +++ /dev/null @@ -1,163 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from advisor_backend.interface import Interface\n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Timeline调优分析\n", - "\n", - "## 1. Timeline分析的数据准备\n", - "我们当前支持Ascend PyTorch Profiler方式采集后的ascend_pt目录,并支持单独分析ascend_pt/ASCEND_PROFILER_OUTPUT目录下的trace_view.json文件。\n", - "\n", - "## 2. Timeline分析解决的问题\n", - "当前支持的功能:\n", - "1) 识别当前可选择的NPU亲和优化器。\n", - "2) 分析算子调度瓶颈。" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# EDIT THE PROFILING DATA PATH\n", - "timeline_path = \"[YOUR PATH]\"\n", - "interface = Interface(timeline_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1)亲和优化器识别" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[INFO] Start to analyse the target file: [YOUR PATH]\n", - "['Optimizer.step#AdamW.step']\n", - "You can choose torch_npu.optim.NpuFusedAdamW to replace the current Optimizer: Optimizer.step#AdamW.step.\n" - ] - } - ], - "source": [ - "dataset = interface.get_data('timeline', 'optimizer')\n", - "# 打印当前使用的优化器\n", - "data = dataset.get('data')\n", - "print(data)\n", - "\n", - "# 如果使用了原生优化器,则打印优化建议\n", - "advice = dataset.get('advice')\n", - "print(advice)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2)算子调度分析\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[INFO] Start to analyse the target file: [YOUR PATH]\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAABMQAAAK9CAYAAADLzbDJAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydd7wcVd3/P1tuS09IAgmE3iMdfVRAwOfhJyiKyKOPijyooIAFLKAgKlgwdAULTaSIID5SpEdagCBVCKGkkkp6bm5uv9tmfn/sntlzzpxz5szu7N25u9/365VX7r07O3N2dnd25rOfz+ckXNd1QRAEQRAEQRAEQRAEQRBNQrLeAyAIgiAIgiAIgiAIgiCI4YQEMYIgCIIgCIIgCIIgCKKpIEGMIAiCIAiCIAiCIAiCaCpIECMIgiAIgiAIgiAIgiCaChLECIIgCIIgCIIgCIIgiKaCBDGCIAiCIAiCIAiCIAiiqSBBjCAIgiAIgiAIgiAIgmgqSBAjCIIgCIIgCIIgCIIgmgoSxAiCIAiCIAiCIAiCIIimggQxgiAIgiCIEcqKFSuQSCRw5ZVX1nxbt956KxKJBFasWBH6vnPmzEEikcCcOXMiHxdBEARBEEQlkCBGEARBEETT8/bbb+NLX/oStt9+e7S1tWH69Ok4+eST8fbbb9dkew8++CCOPPJITJ06FaNGjcKuu+6Kz33uc3jsscdqsj2CIAiCIAhChAQxgiAIgiCamnvvvRcHH3wwnnzySXzlK1/BH/7wB5x22ml4+umncfDBB+O+++6LdHtXXnklPvWpTyGRSOCCCy7Ar3/9a5x00klYsmQJ/vrXv0a6LYIgCIIgCEJNut4DIAiCIAiCqBfvvvsuTjnlFOy666549tlnMWXKFO+2c845B0cccQROOeUUzJ8/H7vuumvV28vn8/jFL36BY445Bv/85z99t2/cuLHqbRAEQRAEQRDBkEOMIAiCIIim5YorrsDAwABuvPFGQQwDgMmTJ+OGG25Af38/Lr/8cu/vF198MRKJBBYuXIjPfe5zGDduHLbZZhucc845GBoaMm5v8+bN6OnpwWGHHaa8ferUqcLvQ0NDuPjii7Hnnnuivb0d06ZNw2c+8xm8++67vvveeOON2G233dDW1ob3v//9eOWVV3zLLFy4EP/93/+NSZMmob29HYceeigeeOAB33Jvv/02PvrRj6KjowM77LADfvnLX8JxHN9yiUQCF198se/vO++8M7785S9r9kKZl156CcceeyzGjx+PUaNG4cgjj8Tzzz8feD+CIAiCIIhqIYcYQRAEQRBNy4MPPoidd94ZRxxxhPL2j3zkI9h5553x8MMP+2773Oc+h5133hmzZs3Ciy++iGuvvRZdXV24/fbbtdubOnUqOjo68OCDD+Lb3/42Jk2apF22UCjg+OOPx5NPPonPf/7zOOecc9Db24vHH38cb731FnbbbTdv2TvvvBO9vb0444wzkEgkcPnll+Mzn/kMli1bhpaWFgBFkeuwww7D9ttvj/PPPx+jR4/G3/72N3z605/GPffcgxNPPBEAsH79ehx99NHI5/PecjfeeCM6Ojqs9qktTz31FI477jgccsghuOiii5BMJnHLLbfgox/9KJ577jl84AMfiHR7BEEQBEEQPCSIEQRBEATRlHR3d2Pt2rU44YQTjMvtv//+eOCBB9Db24uxY8d6f99ll13wj3/8AwDwzW9+E+PGjcMf/vAHnHvuudh///2V60omkzjvvPPw85//HDvuuCM+8pGP4PDDD8exxx6Lgw8+WFj29ttvx5NPPomrr74a3/3ud72/n3/++XBdV1h21apVWLJkCSZOnAgA2GuvvXDCCSdg9uzZOP744wEUI6A77rgjXnnlFbS1tQEAvvGNb+Dwww/HD3/4Q08Qu+yyy7Bp0ya89NJLnih16qmnYo899jDv0BC4roszzzwTRx99NB599FEkEgkAwBlnnIGZM2fixz/+sTJSShAEQRAEERUUmSQIgiAIoinp7e0FAEHkUsFu7+npEf7+zW9+U/j929/+NgDgkUceMa7vZz/7Ge68804cdNBBmD17Ni688EIccsghOPjgg7FgwQJvuXvuuQeTJ0/21svDBCTG//zP/3hiGADP8bZs2TIAwJYtW/DUU0/hc5/7HHp7e7F582Zs3rwZnZ2d+NjHPoYlS5ZgzZo13vg/+MEPCg6tKVOm4OSTTzY+rjDMmzcPS5YswRe/+EV0dnZ64+nv78d//ud/4tlnn1VGNAmCIAiCIKKCHGIEQRAEQTQlTOhiwpgOnXAmO6Z22203JJNJrFixInDbX/jCF/CFL3wBPT09eOmll3DrrbfizjvvxCc/+Um89dZbaG9vx7vvvou99toL6XTw6dqOO+4o/M7Esa6uLgDA0qVL4boufvKTn+AnP/mJch0bN27E9ttvj5UrV+I//uM/fLfvtddegeOwZcmSJQCKzjMd3d3dgshHEARBEAQRJSSIEQRBEATRlIwfPx7Tpk3D/PnzjcvNnz8f22+/PcaNG2dcTnZt2TBu3Dgcc8wxOOaYY9DS0oLbbrsNL730Eo488shQ60mlUsq/s2glc1ude+65+NjHPqZcdvfddw+1TROFQsF4OxvPFVdcgQMPPFC5zJgxYyIbD0EQBEEQhAwJYgRBEARBNC3HH388brrpJsydOxeHH3647/bnnnsOK1aswBlnnOG7bcmSJdhll12835cuXQrHcbDzzjtXNJZDDz0Ut912G9atWweg6Dh76aWXkMvlvGL8Stl1110BAC0tLfiv//ov47I77bST5+DiWbRoke9vEydOxNatW4W/ZbNZ7zHoYBMCjBs3LnA8BEEQBEEQtYA6xAiCIAiCaFrOO+88dHR04IwzzkBnZ6dw25YtW3DmmWdi1KhROO+883z3/f3vfy/8/tvf/hYAcNxxx2m3NzAwgBdeeEF526OPPgqgHE086aSTsHnzZvzud7/zLSuX6gcxdepUHHXUUbjhhhuUYtWmTZu8nz/+8Y/jxRdfxMsvvyzc/pe//MV3v9122w3PPvus8Lcbb7wx0CF2yCGHYLfddsOVV16Jvr4+43gIgiAIgiBqATnECIIgCIJoWvbYYw/cdtttOPnkk7HffvvhtNNOwy677IIVK1bg5ptvxubNm3HXXXd5jiae5cuX41Of+hSOPfZYvPDCC7jjjjvwxS9+EQcccIB2ewMDA/jwhz+MD37wgzj22GMxY8YMbN26Fffffz+ee+45fPrTn8ZBBx0EAPjf//1f3H777fje976Hl19+GUcccQT6+/vxxBNP4Bvf+Ebg7Jgyv//973H44Ydjv/32w9e+9jXsuuuu2LBhA1544QW89957eOONNwAAP/jBD/DnP/8Zxx57LM455xyMHj0aN954I3baaSdfvPT000/HmWeeiZNOOgnHHHMM3njjDcyePRuTJ082jiWZTOKPf/wjjjvuOMycORNf+cpXsP3222PNmjV4+umnMW7cODz44IOhHh9BEARBEEQYSBAjCIIgCKKp+exnP4u9994bs2bN8kSwbbbZBkcffTR+9KMf4X3ve5/yfnfffTd++tOf4vzzz0c6nca3vvUtXHHFFcZtTZgwATfddBMefvhh3HLLLVi/fj1SqRT22msvXHHFFTj77LO9ZVOpFB555BFccskluPPOO3HPPfdgm2228UStsOy777549dVX8bOf/Qy33norOjs7MXXqVBx00EH46U9/6i03bdo0PP300/j2t7+NSy+9FNtssw3OPPNMTJ8+Haeddpqwzq997WtYvnw5br75Zjz22GM44ogj8Pjjj+M///M/A8dz1FFH4YUXXsAvfvEL/O53v0NfXx+22247/Md//IcyokoQBEEQBBElCTes554gCIIgCKKJufjii/Gzn/0MmzZtCnRCEQRBEARBEPGEOsQIgiAIgiAIgiAIgiCIpoIEMYIgCIIgCIIgCIIgCKKpIEGMIAiCIAiCIAiCIAiCaCqoQ4wgCIIgCIIgCIIgCIJoKurqECsUCvjJT36CXXbZBR0dHdhtt93wi1/8AqTREQRBEARBEARBEARBELUiXc+NX3bZZbjuuutw2223YebMmXj11Vfxla98BePHjxemHScIgiAIgiAIgiAIgiCIqKhrZPL444/Htttui5tvvtn720knnYSOjg7ccccdgfd3HAdr167F2LFjkUgkajlUgiAIgiAIgiAIgiAIIua4rove3l5Mnz4dyaQ+GFlXh9iHP/xh3HjjjVi8eDH23HNPvPHGG5g7dy6uvvpq5fKZTAaZTMb7fc2aNdh3332Ha7gEQRAEQRAEQRAEQRDECGD16tXYYYcdtLfXVRA7//zz0dPTg7333hupVAqFQgGXXHIJTj75ZOXys2bNws9+9jPf31evXo1x48bVergEQRAEQRAEQRAEQRBEjOnp6cGMGTMwduxY43J1jUz+9a9/xXnnnYcrrrgCM2fOxLx58/Cd73wHV199NU499VTf8rJDjD3I7u5uEsQIgiAIgiAIgiAIgiCanJ6eHowfPz5QK6qrIDZjxgycf/75+OY3v+n97Ze//CXuuOMOLFy4MPD+tg+SIAiCIAiCIAiCIAiCaHxstSJ9u9gwMDAw4Cs4S6VScBynTiMiCIIgCIIgCIIgCIIgGp26doh98pOfxCWXXIIdd9wRM2fOxOuvv46rr74aX/3qV+s5LIIgCIIgCIIgCIIgCKKBqWtksre3Fz/5yU9w3333YePGjZg+fTq+8IUv4Kc//SlaW1sD70+RSYIgCIIgCIIgCIJoLlzXRT6fR6FQqPdQiDqQSqWQTqeRSCSUt4+IDrFqIUGMIAiCIAiCIAiCIJqHbDaLdevWYWBgoN5DIerIqFGjMG3aNKWZylYrqmtkkiAIgiAIgiAIgiAIwgbHcbB8+XKkUilMnz4dra2tWpcQ0Zi4rotsNotNmzZh+fLl2GOPPXzd9LaQIEYQBEEQBEEQBEEQROzJZrNwHAczZszAqFGj6j0cok50dHSgpaUFK1euRDabRXt7e0XrqesskwRBEARBEARBEARBEGGo1BFENA5RvAboVUQQBEEQBEEQBEEQBEE0FSSIEQRBEARBEARBEARBEE0FCWIEQRAEQRAEQRAEQRANQiKRwP3331+37V988cU48MAD67Z9W0gQIwiCIAiCIAiCIAiCIEKjEt/OPfdcPPnkk/UZUAholkmCIAiCIAiCIAiCIAgCAFAoFJBIJCourh8zZgzGjBkT8aiihxxiBEEQBEEQBEEQBEGMTFwX6O+vzz/XtR5mJpPB2WefjalTp6K9vR2HH344XnnlFe/2OXPmIJFI4OGHH8b++++P9vZ2fPCDH8Rbb71lXO+SJUvwkY98BO3t7dh3333x+OOPC7ez9W7dutX727x585BIJLBixQoAwK233ooJEybggQcewL777ou2tjasWrUKr7zyCo455hhMnjwZ48ePx5FHHonXXnvNW8/OO+8MADjxxBORSCS83+XIpOM4+PnPf44ddtgBbW1tOPDAA/HYY495t69YsQKJRAL33nsvjj76aIwaNQoHHHAAXnjhBev9WwkkiBEEQRAEQRAEQRAEMTIZGADGjKnPv4EB62H+4Ac/wD333IPbbrsNr732GnbffXd87GMfw5YtW4TlzjvvPFx11VV45ZVXMGXKFHzyk59ELpdTrtNxHHzmM59Ba2srXnrpJVx//fX44Q9/WOFuHMBll12GP/7xj3j77bcxdepU9Pb24tRTT8XcuXPx4osvYo899sDHP/5x9Pb2AoAn6N1yyy1Yt26dIPDxXHPNNbjqqqtw5ZVXYv78+fjYxz6GT33qU1iyZImw3IUXXohzzz0X8+bNw5577okvfOELyOfzFT0eG0gQIwiCIAiCIAiCIAiCqBH9/f247rrrcMUVV+C4447Dvvvui5tuugkdHR24+eabhWUvuugiHHPMMdhvv/1w2223YcOGDbjvvvuU633iiSewcOFC3H777TjggAPwkY98BL/61a8qGmMul8Mf/vAHfPjDH8Zee+2FUaNG4aMf/Si+9KUvYe+998Y+++yDG2+8EQMDA3jmmWcAAFOmTAEATJgwAdttt533u8yVV16JH/7wh/j85z+PvfbaC5dddhkOPPBA/OY3vxGWO/fcc/GJT3wCe+65J372s59h5cqVWLp0aUWPxwbqECMIgiAIgiAIgiAIYmQyahTQ11e/bVvw7rvvIpfL4bDDDvP+1tLSgg984ANYsGCBsOyHPvQh7+dJkyZhr7328i3DWLBgAWbMmIHp06cr7x+G1tZW7L///sLfNmzYgB//+MeYM2cONm7ciEKhgIGBAaxatcp6vT09PVi7dq3w2AHgsMMOwxtvvCH8jd/+tGnTAAAbN27E3nvvHfbhWEGCGEEQBEEQBEEQBEEQI5NEAhg9ut6jiC2sGN/l+s5UEcyOjg4kEgnhb6eeeio6OztxzTXXYKeddkJbWxs+9KEPIZvN1mSsLS0t3s9sLI7j1GRbAEUmCYIgCIIgCIIgCIIgasZuu+2G1tZWPP/8897fcrkcXnnlFey7777Csi+++KL3c1dXFxYvXox99tlHud599tkHq1evxrp165T3B8qxRn6ZefPmWY37+eefx9lnn42Pf/zjmDlzJtra2rB582ZhmZaWFhQKBe06xo0bh+nTpwuPna1bfuzDDTnECIIgCIIgCIIgCIIgasTo0aNx1lln4bzzzsOkSZOw44474vLLL8fAwABOO+00Ydmf//zn2GabbbDtttviwgsvxOTJk/HpT39aud7/+q//wp577olTTz0VV1xxBXp6enDhhRcKy+y+++6YMWMGLr74YlxyySVYvHgxrrrqKqtx77HHHvjzn/+MQw89FD09PTjvvPPQ0dEhLLPzzjvjySefxGGHHYa2tjZMnDjRt57zzjsPF110EXbbbTcceOCBuOWWWzBv3jz85S9/sRpHrSCHGEEQBEEQBEEQBEEQRA259NJLcdJJJ+GUU07BwQcfjKVLl2L27Nk+AenSSy/FOeecg0MOOQTr16/Hgw8+iNbWVuU6k8kk7rvvPgwODuIDH/gATj/9dFxyySXCMi0tLbjrrruwcOFC7L///rjsssvwy1/+0mrMN998M7q6unDwwQfjlFNOwdlnn42pU6cKy1x11VV4/PHHMWPGDBx00EHK9Zx99tn43ve+h+9///vYb7/98Nhjj+GBBx7AHnvsYTWOWpFw+SDpCKOnpwfjx49Hd3c3xo0bV+/hEARBEARBEARBEARRI4aGhrB8+XLssssuaG9vr/dwImXOnDk4+uij0dXVhQkTJtR7OLHH9Fqw1YrIIUYQBEEQBEEQBEEQBEE0FSSIEQRhx+uvA1u31nsUBEEQBEEQBEEQBFE1JIgRBGPLFuDSS4HVq+s9kuh54w1g1arK7z9/PnDwwcD//m90YyIIgiAIgiAIgiAAAEcddRRc16W45DBCghhBMG69FbjgAuDXvxb/PjQErF1blyFFQmcn8P73A//v/1W+DiamNaJYSBAEQRAEQRAEQTQdJIgRBKOnp/h/X5/49099Cthpp5Erim3cCORywLp1la+jUCj+P3Ln4CAIgiAIgiAIgiAIDxLECIKRzxf/dxzx70uWFG9buXL4xxQFTMySH1cY2L4hQYwgCIIgCIIgCIJoAEgQIwiGThBjv49UMUgnZi1aBHzxi8DbbwevIwpRjSAIgiAIgiAIgiBiAgliBMHQxQKZCDRSxSCdmHXHHcBddxW702zXMVJFQYIgCIIgCIIgCILgIEGMIBiN7hCTH1c2K/5vs46Rug8IgiAIgiAIgiAIgoMEMYJg6JxUjeoQCyP0kUOMIAiCIAiCIAii4UkkErj//vvrPYxhgQQxgmAEOcRGqiCmc3eFeVy6fUMQBEEQBEEQBEE0DOvWrcNxxx0HAFixYgUSiQTmzZtX30HViHS9B0AQsUHnghrp7qgonG8jfR8QBEEQBEEQBEEQgWy33Xb1HsKwQQ4xgmA0ukNMHn8YkYs6xAiCIAiCiAsLFhQnBbr0UmDZMvG22bOB004DFi6sy9AIghh+XNdFf7a/Lv/cENdHmUwGZ599NqZOnYr29nYcfvjheOWVV7zb58yZg0QigYcffhj7778/2tvb8cEPfhBvvfWWdp3nnnsujj/+eO/33/zmN0gkEnjssce8v+2+++744x//CAB45ZVXcMwxx2Dy5MkYP348jjzySLz22mvCOvnI5C677AIAOOigg5BIJHDUUUdZP96RADnECILRqKX6TPgCio8hkSj+XIlDbKSKggRBEARBNAaLFwMzZ5bPy15/Hbj77vLtv/oV8Oyzxdm0f/Yz4Ic/LJ/7EATRkAzkBjBm1pi6bLvvgj6Mbh1ttewPfvAD3HPPPbjtttuw00474fLLL8fHPvYxLF26FJMmTfKWO++883DNNddgu+22w49+9CN88pOfxOLFi9HS0uJb55FHHok//vGPKBQKSKVSeOaZZzB58mTMmTMHxx57LNasWYN3333XE7J6e3tx6qmn4re//S1c18VVV12Fj3/841iyZAnGjh3rW//LL7+MD3zgA3jiiScwc+ZMtLa2VrajYgo5xAiCoXNMNYpDDBAfWxihjxxiBEEQBEHEgeXLxfOR5cvF2wcGiv9ns8AFFwCLFg3f2AiCIDT09/fjuuuuwxVXXIHjjjsO++67L2666SZ0dHTg5ptvFpa96KKLcMwxx2C//fbDbbfdhg0bNuC+++5TrveII45Ab28vXn/9dbiui2effRbf//73MWfOHABF19n222+P3XffHQDw0Y9+FF/60pew9957Y5999sGNN96IgYEBPPPMM8r1T5kyBQCwzTbbYLvtthOEu0aAHGIEwWhUhxgviDkOkEyWf+b/N9HsHWI9PcUIxsc/Doy2+waIIAiCIIgKuf124Oabgb//HShdjHnw5zUAsH69+HsuJ/7OBDKCIBqWUS2j0HdBX922bcO7776LXC6Hww47zPtbS0sLPvCBD2DBggXCsh/60Ie8nydNmoS99trLtwxjwoQJOOCAAzBnzhy0traitbUVX//613HRRRehr68PzzzzDI488khv+Q0bNuDHP/4x5syZg40bN6JQKGBgYACrVq0K87AbBhLECIIRRfl8HOEjk/xjqGSWyWYVxH79a+Dii4v/f+c79R4NQTQU3UPdaEm1WJ9QEgTRBPzpT8XY4zPPAP/93+JtTPDaYQfgvfeADRvESghZEGvWcxeCaCISiYR1bLEROeqoozBnzhy0tbXhyCOPxKRJk7DPPvtg7ty5eOaZZ/D973/fW/bUU09FZ2cnrrnmGuy0005oa2vDhz70IWSz2To+gvpBkUmCYOhEn5EuiEURmWz2DrFNm4r/b9xY33EQRIORLWSx1+/2wiE3HlLvoRAEESfYeYcsbvF/22GH4v/ZLLB1q/92BgliBEHEgN122w2tra14/vnnvb/lcjm88sor2HfffYVlX3zxRe/nrq4uLF68GPvss4923UceeSTmzp2LJ5980usKO+qoo3DXXXdh8eLFQhH+888/j7PPPhsf//jHMXPmTLS1tWHz5s3adbPOsAJvsmggyCFGEIwgh9hIPaEih1j1sMfdoB8EBFEvtgxuwYb+DdjQvwGu6yJBxdcEQQDlz105HgmUBa8xY4AJE4pi2Pr1wMSJ4u2MZv0yjyCIWDF69GicddZZOO+88zBp0iTsuOOOuPzyyzEwMIDTTjtNWPbnP/85ttlmG2y77ba48MILMXnyZHz605/WrvsjH/kIent78dBDD+HSSy8FUBTE/vu//xvTpk3Dnnvu6S27xx574M9//jMOPfRQ9PT04LzzzkNHR4d23VOnTkVHRwcee+wx7LDDDmhvb8f48eOr2xkxghxiBMEI6hAbqSdUcoeY/HMYh1izC2KqE3OCICrGccvHJBdNenwhCMKPjSCWTgPbbVf8me8Rk+/TrOcuBEHEjksvvRQnnXQSTjnlFBx88MFYunQpZs+ejYlM0OeWO+ecc3DIIYdg/fr1ePDBB42zO06cOBH77bcfpkyZgr333htAUSRzHEfoDwOAm2++GV1dXTj44INxyimn4Oyzz8bUqVO1606n07j22mtxww03YPr06TjhhBOq2APxgxxiBMEImmVypJ5Q8a4mVWSSHGLBkEOMIGoCL4gVnAKSKfqejiAI2AliLS1FQWzhQlEQo8gkQRAxpb29Hddeey2uvfZa43KHH3443nrrrVDrnjdvnvD7pEmT4Ciu8w466CC88sorwt/+W+pqdKXj5umnn47TTz891HhGCnTmSRAMlUPMdcsnUo3mEAvTC9bsHWIkiBFETSg45fcUL44RBNHkmD53eUFs222LP2/Y4L9dXhdBEARBSJAgRhAMnSCm+nkkEdQhZvO4yCFW/J8ikwQRKbwIRoIYQRAeYRxigNohxjoJm/XchSAIggiEBDGCYKhcUDoxaSQRNMtkGIdYs55UkkOMIGoCCWIEQSgxCWLsb0GCGOvbGannbzHnnU3v4I75d/iiVQRBVM5RRx0F13UxYcKEeg+laaAOMYJgqFxQKkfVSINmmaweEsQIoiaQIEYQhBJbh5gpMtnaCmQyzXvuUmO+/uDX8fzq57H35L1x6PRD6z0cgiCIiiCHGEEwVA4xXXxyJBHlLJMjVRSsFopMEkRNEEr1XRKcCYIoUWlk0nXL5yzMITZSz99izpbBLQCArsGuOo+EaFbInUhE8RogQYwYHhYtAq6+GhgcrPdI9Kg6xMI6xFwXWLcu2nFVCznEqoccYgRRE3gRjBxiBEF4VCqI8YX6bW3iuohIYcdsOnYTw01LSwsAYGBgoM4jIeoNew2w10QlUGSSGB5+8hPg//4P2HFHQJrWNTaoerLCOsQuuwy44ALg/vuBE06IdHgVE9QhFsYh1qwnlSSIEURNoMgkQRBKbASxdLocmdy0qfgZzQti7AKpWc9dagwJYkS9SKVSmDBhAjZu3AgAGDVqFBJsEg2iKXBdFwMDA9i4cSMmTJiAVCpV8bpIECOGh66Snbq3t77jMBGFQ+ydd4r/L1wYT0GsWocYRSbrOw6CaDBIECMIQomtQ2zKlOJskoUC0NlZjkkCVKpfY5jDl+LuRD3YruQOZaIY0ZxMmDDBey1UCglixPAwEgSVIEHM5htGdpIWp28jo4hMkkOs+D85xAgiUkgQIwhCia0g1tICTJ5cdIitXw9Mm1ZejhxiNYUcYkQ9SSQSmDZtGqZOnYoc7wwlmoaWlpaqnGEMEsSI4YEdqOIsiAVFJkeqk0oXmQwjclGHWPF/EsQIIlKEUn2H3l8EQZSwFcSAYo8YE8QmTy7+LZkE2IVSs5671JhqBLGCU0BPpgcTOyZGPSyiyUilUpGIIkTzQqX6xPAQR+eUTBQOsTgKYuQQqx6KTBJETeBFMHIZEAThEUYQYz1iGzaUl29pKYpi/LqISGHH70qO3V+894uYdtU0vNfzXtTDIgiCCAUJYsTwEEehSIaJPtV0iMXRSUUdYtVDDjGCqAkUmSQIQon8uXv++cBZZxV/5kUvQJxpkhfLWMl2nM7JGgh2zK7E3fvG+jeQKWSwpHNJ1MMiCIIIBQlixPAwEiKTUZTqx/Fx8iIOzTJZGeQQI4iaQIIYQRBK+M/dfL44i/f11xeL83UOMZ0gFqdzsgaiqshkqYjfRZOeVxIEERtIECOGhzg6p2RUoo9OTNIRRydVlA6xOD9/tYQcYgRRE4QOMZqpjCAIBi+I8YXZQ0Pl39OlKuSxY4v/DwyQQ2wYqbZDDABcem4IgqgzJIgRw0McnVMyUTjE4igcBXWIkUMsGBLECKImkEOMIAglskOMkc36HWKtrerbSBCrKexLjEq+zCCHGEEQcYEEMWJ4aARBzOaEKo6PkzrEqocikwRRE0gQIwhCic4hFkYQo1L9mkIOMYIgGgESxIjhYSQIKioXVKUOsTg9zqAOMZplMhhyiBFETeCdBSSIEQThQQ6x2FONIJZ3is8pOcQIgqg3JIgRwwM7QYnzSUkUDrE4RiaDHGIj9XENJySIEURNIIcYQRBKwjjE2tqK/2cyVKo/jDCXV1Wl+s16XkkQRGwgQYwYHuIYJZRhYkcUHWJxepy6DjHV4w1aR5we13BCkUmCqAlCqb5DgjNBECXIIRZ72PG7kmO3F5kkhxhBEHWGBDFieIijUMTjumrHVFhBLI5OOP5EstLIJDnEiv+TQ4wgIoUcYgRBKImiQ4wEsZpSVYcYOcQIgogJJIgRw0MchSKeoJkYgXDRwjgJf1FEJqlDrPg/CWIEESkkiBEEocTkEGO/p9PF/1WCWDpNpfo1JpJSfXKIEQRRZ0gQI4aHuEcmg0Qj+eeg9cTpcQaJfWEeF9CcJ5YUmSSImsBHbUgQIwjCg/8iyrZDjBxiwwpzeZFDjCCIkQwJYkTtcd34d1AFzcQo/11HHKOFUTrEbJdvNMghRhA1gRxiBEEoqaRDjEr1hxWvQ8ylDjGCIEYuJIgRtWckuIuicojF0QkXJPaRQywYEsQIoiYIpfoVXFQRBNGgVNohxs5XyCFWU3hnVyVfZuSdvG89BEEQ9YAEMaL26MSmOGEjiDVqh1iYWSaB5jyxpMgkQdQEcogRBKGESvVjDf8FRthjt+u6njOMHGIEQdSbugpiO++8MxKJhO/fN7/5zXoOi4ga/kQmTkIRj65nS/d3HXGMTAZ1iIUR+myXbzTIIUYQNYEEMYIglISJTOo6xKhUv2YI7l4n3LkRL6aRQ4wgiHqTrufGX3nlFRS4C8y33noLxxxzDD772c/WcVRE5PCCWEQffK7r4tN3fxrTxkzD9cdfX/0KdYJPoznEKo1MhhUGGw0SxAiiJlTjMiAIooGpxCGm6xAj0SVyqvkygxfQyCFGEES9qasgNmXKFOH3Sy+9FLvtthuOPPJI5fKZTAaZTMb7vaenp6bjIyKiBpHJdX3r8MCiB5BAIhpBLMhFJf+sI+4dYir3GznEgqHIJEHUhGpcBgRBNDCVlOpTZHLYqEoQI4cYQRAxIjYdYtlsFnfccQe++tWvIsE+wCRmzZqF8ePHe/9mzJgxzKMkKqIGkUl+dppIPkyjKtWPY2SSOsSqhxxiBFETKDJJEIQSk0OMndekS9/rBwlicfqSskHgv8AghxhBECOZ2Ahi999/P7Zu3Yovf/nL2mUuuOACdHd3e/9Wr149fAMkKqcGkcnIYzZBMzHKf1fhuuX1xOnki2aZrB5yiBFETSBBjCAIJTqHmByLBPQdYuQQqxnVzBBMDjGCIOJEXSOTPDfffDOOO+44TJ8+XbtMW1sb2tiHHjFyqEFkkv92qeAWkEKquhVG4RCLq4sq6LHZjJU6xIr/k0OMICKFBDGCIJRU0iGWyxVvZ7dRqX7NqObYnXfK56XkECMIot7EQhBbuXIlnnjiCdx77731HgpRC2oQmYz8IspGEAs6oYrrbJpB/WjkEAuGBDGCqAnVxG4IgmhgTA4x9rssiAFAf3/5NnKI1YzISvXpuSEIos7EIjJ5yy23YOrUqfjEJz5R76GMaP73vv/F1x/8er2H4acWHWKc3TqSImabyGTQ2GvghIsE6hCrHopMEkRNqCZ2QxBEA8N/EcWfRw4MlH8mQaxuVFNdIkQmySFGEESdqbsg5jgObrnlFpx66qlIp2NhWBuRbB3aij/P/zNueu0m9GX76j0ckRq4i2LpEIuri0o3rjCRybiKfcOFTiQlCKIqKDJJEIQSnUOMCV6AWRBLp6lUv4ZUM0MwOcQIgogTdRfEnnjiCaxatQpf/epX6z2UEQ2fx986tLV+A1FRw1kmgYhcBUGxQvlnFXEVjaqNTDqOKAg148kL/5gpNkkQkUGCGEEQSnQdYipBLJUq/uNvJ4dYTakqMkkOMYIgYkTdLVn/7//9P/p2IAJ4gWjr0FbsMG6HOo5Gohal+lHPMhnkopJ/VhHXDrFqS/VlAagZ36/8Y87nyyfhBEFUBQliBEEosXGI8cmS1lZgcBDoK6UkqFS/plTT/0gOMYIg4kTdHWJENPACUfdQdx1HooAXimoQmYy8Q4wXjcJ0Z8U1Mql7DOzvYWbPlNfRLJBDjCBqQuRfbhAE0RgEOcQSibIrDCjHJlnHGDnEago5xAiCaBRIEGsQZIdYrBgJkUmbDrGRGpmstlRfLpKP02MbLkgQI4iaEPmXGwRBNAY6hxgvePEwQYwik8NCNROikEOMIIg4QYJYg8B/GMVOEKuBUKT7Zqon04M/vPIHrO9bH26FUZfqx0k0CuoQo8hkMHJkkiCISKDIJEEQSoIcYrIg1tbmv51K9WtGNcduvveYHGIEQdQbEsQahBHjEItITOEFQP6x3zrvVnzzkW9i1nOzQq5QEyustEMsTqJR1A6xOD224YIcYgRRE0gQIwhCCfvcdRwgmy3/XSeIkUNsWKkm7i5EJum5IQiizpAg1iAIHWKZGHeI1SAyyX8Qb+jbAABY27c23AqbxSGmEvuoQywYEsQIoiaQIEYQhBL+c3doqPxzkCDGl+qTIFYzwsTdc4Wc8LsQmSSHGEEQdYYEsQYh1g6xGkcmeTGwP1c8UQo9sUBQrFD+WUVcBbFqZ5mkDjGKTBJEjaimh4YgiCZhcLD8cxiHGM0yWTNsv8x4evnTGHfpONzw6g3e38ghRhBEnCBBrEGIdYfYMEYm+7LFbwZDu+R0M0RW6hCL0wd8kNhHDrFgyCFGEDVB5/YlCKLJ0TnEdKX6rEOMfUaTQ6ym2B67566ai6H8EOaunqu8LznECIKoNySINQixdojVIDKp+2YqEoeY64rdFd6GQnSIxclFFST2Ual+MOQQI4iaQJFJgiCU6AQxRjot/s4cYgwq1a8ptsfunkwPADE2SQ4xgiDiBAliDUKsO8RqECXkBUAhMpktCWLVOMSAygSxODrEXDc4Jkml+sGQQ4wgagIJYgRBKAkSxHSRSf52cojVDNu4OxPE+JklySFGEEScIEGsQRgxDrEaRCYjcYjZCGIjsVRfFm/YuHQxyjDraCZIECOImkCCGEEQSvjPXb5DjEGCWF2xdohlSw4xp3wtwItj5BAjCKLekCDWIMS6Q6zWpfqKDrHB/CCyhazvflp0os9IL9WPWuizWb4RocgkQdSEMDOVEQTRRIR1iLEOMUY6TaX6NUT3xbSM0iHmkkOMIIj4QIJYg8BfSIR2R9WaGnRr6co8WWQSCLkfdDMphhGOauCEq5oohD7qECOHGEHUCNuLKoIgmgvXddDVXvqFHGKxo6oOMYc6xAiCiA8kiDUIsXaI1UAo0nUXsMgkELJHTCf6hIkWjgSHWLXON5vlGxESxAiiJlBkkiAIFd/6ryym/AB4cyqq7xBrxvOWGlNVhxh9EUIQRIwgQaxB4D9QMoUMhvKKk4d6UYtSfe7DlP+mqa4OsWEQxFzXxV/m/wVvbXzL7g42DjGaZTIYikwSRE0gQYwgCBVvTHFQSAILpoAcYjFEl9SQ6c30AhA7xKhUnyCIOEGCWIMgd6/EyiWWy2H5BOCmg4GMmwtc3AZtZDIqh1i1TqoanXz96rlf4Uv3fQmf/b/P2t0h6tkz+XU0E+QQI4iaQIIYQRAqCqUrlEICaodYOi3+LneIkSBWU8JGJrUdYvTcEA2G4zr43uzv4a9v/bXeQyEsIUGsQZDtyrHqEcvlcO7/A77+KeDh0WsjWaXKqu24DgZyA97fI3eIBQlHNehKE4bkOvjx0z8GACzcvNDuTuQQiwYSxAiiJtjGbgiCaC4KJS2rkETlDjEq1a8ZNhOiuK4b3CFGDjGiwXht3Wv49Yu/xo+e/FG9h0JYQoJYgxBrh1g+j6WTij92JUPM/GhA1T/Ai2FARB1iMYpMPrT4Ie/nHcbtYHcn6hCLBopMEkRNsI3dEATRXBQSxc9dR+cQo8hkXbFxiGUKGS8qyTvE+J/JIUY0Gp0DnQAQr/oiwggJYg2C/M16rASxXA5rxxZ/LCCaCx7VN1N8fxgQch9ELRxF/AHvui5mzZ3l/T6ubZzdHW2EPpplMhhyiBFETaDIJEEQKjyHWBSCWDN+kVdjbIrxmTsMkDrEXL1DLFfIkUhGjGiYIYMXfol4Q4JYgxBnh1gmN4TNo4s/RxWJUbkK+P4woLFK9d/reQ8vvvciNxTLk4WoH5fN8o0ICWIEURNIECMIQoXXIWYbmaQOsWHF5tjNC2JCh5ij7hDbMrgF21+9Pf73/v+NcqgEMaywa3ASxEYOJIg1CL4OsTBxwRqzzi1/IBYi6goQZpl01Q6xyCOTYTrEIj75kuOg1p0Lug4x/u/kEAuGIpMEURNsemgIgmg+BIeY6ryDIpN1xab/UXCIFYIdYvM3zMemgU14ZsUzUQ6VIIYVZsggQWzkQIJYgxBnh9hatyxMscik4zrCh2NYVBdRfdk+YZlGcojJJxuROsSKKwy/jmaCHGIEURPIIUYQhAqhVF8FlerXFZv+x7AOsa7BLgBAthBN3/Bw4rgOHl3yKDYPbK73UIg6wwwZfEyYiDckiDUIce4QW4uyUMU+NI/58zHY47d7YDCnsMFbYBWZrMYhVm2HWNSCmCR4WjvEdHHHMIIYOcRIECOIGmHTQ0MQRPPhRSYTmgXSafF3cogNK2EjkzYdYuzaZSQKYo8tfQwfv/Pj+O7s79Z7KESdsY1MrulZQ27ImECCWIMQZ4fYmkSv9zOLTM5dNRcru1di6ZalFa1TZdWuKjJpIxyFcYhFfPJVsUPMRuhT/c5DHWIUmSSIGkEOMYIgVIR2iJk6xJrR2V5jatEh1jU0ch1iK7auAACs611X34HUmWVdy/C3t//W1J/nfKm+6Xrti/d+EUfddhTe2fTOMI2M0EGCWIMQ5w6xtcmyUMUik+zDcEP/horWqXIVVFWqH0XXFt8hFleHWCWRSd2+aSbIIUYQNYEEMYIgfLhusEMsKDKZTpNDrIbY9D+G7RCLMjI5mBvEXW/ehS2DW6pelw2stqXZY3JnPHQG/ufv/4N/rf5X1eta37ceG/s3RjCq4YU3pZjOa5Z3LQcArO1dW+shEQGQINYgxNkhtjZVLoQvlA4M7MNwfd/6itbJP172M3OITeqYBCDkPrARjuoZmYzKIaaLTJJDzAw5xAiiJtgUMxME0WS4bjQdYiSI1QybuLvOIcb/rHKI5Zyc/XmuhptfvxlfvPeLmPXcrKrWY0tvppiGafYideaU29S/qar1rNy6EtOumoaDbjjIuE9ffO9FfO2Br1W9PQD4/cu/x9mPnm187f1j4T8w7appeGr5U9pleEOGaey9WXrNxAUSxBqEOHeIrUmXe8IKcIUPzg19lTnEVBdR7NuZ6WOnA2iwyGStHWImQYw6xMghRhA1ghxiBEH44BxiTiUOsWSy/K+0PiJaquoQc8wdYvLyMhc8cQFm/mGmMQmyuHMxAGDTQPVCiQ1M3KhmwrB60ZvpxR3z7wiXrNHAHHmm5++9nvfwiTs/gSeWPaFd5qyHzwJQdE8N5Aa0y13+/OX44+t/xD8W/aPCEZf54RM/xG9f/i2WdS3TLnP/ovuxvm89nlz2pHYZ/nWsE7tc1/VE1CheM9lCloS1KiBBrEFgHy5jW8cCiJcgtlYQxBzhg7AWkUlPEIsiMtloDrEoZplsxhNLEsQIoiaQIEYQhA/eIWYriPEdYuw2cojVDBt3r7ZDjI9MKhxigDk2edNrN+GdTe/g9fWva5dZ11fs8hquCKMnbgRs78FFD+LON+80LvP6utdx+gOnD1uU7g+v/AGn3HcKfvPib6paj+M6XuzVJPLc/dbdeGTJI7jx3zcqb39o8UN4dOmj3u+mdTHB0ySa5Qo5/Oq5X+HVta9qlxnIDXjXkUP5Ie1yzMhhep55Q4ZOpBrKD3nvg2pfo5l8Bnv+dk8c9qfDqlpPM0OCWIPA3lTbjNoGQEgxqMasbc14PxfgCB+ElUYmVd0FLDLJBLFMIYNMPuO/swobJ1XQCdVI6BCrJDJJHWIUmSSIGkGzTBIE4YPvEOOvVDo6yj+bHGIkiNUc1WzvMrwg5riOt5zOIcbEFEAvgmwe2IzOwU4A5qgZK7cPct/MWz+v4gm+eGzibwWngP/5+//glPtOMV6nXf3i1bj59Zvxt7f/VvW4bGAxx6C+rt+8+Bv8Y6HeidU91O09n6b9sKp7FQC98HThUxcKv5sEIxsB7vFlj+PCpy7EDx7/gXaZzQObvZ9NY2dGDtP2bCKTOrG4Et7tehcru1fi5TUvVx01blZIEGsQ2IfL+LbxAMrxwXrTm+lFb5rr+4Ib6BDrzfTidy//zvjNiOqDmCn708ZM826zjk1G7RCL+yyTYWKQ5BAjhxhB1AhyiBEE4UPnEBs1qvxzGEGsGb/IqzFhI5NA+cK/GofYos2LvJ9NooSNQ2zr0FZ86OYP4ahbj9IuY4tNZLIn04PB/CAc1zFep7HrH5PrCSg6g6IQQLYMBcccF3cuxndnfxdnPnymfj3cBAamda3uWW1c5r2e94TfTW5Bm4jmmp41AMzXhJ0Dnd7PpnUxI4duGcd1rMQu9noBqo9Mssdn2h5hhgSxBoF9uIxrGwfAP+NivZBFLdkhpuoQu2XeLfj2o982FmHy62A/M4fY2Nax3n6wdspF3SEWV4cYdYhVBgliBFETbGYqIwiiydA5xEaPLv+cTov3IYfYsFKJIMYu/G06xHQiyMLNC72fTf1MzCFmEghWbl2JofwQ1vSuMQpLeSfvE2lkbCKTvCBjWo45tUxCyeru1ZhyxRSc/sDpxnHZwEQl075a3V0UsdjjNK0HCBg7E8Q0y8jPhWld3kQMhmWYo9C0jI1DzHGdwOemN9MrvKa1ghi3H6uNTPLX2qbncEnnEvz+5d/bp6eaCBLEGgT24cKEoLyTj2Ta4mrxC2KiQ0wVmWQfOqZ+MdUHcV+u+G3L6NbRnlPOukut2TvEaJZJMxSZJIiaQA4xgiB86BxivCBm0yFGpfo1w+bLDN4FA1g4xAYtHGKdnENMIyQwJxZgFkH4iKBpluNvPPwNzPj1DLy85mXtMjaRSWHSAItxmdY1Z8Uc9GZ78cJ7L2iXAYB3Nr2Dhxc/bFzGxmUV5Izi1xO0HBPXdM+x/KW/bl1D+SEvdmnaHhO7TMsw0QzQPzddg13ec6Jbl3zdORyRyTW9ZYeY6TF+Z/Z38K1HvyX0sxFFSBBrEGSHGFB2TNUT/k0KlAQx7kNn08Am3wcp+0A0udz4+8gdYmNax2B8e1EQs45M2jipgkQuvkOsRrNMJlA8M4y8Q8w0XuoQI4cYQdQIEsQIgvBh4xCjDrG6YtP/6HOIlS7WeQGAnc8O5gaRKZSdKzYOMZ1wweKS/DZV8IKYSZR4ac1LAMS4pgyLQNp2S+nGVXAKVgIO2w9BYsqJd5+I4+86Hsu7lmuX8QQxw9g9QczCiWUaVyafKfdwaR6frUPMpnOOH5fJKMI7xHTj4o0aumXk687hjkya1vX2xrcBiPuNKEKCWIPABJP2dDvSyaKNPA6xSWVkUur/4g+gQNn+yj5c/vT6n7Dfdft5pY/sft46WWSy9HhHt5QdYg0TmSw9RvbckkNsmCFBjCBqgk0xM0EQTUbUHWLNeN5SY6rqEHP8DjHZWWPjENOJDSwuCdg7xEzLseSKSaCyiUzaOMQ6Bzu9/Wka08LOhYHbG8gNYHHnYgDl2RhVhHGIuVLSR7UeQD92wc2ki0xKX/rrXgt855zR/TUQHJnkO8R0rys+1aRbl3zdGUVkck2POdJr4xDLFrKB3W3NDAliDQITTFKJFEa3FL9Bi4NDjP9QAvwOMcAfm5QFsdMeOA1vbXwL5z9xfnk9im+m2OMd3To6vEMs7qX6jiSIUYfY8EKRSYKoCaovNwiCaG5cx4ETpUOsGZ3tNaaqDjE+Mlk6n+XFDUAtgmQLWby75d3y+jQX9rxDzOSgEgQxzboGc4NWDiqbyKRNh5jNmABgwaYFgWPiZ8/UiUq5Qs57nowOsf7ytZpuXTaRSRaXNK3H5xDTrMu2s8zGcSc4xDTr4nuvq41M8g4x02vmnnfuwQ6/3gGXPX+ZdhnefKIb+4qtK6yE1maFBLE4kckAv/hF8V/Ii24mmKSSKYxuLQliMXCIsWx3kk1uKDnEAH+xvheZlAQ9oUhfEZlkAlokDjGVcDQSHWJRRCbJIUYOMYKoERSZJAhCxuGdo7YOMVWHGDnEaoZwHq74MiPv5H2zJJocYnKMSyWWLOtaJmzLyiFWZWTSxn2Td/LlLiuD2GDjELNxreUKOU/sMj2+JZ1LAtcljMnCIWZazkagYi4l03rYuUAqkTKuS4hMWvSDmSKTQoeYRWRS93qxjUzyYrHpNfPsymcBFLvgdNi8RnkhOQ4d43GDBLE4kckAP/1p8V8unHobV4cY++YnzfSlChxijDGtY7yfVRdRXmSydbS3D4KmLPbQuaD4v4fpEKvRLJOpZCrkHSOITFKHGAliBFEjSBAjCEKmwF1EWpfqU2RyWAk6dvORsI50B4DyxXqlDjG+Pwyw7BAzRSYHgsUnfnZJ08yC3jK2kUmd6GLhQlq+dbl3m+nxsbgkYOnqsugQMy3Hr0snBPEOsaDIZGuq+J7W7QchMmnjEKtylkmbfWDtELN8zbzbVRSydM9f3skL49Jtj60naHvNCglicSLJPR0hP7xVDjFZUKoH7JsfJogVEv7suTybpK5Un4lcgOQWYx1iXKl+IlFl+Xy1DrGoI5OyQyzqyCQ5xMxQZJIgagIJYgRByAiCGH+lwjvE0mnxTrwgxm6jWSZrRtCxmzlg2tPtGNVSfN6q7RCTS+21DrEKSvV1ywmF5ZplbONvQqm+jUNMsz1eGDRtb8kWziFm4+qK0iGmi0z22Ecm29JtxuVsHGIFp+AtV+0sk1al+rYdYpavGSZkmSK2/PtPN3Y+PkuRST/p4EWIYSPBfQ0W0oWjdIjFIDIpO8QKcH1vfP7bkIJT8Oymfdk+IRqoc4h5s0xypfrebIxRls+H6RCrkUMsslL9MK4v6hAjhxhB1AibmcoIgmguCgWFQyyZBNrbywuRQ6yuqM7DeZggNq5tHJKJojBp7BCziEyyInmGtkOsN3yHmG65sA4xx3XguI73mHlsHGI2kUnWH2ZaDyAKYtU4xHKFnFXHVlSRSfaaaEu1GddlU6rfNdTlra/qWSZ5956uVN92lslM8CyTjut4s4NqJyngBFvAEJkkh5gRcojFCV4Qq8IhxoSjWEQmZYeYKjLJFTXyB5K8kxemYOYFMXlmsoJT8PL7o1tHV+8Qi/ssk1E+LtXvpnU0e2SSHGIEERlUqk8QhIzSIZZOq0UvBpXqDytBX2bwghg7d2XiAC8SeB1iFpFJ5lJiX3pbOcSqnGWSF8R0goqcyLERS6pxIfHCoOnx8R1i1RTh8/vJtC6bHi6ryKRrGZnkHWK6WTu52SOZWBm0nFVksspS/Z5sT+Aya3rWeNfBWgdjrySIafaDMBkFOcR8kCAWJ6qJTPIOsRiV6rMDjzEyySnu8jdEm/rL0wQLgpgUmeQfa90cYnyHWK1nmYzycRVXaL+OZvymlRxiBFETKDJJEIRMXuUQa2kxC2KpVPEffxs5xGqGbWRyXNs4tCSLz4exQ8zCIca24wklug4xi1L9/my/0DNsIzjYRCYBvcAR1iGmWw8fmSy4BeU1QU+mRxTXqnB1yV3PNpFJbYeYTWRS7hDTjX0oWMzjRTrduobyQ8J1pJVYGaVDzMbVpdkeP8Okbl2O62BZ1zLvdyrV90OCWJyoIjLpzciRjGepfgs/y6TkAuAPMPI3RPxBmJVyAv4PYvZYE0igPd1emw6xekYma90hFsYh1ownliSIEURNIEGMIAiZihxiQPl2EsRqTtCxm5/5XXaIKTvEMluF+1sJYoqL/8HcoJUTS3Y9RVWqrxsXYOcQC3Ktua4rRCYBtejCu8OA6hxiPkFMMS7HdQLFtYHcgNX2ZIeYVYeYZn/yUUjdunh3GKDen47rWPW7WZfqZ4Mjk4Kry6LjTreutb1rhcQVRSb9kCAWJ6KITMatQ6z0OFKeQ8zfNcAfaOVviOSDMEOY7tkpO8RYoX5oh1jUpfpx7RCrJDJJDjGKTBJEjSBBjCAIGeUsk0EOMcAviFGpfs0QzsMVcXcmPLSmWtGSKjnEquwQk8vWVWIDH5fULQP4BTGrDjFLh5jNDIRWHWKKZTb2b/S5kFTL8f1hQHUdYjYOsd5Mr1jurliGj0uatsfWw55nq1kmdQ4xSexSLSeLZqpluga7hNeI1iE2FPzcAKKIajUzpK5DTIpMqtbFC2umdTUzJIjFiSgik9wskyaHmOM6vm80aoHLnE0KhxizUG8e2KztEOA/2HhXlAO1Q4w9duYQs77AshGOwjjEGnmWyWbs4iCHGEHUBPnLDYIgCOG4wE6NbQSxtjbxNnKI1YygLzPYhXlLqsXKIcbO/9kX2kpBjKVOkqLAxsPikmw9tr1YquVyhZzYG2XpENMJHDazTAbF8pj4xtxTuuVkh5g25mgRO5QFMdVzwwtrgHofMHFxyqgpAIrXNqrXjm1kshKHmGo5m2VsY6O2kUkWKTaty6YI3yYyyc8waVpXM0OCWJyoZpbJkA6xL9//ZUy9cipWbl0ZfpwhcB19hxjrBHNcx3tzyt8Q8T0AvCtK/maKn2ESKH8QRiocBZ1Q8R1icXWIVRKZJIcYCWIEUSPIIUYQhAzvEHPYqXE6XRa82O8yushkM36RV2NUx+6CUxY42Hl9S7LFqkOMuae2GbUNALNDjAklKrGBCRfTxk4rbrOKyOT6vvXCdYS1Q0yxnOu6gQ4xm14zeR/olqupQ0yxnCyIKeOs+UEAwKSOScZ1eU7AVAQOMalDTBmZlJZRva54oRIIdgGObR2rXRcgRSZ1gphFEb5NqT4vrJm218yQIBYnqolMqhxiBkHs1bWvYig/hHc2vRN+nCFwC7JDrDzWUS2jvOUy+WK22dQhppuNrOAUyl0FkkNsWEv1R4JDTBeZDOMQa3ZBjCKTBBEZJIgRBCETWWSSHGI1Qz52F5wCDrj+ABz+p8Phuq53YW7tECt9Ib7t6G0BBDjEUqLAxsNmnB/fNl67DGAXmfSJDRbxN0AtSgzmB4X7q5axEenYPkglUsbZNld2Fw0PzChQVYdYf7A7yieIGYSuIDFPdojpxFEbMc8mDmmzDJsAblzbOO0yQNkFOHnUZAB2pfpWkckAhxgTGU2Pb2L7xOIyFJn0QYJYnIhCEEvYleqzcj2+ZK8WOKU3ucoh1tFSLsln4/A5xHSRSU2pPnvsyUTSdx8jNsJRI3SIkUOsMsghRhA1gQQxgiBkCgVFZJJK9WOF/MV052An3t70Nl547wXknbzoELPpECt9IT519FQAlTvEZDEl7+SV58w2kUm+P0w3JsBulkmbbimfC8ngEEskEt41gUrgYH8b2zZWuwwQnUPMN5ujhdAFmGc89DrENAIjf99qZpn09Ywpltk0sAkAMH3sdADq5zhbyHouOJMgli1kxZJ7jeOOdxTq9hO7/vUEMc2EB0B5f9Isk35IEIsTUUQmLR1i7BsU5syqFSwyWZ5l0vU+CFtTrUglUsI4TA4xU2SSHYCY6yx0qb5OOOL/3kizTOoeb5h1NBMkiBFETSBBjCAImYodYnKHGJXq1wyVQ4yRc3JGhxgvEjA3GUt6bDtmW28dum2auqVULiRV6f/GgWA3liyI6QQjNnZvOcXY5dkHq3WIJRNJo1OOLcdihzYOMZ1TKaoOMfbcsHED/sfIX7eZZhOVDRS650YWu1Rjlx1iqrGz9TPjhbLfjXN9TeyYqF2XzayktkX4vr41g4gaFEFtZkgQixsVfpsV2iGWHx6HmMucTQqHWCqR8qnVRkHM4BDjyzsBLjIZZYdYkBDEd4hFHZms1CEWRak+OcQoMhljnl7+NLa/ens8tPiheg+FqADBZaC4aCEIovlQluqTQyxW6M7DgeLFO7voTifT/g4xR3SI8QXjrHDdFJn03F+u3iFmEl0Ay8hkTzEy6YkgVcwyaTMzJBuTqX+K7fcEEsbJBeQZOXUCo1BMH1Cqz54bU2SS9UObnErJRNIbu/w889dtJuFTvl7Ujd0mDslcZMykYeNuMwm2QFl8UgpiFo7CVd2rAAAd6Q7t9gBOZCztTxvXJEUm/ZAgFjcqFcRCOsS8yGStHWIFsVTfSbhC3xk7YMiRSXYA0DrEJKs2OwCwg1lohxgTOeRvE2MSmWQH2dAOMZvZM1W/81CHGDnEYswTy57A2t61eGzpY/UeClEB5BAjCEIm8g6xZnS21xifQ8wVHWLeF9VJRYcYH5l0XeFi39R5JV/8m0SXoJ4qm8gkE3mCCvptZpkM4xBjsbygyCQT/UzCmckh1j3ULU4aoBhTtpD1HHDeuAyl+qwDziQqJZDQCkv868rUIWbtELOITDLRzHMnWjgPTe49AMbnxqZzju2XoMirjdhlI+Y1OySIxY0K7d2xdYi5oiBWgDgjJntzypHJHcbt4Bsff6CRI5OegyqbB77xDSQ6O333McJEDnYSVW2pfq0ik/3FaGhooY/NxkQOscogQSy2qEp6TfRl++zfP0TNIUGMIAiZqh1i7JyHHGI1Qz4P9znEWGRS1SEmOcQ8kYcTSmwcYjbxMEAtSjDxyXM9qcQ1iKJSNbNM2nSIeRMCtI/XjokXlWTnnWo5U2+UHHMsuAXf+RH/XLHuZ5N4aCMq8WKeKTJp2u9se0ZXl+t6kUmT0MpEs+3GbAcgQGAMcNwBxedGFoF5bGclBYJFLJvlfG5BzYyqN7x6A/761l+V22l0SBCLGxV+mxXGIea4jveGqX2HmCIyyTvE0mqH2IzxM/zr4g6SOqt2+r21wHXXITFvnu8+RphwZBLEwjjEoo5M9hcPnulXXiuuPkqhT/U7D3WIUWQyxqg6SXSs6l6FqVdMxWkPnFbrYRGWkCBGEIRMZB1iJIjVDGNk0ilHJpWzTEoOMU/kSQQIYiFK9dk2gQBHU0nAMXVemYQEwM7xY+MQCyNuBJXq+3qjDPuARTRV2+SvN0zrGsgNACjP7ml6bngxzxSZNG2PGSimjNYLmt2Zbu+1xsQu1T5l18JsP9hEJlWTNaieG9V+4CPCumVsY46y8GnjEFO9t5ZuWYozHz4TP3n6J8rtNDokiMWNYegQ498Ite8QkxxicocYi0xKDrEZ4xSCGO8Q00UmC6WDUS7vu48RWThSRQvDdIhF7RAbKD6X6aHifqpY6KskMkkOMXKIxRh2LFB1ici8s+kdDOYH8eraV2s9LMISEsQIgpCxcoil0/AhRyapVL9myMdro0MsoEOM75YyXbSHKdVPJMwOKm/Ge0NHk42QAFjOMmnRIRYm/pZAwqpU3xu7o3eIMVFQtU2hF8vgNvMJMwFink70C1uqz2YlVXaDce4wU/QwjPOQj+LKzzP/3BgdYhYCqhx5NY0paOxh1sUqh5oNEsTiRqWRyRAOMd4VVmuHmONziEHrEHNcx7MUs8gkj5VDrPRGTjiu7z5GmHDETqaqdYhFLYix/VgS/CJ3iJkeG3WIkSAWY8JEJsO4yUYsCxYATz0FZEfGtNr8sdw29koQRGPDO8ScKDrEmvG8pcbIk6DwIkloh1iFkckgF5LRQWUjPlnOzscEDpMoEZVDTFVMb1Oqb4pMst4v1TZtRRebfjflDJlyZFJRqq/sECsZKLzOMsX2WPfZ2LaxWkcaP3ab15Wpm05wiCWCI5NMfLKKTAZ0loWZeTXouWlGmvNRx5lKI5OlD5dkIinkpFVvRN4VNlyzTLaUPvtMDjG+3DHQISZ3iLGOLZcJYo7vPkaq7RBz3dpGJpngl6+B0Kf6Xdi4dJFKkcn6jYPwEUbkYseNhp3NMJMBDjsM+M//BKZOBa68Urx9/Xrgu98FPvxh4MADgRUr6jFKAf5YTg4xgiAAKQXABDHeIZZKlc+XeUaPFv+nUv2aIR+veaEhW8h6F93pZNonfvCf1y7CRyZNzihBMLJYzuhosoytMYFjUsck7XJMEGOCg5WTx7KHy6ZUX7Uuedyq5axFlxCzMBojk3yHmOG5YWLXxI6JgdsLei2EEUcFQUyzr5KJpJVDbEL7BO0y8phcuL4vDfl9FWqWSdNzozquNgEkiMWNameZTJQdYoA6NjmcDjF/ZJIriE+mhW8umNrfke4QDs4MwUngihdRXmRSFsSi7BAznVCFEZgqoCA57UILfdUIYuQQI4dYjGHvDRtBrOEdYn19QFfxOIrubuDSS8Xbr7kG+M1vgBdeAN54A3j88WEfogxFJgmCkFFGJnmHmModBgDf+Q5w1lnA5z5X/J0cYjXDJIjlCtIsk5JbRohMRu0QU0QmTZ/5pmV8EU2FkJAtZL2xTmyfqF0Xi0yaRDOr6J7KAVdhqT7bV+lkWltOb1tyb+WysolMhnQ9GcVD1ayWFjOTGl8LSb1DzLZUn3WIGV8LCgHO1LcWJu5pFFopMknEgmpnmUwWXVfsGwhVbHJ4HWIlQawkVBUSblm8SnIOsULGy4NP7JiIMa1j/OvSRCb52W3KDrEQ0ULHKe9vU9eW6TnJKb41ivAEzBMRmSBWbYdYmF4w6hAjh1iMYd1hNq6vhhfE5ONQZ6f4t26xwyQOrgkSxAiCkFGW6qfTRedrSwuwg79WA0DR+fqHPwDbliJgJIjVDPl4zV9ky5FJ2Z0jf6nNO2usHGIWsTyhY8swk6GV6GIoLGdOJcDsVmIOMW9WywhEJavIpEHE4l1B1UYYrdxtFs8N/7oK24tlKrm3iUzauMhSyVTZ5ad5XQWV6svOvEojmqp+N9N+t3FDkkOMiAfVzjKZSCGRSBiL9YfVIeaV/RdfaoWEOFb2Rs/kM55DbGK7RhDTRSadQtlBVYlDjBd8TE6qMC6q4saDt22JF5ms1CEW5SyTzXhiSQ6x2BJG5GoaQaylpRgpAoCNG/23M2LwXtY5fwmCaF60DrFJk4CXXwaeeMJuRVSqXzPk4zX/BXuuwAliSUWHmKZUPzAyGdIhpnNQWTtrLBxbLP7Wnm4vF/SrZjwsdSRPHjU5cHtMmHFcxyc8Vlyqb9iffITRKLpYiF3GMVkIVCpHms1rAfC/JkPvq6Sdg0rnKgxbqm8T92QilmpcgrCbDBYPbcRmcogR8aDaWSaTxYsgFpvkv7lgDK9DrGTHBRPEXG2pPvv2ZGLHRCH26a3LolTfi0wWQnSI8YKPSTgKUzwv37dKfJHJKKOgxRUaNk4dYiSIxZdQHWKlY0/Dlrczwau1teikAIANG8q3y8epGLyXySFGEISMIIjxDjGg6ALbaSe7FZFDrGYYI5MON8tkqsXnZKq0VN8mwqgUeSxcSCZxzeQiY26fsa1jjZ1enkNs9BTtupSxPM3Y+VJ909jb0+0AAkQQk0MsZKm+TTE97wQ0iZVG15NFp5dq9lKbdRknazAJrRU6xExjSifT2vL9WsRLqVSfiAfVzjJZyoAzh5UyMpkfPkHMm2WyNC7ZIcaX6jM32+iW0cEOMb5w1RSZDOsQM0UL6+oQq7JDzBQFVf3Owx4bc5w044klRSZjC3WIcfAOMRYZ4gWxGDrE5OgMQRCEEJnkHWJhoVL9mmGMTIZ0iIUt1bcRsfgidZ+TxzYyKTmHVGNibp8xrWOMnV7semxC2wTtMnJEU7Wc0gFnENesI4wah1jFpfqGMYWOs1o44JRjt41M2nS3KV5XFZfqlwQxU+dcxdHfCp2OVKpPxIsqZ5n0HGKmyGRh+COTLbwgpnGIDeYHAQAdLR2BHWLyzGTlWSaLfws1yyQvcJgik2E7xKJ0iEXVIVbNLJPs29kYXEQPO+QQiy2qE+ygZUkQKxGD9zI5xAiCkNHOMhmWODnEMhng+9+3j3vGHPkz1+gQM3SIVVOqbxRdEiFFngpnomTjbEu3GTu92LqYY8tGVFItF2VkUjkLo0F0Me0HK/eeTWTSUoCzihRa7ivf2IPEvIB4aVBkkj1mdq1uu69sXseVOh0pMknEiwhmmQTKkcl6O8S8yKQniLlKh1i2kMVQfghAcZZJdpAQ1sW98YXeGafgj0xW2iFW6SyTwx2ZtHWIsXEFzTJpEweVXWbNBAlisYU6xDhUgtj69f7bGTFwTZAgRhCEjLZDLCxxEsSefRa4+mrgpz+t90giwXaWyXQy7RMH+M9g3iHGCzNWpfpBkckQzhqTkBC2KN40S6Gx2Fx6fKrlQpfqW8wyaYoBquKlNu4961kfDbNa2mwvlUjpI4WqWS0jiEwGva6CIpM2+8pGrLSdgMAr1TfNEkql+kSsiGCWSSB+DjE25XIh6Z8Rk41jMFdyiKU7AjvE5JiNLzJZaYeY7IKqpkMs0lkmxVJ9+ztGEJnUFfM3ExSZjC3sWECCGEa8Q6xhu90IggiFcI7Hrs8qEcTiVKo/VPzSF4OD9R1HRAR2iHGRSV+HmCM6xGJRqm8TAwxw8pgik6EEI0Mc0laACyWCGPrW+H1lM7sne3wFt2Ce9dFiVktbAccmwmgTmbSaEMAiXhrkELN6HdvEWS2jv1SqH0zdBbE1a9bgS1/6ErbZZht0dHRgv/32w6uvvlrvYdWPCGaZBOLkECs+jnRJqHOgmWWSi0y2p9uRTqY9W7G3Lp1DTOgQK/6tIodYKlU+earUIdZatjnX0iEGWD62KCOT5BArQg6xWOFFJi1mKGTvo4adzTBIEKNSfYIgRgDGUv0wxMkhpuqmHcH4OsS4i/VcQYxMeuKA6/+8dlFFqb5lL5YsOFg7a0KIWLzoUqm4pirM14kgfE+VTR+Zdam+zmVlcHXxy/FxT9MsjDqBSvncWM76aIowGl2FIcXKMA4xq+0FRSYtxEObdbHXgmn20mYt1a/g0yU6urq6cNhhh+Hoo4/Go48+iilTpmDJkiWYOHFiPYdVX6KaZTI2DjEpMik7xNJlhxh7c3a0FKctHt0y2otR8usCpJMlp8AJRlU4xNLpygUxfna3bDZ4+ZAUSvtGEMTgBiv5OncXRSbDQYJYbKk0Mum6buNZw20dYolE8TUdg/ey3AdJEATRkJFJdt7VIC5z+Ysl/nrC5xBLmR1itS7Vj6qMvOAW4LiOIBqoRJcoepxaUi0YzA9qY3JBkUm5VN/kjDLNWKkUgkKU3LP78GMyzjJp+dz4IoU5s0AVxkEV+LoKcIgFlepXKuz6+tbCTg4hddPx/WvNXqpfV0Hssssuw4wZM3DLLbd4f9tll13qOKIYEFWHGBPEYuMQK76ZtbNMFjLeQaMjXRTExrSOQedgp7cu2RXG/73cIVb8WyiHGC+Iyfs/bGSSd4jVcJbJ4updBDpbdR1isqhDkUkzFJmMLUGzTOYKObhw0ZpqFZZxXMc7XjYM/LFsu+2KP6sEsba2YnwnBheJ5BAjCEIm8lL9OJy3NLhDjL9YzxayaoeYwtEdxiFmE20LW6pvErF0xe3877auIKvSeV58ChKoQpbqVzvrY1DsUH583ro4HTt0ZNLkblPsK1OE0dhNF8K9Z9pXtqX68vZcuD6h1cq9F9IhJouVbVC8jikyOfw88MADOPTQQ/HZz34WU6dOxUEHHYSbbrpJu3wmk0FPT4/wr+GIqkOMRSbr4RDjxu6wWSaTig6xhOgQ4yOTAHwzTRojk17HFivVd8v3CdqXUUYm+W8wazjLJAC4NhePth1iNmIfzTJZpEFOZhsFk0PMdV0ccuMhmPmHmcg7eWGZhuwRs3WIyQJ5HREEMacQizERBFFfGtoh1iDnEIGRSVWHGJtlUuMQMzmH2LKAffeStofL1lkjleqrxmVT4s8vZyzodxVjH6ZSfavYoenxRRSZ5J8bm8dnI+aZHp9q7IGl+gFxVttSfd49pxPXTFFcVbzUpk9OtS4q1a8jy5Ytw3XXXYc99tgDs2fPxllnnYWzzz4bt912m3L5WbNmYfz48d6/GTNmDPOIh4GoOsTq5RBbvhyYPh2YNQsAF5lkglgCKDA3F1eqn3W4WSZZZFIq1tdFJh3XKTuomEOMRSbffBOYMQNYulQ/ZtvIZFiHWK07xHbaCbj+evMdqUMsEvJw8Or00jfVDXIy2yh43zgrCtkzhQze3Pgmlm5Ziu6hbtF10Ig9YipBrLOzfByQj1MxeC8LX25s3AB89rN1HA1BEHGATSQENFCHWKNFJqXPXF+pfumCWzXLpOwQC1uqbyWUcM6aijvENFEz1fZMsUN+uVAxQNX2LDrS+OWsS/WDtpcI6BArLRd61kcbgdHSTacT8wSh1SBWVi3AVViqrxp7pTNymsYuCHCGbrpmpK6P2nEcHHzwwfjVr36Fgw46CF//+tfxta99DddrLvIvuOACdHd3e/9Wr149zCMeBqLqEDOV6tfSIfbCC8D69cAjjwDgIpOlN2EhCRQKJUFM4xDjI5M8WoeYUy7VT5X+7EUmV60E1qwBXnxRP2beISYLkpV0iHkDjjAyqXKIrV8HPPxwwB0L4rgqEcTIIYarPgS8/+vAHw9Gw5zMNgqmWSZlR1hTOcS22aYo8LsusGmTeHtMHGK8MwAAHLjAv/9dxxERBBEHInOIxWmWySaKTOYKOe8ztiUV0CFmiEzqZimsenY+RdTM2smjc1AFFalbRBgrdSFVW6pvU+JvmolSHlcYMc/K1RU0mYHN9kyRSZuSe5s4q6VDTO53C1pXtaX6vDDIxqWbzIAik3Vg2rRp2HfffYW/7bPPPli1apVy+ba2NowbN07413BwH96u6+Iv8/+C5V3LA+/GXsg+h5gqMllLh1h/aXulD3wW6/MEMckhJswymSsJYi0aQYx3iElOj/Isk1KpPvvANp2AqBxilXaIDWdkEgg+sdIV4oeJTFKHGFZMKP6/ejwa5mS2UTBFJvmTh5yTay5BLJUCpkwp/s5ik7IgVueLRPmCykmg7mMiCKL+NOQskw3mEDMKYlKpviwO8J+/ulJ9eTl+m0zkyTk5rWgWulQ/QHTRxi8tI5NWHWIWLiQbFxm/HBNd2GRCqmVsHWImUUkluphcSDonoMoZZdqeUfi0jEyGLdXXPj7LUn2byKSVsBvydWwUIikyWT8OO+wwLFq0SPjb4sWLsdNOO9VpRDGAcyg9u/JZfOm+L+Fjd3xMGQfi8SKT9XaIDQwU/y994HuCGN8hpirVV3SIMVGPwSvhvsikJBh5DjGdI4rH1CHGix+2HWI1OAFTzjKZCBgTYD/LZJh+tDicWA4nrotcqXvdSoQkIufm127GtS9dq7xNFcGQb2M/N5UgBvh7xPhSfaDu72USxAiCUCF86RlFh1gcvshrcIeY8AVUoRyZbEkpOsQsSvUBvVjC1ievix+XsYcrpLMmrOhiEkE8Ma9Sh5iFUMIvJ0wIoFuXhSPN5CLTjUsnaJqK4m2W4cdl60gLE42tdAKCsKX6yUTSiyjqHGI2Djhh7Lb7yuAEbEbqKoh997vfxYsvvohf/epXWLp0Ke68807ceOON+OY3v1nPYdUXTkxZsXUFAGDJliX4x6J/GO/GF9UD4RxiVjMx2uJziJUy5eniG9BNAPnSGzqV5CKThUy5Q6wUmZzQPkFYNT/OwMgk275NialqlslKS/VV64iAyBxi1XSINWtk0nWRYzppAg3z7e5IoeAUcNbDZ+Gcx85B91C38nbALjIpuA4CvmQYkegEsfXrxdtjEpmUL6gKSWiPL72ZXryw+oVoP68IgoglwpeejeYQaxBBTBaibB1i8nFfV6ovr5MtC1gWtxsEKusy8rAxwDCzPlboELPpSOOXs9mftiKPTal+VJFJU5m8dl0mMa8GkclqS/VNIqPNbKn869hmQgdBGCSHmEBdBbH3v//9uO+++3DXXXfhfe97H37xi1/gN7/5DU4++eR6Dqu+cJG9rqEu789X/OsK44WA7BBjccO+bJ9vWTkmqXrjV4zkEHO8yCRfSFk8GPkcYlJk8lsf+Ba+fvDXcdI+JwGQHGKu6BArRyaLf/NFJm3FrEojk+xCU1XMHwFVO8SCOsR0j811qVSfd4hRqf6ww59YMxcpjykySQ4xySEWs1J9pUNMc0w757Fz8OE/fRhPLn9yGEZGEEQ9ERxi7PqsUWaZbJAv1XyRSUfsEBMcYtzFuvxllFyqz5eyax1ipqiZRedVJVEzXXzPJirIb9PGhWTs9FIUt9vMaqkaeyhHmmWpvq2bziYyyQuMumisSYBTikqGfWUljkZQqh8qqmojjgY50kJ0xVGpfp04/vjj8eabb2JoaAgLFizA1772tXoPqb5w7qKuwbIg9uJ7L+L51c9r7yY7xMa2jQUA9GZ7fcvKMclIY5OaDjH+w4ttT3aIyZHJfafsixs+eQN2Gr9TaV0ahxjfIeY5xEJ0iJkik2EdYi0tNSlxrXmHmO6x8X9v1g4xziHmAg1zMjtSCBKxjB1i0jTwTS+IxdwhZopMruoudouu7m7AyXQIghAQHGLJ0mdvJQ6xkVSqf9ddwGc+A8yeHY/xBmDbISbPMik7y1xXjEwaxZKwDjFdPIz7gt06lldlZNKqQ0wxdmPZuinCWFounUyXZ300ubEsRBebUn1Tf5aNm04lmqnWFVY8tHHm8YKYT4AL6RAzuve4dWn3lc1rz2JMgDqGSqX6InUXxAgJ7tusLYNbAJT7t25+/Wbt3WSH2Li24oQDPZke37KyQyzSYn1fh1jpgNzit+ymk2nvm4tsIeuLTDKYfVOOSTIc1yk//ko6xFQOMdX9bEr1axaZ1DjEKo1MyvfTPTZe/CGHWPmCvdn2QR2Re0lk2Mk1Owb0ZHrw77X/huu6zeEQmzsX+PnPi+/VsIJYHB1imjGxz41IHc0EQcQSXy9UAo3vEPv1r4H77gOOPRY4+mggK0W7HngA+OQnix2Qv/tdbcdqgez00s4ymRQ7xFQOMTmuFeTG4jvETE4XbRE+93owiTw2Bfa2pfqyYyvIkaYbu3JMATMLBu1Pmxk5g2KHyu4vmzikRUda4HIh4qWmsRuF1hCdXral+jb73TRDpo1rzbcuKtVXQoJY3FBEJg/c7kAAMM426XOItZYcYhmFQ0wWxGrhEJNL9RUZ9lSCc4gpIpMMplbrIpN8h1jVDjH55KmSDrGaRCY1DrFal+rz+62ZBTH2tmSfE00Sm3xo8UP47P991hPn60FYh9jpD5yOQ286FC+vednfISbNTtsQnHcecNFFRWFshJXqKy96AwSxhhEyCYLQIh8bCklU1yEWB2d7kENsaKj88zPPAG+/Xf79hReAE04AHnqoKJQ9+2ztxmlJoEOMi0yGcYgBekGMFxzY9Y62uN1Q7q4SG4xOHssYYJjIZMEtmGOAIQSjIBdSkFPOFCm06bLSjcvUixXkAuTHrRpXmH1l20fGb8/mdaV9fCFK9UOJhwYBzsYtSKX6ekgQixt8ZLIkiL1v6vsAAGt612jvJjvEWGSyP9fv+zbGF5mshUNMKtVPp8sHmSwXmWQHRD4yqXOIhYpMlm4bNoeYqkNsOGaZrHVkkhxifocY0PixyRdeAD71Kfzm6Vn4+zt/x+PvPl63ofAnE6oPelkQW9m9EgCwumd1czjEektfevT0+AWxiRPLtwGxj0wWDB1inkNMcTJLEERjUXAUx4ZGd4jlpGPbpk3ln9nEKIwYPB6TIJYtZIVSff6iXyVO8Rf2gEEQC1PcbjGToW3vl01Pla3owruQTOOycT0ZO8QUM3daOYwMQomxb81iXFaurggdYrxAFSYyadyeQTy0LdXnhUHdcsqeMZsZOalUvyJIEIsbisjk+6aUBLGeNdpifdkhxiKTgL9Yf3gdYkwQ0zjEuFJ9FplkHWIMpUNMjkxKDqpEadFQDrEgQay4QvU6eOEpTrNM2jrEdI+LXz/7djYO37QOJ3KHGND4DrFbbgEefBBDG9cCgPfe7M30DrtbjD8BUJ1YsGMBe4+wZeQT75zToB1i7CIqm/ULYkz4YtGbuJfqG2aZJIcYQTQPkTvEYiAgBSYO2PG7o/SlMC+IyWJZDB6PSRBj5wyAwiEmRyZdsVQfCHaImRw4YUr1bcvWw0bbbGd9tJqF0UYICugjs3HcBYpYBoFRN65KerGECRaSKU8kNYqjITq9lLNMKsRK4+sqyHFnWXJv9TwnEmhNakQsi/UIy1nsKyrVJ+IB9+HNSvVnTp0JoDjDWnemW3k32SHWlmrz3oxysf5wOsQcKGaZLL0J+VL9/ly/d9DwRSaDHGJcZDLlsIMDSvexEMTYRWJQZJL/u24dNZ5lMsVt3mqWSfkCOGxkkv8mk52MxuBEbFhROcR0rydTv9iCBcAllwBvvhn5ECOn9LznSxcl7D37/pvejz1/u6dwslvzofCiluKElXeIua7rLcNHNtjtTSOIsfcqL4i5buwdYgB3zJagDjGCaB58glilDrE4luoD6nMIdnyePr34Py+Iya6yGHwxKT9H/OftQG7A+9nXISZHJhEiMhm2aNxCuLARsWwjfia3Gfu8Y9c9ynHZOOBsxTwLYdAmDqlaj6mHK2zJvW5/MmEmaOy2/W42s3uy58+4rggcYsrXqEnssoi82jjSTPuKSvWJeKHoEJs+djomthejL2t6/LFJ1y3bjZlDLJFIaHvE6uEQS6ZbPNeWyiHGu9h8kcmADjHHdfyRSc8hphG2eGwjk6b1qNZR68gkEN4hFjYyye6fTBYFQ34dzYKqQ0wVeejtBXbfHZgwATj8cOCf/xRv//GPi//23x/46EeBvj7/OuJC6fWQ41xXrutiUecidA52onOgc9iGIotaMvzfHNfxThhUAlhDCmLstZjN+iPSvCDGHyti6hDT/Q2A8LwSBNHYyC6ihnOIqc4hZEFs40b/bYwYPB6TQ0wQxCwcYral+mFcSMbZDkOKWNal+jY9VXwMsAKHWJSl+jal7DZ9V8K6bEv1LfYnoJ8FNGy/m42DyhShVY1d100XplQ/cEZOGwGuglJ97XuLIpNELGBuqELBiyZNbJ+I7cdtDwBY27vWdxdeHGIOMaAcm6zUITaUH8K7W97VXqAo0cwymUilvBkgPUGMc4jx+CKTkkOMt1cDxcfPPmDTpV3h6xCzLdWvVBDjnRm1iEyWnHYpeZW2DrFKI5Mq91wMvpkcVjiHmDEy+fbbwLJlxb6m558vzvzHw3qcAODpp4F//7sWo42G0ushj5JDrJATTg6G06UT1CHGH/940UuOTPpK9Z0AMXmkwI49mYw5MslfUMWlVF/xHDhQj4k6xAiieYh8lsk4nLfwYwjrEJMFsRg8HltBLJ1MCxfrJodYUIeYsvvL4HSxEpWqjEyqBCqTCMLHACtyiNnG8kJEJm0jjLal+jpRKZQjrfTetYq9hhDzTJFJ00QFNSvVt4gwBomCVKpfPSSIxY3SAaC/MOi9iSZ2TMT0scUPSFWxPn9RwRxiQLlYvyfTIywf5BDrHOjE0bcdjbGzxmL33+6OWc/Nsh8/c4ixUn1wgpjnEMt5Y2UOMUZbqs2nTssOMVe6YFJGJit1iMknT9VEJiN1iCk6xIJK9V03ulkm+X0Tg28mhxXOIWaMTMonrOvWmW+Pcw9Z6TnOlYRYOX44nKJEUIcY/7eCWxAik3LcsiEdYrYdYrwjIcaRyUKAINYwzxtBEFoii0zG6byFH4ONQyzmHWLyFxq80MAmyQKK5/rWDrFaRSYtyt2DIpOhSu4NgpFJhFMJdVY9XEGRSQsRJLD3K0Spvs3snjYuQMDSLWghUFmP3WKyBm3fWhWl+lbCrinyyj1/vtlLqVQ/EBLE4kZJTOnKFUWslmQLRreMxvZjiw4xVWRS5xDTRSbl/h9ZIJuzYg7mrJjjvYnf3Bii80jnEEumlQ4xvsAQ8PeHAeVvjNi65A9TZWSydJsnnpnEBzbVdVubX8yS7xcUmaxZqX4FkUl++0EdYmEcYjE4ERtWXBdZ5hAzRSbZCStz32zYoL6dW29skSKTKofYUH4In7n7M7jp3zfVdCimDjHXdX0iFx+tk8W0agSxrUNbcdifDsNvX/ptqPvVHFtBjH/9xTkyGeQQow4xgmh4fLNMNlpkUnXuxs4rRqhDjD82M4dYS7LFJ16pHGI1L9WvNEoXtuTeIFDZzBpYz1L9MAKjC1d5LeatK0wvlkFUAvSRSRuXXyXPc1BkMvJS/YD4pa2ri4/iVjKZAZXqE/GidADYki0KYhM7JiKRSHiCmDIyyR2U+Bcyc4jpIpPMnSUfIGWBLFSBto1DzOE6xKTIpNwfBnCRydK65A/hglt2iKULkkOMLWQ6edi6tfj/xImVzzLJRyZrUaoPhSAWVKqf4Z5HOSJlK/Sp4qRxOLEcTmxL9dlrYIcdiv/395ffD0AsS3G1eJHJ4hjzTt7nEHt5zcu4b+F9uPrFq72/v9fznvehOm/9PMx6blZFHYVbBrfg2ZXPKgUvHvlYEBSZlH/vy/bh7rfu9rloVfxr9b/wr9X/wp/m/Sn045EZyg9h9tLZlU1O4LrA0qXl9yF7XYWJTMqO0TrBn9DKf5MhhxhBNA8+h9huuwD77ht+RXE6b4myVD8Gj0c+VvOf9Z4gVrqQ58UB1cV6RaX61Th5VAJIlb1YskvHuK4QpexRlurrIqjW5e6c6FKJY8smounrEAsTmbQQD20nBKiq0ytMqX4VkxmoRDPluix64KhUn4gXJfGnK18UsViZvjEy6aojk16HmKZUn90uX7DKbzjri7ZCoSzCGDvESpHJZEo4uAL+/jCg/OZkb1Zft4TreH/zzTIZNCsgUBbExo8PjkzqLiCzpX3W0hKfUv3OUvF5Og2MKz7XoR8Xc/yNGhWvLo7hhC/VT5b2gelkduLE8rTpvEtsJDnEvMhkeZZJlUMMKB8/7nrzLsz49Qz85sXfAAB+9OSP8KOnfoR/vlucXCBbyGIwV45RmDjzoTNx5K1H4vnVzytL9Z9Z8QzW9a7zXzQ55cikSgDjvzwouAVc98p1+Pw9n8c1L16jHMcTy57A7KWzvfHz/1fDb1/6LY79y7H43cu/C3/nP/wB2GMP4KaSMy+sQ6xGse5KUM3sRB1iBEHk5WP7U08CY8aEX1FcHWKVRiZj9HhsOsTYOb7QISZHJlFZqb6VQ8yieN+2mN6mQ8wmMmntEKtyhkxVqb6N2GUjpiiXCyPmGSKT8hdlodx0hhJ/28hkkIhq2+llI7QaJ36wif4qyvlV61LFL6lUX4QEsbjhRSaLItakjkkAYC7Vd8yRSV+HWF4SxAoRCWID5RJNFAqA63oXN8lUWukQSyQSQo+YKjIpl+r7Zh9yeIdY6T6yQ8xGEJswwX+haOsQ21KcAAGTJtW2VJ/bfKBDjAli22xTniEyrPOtu7v4/7hxsToRG1b4Un32+jCdzLa0ANtuW/zZJIjFWVhkkUnWIVbwd4jxXV0AsGDzAuH/7kzxtdOT6YHrunj/Te/HXr/by0pQeq/nPe9/uVT/zQ1v4qjbjsIp953i+9DnY5K+DjHH7xjb0F98ftj/PLlCDif89QSc8NcTMJgbDBRkejO9eGP9G4GPDSh/saE6ngPAvQvuxf0L71ffeenS4v+LFxffi2EFMT7WXe9SfdaNyJ3IqTrECk7BO6EjhxhBND6+87wwkzvxxOmLPFNksnTODKAsiG3dWj5us/9j0v8I+L+cVkYmJYeYC1cppMhxraCImK1AFaYo3nEdn8gXtvMqSJiRt2nTIWZaxkqAsy3VD3IOJUSHWCW9bKFilUHiqIVDrJIJAQLXFUGpvjI+a5qR00asND03VKofCAlicYNFJnPlyCSAcodYCIeY1yEmRyYtHWJMqNLNQumDF8QAwHG4yGTaX6pfEu/42KQyMimV6ssfWGKHmMYhZhOZ5AWxsE4qXnyqRWSy9Jj5WSatHWKTJwcLfbqxspkRx4+Pjatk2OFL9VOlH0wOsdZWO0EszvtRikyqHGKe8FR6P8sOKl4wK7gFzN8wH6t7VqNzoPi67BrsEi56+Pc1v265B4wdA9f0rlEKYmEik94YFSLXUH4IA7kBZAoZDOQGfAIgAGwe2Oz9fOr9p+LAGw7EvPXzfOsCgEWbF3kXCfJ2f//y7/Hhmz+MrsEuDOWH8Pm/fx6f//vnkS1ksb5vPfb63V64bO5lxRWx100uJ74OTYIYAAwOlm+LyUWirUOsXjOcEgRRH3zuX9dwrmMiJuI/P4ZMCrh90d+wrpebeIc/P5g6tXy+tXmzeHtM+h8Bv5NH5RBjx3b+Yl2+5hAcYgGRSVUZeUXOGoXTTLmuiCKT/GRgRodRGOEiwlJ9m9kOE0jYz5A5HJFJGyHSMjJpNSFAhRMsVBqZVE4IYBD8UsmUt89M+7Q1aS7Vpw4xIh4ERCbX9633vbnYRWUCCcHqqIpMuq5r7RBjt1s7xPi+JADIl2e6SKTSSJbUKd4hBiC8Q0xxosT2QUrXIWYSjpgLqhpBjJ20qMSnCOAdYt5jC5plkhfpKp09kxxikkOstA8a3SEmRSZVHWKyQCT/zgtjwn2dHFZ3r8a0q6bhC/d8AQAwe+lsTLh0Au5+627fuuRSfV5M8rsIxMikHLfUFfCrRBb+b9lCVijrB4Db37gdU66Ygj+9XuwUW7F1hfB/b6bXO/a+ueFN7P37vXHKfaeI+6z0/5/m/QkvvPcC/rX6X0XxzckhU8hgKD+EF997EYs7F+PvC/5eHAx73eRy4uvQ1CEGlI/PNYp1VwI7meMvllQdYvwJLDnECKLxkR1h8rHemjidt5SO3X/fFzj1+XPxk6d/Ur6NPz9oayuetwHl2KQ8aU8Mzh/k47dKaGC38aKTfE3hupWV6oeJkdkIT8p1hSy5D9qeb10RiTyVOsSsnEpSlC6oj8y2VN/G+QVYlupbiIdse0FOQJveuTCvBX7SCNV+iCoyCSCUaGvaV80ICWJxg0Um830AypHJqaOnIpVIwXEdbOzfKNzF68/i4pJAuVS/J1uOTOadvPcmC3KIhRbEZIdYocA5xJJcZFLvEDN1iGlL9R1FqX7pttAOMf7kSXUCpTupYoKYSnyKABYjSjn8/gjYBj+mSoU+Joip+tWaBd4hlrRwiOkEMSZexOkEXYfsEFO4rXRCmO932V1WyGFR5yJkChnM3zAfAPDsymfRm+3FnBVzAOjFtLyTF2YblE8gcoXyDFYqMY0X0/m+MTa+ZV3LcNebd8FxHZ+IJzvf2NjZ//zjLzgFvO+692HmH2ai4BSwrGuZt37VvuLddVrhkf2dd4jxF1FBDjEmiKXTsXkvs2M5/82mk/C/L/gTZnKIEUTjE5lDLCbiPz+GTaOLvwpRfXnSkylTij8zQax0/uC0tuC5HYGnxnViSeeSWo/YCBMpTc4bdhsvOslfwruosFQ/RPm5dplKZuerIObIO8Ss3V8hHGJBMxlaRSYtZx+0ct0lonHvAZal+iFmAFWtSyWuReUQU67LouQ+TL+bLFbaRCupVF+EBLG4wSKTkkMslUxh2thpAIA1PWJs0nNHJSRBjEUmOYcY/0E03A4xnyBWGi87UAMBs0xqOsSUkckwDjFdZFJ1sRgUmeQdYrWITLqS2GfrEKs2MkkOseKPNqX6QQ6xGH3Dq0XuEFNFJoOEME7I4T94s4WsIGoF/S/3gPEONPkkgz9WqRxhQaLeNx/5Jr547xfx3Mrn/A4xw+OTH39/rh+ruldhdc9q9GX79PdR7DtdNNU7kWWvG3nmSJUgli6fkMXZIZZMJJFkgpgiMkkOMYJoLlSTJ1VEnM5b2Odq6fArnFtzx/JnVs/FTz/Qh9sPAFavWSDcfv/ueXzkq8B/7v869vzdnrht3m3DMnQVqsi7jK1DzFeqn7Qv1TfOzhfQi8X3cKnWFdaFFCSmeOMK4xCzcaSpIpMWIkiYfSWLlVWJeabHZ9shZuEWVD1/qnUxbMQ1204vq5J723hpSIeYsVQ/aJICcogRsYBFJgtFhxjrEAPKsUm5iDnIIcZ3iPFusCCHGLt/NA6xNFJeZFJyiAVFJiWHmDIyycqZWal+6TYnjEOM78nSCWI2DrFaRibDOMR4ka7ayGQTd4i5joOCN8tkBKX6MeoA0eIW3235hKFUP4RDTHb4yM4ok9hk6v2SjwX8scqqQ0wS4Db1F7+N3zSwSRizIOLpOtO4scv3Ve0T5f+K+8rrDxWZTCTKAiwviMXkIpF9uZFMJJEsjUlVqi+8fmiWSYKomi/e80UcccsRlUcRa0zkkck4fAHFBLHSqbow6zI3C/D//uNU/GLnVTj1ROCgFecXj3ml29+dKB4f39r4Vs2HrUMVeZdhF9/FY3zx/MkniHEOMbZMqCJ1G8eWQXQRupeqjTAG9EF54woRA7Qqplc4p3nRpSqHmM6FVMW+4md9NAldQPAECzbuNn4fyMvJYqVOZFSJSibhyaqbzhCZtHodS71fgbN7cvtK+94ihxgRC6TIJHOIAfpifZ1DTNUhxtxgyUQSo1pGCX9jeIJYa0hBTOEQU80y6cVkEpaRSckhZirVTxVKByPmEGPva1uHGH+haOsQGxwsi4Eq8SkChA6x0t8CHWJRRCb5Uv2YXEQPNznuQ8MJ6xBbv95/e4xmidLiup4ICKidS+w4UXALcF1XKe6w/+XIZGDvmCFuqXOPAZIg5vgL+YNK9YUyf50AaBCzvO1KcctAV5nuvtVGJoHy640do2JYql90iCWFv/GQQ4wgouXut+/G3FVzlRM1xYFGLtXPlk7VlQ6xlhZhspZOtx8ru1d6tw+1ipdtNrM21wpPEEsZBDFOLGMX60aHmGWpPu/AMcXybPqg+MdgFZMziC5B2/PWZREDDCNuBJbqB4ggttE9QB+RtZn9UnBGldbjuI5ygiUvomkTVbWIX/LXyPxyujhrJfHZsJFJq3XZuPcsI5M2XXFUqk/EA8khxjrEAGDamGJkUpiZBgaHWEnQ6smUO8SYG6wt1VaeRVJyiLE3SdUdYvlyX1kiXXaIMZQOMUVkkr05PYeYIjLpWbflDjFvIc2FXz4P9JYEQ5vIpOqkijmx0ulitLAWkUmhQ6w0FCB8ZJKNSb5fUIcYH5mMs5BTA3hBLJLI5EhwiDmOF+sAzKX6gCjkKDvEdMsqCvjl/00ilnySMZgfFJaV45b8saPgFnzbF7q8pMik1s2mWIcs4mndZEH3dfxCnOAQkwUx5hjjo5Ls9RbzyGSKCWIKh5j8+iEIonIKTsF778XVcdnIpfqmyKTbkvacY9uUTqmXdS3TCmL1PB56yQxDZJK/jV2sq2aZrKhU32JWRBtxgx+bSUiwEemsI5NBsTXLyGTYUn2tQBVFT1WI6Kiv00vh2Aoq1Q/1WkgktO4on3uvmskTNAKcbj8Yo7gh+t3ClOrbRF6bERLE4gbrECuUOsS4yOSY1jEAytMZM7QdYqrIZMkN1pZu85xZQR1imXxGOJhrkR1ihXL4JZFMeQ4xBjug8A4xZYdYQKm+sE55lskgh1hPWSz0RSZV91GJQfJsjhFfbPLfnvlmmbSNTAZ1iOnGSg4x4YPFqWaWSXafkdAh5rrIBzjEdKJPNQ4x1f9at5Xjn2VSdogFRiZNXV4aEc9zxNm6yxQCYChnWlQOsTiX6idS5Q4xxbkYOcTixWBuEL9/+fdYuXVlvYcSe9b1rsOZD52JN9a/Ue+heMjH8jjSyKX6XmSS+wKHnR/k2lq8872ZpfmzlnUt824fbBEPkHFwiBk7xDj3mNEhVkGpvtXsfBFFzWwK83lhxoUrnJ9U4hALU9weRam+zT4A7Er1A/eVHGFUOLZCTbBg6XpSiUG2kcmwz00iURbFbKKVRvFQ48oLW6pvem6oVJ+IF55DrHjxwkcmdQKWziGmjEyqHGIBgpgL1+6kKaxDLGHZIVbaJ+zNajoxSukcYjpBjDmgRo0qXjjyF4q2DjEWTZw8mQ24vI4I4B+vr0PMNjIZ1CFm4xCL04nlMCJ8ULP9aOsQ6+0tRmr520eCQ4ybSAAwCzWAJFQp3F/ysjrxTOUUk0UstgzfAcaouENMsX3ZIRbk3OIft69DLEgA5IU4W4cY7wgD1B1iwIhxiLEOMScB37ioQyxe3LPgHnzr0W/hp3N+Wu+hxJ67374bN/z7Blz70rX1HoqHfHyKI83qEBvsKItL+24q/i84xEqC2KhSp0E9BU2rDjHuNnZRr5xlUrqwr6ZIvZKi+KBOr7CRSXm5sB1i1g6xZDl2KJsFQs9kaDEmIMJSfU4sNTrEIuhuMzmorCOT/P4MKbRWFJkMGavUPT5+P9g60poREsTiRjIJJwFsLQlifGSSCUfyB0TgLJPZXu9No3SIyaX6jiiIAZaxSVWpvmKWSQYT8PhvCZQdYkwAkmaZlB8vYHCI6QQfvj8MEC8U+fuYRC5eeJLXEQH8iWDVs0zKgliKTZ8YwiEWZ2dTDagoMjl+fFmI2LChuDzbxyOkQywX0iFmdFtpltW6ykwdYtwHvXzs4kuKrWaZNMQf+e3wkUk2DtOEANb7ph4OsTiV6rtcqX7pyFYIEMTIIVZ/tgxuEf4n9PRni++7gfxAwJLDh3yMiSORd4jF4fO2NAbWITY42Avsthvw5JNlwauteAGdQAL7lE4teUFssKS3jMsXV1LP54+dmxo7xEI6xMKU6ldVRq4RXbTCRcgIo7yusLNMmkQs3cyJgsjjiiJPkMBo5YALIVAFdaklE0nhGo4fVyWl+p5IZ3Bs8WMXtqdziJnESkuhNYz7S7c9GxFL7luz6j8LiF82GySIxY1EAj1t5R4VPjLJ3vC2DjEWmXRcx4tZhnGIsYgmYCmIKUr1vchkyuAQC4pMslJ9KTKp+hBmgliSCWLsBp1wpBPEZIcY6+QxRSaHzSFWxDVtI5cri1mqmS/Z/UyPC1B3iMXhm9ZhpKLIZCIhxiZ54WIkOMQcR4hMWnWI6UQtw7LsW03+Pq7rCmKTztUFSJET6Xdf7NGwLpVAJQtguhikShiz3jfVOMSiEMRiEpkUOsSCHGIxjXg1Cmt61uDBRQ8aaxLY8yEL0oQfuT8wDpBDrE5Ikcmh3CCwbBnw8MNlwau9eGN7qg27lfRmwSHGBLHSSmIfmeQdYjYdYkExuRAuHatYpSRcVBOT48ckL6dzIRldVjZdXboeLogiT1D/ma2rC7Ar1feEGVcTFSxFClVClq1DLJTryeBuqyjOaim0Br3+jJNDKCKvWgFO2p6NaGt6bzUjzfmo40wiga6SSao93S44pnSOLnnWRsboltHeQYD1iFk5xEpvEl40q8ghls/DSTCHWMquVF8VmZQdYiWBSGXT9maZLP1u7RAbP760MU1k0uSk0jnEohLEZIcY737TCX1MpEskgIkT9ZFJ9rhsZpmMScxquMk53DdJpk46WYwIEsTi8I21DlVk0tIhli1kxdJmRxE/lIQq/qKRF4DlHjBZbBKmrYd4nFI5wvh1F5yCz6FmU6ov7w8m4rF1qxxipn41330N+8Z1XSxId+GUE4HFrb2VC2IxeS+Ls0zaRSbJIVZbznz4THzqr5/C86uf1y4ji8G1pj/bj1PvPxX/WPiPYdlelMjHljgwEgRmnyDWCLNMypFJlL405o7dniCWbseuXcXl3u16F26eCWLFx+E5xGIQmYykQ8wyMqmKFFq5b4KcQ1UUt/OihC4yGdYhZjMbIO9mAgyONEtXUGiHmE1kMijip1iXdYeYTWeZRf+ZLs5qdAtaPDeAXhAThMgwkUlLsdL0erd10zUbJIjFjWQSfaXrFxZ5ZARGJiWHWCKRKBfrl3rEwjjEWlOtniBXkUOsUCgrzukWpCQbpsohpoxMhnCIJZkgVq1DTI5MmoQj2SEWdWTS5BDTbYeNaeLE4th1kUnmEFOtw3XJIQaNQyysIMY7ylipfpz3oyoyye0HvsvLu90y5siLPL77BnSVyQ4x+bgUukPMNrooPSb5vrJY6Ht8hbIjruAU9PtK9Ts3joJbwJ/GLMEdBwC3TZNeV7YdYnEs1U+mvBNWRzFZiPx6ImoHm8Vans2ax3OIFYbHIfbMymdw+xu3Y9bcWcOyvSiRHaRxQBb744gsgJkmUzISE/EfQFkQY9+vJkrxSU4QG2or3tjR0oGdtxaX68n0oMstfvkzmGKCWPFxxcEhFrZDTP7cdlynqlL9SoQE/ssYfpym2QBthBldkTrvQjI6mhQuJGP8TReZtHSICftKIwTJ+0orqISJl0pCpGrWR/m5qaSzTHZQVROZjNQhZhGZtBJ2qynVD+hbazZIEIsbiYQ3y5ZsWwyMTCo6tZio1pPpEe5r4xDjBTGraISyVL/0sAwdYoJDzDTLpNQhJn8Ip5NpJBz2pi9ScYcY7xBLJs1ikOwQizoyaXKIAWpxhu8PA4IFMdVYBwbK627qDjHuJIPtAyZEnHoq8IEPiPG1IIcYuz3O+1GKTMriUlDXlUlMCuofk5c1bdcXmcwNau9rLNWXXBxKV5shymgS8ZT35faVMZqpcJsNJIqPYSCRb0iHWCEJcojVEbavTV+CDXcMkFU+9GX7hmV7URL3yGSchDqeRo5MZrlT9aE0RIdYa/HY3NEyCh15YHrJpL8sXfxhiAliuRh0iLG0hqFDjHcwaR1iXKl+YIeYyqVTjZAQEJlURgpt3WYaF5LJFRQ2BlickMY/wQIv8vDr0gqMJhFL50IyiEE2QhCgFtds9qe8H1RCl7yMsD2LyKSxVN9yVkubUn2bWGVUDjGbsZNDjIgHBkFMJ2DpHGJAuUfMi0wOu0Os9LAimGWSvVl1H8KpRMoTGap2iMmCmCkGqXOIDUeHGGAWxOReszAdYiwumUwCo0fH68RSYvPAZry75d2arFspiLF9/n//B7zyCrBihZ0glk7HRowwIkUmZWdWUNeVyeVkEoxsnFn8CQ+7UGbIkUmTuMa7vLKFrDBLU9hSfZ3jTfmYCnoxTd6OcoZOlMaIQjhBjH1hEaNSfYpMxg/22jS5v7xlhqlDjD3/sgA+EhjueKkN8jEnjhTQ+JFJABhsgeDuZYJYe7odSCa92OSyluL59WCquI5xufo6xHjBxTYy6XWIybNMcqX67HyfXXvoRHBbIYEtU3ALwpijjEzqRBCryKRhezYiHaDu2JIFOO3MkCFEHpPLit9mGMHIJjJp4/KrRqy0jbPazAAatlTfJHZZibEW+9O3LirVV0KCWNxIJj3nj6zSaiOTBocYmynSi0xqHGKnP3A6znn0HGH9oQUxpUOs9AZLp70LHoZqlkkbh5iutyCdTJcFsdLfjC4qoBwJZIIYf/JUqUOsRrNMJtzi47KaQTOo14ztD1NkUo5LxljI+ehtH8W+f9i3JrOe8R8aDjtisv3HBAiVGDGpNENsV5d4W8SCaU1wXdEhFiBqyQKRbURSXrfPTaWIPVYamcw5OeEkseAW9EKcwpnlE+o0rjaViGX9eG3cdCi+9nyCmG1kMkal+uzYVizVtxPE4tp5VCseXPQgfjbnZ8aS+0rIFXK4d8G92NS/Sfi7TWG+7Hp6Z9M7+PFTP8bWoa2RjlHentwZOBKIvUMspu8n3yyT1TrE4vB5K5XqA36H2FBrcbwdLR1Aa2tZEGstHr+HksXHMTbvdwUNJ/xnqW1k0uQQk4WE3SftDgBYsHmBd+wLLVxIpfOqCKMsutjEE01CiW5d2lL9Qg7vbHrHS/GEieXJvVja7dmW6lfrQgrZt8avSxVhDOqTs9pXmudZNaslW5dVZDKkAGfqI7OKVYbcnzb9Z1SqL9KcjzrOJBJlV5UkIGkjkyaHmByZVDjEFncuxs2v34xrX75WcCi0plo90axqh1hSUapv2SHG3pyeQ4y/iOJEQEEQkx1i1UYmTSIGE59qPMtk6YtBO/dbFJFJ5hAbN6604Rh90yqxZMsSZAtZbOzfGPm6c/wHJx+ZdN1ydFIlRrCuMD5Oyfc3xXA/esgdYoqeLJ0IFCYiyX4Xoooa4Un1u3yBzDtI5IikfIEv9IBJvV8qp5auVF81SYC1QyygM00lxGUTJZeBWxA7xEZ8ZFLfIdbMDrHvzP4OLn7mYszfMB9A8fFHIY49sOgBnPS3k3D+E+cLf7fpB5NdT5fOvRSXPHcJ/vb234Tl1vetN3aRAcDSLUtx6dxLjXHIEe0Qk2LZcWAkdPI1Q6k+UBTE8tkhLOlfBUByiPGCWHupQyxR3A/MIVav549/PkyRSVWHmNIhJrmQ9pmyD1KJFLYMbsG6vuIxRC4/t5lBUujYUji2TGXrvnVZ9kaFcSE9v/p5zPzDTBxw/QHasVvHPXWOtIRdpJBtj3fL88tU1OmlE4xMkUlZxLJ5biyigsL2+C+6KynVD4hVhirVt3Dm2URe+e2ZhLrAGSvJIUbEgkRC7xDTRSZNHWJyZFLhEONPMofyQ9E5xLJZMTIpLW49yySLTEqzTKYSKUEETCW5yGTpb4EOMXmWyThGJpngWdqZCfZtmckhphPE5MikafZM5hBTzcAZIwpOwXt9Ru2iACQbepITxPjXlEqMYEKEfNtIcIg5jjjLpMohpotM2og6plL9EA4x+QJZcIhJEUn5GCZ0iKkej6XLS3Vfo5hmcJfZiIlZ5hBLOH6HGBPIRlipflEQK/0tyCE2TBeAuUIO89bPq7zMOyKYu7s7042h/BD2+t1eOP6u46te79retQDgXWwywjjE2PlEd6ZbGCtQPC4feP2BOOD6A4zP2c+f+TkuePIC/GX+XwK3J0ekRwJxd4jFaVw8DdkhVjre8h1ig2nggmlvY89l38UjewCDLSWHWFpyiI0qfn4NMUEsW1yuXs8ff1ysepZJuL4L+/Z0O/bcZk8A8L4M0LqsTMIMJ8ipIoVy+XlFsTwLgcrXIVZa19MrngYArNi6wjcuawFO4ZTTOcRMQontvrIp1bfpzhLGrhDzQs1qGdJBZRIrbUQl/jk2RXGrcX9FOgNoBcJus0GCWNxIJrUOsbCzTAKKyKTCIcYzlB/y3nDWgthPfgJceaXfIZbJlPvQ0i1WDjFTZJId2ISLKM7aGYlDjL9QZIKHKTKZyQB9pW+2axWZrMQhFuRaC+MQkwWxOJxYcvAXSbW4eBU+qHmHWFB/Ey+I8UJFTPejgBSZVHaIaUQvK4eYxkGlihvKsUdBEMsZBLGChSCmcYgFxiANPWBBYlpgZ1qA8MgcYkpBjBHUIRYzh1gqkfIcYoUqO8R6Mj2BwviZD52J4+883ni8uOS5S3DQDQfhzjfvNI7/0rmX4pkVzxi3N5AbwJqeNcZldPBxwRVbV2BZ1zI8vfzpitbFw8Qs+XzCqlTfEUUedl7BOz/6c/3Y0L8BmwY2eQ51FSzm/m6XvgOSF5UqFkbqRBwFMfmYE0cKKFdFAFU4xGJyrOPHIEcmF7QVz88XTgaGmCDW0gG0tXGCWOl9KTvEYhCZNApivEOs9LNPEOMcYvw5/f7b7g+AE8Q0woVNHE1eTttTZSEY2QpUNtuTCVV+bii5l0v1gyYp4LenW1eYUv2w8UtVhDFoe0Knl+W+Uu0HXam+Td8aIB6bwpbq2zgd+e25cIXPwJqU6pNDjIgFnEMsylkmfQ6xVJsgRDFCO8Q6O4Ff/hI47zzgvffE2zKZstst3YKU9HJTOcRUkUlfqT4nAGojk6W/WTvE5Mik3CGmc/UwJ1YqVTMnVaBDrFAojnf1av+4giKTzCGmGivfIcavIw4nlhy1FsSED07mEMvlxLiaKTI5Eh1iqsikpUOMF5pU91UV1vPiki6aqFq3fFySI5OmvrGCo+8QCyzVd6TOtCDBTyMABj1epdtM5xDjSXMXKKYOsZgIYkGl+rYX8PPWz8M2l2+D8x4/T7uM67q48d834uElD+O9nve0yzGBZlnXMu0yr6x5BRc8eQHOeewc7TIAcPydx2Pna3b2XFlh4N1R7FhnijPawt4P8gUS29c2kUlZCOPXxTvMTFFHdt9V3au0y/DrtXKr14mr/nUVrn/1euFv/DEmLowkh1hr6VvVRnKIyaX6fcniuURfa6lkH+XI5JTS6c2W1uLjHyzNMhwnh5ipQ8w0yyT7XXCIce4UJoi9ufFNbzmGbcdWKpnyLvCNLiSLmFxQD5cpfqnbnkxFEU2LUn0rUUnjEKtFqb7JbSbvTysxL+y+0sVLOXFNG3M07Ctdqb5RaA3qpuNijr6x16BUnzrEiHjAzTJpHZmspEMsrXeIsYNOS6rFE6i0J8e8K4EJKEwIGBoqu91SKU/QYSgdYqrIpKZUP5VIhXeIZbOik822Q0x3UsWcWJMmle9bc4dYSRBjCzgO8POfAzvuCDz4YPFvtpFJU6m+ziEWMyGnP1d+PvkTpqgQbO/sLRnWIcbfFqcTdB2qyKRBqJEFJf4C2CoWqIkfZgtZ0SEm3V51ZNJWmAoQtcL0nhkjogbnHftdcIjlNW4plUOMHavjVKrvakr1K+wQm7d+HvJOHq+ufVW7TM7JeccJUyxQJxjxdA0V7RssMqhjweYFyDt5LxYTBpUg5rhO1U4p9vh0s0yHKdVX7Sv+Z1MZPtvO6p7iFzo/furH+PhfPi48z8K6Ytoj1jXYhXMfPxfffvTbwnMTR4dYPSLIYfEEMbfkHK22QywO5y0ahxgTxPpbSiX7KEcmx5Seqv60AxfAEIrLjq13hxj3Gjd2iPGzTLIOsdJ73hPEXH+pPgDsN3U/AHqHWKBwYYrJWcQc+W1adT2ZOr1COsT47eken8mFZFuqr4oB+sYuu5CC4nvQF9P7xMMwkckKRCWfYGR4LbDlwohYQWMP7BALGZmUx2Uz4yj/GK3cdBSZJGKBoVQ/0lkmo3KIqS7ImKOId4il0lYOMWVkUnaI8RdRfIdYwqJD7IMfBPbYAxgqPR7dLJO2HWJyf5i8jgjwO8SK/wuPbeHC4s9vv138XxeZdF3R/WaKTMoOsZgKOf3ZsiBWk8hkgTvJ4B1itoIY7x5Lp2PrtBOQZ5kMivZJIhfv2lOKOtIFGX/RaFpv3skbL7RDRSZdfR+Z/PiUPWCG3rNAcU0jplnNOskEsaSrd4ipBDH+tpi8Bm0dYrYX8Oz1YHI38a8Dm+WqFc2EcRnWpaLgFLzPu8H8oPC+qlZgUY3ddV3v9WZ0iHGOTsd1vMelFcQsHGKru1ej4BRwxb+uwKNLH8XCzQvV64rRTJMvr3kZ17x4DRzX8Vz4eScv7Ds+Dh4X5C8dhoO3N76N+xbcZ7287BCr+LM9Tuctmg6xvlTxPd7fWvwdKDvERpeennwSGGgB8iiuI04OsXTCLjJpdIgpLsaZQ2zBpgXIFXL2woUuDqno9PIJMyYhwTYGqIhM6lxkMsrtVRmZ5B1GplJ9oRtLIRiZiumF/cC5kEzdWfzYK4lM6p4b/rHLQqtqP/jcdCF6uAA7odXUpWazjG2cNbCzzDDBApXqE/EimdSW6lcyy+SO43cEAMxdPReu64ZyiLWmWr1lKhLEBIdYWtshxltBbRxiVpFJnUNs3jxg3Tpg/fri32RBTBeZDHKIMScWv46IBDHPEafrEOP7zgZLFwu6xwWIj80mMilPOBCHE0sO/iKxJqX6gkMsRGQyyCEWh2+sdUiRSTl+6OsUk0QvXqS0coiVfi+4BZ/44esQ4yOTBSkymRMjk0EOMf52fsxKYaogPX5D75l8wSlEyQoZ70THcR3fxXOgQwzcRdWQ4ricSJTf14BfEIttqb6dIGZyiNkIVIIgZiF2VSuaAWVBKGzUkX8cvEOsknXJqEQsU8RYN65cIeeNhd8P/PhM62K3re1di2Vdy5QOtbg6xL796LfxndnfwUvvvSQce1TCYJy6unTPeS05+d6T8Zm/fQaLNi+yWr4AySHWoJHJIU4Q62sFBtPFcTKH2Gju6ekcVf6ZCWJx7BDjry0Eh1jpQpwdG3iHmHxhDxSvX8a1jUPOyWFR5yKtcGG6+Oe3IzioLCOTlcQArSKTGocYvz0b1xq/Lp3rySiuWUQKbRx3/DbDuJCUs0xW4RADpE6vkJFJ/vGZXle2Udwggcq204u/1jXGWS3EQxtBsxkhQSxumBxiFcwy+am9PoXRLaOxuHMx5q6aG+gQy+Qz1TvEmIAiOMRSXiSG4TnE0uE6xKxK9Ut/8/VssQPf0BDQ21v+3TTLZCqlF7lk0YhfR9SRSeYQK/1deGzseWCCGCvQHjVKHBMgPjabyGTcHWJcZLI2DjHuw8fkEMuWLjKCSvVjKiwKOI7oEJPihyrHGH+RxT8nNiIP/7vgLlMJUbyry+QQk8Uz6Rgm/y672owzRXKur6D4qDwOXniTH4ONeOhFJlPwz+wLiO4wYEQ4xFLJlCeIFZIwCmJyqSyPJzxZCF2AndhlEtfYc2dapuCURV7Tcq7rYlnXMuHkXHZG1cIhphOebDrE2HJRRCYLbgHPrnxWef+4OsTYhACdg52CUKfap7FyiEmu1OFgY/9G4f8gfB1iDVyq31/6trO/BRgqneh1tBQFsbQDtCWKx/TNvCCWi49DTHY78V9sh3WI8ef0iURCiE1qZwM0XPzz46s6MhnQz2SMAWrGJKOLTAozGYZ0iNmW6gv7QSVQSQ44Y4dYSDedaV/ZdLfZOraqei1YxC9t4qzyNm06vXSvP+s4K5XqB0KCWNzgSvV9HWJcZJI/2Bk7xNrG4n9m/g8A4ObXby4LYpYOsYoEsbHF3jJ+lskEEn5BLCFGJtPJtHK2Gp9DjBMAhchkkotMyi6qQkEs1h8cLPeHtbUB7SUhLigyKZ9UMQFqFHemUqvIpM4hxj82doFsEsRct7w8u3i2cYjFxFUiU/PIJP/h00wOMb5DTCXycPslky+7ngCFqGUQeWQ3lixMmYQ42S0il+qbHGL8duTfTcX/3mMo3e5zecmPV4pMmrarEuK0HWLVCGIxEbf5k0fbDjFA7xILHZmMyiEWwfb+PP/P2O3a3fCbF3/j/c3kEKtaECv4RSxB6LLoEGM/e24zp/LIJAA8ufxJ5d/j6hBjj3swNygIdfLxABg+4WJ513Lscs0uuObFa7TL1KNDjO0T20kRGtkhJkQmW4C+lpIg1goMlk70WGQSAMYki+enTBBrzQNtuZK4Vq8OsdJ5uOxeAcQvtm07xHTuFH6mSV+pvsWMekCVPVU2rieL2JqtQ0wXyzOtS7U9Xal+UExOKa7ZupAUfWS2bjo5um+1PcVz4xu7tK7WpGKWSYtZQgHFvrIQ8wI7xAyRyTBdeD6Rjkr1Q9OcjzrOJJPeIV83y6QLV3jjmBxiAHD6wacDAP7vnf/Dpv5NAIoiFB9VZAzmB703SUWCWGurslQ/mUj6I5OSQ0zVH8buC/gdYqmkoVS/9Ddvk45UQM0LYixWCARHJmURgwliHdzYI45MBjrE5MhkLlf+nQli/EmG7H4DRrRDTIhMIvqxCdZx9nKrplQ/Tt9Y65AikwW3IFzsmlxd8u+mzjDA77CSxTT+WBfKIRbQISbfV3C1GYr/2br415oclzI5xOR95duu7J6rhUMsJqIsu8gNnGVSOnHTxYSYWGIbmbSJ8tmsS/6SSjWmoHW9vbHY/6jrzhrIDYiii0Gwem7lczjvn+dZPT6d8GQS+eTZJFWzTIZ1iAGSIKaLTBrW5brusDpm2LYG84Pa55n9HMVECDbMXTUXK7auwP2L7tcuE6ZDLKoxyzOSBhGZQywmxzp+DPxna3cbPDd2XyswyBxi6Q7vXHp0ongMZ4JYex4oTTpZ98iknNQAxHN50yyT7MLchbpUHwB2Gr8TAGB933qtQ6wSIUHn6jLF5AJ7uKTtLelcgkuevQRdg12BHWJsn9kUt9v0n+lK9SuKX1ZQqh8UO/T1kamK4i1dT7zIo9sPJsFI5+oKjJdaRGMDO71s4qUGZx6V6kcHCWJxg59lUhOZBMSTCpNDDAA+uMMHsc/kfTCQG8DDSx721iVP5QqUy/cBURDTnnzLpc6jR5djeHxkMuF3iLEDBXOIqeKS7L6Av0MsmUjad4ipHGKy4APoZ5nUiVyyE4tfR1SRSRuHGBP7BgbEi2TbyOQI7hCreWSS/2aOd4hVIojFqL/JiFSqD5ijfUZBTFXALzlR+OfNF7c0CFNhZpk0LesbsyT4yZHJ0C6vEA4x0+PNFXLIJhsvMhmmQwywcIgZxCKdk0cmTKm+aUy2vV+qnjHZGWXrEPvpnJ/iyheuxBPLnggcu07EMj1uWVBR9ZHx9ze5uvh9yMfpKino/8Sdn8Au1+ziiyXXCvZcDeYGtU5A+dg3XGOydfiZxnTbvNsw4bIJeGr5U97fKu3pDO8QK573NJRDTBGZ5GOQ/S3AUOn4ziKTADAGbcKy7XmgpVD+gng4hFYZ3RfTgD4yyX72RSZddak+IH4h7nOIWZSf89uxiTDaFrcLy2gEo/OfPB8/fvrHOP+J87XiBmNix0Tt9oDitdmGvg3WY6+kVF+3LlkwClOqn3NyWLF1BZZ3LRfWZeOyCnI9CR1bmk4vG3GtktdC0NjDlOoHvq4MTjndJAW2z41pAoJmgwSxuGGITPLiFX9Q42ddVK8ygQsOv8ATjxJIeDbkCe0TAAA7T9gZANCT6RG2F9ohNmZM2XWUyQh9aL5ZJkvjGd06GkAx3qkcP7tQKr1ZtZFJ0yyTvIsKKApi7GJy9Ojy33WCmO6kSuUQizoyadMhxjvE2ONKJMoXw5WU6o8Qhxh/8VOTUn3+myu2z20ik8wpORIdYo4jnLQD0iyMBb2rS/5dFnlkgUi+wJUL+WWHmMktIpfqq06MbLbri0gGxB6N7rIwbjqVACaLawnOZWAjiLW1ib/HSJQVBLHSAa1gEZnUxYTCOsSqjUwKvVGa5XRl67rldIJYmFJ99jnOf8ElEzQzpG2pPt8hZnJ1DeWH8PUHv46HFj8kjkPzOLTCoMEh9tyq57xy/uHAi0zm9ZFJ3f6t9ZhMz59th9hTK55CX7bP63b7zmPfwZ6/2xPdQ92hxuS4jncctxbEmEOMCWLVOsTi8HmrcIgJglgrMJj0RyZHQ+wQ68gDLdzuqIdLTPhiOmkZmeQcYYDUIaYo1WfrB4rndvxnOF86H1Sqr3J/6Zw82UIWs5fORudAcQZ5m1J9ncOI8fr617XiBkO+zuG3BwB7/m5PbHfVdoJTLspSfRu3mSnmKCzHuZB6M73Y5ZpdsOu1uwozhYaJTAZ1iFk5tgx9ZLaRSZuJGCop1bfpGePHZXpubNbFawmmsTcb+rlyifrARSblD4Z0Mo1kIilMcw5wDiJNZBIATjngFJy070nYPLAZbak2bDtmWwDArSfcijW9a/Dg4gexYusKnyDmzTJZCBDEdtwROPJI4MMfBv75z+LfMhm4peO5yiHGPkQP2u4gfOPQb+DwHQ9XbqKiUv0gh9jQUFmkUIlZcmRS5xAbjshkkEOMF/sGBsReM/Z4dJFJU6n+COkQ4y8Sh7VDLIxDjC/Vj+l+FFA4xEzCTTUOMdnNYXJqycJUUFG+rVMoaMxhHWKmCQdCO8R8kcnie5UcYn5sZnOMslSfX5duOV3Zum9dBb+oxI8vjEPMRhSxLdWf9dws5JwcfnrkT73b5PiwqidLdnU9tfwp3PTaTXhjwxs4fs/jfWOV0bmYjH1kpXXxAnUt8SKTueDIJKAXcqPEc4hZRl5NY2L7k50X/v2dv2NN7xrM3zAfR+x0hPWYbJ2HPOUOsZJQXqkLKibHOn4MWYNDbDBZfJxslkkAGOOkhWXb80Brvvx4soWsNmFRK2wjk7zLabsx2wnLqRxi8rrY+b/jOr7IpHXUzKJInY3z96/8HrPmzsJ+U/fD/LPmK501bGIXdg2jE3kYu0/aPdAhxm7nt8f2reM63nvwX6v/pX18KnGDrYsXglzXFfarauxWxe2GyQXY+uTzNV9hflIxy6SFEMSPixfz5C8zbfrPtI5CzevKFK20LdXn3VjWXXhVPDemKC77vdlL9UkQixsGhxhQjBcO5gfFyKRrjkwyRrWMwo7jdxT+dtwexwEAHl/2OIDyiQ8ryrR2iI0aBdx+e/Hnp0r2+qEhuGNQXp/cIVYS8FLJFH7/id9rx+0r1eciotrIZOlvxg4x5o5SxR3DOsRqGZms1CGmGhNgH5kcKQ6xYYxMCg4xXhDjHWJMgGigDjHALNzIF6BGkSfgvrLbytchZohMyh1ypotnn0OM2262kPUVhFuLeAGl+qZ9pXSmNUGpfiqREkv1AwQxbYeYRWTS5BDrHOjEpI5JSCQSoSOTuuVq5RCzEepsBL+CW/Au7uSJMgZzg7jwqQvhwsV3P/hdz8XNb7sv26cckyDm5QY9VxH/3sk7ea3zJ2yHmOM63viHIzJZcAre2E39bmH6uqLA6+qyjbw6BmG19Bwyp2Fvtlf4e9gxAWEcYsVjQKvTgA4x7lR9ExdQ6GsFhhLFxyk4xApJIME5xHLlyCQwPEKrjFEQ4yOT3MX3ThN2EpbjL8R17hTv/N8QmQzqqbIq1S+NhV0DvbnxTW+7bByykOAJYhpRgrHrxF0DO8TY7SoxiH+/TR87HYs2LxKWUUVC5fgb30FdcAtIJ8ruPNXYBXFNJyoZOrbY+oXbFc+zUeQJUarvLZdTO8SMDjjL7UVZqq+KTAZ14an61mxmvuTXpZyRs0XcD1SqT8SDRELrEAPUdk8bh1gQTPhiJzytqVYkEgl7QSzNHfyY0DQ0VO5DK33bwRMk4DF0DrFUQuwuCD3LJLuY1Lm72PI2HWK1jEzaOMTY8xD0uNjyQaX6hQLQV7rQiXuHGB+ZRPRjEz7s+FJ9XmANU6o/EhxiishklA6xSh1TQaX6MqbbfaX6fFQzYFZNX2RSuq9ppsjQbjrZIRZFh1hMXoN83D8Kh5gs8piWAcQL+1fXvoqpV07F92Z/T1guUoeYTYeYRggayA1gIM9FJg2Ch02PlGrssqDVn+v3jqk6MY53lZscYkwI5veBzWNQrUu5PLcu+T1WC2T3nu55HvbIZIQOMfYa6c32wnEdTxjj97Xrurhj/h14c8ObgWPi1xmEPMtkxV92xeRYx8ZQSHBfZkIRmSwJYnyH2OiSXbuzdErXngdSBdc7/x3OiSQYuvNwQO8QY9UsDN4hpusvYutWOcRUQhA/tjCl+qoZ7tnY2LqCitvl2BqjLdUWOMsku10XuTM+vhAxR99yIYriAzvENOIhP26fgKMoig/rEPPtBwthUFVMH2aCBSC6Un3d69ims0z7WqhkRk7pNdNskCAWN5JJT0RSqbSsWF+ITFo6xEy0p4rCFzu5ZQeNigQxRal+MpH0RSZtVWifQ8xVdxcYHWIqQUzl7uJPnlSRyTAOsXp0iMmRSXlMgBgH1TnEernumZg7xGofmeS/KasgMum65ecknY6tsCgQVKof0KkV6BALIRDxJxKq2R5N2JZ5q7YrW/3511agqGWIPVZdqt+wkcnS3xQdYr5ZJgM6xAD9RaLO1TV/w3w4roPX17+OvJP3Xndx6RAbzEUbmVRtR4626TrL+OeDF8RMY2eisW1kNeyMlfy6hiMyKY9J9zzb9nXxrO5ejX1+vw9+/7LeOa/DxiFmW6rP1tGb7UV/tiyO8s/hGxvewCn3nYKvPvDVwPUAFcwy2WCl+vIXTbwglksBPYni/hEik/mEsGx7vrgu3cXvcCCch0tfxmsdYuNFh5jQIaa5GOcn1fI5xHQzGVq4dIIijKrlrGd9lMQg4fHBL1Dx6whclyJeahOla021ej/fu+Be3+MzuemqnfWRv11e15jWYoyI77uURaWpo6cCADb0b8Cm/k2+sVtFYyOMTIZyiCXMDjFTN10oAc6wD+R18a5O0+yezQYJYnHDIjIJaGaZjMAhFokgZlGqH2asskNMG5nkLLrGni0gOFood4jpvmU0dYgN5yyTQZHJoA4x+XGx/rC2tnIpd5y+aeXgL35qUqrPnxjYRCZlQQwou+1i5M4xEhCZlKOLJpGn4BZ8F2DWBfUFRWTSsk8ICHCIBUQmjRHJfICoZehMCyzkD+hMY58P2TTgDigu/NPSN8qyIBb3Uv0kKu8Qs5hBUifIsOelP9dv5fyS11Vth5iVQ8xWELNwCQU5xIbyQ8L+1D1W/kLGyiHGiyOlnxNICK4SeTkbhxg/PvZ+zTv5mnwmyOMbzOtnmbQVn3jmrJiDhZsX4q9v/9W43Nce+Br+6/b/EsQim+feVqRj6+jJ9HjpAUDc1+wClb9QlbGdrIGHOcRaGqxUPyud+vZLh+dOt/geFyKTpd3Hl+rDdZURquFCF5mUS7t5h9P247b315ygJPIElepzcTu2nFa4sHAOBZXcqx5nKlGeyfCEv56AI289UnCuafvB+Menc4ixDrEAoc60PVPMsTXVitMPPh0A8KX7voQb/32juB8qEJV0Jfd8Ubx27JLYtXFgo289bEzbj9seB253IBzXESZl0Tm7TMLglFFTABS/APMtExTRjNAhxju7tI60MAJcCIcYYDe5QLNBgljcqCQyGYFDjDnP+MgkUBbEtCfyTAQIcIjJpfphxio7xPSl+inuPkUEFxUfcRsaCo5M2pTqD0dkMsghxkcmBwbCRSZ1pfpyoT4QrxNLjuHsEPP2uTzLpMkhBpSfkxi5c4wovsk2uaBMghjgn6FSEJsMnVouXLGPR7pv0PNtuviS72tyavE9SfKyQPAsk1XNyGnYV/nBxnGIsS9MquoQq8Ihxp6HgdxA6JkoTcvZiliBDjGpVN8keNi4hFTCoByZFLYnvQ8Zusik3BvFRCpVfK493Y4dxu0AoNh1Kq/LyiHGba8/14/eTC92/s3O+MI9X1AuXy02kUnHdQQhx9Yh5nV1GZ4/13Xxp3l/wpPLn8Sq7lXlbVu4A8OW6vdmerVOQLYdWzeiTak+74Kp2iEWk2MdG4P8RZNMF4qvo46WDu/LyDGZ4tgFh5jjaIWJ4cCLTCb91SW8iMP/nE6mvfc5fxvvoPKV6rMovesIn9m8kGDVLQWNyKMRnka3FMvdeJGAd6XNXTUXz658Fut61wVGJvmx6xxiusik0W0mbc/kHAKA64+/Ht96/7cAABfNucgbm7A9C1EpbCxPOfbSutgEbxv6NvjWw78WTtz7RADAfQvv84/LJFBJ++qTe30SralWvLHhDbyx/g3fuPnHZ3J16faDbak+wxiZ1JXqG57nsDOv/vLZX+KCJy5Q7odmgwSxuJFMmh1iqshkDR1ibHuBDjH+IoyJLENDtXGIueXHywtr/PqtHGJhIpP1LNWP2iHmuuXleYeY65aFMLlQH4jXiSWHLKBEjZVDbGiovE/Ze4EXifv7y7fFxJ1jxHHMs0xKIo+pGN93X6lDzFRQL/+ed/KhTv7DvB585faWDjjVfcM44kz7Rr6vLMxlMwqH2Egs1U+mxMhkFA4xzYU3/1nG/6wVxEyRSYs4pI1rDVA7xGTBzmZ7rutW7BDjX/NyZFLnEAvbIabat23pNpx+8OnYb+p++MQen/CN3SQoy+tiyyzYvABretfgiWVPKJevFpvIpCw22R672D41iUyD+UHv/SMITpy4qXPH2Rb98x1i/POseg+x12/nQCd+9dyvsLp7dXlMIUv1eRGxtcEcYvIXTTr4yOTooVJvZOmUoiMH0SFWj8ikw/U/SrO9C4KYJIzwPWJKh5h03eM5xFy/mBJ08V+JcMHYftz2xXVZRCsDHWIKwU/l/LLZnspNx5bZOrQVd8y/A91D3b7Hx34+54PnACifd1USO2SvO/m9HBSZNDnENvRzgpjitfDpvT8NAPjnu//0zoPCdGyx/TCpYxI+uecnAQC3vXGbcns656FO1DQJrcq4Lu905ERWx3Xw0nsveX2MVmKlLj4bMEMmW+6Pr/8Rlz5/Kdb3radS/XoPgJAIcIgpI5Nu9IIYe7NUW6ovOsSkAnxL+A9EQP/NFP+BZtUhZnJSyZFJ9vd8Hjj2WOAb3yivR15HPRxiQR1i8rjkUn3HAc48E5gyBVi40OwQi5mQwwsqw+oQ4wWxfk6YYIJEIlGOm/KCWEyFRQHFN9m8A8LUvwX4RS7+d3kmxaD7Cu4ySSCKEtnlZRL8Al1eUiTJ1hGnnFXSIB5mMwq3zAgq1VdFJlUdYj6HmEWHWKWRSVkQyw72A9/5Tjn2bLEu3ZiGo0PMNp6mivUZHWKlcbmuq+0QM7m62GuXF2rYGNrT7fjBYT/A/LPmew6SsJFJwSGW7feinJX2ib218S1s7N+ovd0nViqisbavWwB4aPFDeHr50wDKMVTT88eL47rXoc1rxNghxs0yKTjEFAIX+//m12/GhU9diF+/+GvlmKwEMUchiFXbIRaH8xbHCXSIMfjI5JghceyeQ0xRSD5cCO5e/ovpREoQwWRhhJ9pku8Q0/UXMTFAJaYEzagnCxz/987/4ZN3fRJdg12BTqzJoyYXx2bh2LKZQdJmPTbbE9xm0jKX/+tynHLfKfjWo9/SdrLxkxQot2dwiLH7MlHztXWv4dmVz/oeQwL+SdTYemRhZtvReocY/1rYb+p+2HXirsgUMpi9dLZxX/FfTqiccqcecCoA4C9v/gV5J699/gIjkxZRXFN/Hdsmf+36wZs/iP2v31+5H1SOUNtSfV0PHCNXyGlfM80CCWJxg+sQUx1QjLNMVlOqH2WHGB+ZLP2p6BDjIpNhHGIsMil1iMllnmnu54odYqrIZCpVPqlauRKYPRu48caioBEUu4yAUA6xQqEsZsmCmOqx8ZHJ118vijyvvaZ2iMXpm1aOmkcmXe7Dju0DeZZJlSAGlMUIdnuM+puMKEr1eQIdU8sXa28PKtU3CUbZQlb5HLPjVDWYnGhhXGwuXOECMNQEBIoyfx5532SzIzsyKbgM2N9COMRc18WJd5+Isx46C4CdYyt0ZHKgB7jmmuJx37CuWjjETB1iOgecTTyNd5Hxy/Hbyzt5pegiuzH4bimTiMW/dmUHG/uiD1Cf44TuEMv1e+Mayg+FFlPW9KzBAdcfgE/cWXSrvb3xbRx8w8F4cNGDvsfAxqQSY+WLEt1rpHuoGyfefSI+9ddPwXEdK4cY/9zonJG615utQ4wv1ee74nRxW8d10DnQCQDYMrgl1Jh4BIcYGsgh5rq+DjEdwiyTA+J7zivV1whCw4G+usTsEOOL9QWHmOZinD//t3aIaYSER5Y8gocWP4Q/z/+zlfNLta6tQ1t9+yEoMqlyt+kcYrpeLH48QW6zv8z/i7aTTb6esnEYyWLKzKkzcfpBxT6yr/7jqxjIDfhcTyqXmEr4ZJHJ7ky3d1xROcQSiQQ+vdenAQD3L7pfOa69J+8NALjqhavKEVTF6+rY3Y/FlFFTsLF/I2YvnV3166oqhxj0ExBoBbgqS/UB0ZHHxk2l+kS8SCS8WJZ1ZLIGDrGoSvX5x1KpQ4z/hggQH6/QXSAUexbRdojpHGJBs0wyV1ChUIzJDUdkUuMQKy8gPbbO4gmp8LjkcalK9dk6urrUDrGYXETLCGJETUr1y/vWYTKkjUMM8AtiMRIjjAREO3yF9LJgZBCQAiOESxcKv9uIDmEEMd306kHONGGMISKU8r4K7FvjLm6DxMKs6iIxSBCLkSjrnfwjiSTrirEQxNjzsaZ3De5feD+u//f1KDgFqwJ7nYuMF8QEYS1R2kcZ//pEJ5nfQSZvrxqH2EBuQJz44e03gFXl7iiGT3xQHGdkUULlEAOArqEu33rlZbQdYrxglJMEMannip3XAGoXfNhZJgdyA4JgpItZ6ljVvQqO62DplqUAiu6t19e/jrveusv3GNiYgiYpUP3O6Brq8gTI3kyvIObpsHGIZfIZ5Ao5/Hvtv4UvEmw7xHj3V+dgp3IbcnySvd51x+2h/BBc18VX/vEV/OKZXyi3WxOHWBw+b0NEJgWHWL/4uukoOcRU4vFw4SU1Ev6khskhpoxMonzxLxsBeEeTjUMH0Dt5GGNbx2qdPPLjU0UP5W0FRSatHGKygKPpGlOV6stj33/b/a0dYjrn0D0L7sG1L12LglNQruvK/3cldhi3A97tehdXPH+Fz/UUOPbSuia2T/S2ySbm0HVZHbHTEQCARZsXCcux7f3qo79Ce7odTy5/EnfMv6O4jEIYbEm14IS9TgAA/Gv1v4zC070L7sVr614T11Vajn1Wvb3pbW/dNg4xXxeeom9NEFplsSuCUn3l9qhUn4gVyWToyCQfIawUdkHJTrIic4hxbreUIF5FU6rPP+ZQDjEbMcskiAHFCM1wRCYlhxhDGZkEyoJYmMik64qC2EhyiA1rZLK0D+RSfT5OFSSIxUSMMGJR/svjE3mkz/eBLeVvo7KFrHBy4HOEFexmIePhHSZB6MSzoFik6TZfhxr3mgzjLpN/D1rWu7jiX3MjyCEmHMv5yKQ0Lvminb1+eHGkN9trFWHULcMu5HmHDgBkmCCmeL8K4trJ/wNcdJF/GYteM9d1Ax1iLlzBnZD54w3Aqaf61iUIFC88B+y6qy/uKX+e6xxNXYNdvvvIy1h3iHHvA88hlvc7xJg4FtYhpirV538PA3usPZkeuK7rPUaduDmYH1Q+z7aTQfDPR3em29ueyU2lE8RkUXLW3Fk49KZDcfsbt5fHEXKWSQBY27tWuT2fIJbzC2KygLZ863LcOu9WzJo7S7ndSDvEYnKsY2Ow/VwVOsT6xOfIc4hpLn6HA/Z8yEkNuVRfFmsCHWKayKQqbscEgiAnjzyGZCIZKDzpSu5lVA4xm1jluLZxvmVU21N2iAVsb79t99M7xBLi9ZS8PTau2e/OxjmPnYNnVj6jXNf49vH4/oe+DwCYt2Gez/UUOPbSMolEwtcjptvn/CQMquV2m7QbLjqy+Bn8vX9+D4O5Qd9rhjG2bSyA4utY91rY0L8BJ/3tJLz/pvcL22Xr+uy+nwUAXP785XhkySMA9N10/3rvX9jx1zvi7rfuFsRDvlRf2FeqKG6EDrFzP3Qu9txmT+F2KtUn4gUXmVQdgI2RyQgcYvJ2vFkmdSdmth1icmSyAoeYHJlMJVP6yGTpf6sOsaDIJF+qzwtivb3DE5mUHWKmyCSgF8RMkUleVNM5xGIq5PAXO7Up1ee+2WE7X+cQ4+O1QFmMGIGzTJoikzKBgthg+aJZdnj4+sYk/cYG3mEShE4QC+oyC7NsGFErzH19HWLskDd6dPmPI7ZUnxPENB1ibAZCdrLHX3SzqJZ8H5mgyCQgxr3ySVcp0snryjp54KWXfMvYOMRyTs5YkK4aYzYFYNMm37qE+29aB6xYASwWI8yyMGflEFPMRAmE6BDjjtNsv7F18u9Jdu4R1iEmRCaz/UKUM8ghJl888OLoYH6wLIhp4q+DOU1k0rJUn19v91B3dQ4xaVKGd7veBQAs6VyiHIexQ4xb15qeNcq/6xxiOrdmppDxxs5PDMAjOsSKB7mKv+yKybEOgLVDzOvhYg6xXvH92pGD4BCLW2SSjQswl+qz6wDeIaYt1VdEunTl5/K6nl7xtG/sNtFE/n+2zTMOOQM7jt9RWE5eFy80sPHI69luzHa45YRbcN0nrvPGpNqeyrkW5Egb3TJa62zjZ+1Ube+Hh/0Q33r/t7xur55Mj1ZUYp/H/OMDzP1nqufZE8RKPWK66J7cJ60ScL7/oe9jTOsYbB7YjJXdK30dW/J+cF3/8xfkFmT3PeWAU3DGIWfAhYsv3vNFbB7YrH3+Xlv3Glb3rMbn7/m8IB7K7x/TvlJNoqGL6/ZkevCX+X/Bez3vCcuxbV3x/67Aom8twpjWMd7tVKpPxIugUn1TZDIChxjDm2UyVf62VnlCwkQBlUNsqHyi5ItMRuAQ80UmTbNMqgQxlbtLJ4ipHGLd3UC2dGI5nLNMlv4uOMTCRCZ1gtgIdYjxFzu16RDjyzDZH3N+tyCgFyNGokMsxOFE/vZ+UBbEuN8DHVLqKg8jYSKTOjdZkJMrzLJhBLEw9/VFJqsVxOISmUwkkSy9uUyRydEtxcfpOcS4i265AN1mlklVZBKAEA8DSvtZsa+EdaX945bXqy3e5wQRvuTX5N7RjUkQH9hOlcYlCy06QYx3pHkOMenikxeeCm7B+7wSIoWyQ8wmMhm2VJ/btz6HmEHc/ubD38SUK6YIgg+/f/gIo07ksXWIaQUxbr1bh7Z6Apz2vAt2kUnescUvrxMueeSeuTW96v3D/8wLg6Yx6WYvZfCfJy2sQ6xBSvVtOsS8z7PSpDyj+8TjhtwhVtfIZFKuLjGX6s8YP8P7mTlQeVFC13llikzyohQ/NjYu1j3FUAlwOhFEXu7646/HinNWeEKCSqA6eb+TsfTbS/G9D35PO3YA+PKBX8Zxux8nbEcWcEa3cp/tbOxBjjRDJxsvMPLbZX/fa/Je+O3Hf4s9ttnDG49OVOLXpXKInXXoWThu9+PE5RQiFusR8xxiGgFOrs9R7dOWVIv3GaKKs8pjN72uZFTbu/a4azFj3Ax0Z7oxb/28wNcV//jkdcnbsuks0wm7y7cux5fu+xLOfvRsYTnj+0vzmmkWSBCLG8mk0SGmnGVyGBxigOYCg4ko/EVYSWRxM5wghoQYmazGIcZbtXWRydL/WtFI5xDjRR8moPEOsSx34sF/Oz+cs0xW6xDj46C6yOQI6RArOAXhhLoWgliW/zaGvZ50pfo2glgM96MPxwnlEAsiw50TBAlCGf/5QyBhIpM6N1mQ+GRaNpSoVYWYFplDLCavQf5YzhxihSS0gpjnEHP8DjFZEIvKIQYAmZR/TPK6dAKVjUNMFnl0gorNmATxgX1oSOPyRSY12wvbIcbfnnUsHGKKyGRgqb6uQ0x6LnkByPRefnrF0+jOdOONDW8ot9GT6VE6xOR+N9XzLLuvdE4eOTLJi3k6YdfKIVbIeOPil+fHpRuT/Dzzgphu4gZtZDKvF8T4ffrympex/3X7e7PIJR14yYJGKdVnkcnR0gfsGO5p7mgpnU8yh1if+BwxQUzlGBkuhAlRQpTq8+4x9ppSiTwM3hVkck+Zis0vPupi/PGTf8Qxux4DwNzP5K1DUdHCSCQS6nElyjHA3SbtJnakBURCZZGHrf+iIy/C2R842yuMt3E0CYJf0PY0+503IgQV9POiGT+unx/9czxy8iPe8Z0Xn/j9Kc80qXMLyhMCaF1w3DWj9vGpltG8FqaMmiJsl98PralWTOyY6I07yOE3c8pMsUOstNznZn4Ou0zYxfu7SrRl76uX176M2UtnK8XY6WOnC9tjX2oFOjANonSzQIJY3OAcYtazTEbgEJMvKFWCmNK+b4hMulwJcZQOMV4AFD6ITR1ismg0NBQcdwxyiG3kLsCGc5bJ0t+1cdDNm4v/23SIjXCHmHwhWbdSfT4SycPECOYgi1GhuZGQHWJhkE/ew/b7qIgiMinHbeVeEtOyJjHN1/slPX6fu457Tcuvb/6iFqhQEIvRa9CmQ8x13bJDrFVyiHEX0z6HWMgOMZMgphK7XNcVXUEagcqmQ0wWeXTxRN+YVIIYL4gwQaxChxi/H2w6xHTr0naIsVkmeYdYWvzSz3Ed4b1o5RCTIpMmhxhbn8611JvtVc766ItMKuKUKofYY0sfw4l3nyi8Xvn7bh3aKoxdF5u0LdX3HGI5vUPMdV1c98p1eOm9l5TrAaB10Mk/s9914mEmnxGeD/75fGDRA3hz45v469t/BVD8EtATyhtBEOMik+MkQWxb7iXqfUaxDjFJ72Kl+uwCuZ4OMd8X0wGl+jzsNcKLLiZXkKmY/sAbDsRJfzvJWx+/3JTRU3Dawad5QqNpBj/58WmFLJVzzSA+BUUYdRHNj+z0EVxz3DXel0E2kUmT20eOHQbFE23XZXI9KUUXbhlPEJMcYqYx2S4XNFmD6rWgm3jJRqgLmjxhnyn7iG660nJ3//fdeOPM8pcy/H5nY+1IF1+/f3v7bzj2L8fi8Xcf9z03u03aDa+f8Tpm/ecsbz1sbPxY5e1TqT4JYvGDn2VSFZlUxAlq6RBLJ9Pem9FaECv97BTKJ7HFDrFoHGLCBzFf5sntL6VDTBeZrLRDjDnE2trK9+PXEVVkMsghpivVN0Um2fJhZpmMyUU0j3yhU/PIJHs9yYIYQxYjSpGHEecQCxmZrIYonrMwkckwy9piEtNMxdhhiSwyGZPXoBCZZH9LQDjG8KW3pg6xSiKT/M9Gh5giDplzcsLzPtwOMasYp5dDFZeTX5O6ziu+VN+mQ4y/XZ5ZUDXLJPuff0/yNQ2qMQ3mBjF/w3yc9LeT8M6md4RtMPpzkiBm4fbkBRx+XT2ZnsDIZM7JKR+fShC79qVrcf/C+/Hw4ofLj4lbb/dQt7BPbQQxU1+X0iHG7dOck8Pr61/HNx75Bs546AzlegBg00DZEa+bZXIwHz4yqZpVlz3+lFOuwqg4MhmTYx2AoiBWGs7YrCSIcd91sAtfTxCTDgNyZDJOHWJyqb5qBr3rP3E9dp+0Oy4+6mIAAQ4qXmwwuG8Wbl6Iexfc6y1rWpeyQ0xRhA9YOGvgd2z5llGILqplbLcniyCyWGzj6gp6fIKDykLwU0UmlcvZRCaDxhSwr1T9YFZRwdLfZINIkCONF+CC3HvpZFo4b+DXxf+sGvvXD/k6jt/zeEzqmAQAWN+3XvncHLjdgdht4m7C2MO8rqhDjIgHQZFJ6dtToLYdYolEwjzTpMkhxi3m+9AMId7JHxj847WeZdJUqq+LOwY5xDZs8N9fXkcEWDnE+Pge+9lUqs9gz5tNZDJO37SWkC90al6q7/1RmmWSoRMjeAdZDIVFHxFHJmtNUGSSj2mEiVfGDdlt1iil+nxkUnaI8Rd7TBCz6RCLMjKpEp98sUNNh5iuiF23DL+cMTKp2Z4gPqSrc4jxkUl2H3kZWdBmt8v7lv/ygq3L6xBTRCZ1otJgfhA3/fsm3LvgXmHmRGNk0uQQy/lL4PmfezO9gZFJQBQPWVxUVarPxsV/dvkcYpkaOMQMHWKbB4qucva/vB4ZQVQuiAKzan8aI5N5vyDGHn/KbbzIJDtmj5Pe2rxDzBeZlJaVS/Xr2iEmJzUkh5jKbXPGoWdgybeXeOXztl1PQWIDI5SopIm22TprTH1dJtFFXk81jrTFneKkKQ787iJ5H/jGbuEQ0wp+3P7kH5M8dpWDCig7xNhneKQOsSB3m0LQnNgxET887Ic4ce8TrbanfC0YXlc68ZAfo8qxdcB2B+DBLzyI/9j+P6wfXxROx2ZhBF3yNAkBpfrDPcskf1tYh5jLDb8YmSz/QfdBpkKXG/d9ECNkh5jJIcb3bOkcYiwyqXNiDZdDTBb7GLrIJL8fdLNMqiKTcfqmtYQsENStVJ8xgtw5RmoYmawFQZFJ/oI7TLwyblTUIab6PSairHAsZ2YmSRDjP+uMHWIDtYtMquKQsohl4xDTRiY1DjHd8qbtiaX6pR8CxDytIMY7xPJqF5lv+woxL+/khQumMJFJnyCWG8S6vnUARJHHF5nM2DnEgiKTQoeYYiZJ1TZ0Yl7OyXnb0W1vbe9aYV+FFsQ0DjF+f8gdYkFjkrGZZVIr0hUyWocYu2+kDrE4CWJcZLIt56KVOxXjHWJyZLK1UP5CFOAcYiUnVl06xFx1h1gqITnEDJFJ/txeOxsgLzZIAoHKyaLq2JLXpRIb5GsgfnvKcYWITOqEIH6Mti4klQC3/djttftAtz/lsWujnKZ1KRx3/N+NY+eWkWeZDNshFpW7jV/Ppf91qRc7tHXvqV4Lvn43V5wAgl+XIFYaXJM27kTV+0a1XFCctZmooL6YqCmJhF2pfq1nmUz6HRXKCwyTIMYtFkmpvtQh5otM8qWXQbFCnUMsqENMVapv6uqKAKtZJlWCmE6o4wUxVal+X1/5+Yy7Q2w4IpMOH5ksPfZ8vnJBLCZihJFhjEza0pHu0PYHmWKQ8jfWtYhMDheyS0IpiKWlj/Vksvg3fgKUmIiyQjFz6YBWkCKTKkHMqkPMZpZJ21L9NIJFpRSAgn9/8uvVRiaj7BDjxQfLDjGbUn2dQ8w3Lo24ptq+qlRfjkyqHGLr+9YXf9bEHH2RSY1DLO/kla8l/ufebK8nJuleOzJeBNXxO8TYMUwlBAHA6p7Vwn2qcYjZzjIZ5OqS0e0HvktNmHBCis/qHGLsvp4gFqVDDCi+D+p5scd90dSSd9GeB7Klw7XgEJMikwkUXWLdpY8u1iHmlerXOTLJn4f7OsQUkUmGyVnDCJqhcNeJu2JZ1zLvdyshQbG9Q6cfilP2PwWJRAK3v3G74L7h72scl0V3lk54Ms2cyP8uzPpYuu/3PvQ9DOYH8V7Pe7jrrbuMIp0sugTNIGmzriDRRSU+CaX6UmTSpteMbVe1vUr7z6p176leC0qHGO+m4x1i3M+mTi9+XOw+Ni65oHXpHl+zMII8AE1CMml0iBkjk/VwiDFRgBcCWGRSdogJ4lX4Un12UOIFQNEhxglipf/5MQgCRl9f+QJHNctkmFJ9U1dXBFg5xFTxPV1kUucQ4/++dWvxf1WpfoyEHF9kshal+i4fmSytP2xkkpFOx0aMMMJFJuX3ar0+LE1ClhyD5MfYkmwRTkpGsiAmY+UQA8TXYVxL9TWRSXYx3ZJsKTsiLDrErCKTXHE7//fOwU5xXQrxyScq2ZTq6yKTug4xRy8qZSwcYkPsZS93iElih058UnXhBblRvMikIXLnOeAK/g6xoMgkUBaNBvJlYYV/TNlCVnC3yU5ihqq/ChCfW77kfjA36H3GGMVKzf7MFrJKNxY/jqoFMd4hlg/uEHPheo9vKD/kvSeNz5+mQ4x3iOkmk8jk9Q6xmnSIyYJYPeEcYi15pyhslRA6xFhksq38ucb3iMkdYnWNTMrn4fIsk5YOsSCnkk4gePrUpzH3K3OFcQU5jFTLtKZacfuJt+PLB3zZNyb+vt7vIQrzbSKTQAhhUNpX49vH4/JjLseh0w8tLxMQ4wzaXpiZGoNEF5WjSRWZ7BzoFNzEJtcTW59y7IrnOezrSh43v90w4iH/OcTWpRMPdXHWMG5B+fEFvY5thN1mgQSxuMGV6lvPMunUrkOMvy10ZJJbrFiqX36TRVaqL3SIGRxigOju4ol7ZNKmQ0x1YasTxHg3GXOIyYIYg3eIxVDIGZbIpBNxZDImYoQR7pts5sphyL8PBykkjd80yzFIfozpZFoQxEZyh5iMF2sNI4jFyCEmdIgFRCZbUi3ea0DVIbapv1z6DdiJT+xCXf5sU0YmbRxilZbqR+gQE8SRkLNMmsSuWjjEVB1i7L0sr4d/T7/X8x4A8fgvP9/MbQDoI5NapxL387redd7PLtxQgp8vMmkRT1zdLQpiuu1YdYgVgjvEgGKRP0Pl8JIxzTLJ7l9wC977VBbQhP40xX5nr0PeIVbxZ3vMBDH2JUZrzikKWyWmGmaZBMQesXbmECslOeoSmXQ0kclkyjtOJ5Awnuvb9HAFOXl2HL8jZk6d6f1uKyqFiSYGjivIQWURmQxaTimuVRDj5LdnIwZZOcSCxMMAYXDyqMlIJpJw4WJT/yarMQEWXXGGUv0g56E8bv5/k1AnL8NPRiLvA3nscpzVakIAG/eeySFmIR42C835qOOMbWQyLg4xQ6m+ww0/CoeYHJlMJVKiVdvUIQaoBbFkUryAVEUmUymzQ6zWkckgh5hKyDKNS+UQ4yOTPGPG+O9f75NKDjkKwx/4o0LrEGvwDjHmEPO+rS5RD0EsLfWSyLSnxOMXP2afIBayQ4w/OYibmFaRQyyupfoBglhrqtV7HlUdYvKJp1VksvQ56pusQHb2qDrEZFeXTal+yA6xwFJ9lUNMiEyWfgiaEMCmxN9iGf52m3V5kUnuPek5xKTtjW4Z7X3es9eNad8KEUpNZFIV15Pvu6Z3jfI+VpFJRak+245OjJPdiWFmmcw7eUE4kh1i7BxKFlC6M5wgZvP4NB1i/CyT/G1yxYduFk35fSg4xCqNTPIzgNf7M5ePTBbgCWIdOWAct7vlyCQAjOaesg4WzIiBQ0wWxHiHmOlLLEDtYrHpetK5feTlKomHWc+caCNQ2WxPcmxpRRBVl5qFCKITbwBRDDJ1iAUtY+0QU/TAAUURdfKoyQCKX2TYRBP5/61mtbSIoJrEUdP2TBHNMw45A3ttsxf22mYvb126vjVbcdTG1RV2BlCTkNws1FUQu/jii5FIJIR/e++9dz2HVH8CSvW9yCR/ghGBQ0y+SFR17lRVqo8IOsQkh5hs1eYdaAn+4oqhEsRGjRK/QdRFJlUOMVUHGVuerSMCAh1iGc2Jq02HGHveVMX8Y8eWHWRALJ1N8jf/w1aqXyg0vkOs9NR7J+covo+FAuxhEohaEmnjRBzy8YsfsxzhkMWzIEa3lMWm0a2jDUsOP9aCGBe9idNrUDiWl45shSSEcbGL99ZUq/c8qnqfdLMdyqjiZbpInbecbYeYhUNMFevWOcTY+FRfIGk7xLhzg1zKLzAqxx7C1RVYqm8j5hkik3JPKi+IyuK8bQm8ziEmxPXyaoHKJ4iV7mNyiNlEJnVinAz/uJZuWYrfvfw7YbZKfhlZxOrP9XvvFReut02TQ4yNz3qWSWkCAv59qNtXfDedrrsNKH4JmCwdqyKJTNb7M5ePTDplYWtMVhS8rBxiAFrYFwT17hBLil92M1Hb9CUWoHbWmKJfQX1XbDmrqFkIsSFwOQuBKrKxW4ogNtuzcRjZrEsWeYwdYhrRhRXrb+zfaO0QCxSyLJ8bGwGVLatcziDszhg/Awu/tRDnfvjc8j5w1a8rn7BrIfhZ7QPudWzqW6NS/Tozc+ZMPPHEE97vabkQuNlIJo0OMWVkMgKHWDqZRiqR8tYVhUOMPwUvdsRw4lU1DjFudhshMqlyiPErUgkYprhjUIeYzToiINAhZhMF5cfFC1/seVO5w/j+MCA2rhIe+UK2Nh1i5f0lONAGFRcxshjR1ua/fSQ4xBzH+yabvwhtSbUIJ7mjWkYJFzvpZFroHUogUbFrj79vOpEyRyYlYU4es6lDrCXZYoycjGoZ5XXsjGoZ5YvT1ZPQDrFUqvg+jslrUJipLIRDjL3GjDPhKS7oHdcRPjtzTg6O6wQLYhV2iBWcgrA9Fy7yTt73Wg5yiI1vH++97ka1jMJAbkA/y6T0uDMpoEPuEJOXCSFi2TrEbErZw0QmW1Ot6Eh3KJ1RgFnA0QpiNg6xHlEQs4mOejNySseV/ly/95rXOcRk+LH88Ikf4t4F92Lb0duqBTFpH/BCF1B0iY1qGeUXxDiHmCeIWTjg5J/lvhzdvuKPobqoKhCRQyxOkUmNQ2xMVhS8lA4x/vbS/VqT9XeI+WZ750r1g2aTV5aR27hvNAIBWy4oamYbk7MVeRwEz8KodbdJY7eJTFrFAHWOO50jTSMGhXXv2Y5L3g/bjt4Wb+EtbOgzOMTkDrEQ8dJK+t18HWIVOMSU+wDqfeXrk7OIhAY9z74ZQEOIec1G3SOT6XQa2223nfdv8uTJ9R5SfQlyiKkikxE4xAB1sS2gdqV52DrEEtE5xPjIpLZUn4lGNg4xHt0skyqHmG4dUUcmgxxiqjGZxsUvz5431Tr4/jAgNhfRPMM9y6TDfZBZCWKqUv2YuHOMcJFJPiLZkmwRLuZlxxTvppLvC4QTwoXtJlLGHjBZ5JIdYvx9+WObaow8yURSWLf8+Kqhmi8wGKEFMfZ+j8lrUHAZlA5oJkFMLtU3CQmqi0TVZ5hc8q1cl4UgphKoVIKdalxBHWIT2id4t7GfdSX+PqHOxt0WxiFmWapvFZks2EcmgxxiRkFMF5nUlOrzf1/Xt068T4jIpI3wBNj3dW3oK/airdi6wsohxjuxgKIgVnAKvs9J1bgq6RCTt6cT6gRBTPMcAEC6AUv1vQ6xQlnYGpOVBK8WvyCmdoiVjof16BBz1R1ioSKTihigqZ/JqofLYjnbmJyVyGMYu01Pla5IvZLt2cQ4fQ6xWog8NmKetAz7XOvOdNt3iFnsq0B3myGaGHYGUFuBSucQ8y0X4Ba0nkXT9nWsWVezUHdBbMmSJZg+fTp23XVXnHzyyVi1apV22Uwmg56eHuFfwxHQISbPwARE4xAD9IKYlUOMvwjTlepH1CEmXETx62Rv9LY2tUNMJYiZ4o7MScU7xMKuIwIEh1gqZe8QCxOZVK1jBDjEhj8yyT32AcVFdAN1iLFohyBMKRxiPGF+l+MUssglFONLHWKyEOcTxAwdYi2pFuG4IY9RWFYSAOVl5ZMLmxm1dOuSp67nl9cdL0MLYuy2mLwGhR4a9reoHGIKwUK1fKYQLIipRCWfiKXoEFOtVyXcBDnEVIKY1iEmPW4bd5tcqs8LyvK42bK6uLQsiKlmdZUFHMEhlio7xFzX9TnEeKqNTOpcWvy6eMcrUF1kUiivNzijeIQutNLj2Ni/0cohtnVoq/B7X7ZPKZ7wy8mPT56gBNB3iMnuWZ14qHWIKSKT7LyuIRxiUmSSCVujLSKTwu2SQ6zukUn+PJwr1Q+MTKocWzZ9SQbXk42TzDYmZ11GXs325LGH6T+rxDkkd4gFxQBtBbgqe6psivB9HWI2+8rC3RbouFNce6qWsxVaddsTloPdfrBy75lexxYdfc1CXR/1f/zHf+DWW2/FY489huuuuw7Lly/HEUccgd7eXuXys2bNwvjx471/M2bMGOYRDwPJpHGWSTlOANTeIWYUxJizSBWZ9DnExA9NW2TbqicQJeVS/dL+am2t3CHGiz5xiUzyDrHW1rLYly49dtXjamsTy2T5calmmbRxiMXEVcLji0xWGM8zwUcmHf45rcQhFqP+JiOOUy7V5y5CgwSioN95oSqUeCY5xGSnVtAsk/yYU9K65O3y625JtQjHwjBjli8UwoqFJiceo2JBLCbitrZUn3tveLNMcvuEXdSrhAT2eaESLNhnGP/ZWkuHGBtfa6rVO2Gt1iE2sX2idkz8fb1xcmLe1qGtyOQz3tjZmOTtjW0b61uv3CE2pnWMbxl+Xez/8W3j/csYOsT491vOyRkdYjYTFgD6jjidGGMSqHQ9XDxeZFISKnQOMdsOMSaCbRzQCGJ5C0GMGxM75ig7xErrYmXXADBl1JTibYWMd/FUS4eYEJms1CEW41J9oUNMFZnkKheYQyyZSKKltCtYh1jcSvWnj52OBBLYYdwOxnXYiBLKGfWCergsHEY2MbmwMUCtaAbD45MEqmp6sVQiSOAEBBYuJBvByBTLCzPbpm0HHBubcVwGoVUlxppisWxZ1fZshF0bQdO3XJjXscU+V43dRnBuFuoqiB133HH47Gc/i/333x8f+9jH8Mgjj2Dr1q3429/+plz+ggsuQHd3t/dv9erVyuVGNJVEJuPgEFNFJrnFig6xYMeDCjk3ru0u4AWx0t8EQSyMmBWnyCTvEOPFPnZxayP08eNizxnfJTRSHWI1jkzy/RBA6QOR7YdGd4hZdojxmH5PIxXqvrwIlEbKKMT5OsQkEY8XwGTHmGm7LckWozMtjKgV5vHK+1kX1axaEItJZDKVSHmCWKFKhxgTYLK9W7UiVke6Q4jm6QQT9jmVsYhDqpxY7GK/I91hrB4I7BDjRCUvMqmbZVJ2iJWca72ZXuz8m53xkVs/4i0zrm2csB1PEGv1C2LymHSCmLyu8e0KQax0/uJ1iPETdXA/Z/IZa4dYlJFJk9vM5BBj49NFJnmBqpLIJHscq7tXC04vXVcXL8ABRUGMX4YdV0yRSUEQGz3F+1me+AHwd4h5+0p6TfKzTLLXvuu6/g4xF17VRqM4xPjIJN8hNoqfRZJ95nLHciaYtafavfPb1kQdI5PcF/HC5FaJFHYcvyNe/trLuP/z9xvXoXIh2RS3B/ZwBYhBtjE5G2eNyfFjM+ujbWQyTBxScBcFxEsDZ7UMK/IorlvDuOlsXXJsbPJj8m0voFTfRngKcqQp12UhoKpML8oIo8Fxp31PaF7HpuWavVQ/Vr64CRMmYM8998TSpUuVt7e1tWHcuHHCv4ajksikU87yV4NWECvNyqb81tRQqs/P8FiNQ0zOcetK9b3IpOAQS5RdUNV0iFXiEKtFhxgv9rUaBDF5TPy42HMWJPSNhA4xKQoTdam+/A2/67rlk9RGdohpIpOymBTUGcb/3ppMG91WJrEpnUgaRawwkclAQYx7TK2p1oodYmHFQ8GZFiCmMRopMllJh5hSECsJMJlHHgSuuEK4jS3fnm4XZjPUOXTYukwOMSYMmRxiHS0dyglx2PHKJ4hJLittZNLCIcbEvBVbV6A7041X177qiR5MEJMFHPZ31ePVCWJpya2iEvPkdZkik2wd/PMvvw8G84PePjQ5xKxK9UPO+si2xx9LvOdGiqCy45MQmbR0pKkcYsu6lomPQyPSmSKTyUTSEx9VUU62LuYKk3/O5IsuMX6bcmTSZgICk8AYiUOMP4+u92cu97na4ogdYim37BjzPs+SSe/cmkUmO7jPurg6xADg0OmHYvrY6cZ1qESXSmYotBV5bFxPKrFBNS6VS8c49iojk2EihTZRQdvthRV5VNetNjM6hnXvsfWZ1mUSIm1inL4OMZtobIAjLZR4GMVzY+veM8Rsm4VYCWJ9fX149913MW3atHoPpX4kk2aHmCoy6dY2MslOfHsziiirZam+r2egEocYi0xqvpnyHGLpdFk0AswuKJO7y9YhNpyzTLa1lcU+kyCmcoipBDGT0KeLTMZIEJOdHVE7xORvXh3XqU4QS6djI0YYcV19ZDJfHncogUhyeYUp5Jcjkz6HWEBkUhbE+GNlkKgVRhDzucsqjJcGiWkMTxAbw4kTI6hU32aWSSZ+qRxiKuFCcFAtWiTcJghizLHFOcTkz1wm5ig7xErCgeeyMjjERrWMKgtwT/0TALCpfxO2v3p7nP3o2f7IpKFDTIhMKp4/WSQcYg6x0kypjutgbd9aceyyQ0wRmWSCBTsmyoIYvy6+uJ0fO3tNm0r1+c/1TCFjjEzys4ayx80LcNt0bANA7yTmPz+sHWJSZJJ/fJ4Yq3HThYlMys5813U9YW91j5iOCBOZ5CPI7NhmmmWSd4XxbrGh/JBPxLKNTPKw/alyaRY7xMoXfxURM4eYapZJJnaVRS/udV46drPIpHCeXsdSfeHLjGR1X3bblJHbuLrYuKx6sYJieQFRM5txhS7VNyxn0/VkU5BuvT3FvrJyralEHkU0Vuves+hbsxWoTK6nMA6xoHGFFaisxEOTSBw2omnR7xYU5WwG6iqInXvuuXjmmWewYsUK/Otf/8KJJ56IVCqFL3zhC/UcVn0JcIgZZ5msUWSSnQR1Dnb67xQmMsm93Cr90AT0ZZ6eIJZMIpHiRDmTQ8wkZqkcYmFFtQgQHGItLWWxL2xkUu4QC3pcushkvb9l5ah1qb7PIQa3/Fr//+y9eZwkRZ3+/2RWVVf3TPf0XDAMMMAcMMMtl4IgIOAFCoKroqCyi6uiu6L4VVfX8+eq6K667rKrrLqiroq6eOGKXAKKCCIoHiDnDMwMxzDMPd3TR1X9/siMqMjIyMhPREZWZVfl5/WaV09XZ0ZGZuX55Pt5QmWZlAWwmUqICTfuMaHGIBcrIhB5Va0NMNZWlW631FkmxVGvgODcY2SZFEWtasr6augyl3bLwXC35IKYeB6boYSYz252PSgzxAYqA7EMMZVwoSOokggx9jA+b2heZHpOUGnaoohmomVy8pKLge3bcdcTd+GJHU/gh3/5IZ+Oh5ZLxJbSMkklxMJ+iZlTj219LLJ+nGjS5IOlEWJiW2IfRMvkglkLIm0x0UUmPMVgfS7gVGrKsH8u4ITLnD80n/9tj+E9AESvE9/78/cweukofvLAT6KUFjVDTCKamEAJtL+nqeZURKxj5wVxf00T4Hhe13TbXsqOF/k6lyQ8yRZGMUNMJC7F9uTtOVof5fdZo/XRyD2o3G9Z1KKMyMmmUYmClSbgh9fLXrFMioTYvHCVF4SbbXgq6GvkeAjP3bMVgljN6z4hFosuEa6taUWx5ZmQQ7G2LGxyVMskpV8U2kcmtpIEHFsRJHUAghRxjSzAGYo8pFEtNaKS+DPLQAy63DZqvptJWzo6kTqdMb1H2I/TrJz9UF1d63Xr1uE1r3kNVq5ciVe96lVYsGABbr/9duy2227pM/dqCRliqp1SO8pkRkJMNfQ50L6J3Ti2MT6TxjIph+r7QrhpJkJMyEyLZBewBfo+fN4Hz0zMsskQ6yQhJtpBGSFGyTUD2uvQS5bJ8M2/PPqMq8pMiNXr8b8XcDvGSrB2xAgxjyYmyX+veRXUhEtO+rzCcuVQ/bRRJsU+VwwzxDSh+jGqTbe+KTlgJvPKfWS0wFQFwblX3M+qigeSGRGqH77wkAgxXYaYSrjgApXGwphEiIkEDCBZJhMyxHS2Sp4hJlgmJ9AAxsY4cf3kjif58rnYlcMokyLhvW7bOgCC4CctT5chliSasXnE3C8gKuYxsUoUeYC4oC3e5+gIMSAuuoiC2KLhRQCC6wS72b/moWuwY3IHblp9U6JlkkKIseWJgp/4PU01pvi2Ug2KIdo9WZti33efvXukL0m2T3Ea+buXj48IISad21jJGWL1Sp3vJyP1EX7c7Jrepd1OkX7pCLEpGiHWK6H6YobYRb8FPnwz8LY7g89mh1h2ZD+XCLHo9Tg4H3ZjlMkI3SsKYp6BICa87KZYGKmiUtp0FGFGXF5av9IoJErWk7xMkvhEEEG0Io/O4ufFRR6dAJe0fmI/qXld1AwxUn6WA4smoCfljEPuNSQWRWQ0tmgSSMc0yq8fin7myqGuvPLKbi6+mCWMMkm2TOZMiDHbgZIQY0KKKAQoCDG5fy4IsYpfibQTIcR8QZTTjaQoi1lJlkmTDLG8QvXlDDG2zScUN5vUDLGZHqofPiQMDwxj++T2/AkxMUPMNlR/BhBizVaTC9qxUH3bkRO9CmpCsGDqvBFCzNeH6htaJnWCWIzU0ghTWpunX4vcpJiOMinejMWWOwVsRCiK1GrR/WwGhuoHglhIgvhQCmKiSEghxCaqAHapRazB6iC/+ds1vSsiiD3wzAN8ei4YVZEodukEONbuUHWoTT2F0zEL41RzCo9vf5z3fePYRv0okyHFNl0Jtp/8+JE0yqRIiLFQc0qofs2vYao5lUqIMZulSHWJywAEQUxjmeS/TygskxpCjPVPRYg1Wg1MNiZRr9bxxI4n+DYQ73nGpsbQarXgeR4p5F4nVrJ1kwkxsRhBVq/W+b60x/AePIeLWRV3NYLlid8fq/lD87FpfBNJeGJtqIhLseQg/MHqIEbqI9i8azPm1OdgsDqIbRPbMDE9gYmKfnnythoZGOH7PV8eI8QU4nYkQ6xXCDHBMrnnduAjvx0GdgTf7SGbqrh3QRMrF6xszxOeu+eEm1oUVwdC8anblsnIi2mTe3tRoDKgfZJEF9aOUdg6ISheOZ2ircz5Z1TLZIoIohPp2HRM6CJRSGnioabfkekIQfE6cS2WIUYRNVOEQYpFk/WLJFClkHm6/SU2XdL6CTby1G0lbIO06cpQ/bKKVTaWyZwzxLhlcoxomZQIMXYwV8SLpoMMMa1lslpr98EmVD/JMkmhsfIK1U8ixFxmiIlvU5MyxAok5LAHO/aW3nmovnSj2YIgiDUUN+g9MsrklNf+juMCUTXye0Rcki2F4ttsVCJ0WWqGmGh7REW7nJhlMiVUXzxv6Mgt0wyxLKH6NoSYtSBWkH0w8nIj7IrOMmmSIWZjmRRDw2NtJRBi2gwxMVSf5WeF4po40t6aLWsiy9MRYiJxNenFz8VJo0yqBBUu5kkCnJghxgS4iUYQos7OiTJFxn6fbEzyPsgiFnu5ZmOZFNuq+lX+XckWRka0A8Ci2Yv4/9nLkye2h4LY5LaICNNsNTHVnMJ0c5rvX2Lf2XlCXl7SdzMxPZGYt8aK7XesH0zAA9r3XZwQU4yUyaZptBqYbk5rrYlANFRfzBBT9UkUK9l3O6c+h383FEJMFtdUgzWkEmJslMleCNWXLJMAIvdq3/jFQjz2zsdw4G4HtucJ6d9TVwOvqx6F9z33PfxPRQnVj9yHm1gmxdwogojlymqmI7ZUIzWmTUex5WWl20jLI1gmY21R8tYo9lKNZZIyuAATeUxC7uX2VctzIfgBFnZPQtC/9XejEpIJAqOurbR9ph+qFMSKVoJlUrVT6kaZzI0QM7VMhv9vk26I9S8LIZZkmawKlkmPiXLh7wDMMsREQqxSKYZlMokQs7VMqtZrfvvt+kwgxNjoWCzHJW9CLGKZVBUlVL+AwqJc0177O5bth6KoZSL61LxKREyTRa04bRW1TOqWI567PHiR81fVr0ZIiEpKQL882mPEMmkyqmbaCJWaPLK0/LHZoiAmWyZnUqi+8HIjKVRfmSEmjTIpbjsdsZUWqh+zTIptyeJag5AhxiyTKkJMsDDGLJOh+JRGIU368XOxLFJMSESaWBRCTMzIEgUqXYYYm6ZeqUfEadkyyUUXimXSb1smd5+9Oz8m5BB4sb/zh+bz8wabTiTEZBFmbGossv2YbVH8f8wyKYhgs2qzIqNtckJMYZkU22LLFAWx3WdFLZMqQVPcX8enxs0JMT9+rogJYpU6F0hHBkb4uVaVISaXTK4pBbGUDLGeIsRarQghBiAyQnCtOoC95+wdnSc8d8+aAr4+5w04a+VZ7T8xQqwblsmEwa1sX3Y7DSM3oL8oJI9yOorIQyGjZMskoV+pFs0UWx5lhEXTTC+dyKPKP7MZkZNqZzUS/ByIlcq2CAIV9buhjHBKsaAq+0X4bvql+nOti1y+ryfEwpv4RqvBL0jOCLGK3jK5ZdeW+Fs6HSEW/srCkiMj0RhcNGVMNnF0G3agm4TqU0eZ7KZlUiTExFEm5VD9QeEtu6llkn02MNAmwwqeIdZqtfjoWIxicC2IyW9eI5ZJVsINbe8QYoIgJgpTUoZYTRKb4qJPMiEm5wHFhCnJMimKWPVqPXLRFi1XskgnU2wmGWKyrcjU9hgZVTNFTJOFON28MUJMPP/OQEJMO8pkSLQM+MkZYqJwEbFMUgmxabUgFrFM2mSITbdHmRwIv0+dQCUKYiKdmpRTNaEixELxgW0rNsqkSlCJheoz+ktBiLG20wQxnc1RJsR4TpXKMolkQmzR7EX8+NGF6g8PDHMxaufkTkw3p/H0zqcBBIKYbNMbnxqPCDPiCItcEJtKHmVyqDYUofh1lknW90azwafbY3ZbEOOWSU2GGNuebDom0skCIysxVD8pQ0wW6Qargzh090MBAAfvfrBdhpgib40vb0pjmXRNiHX7mttsRjLEAOjvH4A4/StQ/EUI1Y9liGUkxGxELPEzssXPQMRKmy4t/0w7umJCqD4lQ4xkCdVQSBS7p7itSEHxKblYqfQekYDTUk8G+4L2u5HEyrTvhmr31H03FCLSZHRWo8EhNIJzP1QpiBWtUggx8UaHXQTzJsTYDWYLrdiw2tpRJjkhFgpisLxoCoo5kPxmqtoSLJOqUH0Tu2OrFR2NUUeI5W2ZTCTEwm3IBLERwcJCtUzKJ75qFTj3XODgg4N/YhWMENs1vYsfA4wKEE/8LkoZqi+HlpsKYgWhc3Q1JZAncUKsfZ6JjRwpB8VLgtiAKGpV6inC1KzIvKJAJNMN4nkxTQBLFcRky6Tf/g5lIc7E9pg2yqS8XCPLpOfFRS+xZkSofvBZQ0eICRliLIMJiFodKUH4SYSYKDAAbfpHF5ivDfEXCbFwP2JtiYQYq7n1uUFbUjC9KLoMDwzzeAAVISaTS0zMUy2PEqovEle7pnfxc6K8T4qWyaQgfDFDrNlqcmEzkRCTxbWwrUXDi/h5aXx6PLIviILYyMAIF6N2Tu3EUzue4teIbRPbYlSSSIjVK/UI/cUC+jkhphhlMrJfES2ToqgUIcRmpxNisoUxicRi23P75PbUDDGZuKtX6/jPM/4Tay5eg2P3PrZNiE2nE2LythK3pzyN0jLZi4SYbJk0FcSE9WHX025niEWyfC1HmUx9+NeIWKwfgIHFz8B2qOy7oi2SCJKQ6ZU2nUkemY4ukpeXRpuRiTRdLhaB/jIO3hfu9a3EQwNKjvcrZZ8hh9xrvhsT0ZZs0dTtx4Tt0C9VCmJFKyFDTDfKJNC+ycgjQyxCWFRq/EYmZpvUCWLhr1wQsyTE5BEEee6MV5EyxBSEGNAmxCjh80mjTNrYLl1ZJpMyxGRBTLyx0gliotDnS/tYtQp88YvAn/6kp+cKUIwO8+DxB4DcQ/XRH4TYNMKb3Wb0nGNKiA1FhoivkO2WXgtcQABCQiyy3ChBJZ67ZItk1a9G5q361ci5KN5niYjTCHEy+aGbVw4E19k+5XlTBTHAThArUKg+GyU4KUNMHC10ujmdaG3T2hwTCDEmjMwdnBu57jIhQ2e/ZOceFnIvViRDjIk8TKBKIcSSRmqcVZuFgXA8JKUgxkQRNupj2Pc0Qky0aIqE2OyB2YmjPopClhiqL+ZPifs1i1+YmJ6IxD4kZYjJyzty8ZHwPR/P2+d5EUJM3FYRQaw+wqfbObkTT+54kv9NSYhNj0dGIhW3A8sjk6mnCCFWbX/PEctkAiE2PjWuFMR8z+frocsQGx4Y5ttOJMRkQYwJxmKGmGzLZqWyTFb9Kvaduy//nS8vnEY1KimbptFs8ONClyGmtEy6IMSAwrwAkEP1AUTvs3QjBLO/Cw++RSDEYqO9W8ahZBkNUJyPJFBpAsSVYpGGsqLY8tL6bmKtpIpmFGKLYh2l2jjJuVgZ7Jc2dlaK4EexTJIFqozWX5KtVyEk2wzoIPe9DNUvq1jlee3sLcVOKd7AsIugeGHKUkmEGNC+kY0F6zNBTHwIk0P1w/WwzhATDmwgOtxzRGRTZYh5iItZogiko7tEQUz3ANntUSbZetVq7b6oLJNyhliSIJZURbmpDGvLri0AAiKEPSjnFarPbl4zWyZnSIYYs0zWWl5EmIkRYn41USDyW15E1BpARZpXsgWKhBT8qHjmVeL9EAkxjWVSKZAJb7HF5Vb9aqytmCCY0OeKV4mKWtK8OrpMHs0ytn6VeuRmJiKIsWP2mGOA3XYD9tkHsSq4ZbLiVYwzxEQhQbS2UYLwkwix2QOz+fcyWB3k36cuoF+0gckh9zpCTAzVl/suCkFM8Gf5VKODo6iHx8aEHz+H8H4NBOIDs0ymZYgxWguIChyzarMiokuS5S6REEuwTIrfn84yKQo4pyw9BZvfuxnvf977OS02PjUeEdcSLZNTO3l+GKAmxESBaqg2FBFwYhlijbgNULZMsm2VlCE2NjXG26v5Nd73kYERvt10hFhMEGMCVT0qULHjQ8wQk885Yp/E5crfjSpDTBQFgfa9npxrJgq7bDtRCbFML7uKcs1VheqbEmIAX59aNzPEhPtwa8ukaMtLELFUI/ilkTUUC2NSWyqxKJV6oog8WUPnCYIR2Zan2KY2ghF5/Qj0FyXc3djOqrNfIr7vWRNiqrYsgv5j6+hCgEuj91RCa0mIlVWI8n2tZdLzvMjbWiBqIcxS4o2PLIjxkSbHJUGMWQi1hFhQEUHMhhBLtUymEGKsr8OCfSFJzGq11ISYqooyymSl0l6fLJbJpCrIQzQrFqg/Wh+NXHBdFrvRrIebTBmqr7uhFcPO2X5UsO2oqmn2FruJmLhUlS2TCSRTreXxkfWCaStxuixBTKu1/Ih4Vm15kRttWZjSWSZFsggIzhtJlsmYMCWF6uuotlpFT5PJbevmlddP/vvs8FQWIcSuvx5YsyZ6fuOdKWiovvhQRRDEREKMiRkVrxKxrTGBQpf7lTTK5KzarIggxkUzTR6ZbtRH1m5klElihphoWav4FXz/Vd/H9175Pcypz8EAgmNDZ5mcw+i2cDvoCDE5s0wUVGZVZylzsQYqA3z7VP0qF6iolklRKJGFmSTLpNhnkRATxTVRnIlYJid38hEmgUAQk3O5xqbG+H41WB2MCINihpho95QJMdEySckQE5fH8tpGB0cjQhfQzhATz0dWhFhDT4jJgwbE6D1FhpgsiLH12NXYFRErxX6xl6xpGWJ+eK6ytkwCxXmZ12plzxAD+PrwUP0uWyatQ/UJWUhKoctSdDHJqSKH+OtoM1MRhNAvHcmjJLaIhFhSfhZFrLQh4GxyuBIzxBL2Gcq+QNmevO+uBCrNd0OxVprSe9R8N50A3A9Fl/LL6kwJlskkbHGgMhCxJYijLmYpLSE2lDDSpCZUvz3KZEiI+dkIMdky6XvScM9UQmx4GNgWvp2nEmI6Qaxbo0yybS4KYrNmAZs2ZbNMJlVRbirDYpbJ0cHR2MALrorTCYwQU1kmxW2tI8QKZldLrFZ7JKxq04sLNcIxN6ATeWTKSyLEBiSxKSqm+ZERKaueryWoapUafM9Hs9WME2EePUNMfkiUl6ML2VeJaeI5nM3LzttphJjf9GN/54HmKsskO/5VVXBCjCKIidtoqjkVIXlE4UIM1W+1mpHbRKUg1kgWxLgwo8oHCx/gxYf8STSU08yqzUJdtEwmZYgpCDHWhxeteBGfTiuIMXIpJMSYdVQliPG8NUHEAvSEmPh9sL7VK/VE0SxCiM1qE2JiALx8r8NH5JTENbFEQUy094l5XSP1kURCrNlqxoh3UZQZqrYJsXqlzv8/Pj2emO8mWmOpo0yKttpj9jwGrznkNThp35Nighj7/pbOXYr7n7kfQDIhFhPEVISYJLKzUlkmxRIzxNh5VBx4AQiEz41jGyN9Yv0Vp3l8++NotBqYakyljzLZq5ZJG0HM94FmsxCWydh9eEZCrBO2Q2PhQgUnKMgaG1FJ7pfcD2W/Uug2ncCY1HdK6DxFrEyj6YxC9QmZZaplkgQjAnkoE2JpQp0ueF+179mSjsb0nm4/JtCJ/VL9KQMWuVJC9YHozSLgjhDLZJnUheozy2TWDDFGiAkCYOTNlCiIVYMbhwghJgpirKgZYvIJoiL0v5OjTEYyxAwJMYplsqL5Xgom5DDL5NzBuTFbravib9PDm1dlqL64P1EEsYKIEYklBf/KtFUsVD+REIuKWjGBTGeZbHnRaVsVLSFW8dp/l4kwVai+eP5JI7Vilkkq5aUQ7RIFwBRCTBbqlIKYruaHNrJ54YNrQR4Qo4JYeH73ETnHMOF7Tn1OlBBLyHoSia1p6SGai2jVoYiQkEiIsWutxjI5qzaLD+gyIQlUXOyotgkxZuVUEWKcblOQUWLVw3eZqpy0JEJMtTwmfMmZbKJwMVQbUgbFi/tkvVqPCEGi2KUkxKbbhJhMIInrrBIGeb+EUH2RphNprOGB4URCDACe2vlU5HfRwigSYnPqcyIWRpF6EgUxWWg1CdUfqg6hVqnhW6/4Ft589JsjwhPrPwAsnbeUzz8yMMKnE7cDOUPMj+9bqlB9scQMMTlHjxUjNkU7a70i5ckJA1iMT4/z5YqW10iGWBZCrCjXXJtQfZEyjxFiQWNdsUw21ffhtqNMpgoledkOLcLdxbZcWyblz5T9IlBkVLtnGt1GHqSAQtPBsC2doEmwTJpaY5O2J+97yvdMDrnPKFAphWTdvkD9bjT96ocqCbGile+nEmLizSmQEyH21EZg7xH+8L9wKMEySQjV90PdtSKKVw4IsYpfiY5uExHEBFFOJ4jpguN1hNjcucAzzwRtyzcxORFifgtAvQ4vvBfnofqibVWXIdbDlknxguSy2MND3UWG2AwixNqWSS9u+xPeo8i2R9kyWYMgnskZYl5FT5fJhJhGbBJFLlm0qkp9jGWIiVZNX0+Eiba9WJ8VhJh4E6YT01RkWoQQkwQzJohN+dAfs6zOPRcYGwPOPjv4vSD7YIQyCJFimRBjL2EWzFqgzBAbqrYJsapfjQirE14D4hEp2tPE62gqIZZivxyAj2k0MelJhNhUm/6py5ZJiRATaSqdEAQIhBgTxMLvc7o5za8XcoaYzjIJtAkk0QrJtomKEBND9cVtlZQhVq/U+bZttBpc4JEFF/EznTCoIsQGq4OR42pkoE2IjU2NRQgxADzHbbQ+iq0TWzE+Nc6P2cHqIN8+c+pzIpllSQMeiKH6ppZJUThkywfihNiyucv4NEmEmDyaoyiIsftGWXRnxfojjrap6tdEYwLV6eDcM6s2izsXAMEyKewv8gALovA1PtUm5RbOWohN45sAlIQYgHioPsDvxYpCiFmH6osP9pRsKQohRrT4JZE8KiukTlTSCkaGfRf3cRtxjWovNckjMx3VUkekGbdFEOlUbSlHL00Q4HT7XixDLEXM0xF3pgMeUL+bNCKNSjqWofolIVa8EkL1k3y8Yr4GkBMhtvwA4E1v4r8zQszEMsmFPfaxQHy4yBCLodpsSaJlEq22kEPJEBNFHx0hNjranl/+W14ZYswyyQgxlWVyJCQlRkYQK1kQq1RmtGVSJMTEi4jLkjPEnFgmCyYsxkq4aa+2oLVMOiXEhIfGgaYfFdNafkSIkoUqUeSSLZOxDDEvOUMsJkypMsQS+qzMENPkgKWJabHtLuatmRJiw8PA298OLFkS/F6QfVB8ucHORDFBLHwJs2BogTJDTCTEREECACZa7aB4IMg0YvNQLJM83F1BYkXErlawr+oJsWiovkxsRfKnUggxJojJ2WaRvKZqdIRMlUVTDIRngkvNr0XuBQK7ZzwoXiQe65V6oiDGCKr5Q/Mj4goTo2TBRfyMYpkUg/DrlXqEFhypCxlikmVSLCbOjE+PR75X1tZIfSRKpDXaopJ4HEdoOk2oPiPGRCFIl9XF+g8Ae8/Zmx8HwwPDXEjTZoiFlsnp5nRE+FQJ9LJlMmkEUJEQG6wORqyx4giZovVSbGv2wOwI3caWy/JqAUaIBfu6kwyxbr+EajbjGWK6+wdAnyEWngeKlCFmS4iRsqyIwe0kWx5BuKCIWCSRJ0WgYvOJ+7iNuEahrBKnsxCMqBSSsXhIsHGSCbEMgl8sQ8wFIabZ98TPdHSbMnvPgpKL9CvFZtsPVRJiRSsby2QOhFi1CeCBB/jvDG+3IcQilsnwfG/0FunRR4P20iyT7F4nKVSfQohRM8Tmzg1+6kZzdD3KJLNMhp+3KmGfREHsH/4B+O53gdNOizeUlRArmCDGM8Tqo/xmI29CDAjIvMhWo1omCxZonlgRW4cXs/3V0O53zUsmxAZaaRliUqi+PMqkSIjBjwtGwu++5xtZJhMpL4XtUXwQlx8iI31W5ICJ53ATu+VAZSDytjiJECMLYnIVZB8UX274zZAA1hBi7BqXlCE2VBtC1a/Cb3loei1MQhLEVKNMSoQYE1AiolnKiJWc2JJC9Tfs3AAgEO05IVYFmo3piAA11ZyKjlCYQojVRUKs2eTXuMiIfuE2YaIZW94ew3vgyR1PwoMXEXQjhFiFSIiF21C0TIp2yIHKAFbMX4FPnfYprFywMkKDsfO3ihAzsUyOTY1F7H1zB+fifSe8DwOVgYjAqbJMslowawFWb1mNsakxvo8NVgexfN5yAMCyecva4s1U1KIpkl1D1SGlmCcTYgtnLcSOyR0xy6RY4jZvCd/fSH0Eu8/eHY9vfzxGiE02g+UlWSYBYPOuzQDi57YFQwswNjWWapkUrZx8W1WCEVnZd8otk4KNUyYPZ9dmY6g6hF3Tu2KEGKtKU7BM9gIhZmOZ1LxUKwIhJjs1jF52qwgxCxpG/Ixs8TMRLjQili4/yzQ3ikSIacQ1CmUVW8cEIZISvE/JuzJdHiVPTlyeapkUwUiV+5U0DUC0MGpEJUpYvvgZ5Xum2D116yd+ltavfqj+XOsil4Vlko125IoQG/DCB37hAYCPMpmUISZeyGOEWPAf25Fo/M98NmhvPLhRi9hskiyTTBDz0BaCTCyTaYQYy+LRhde7skwmEWJ+2Ce2XtUq8NKXAl//unqkOUqGGEUQK4iQwy2TeYbqSxligJDdxsrUMlkQOiexBMtkVc4QixFiFXoOGHwMCO9gtHRZU7Zb+rEsM/YwV/Eq8Lz2KJQqy2QsQ0w4b8iEmE54k8mt1BwwWUx0RYiJo0xSLJNyFWQfTAzVF84xiYSYIkOM2/MYQSWF3KtC9UXiJ8kyKRNirVYr2hYnxNr9bjQb+MvGvwAADtztwEiG2A5hdMNl85bxvot0UVKoOaCwTIbF7gl8z8dslnkVCnCMMFo6d2nQbjh6JbsW6yyTqgyxWKh+NS4E1atBYP57jn8Pzlp1VsTazM7fqgwxo1D96bHYtvrEqZ/AR07+CIC2GLVjcgee3PEkgICyEou98BOFmaHqEJ675Ln4xQW/wJde9qW2ZVII1R+oDET6L4bqi2KenCHGlkexTLbQwlRzin9/s2uz8fKVL8fi4cU4fI/DlaNMyssbqY/wbcPsiDLBylwAY1NjaLVaifufapTJwepgZDswQSxCiEmWyVm1WZFtyrYDi+cAQkKMWSZ7YZRJlWWyXm+/tDUeZTIcwEoada8TFRkh2JIQM7bJUbOXKLY8A+EilRBzGGwuz6vsVxI5RKCs5OmShCyloGIp8lDoPZMBD8RpVMs0FfN0YiVF+KSItlZZeGliF8HuaWQ1LkP1yypUEQgx0TI53ZzmN5bi0PM2JQpiACI3D4mWSTG/ilUiISbYlmwIsXBZokU0ciFWZYgBdqH6gH40xr3DG+rdd1d0uEOEGFuOSIjpSkWImYTqF+QhmtWWiS0Acg7Vb8YFsWZNuuHr4VEma03EbX+aHLB6pc4F21rTi07b8mN2y+RRJr04IZYQOs9uwNk5RUWIJdkrY8tVWCR1o06miVo6usw0kL8XCTGVINYQCLGpxhS31ukyxI7Y4wgsn7ccZ608C0BAJwIpglj4YM+s16wtlWVSzhCbak7xm0yWIQZECbHVW1Zj1/QuDFYHsXTu0kiGGLNLVrwK9pu7X7BsA0KMWyYlck0UMep+e4TMnc02OcYEOHa9Z+vILJVGhBghQ0wu1pbOMine4yQSYkKml/i9ysVE+rXb1vLz+QELDohMI4pBYlue5+F5+z4P84fmR0P1he3sez5frixqsuUN1YYi93NseeIok3Lfxd93Te/iguXwwDD+44z/wLpL1mHhrIXKDDHx+2D9YiIZJ8Sk8xN76dlCC5ONycRtKmaIcatqVT14gjyaaEwQq7a/Q6Vl0hUhVpR7FxUhVq22g/OpGWLh+Vu8nnY6WN+pZVJDGFEf2CkClasw+dh0KcJFqqXQwDJJse7pSKzIdIaWO63gRwlup+SfabLb5JB71efi77r9ykr4TLFWauk2gggpLo/83VD6lNFq3C9VWiaLVp6XTogJb0+f2vEUWmih4lV4ToRtcUGM7RbCjbaRZTIUVVgWGju4rLDqZhPe448DAFrTwQWfo9peJZohFiHE2IUBekJMJ4jphKNjjgFOPRV41rPife5UhphMiFEFMVHom8mWyU6E6ksZYoCQ3cZKZ5lUjRBVlJvzpGo226H6DShC9QUrnxfP66o1PUxWWqEAlmyZlMW0gcoAKs1glMFaK5ohVm15MUKMzStmh7Gfsnims0zWq3V48NBCizTaYxLlpQzGl0L1EwVAw0B+Z4JYgQgxNkqwaJlkNIsHD/MG53ELopwhNjo4igf//kF+zWTElhxyryLEmEAABMKFMlRfIrHYsoFQBFFkiP15w58BAAcuPBAVv4IBr02IbQ9pn5H6CBaPLG63Q8wQq6cQYvVqHYNMEKsCOxpBfyteBUvmLOHrx9ofmxprWzgrNU6ONVqNiA1Q7FckQ6yanCEW63u1jp1TO7WWSZFII4XqJ9j7gDYh9vDmhwEE9zKi6AIA8wfbGWL16aAN2cIoCnDy8t5y1Ftw/zP3Y9m8ZUq6jR37TLBSEmLS8kShcNf0Lj4IARP4xPB/No0oPtUrdb78oVogiD0z/kyUEBPOKeKojzundpJGmRT7ECHEwlB9cQCCerUeE1ojhFgoDIr3sZUW3FzbC3K+U2aI1WqB6DU2pr7/0lkmhUe4ycakcv/PqyLnbuG6bhWqryPEiMKFSpSg2PJsaJ9Yvwg5XBQRRGuZRLowSA1IJ41YqRJ5dIJfRnpPOY0m5J5iL3Uh5nmeB7QsxEOCWJn5u9GIh1RCjHIM9kuVgljRyvdTCTHxZpGFxC4aXpTZ98tuVhiGnWSZbLVawQHTbLZvMFSEGNOnwn5ZXTSffBLeZKBGNBvTwMREBNWOiGxsw0UIsVZbKGI024Lwxm94OP4wKZ4IxPB5lXD0utep+5zTKJOVFoJRJsPPW/JNXpog1muWyfCBau7g3MgFwmVlJsTYwAXN5swixMJdqdryFIRY+030gER51fwaBprBg3pgmYzaHkWRa0AlNjU9NPyWwm5ZSbQUsnMA1TJZ8StxG2ilhsnGpFLUktcvYl1MCdWPZIjpLJMKQowyymTTBxq1KuiPIGEVRJSNUAaKDDH2Ambu4FxU/PY+IGeIAdGbOEZsMUJsujmNDTs3KEeZ3Dy+mX/mez7/XiJ5UBKJxZYNhEJoK06I/fnpQBA7aLeDACCSIbZ9KsyDGhjBHrP34OshEmJi9pJcAy1BEFP0KyDEguXtqgLbmwF9MzwwjN1n787Xl00LRC2T7O87p3ZGLJO7pne1beRihpgUqi8KM3Kxz1xZJkXLq2p5TEB6aNNDAIIMtTkD7YwtDx4fXGB8ajxCvYmlCtVnffrciz/HpxMtk+K2ShLE5P2Y98vzUK/UOYklEmJiDVaEbDPBnliv1jmJOFgdbBNi42pCbLQ+ykXQbRPb+MORbpTJJEGMEqpvSogxIcDqYa0o11yVZbJabYteqpcbqpdqUqg+0Plgff6iVsrytSXEkmxrphRSVlsedXQ+VdaTtu86S6HC4WATqk8NSKfYBY1EJSK9R7H4RaZJ+G6AFJpOeB7IKuYZ0VhUYZcScp/xu1GRjrrlpdls+6FKy2TRyjMbZZKFxO45smfmRe+/YH/Mqc/BUfWlwQcKy2Sj1eBCBBdWgKiQEpJHccukBSG2Zk1bAAKAP/4x2TLJN5xkmZQJscWLgcsvB772tfjyqIRYB4WjVEKM0ieAZpmcQYQYszqJGWJ5EWLRDDGJhlTdsIol3+xabMefPvhTHPvlY7Hi31bgeV99Hn84yqVEy6SSEBOEGilUnxFiQFwQG0ghxGp+jdtI5HllQkykG7hl0ku2TMYyxMS2hXB/WfCSBTIdIWaaAyY+ACuzyyrJ/WCCGABM1mbuA2JkgJTwcBAzxMRAfQCJGWJycUIsFMTefd27sddn98JdT9zF55EJMfZdJlomheP1hkduABA8vHue116eSIiFgtjBux0MoB1FMFkBtoVCxUh9BHsMh4KYQIi10OICgdoy6Sv7FRFEBMvkjmY7W4oROPwFWNi+LIixvoiWSZnYEqdVUWQ6yyQnxGwtk4pQfaVlMhStWTuLRxZHRtcUqUCdQMXabraanNbSCX4yTScez0z0iVgmK/G+i/SXmCGWNE2SPVG0TCZliIn5eUw0AxSEWEKGmEi4qUL15Qyx2bXZ6gwxeZRJ4XpjkyN29f1X4+PPmQjuBbt976KyTDJCjP1fLg0hVvF8/vCaFqzfaDbws4d+hvdc/x7cv/F+2zXglWSZtAnVB2AkXGS1AeqEC7FtV1lPaZZJTogJ+7dOXEuze+ooK+p0KnJIl2tGpulSRCUqIabLWzMV8yh9z5rXZZMnl2rr1Yi2SoFYQ6Sl2Wz7ofpzrYtcomUyQaUVRztihNji4cWZFz1/aD7WX7IeP9r94uAD4WFpsDrIb8R4sH6SIAYAlUrM+mmVIbZmTVsA8gDceSfRMqkgxIS+4U1vAs45J768JEFMOpFM+E28/gevxzfu+UZyG53KEGOVt2WyIFQJK5VlUswVcFFs29eErzISqi/ezLLf5WJ/Z9vWYv+49NZLccf6O/Dw5odx62O34pY1t5DnNS7BMimH6lf9KkQeSQ7Vr3iVqKglTivZIFV2y4EEMa0mZ4hV4hliSZZJUfDi6yBM63kef9iWHxKVwfiSYMb2vbQRKsW/s7fqXMRTZJfpwv2jgpjFYCoFOZaVGWI+YoQYI2qSMsTkqod03URombz50Zsjf587OJdfR5lAwMSAAxceCABYPUFoJAABAABJREFUuWAl/04aPtAIB68ZmxrDP9z4DwCAdx77TgBCZpkQqn/v0/cCAA7ePRDERIGKWSbn1OfgxH1PxFB1CCfte1JEYGF0j9J2mECIicLQoCcQaaFlcqQ+gn1H9wXQFhmTBLFVC1ehXqljv7n7RWxyYqi+TYYYE1R4hpjGMkkK1RdELFVbh+x+SOQ+4dDdD42MwigKQbpML1EgYy9jTPoeGdFRyCxLCtUX+6AjxNh8coC9uC8N1Yb4Oj899rSyTxFBTLARJxFioiAmC3DMMilOoxqsQSTE2HYfGRjh57qAEGt/dzY5Ym/+yZvxgeMncPdiJJ7vbnjkBpzznXPw+yd/b9y+SbWaDS6IDYiEGDVDTEGZs+9QlyH2+PbHseo/VuEl33wJ/vm2f8aHb/5whrUIF+0wQ0xsz9Z2aC1QJUwT6ZOOsqIIcCm0mSgGyZ+ppiHZ8iiEmCYwn0LcmYo8ZIsf4bvRWSZVghFlkILUbLqkthSZZaQBD9Ly1ghiF8WuSybg+jxUv7RMFq0UZJVcew4HNNi6bev4A4ILQQwIb7ZYB6QH9gWzFmDn1p14ZvwZLMfyqCAmX8irVbTCIe+9UHd1Qoj99rdoHK4e3aYSWm4ihJiHOAVVaY9ydeIVJ+KM/c/AJ079BJ+Xl4akumnqQXzjD9/AzWtuxusOl6yTnR5lkpUgiN2x7g48uOlBnH/Y+e2/s/2JWUcLQIg1mg189JaP4tbHbsX3Xvk9/qBAKfZQkmeoPmuvKjTbFAWIWq0jhBjLv9ljeA88ueNJrN22ljyvcUlvsWNCVAIhNlAZgOd5bUKsGR9lUibEYnZEPq+cIebHbY5+W1wCopZJOW8siRhjy2frqBKxkggxJqbV/BomGhOpOWAiBSYurzHdUGaXRTLEZLul8OwzOWDxXqsgtGckh0ZlmdQRYlNq4QKIWybXbg2Ol4+c9BEsnLUQRyw+Auu3rwcATmKN1gNq6LWHvhbH7n0sls1bFiExJ1sNDAH4l9v+Beu2rcO+o/tyQazOLJOhqieOMKkixHiG2MAIjlh8BLb+w1bUKjU+YjSgF4wSR5lUhOrvqgI7WoEoMTwwjBP2OQH/efp/4rglx0XalwWx686/DlsntmK32btFbHJKQky2TEqWQrHyCNXXWSYP3v1gPPqOR/HQpocwPj2OE/c9Ef9113+126kORWgz0TIrVr3Szhpk1x4lvRd+NjY1xh9EYoKYyjKpEHZFMk/OEJOniWWICfvNUHWID97AtrssuoukHCPE5BF5WdusT2mWSaC9XyktkwIhxo7DWbVZmD0wG1t2bQkIMTE3yJAQ2zm5k780fnABcJR0vmu1WvjYLz6Gj9z8EbTQwoKhBfjSmV8yWoZJTaN9I8EtkyaEmPxSrRXkXorHiar+74H/w0ObHuKW2Ps23pdhLYLiL6b9SvTe3iJDTGzP5uFfnI5qy0sTZoD2/pY16ymN0lFmiBHEtSSRJ9WiSaCeSNMYijzGI3ISCTEdseVKzNOJlbZEWpo4mvrdZBTNxM904mi/VEmIFa18P5UQY6NEPbLlEW6ZZMG8TooJYbIgNiSNNGlIiPk2F00dISZdiFWEGOuL3DcA+MWjv8AfnvoDvnz3l9t/E08YGpLqoUbwlnXttrX8Ri7WhmvLZBohJnwH5151Ll73g9fxYGcAccukKhtNR5k5Xq8dkztwznfPwcd+8THctOYmfPOP3yTP22w1+Y19npZJvq+JhJicl0clxORQfeJ2HJ8ax+Pbg4ElTll6CgDg0S2Pkua1qlarHarfVFkmJXJLsBuyeQDER5Vs+RExbUAQ0ypeJRTTwr9JYloVXuKoi6pRJrUZYl4lIp7JP6m2R5W4pQvgFymw1OVqCLFaA/Bb7X1ysuan2mViJR73lqLYTatvwr/c9i+ZRn9TEWIRy6RMiIXboIUWt5DpCLFJr4mxqTHezsXHXoy3Pftt8D0/Jp48d8lzAQTXq+Xzl0fIQQCYaE2h0WzgX277FwDAp077FH+gbxNiwUo8svkRPsIkEyLq4T43UQW2T7dD9QFE9mV2LhNHfZSLLy+BEKtX66iHx89EBdjeDESXkYEReJ6Hi465CM/a41mR9hmRxrbx7IHZPIpBJMREGyD7XBbEeJC6LkMstEzqMsRchOoDwF5z9sJJ+52EF694MWbVZkUIMVEI0hFinueR7J5yJhsQPX7rlTqnvEQySrUdRPGQkXmxDDEmiDX0hNj+8/ePzKcixNj+zAgx1fqpCLHB6mCEcBNHPBcHT0jKENs1vStCyjE3gpghBpgTYo9tfYz//5F5iJ3rfnz/j/Hhmz/MHyof2PSAUfumNeW1lx8ZZdLUMqkixDQZYvc8dQ8A4GUrXwYgyNPLmrcqZvm6JMQS6Zs04cKRLS+SU9Vsr6Ncxhlimhwn1odILpaF/ZJitxP7oKOejGyARAJONx2J3hO/G404HhGMHIp58rzK5Rnse6kZYgZ2SNvjhhr23w9VCmJFKwIhtnTeUgDBTbdLyyQvdqGULpjsLT23TE4JF2AFacRHmWTr4fv8Ic6aEPvzn9FoBIJOnBBr98UbGWmvTkVNQTHq5umxp/nDB5UQe3h6Q/v/mx6Otu+YEGu2BEJs2TIuECaRbzsnd2LNljUA2radyPS2GWKO1+sDP/8Afnz/j/nvVz9wNXneHZM7+Mk7z1B9NSEmbCMTy6QlIbZ6y2oAAcHyrEXPAgA8tu0xzRwZS8gQqyoJMUHkEuyJ/CenvKAN1Rctk9yyKFomRRJNQ4hRLJNJGWJy31ND9QXRi/+kilqqeRPEtJgQJ/ydWW3Ytnrj3F9g4acX8mOeVOJxb3HMXPH7K3DaN07Du69/N779p28bz88qURCTCbGhKCEGtAUjXYbYBBpYvy0gwWbXZnMKDIiLJ6cuPTXWjvjdT6KB9dvXY/vkdtT8Gv7qoL9qTycRYuy8y0aYBKAeZXKgfZ3i/QpFCLJlUvj+IhY2cZTJZpsQS1qeTIiJJYogjGKTM8TY/6eb0xGbXGx51WiovlJUUtgOxeMBiIbqc2FGkcOlqoggVh2K0GZJGWKAkH/G+q6xTIqCmLitROFpbGqMC2Kq5bG2xNG9qRlibF7fC15arJi/IjKffI5RZYhpRwANw/5ZH9i2r/k1DFYH+TWZ20srilEmFaH6Q9UhTsFVZcukISEmnhNXz0XsJdRv1v8GAHDYosMAAA8+86BR+6Y1lUSImVomRUKMDTSisUwyK+jLV74cFa+Csakx/uxgW04skypCLGtwO8GWp7MUKgmxFBsgSXQxtUxSCCpLy6Sy70m2Q42YZ0rAUe2lJHqvmfzd5CHmdXRUS6ktnUXTaOTLNNJRI9r2Q/XnWhe5hAyxpJ2SE2KCIOYiVJ9XAiHGbl7YW0gurFSrcdKoWm0Le+zkIQpiNoSY7wHNJhqTwfLFDDHf8+Gx/vo+vIveyptoJQhHopDFRAdqhtjDU21B7IFnpLeKrjPERELswAOB04IHt6QMsUc2P8I/YiNrRfolkm9dtEze+titAID3n/B+AMAta27h1FdaiZYVNjockCMhJqxyhBCzsUwa7h9sP102bxn2nRvk/4hvvp1Xs9m2TDYQF6JEK58w+mOcEJMsky0PA4JLXwzVb4tp4d9UhJgsEDG6TDHKpCyAJQlkKtoriery4AUZaYJFVPyZKmqp6DJimL/4d1kQ+9nAY9g+uR03PnIjyCWeOwzPU//zh//BX//or/mx8cXfftFofrFE2lcpiDFCbFY0QwxoC0YqIWEgvIhOeA1uL14yuiRyQygLMYy+FMsDMBBeBiYwzY/F/ebuF7mG8cyy8ETBA/XD/DAAUWJrWiOIVSVBzNcQYlVECTGREAoFuMAy2Q7Vj7WVkCGm6pNMPTHhYnZtdkQ8oYhrIjmU1CdyqH5DT4jJJRNiYls6Ky7b10SRJ6nv7PsDosK6HOLPySiNZZJR+TJ1Kk4TG2Uy7NtQdQie52H/BXpCbKg6FMsQ01FrMiHGPh+qBctj20oUDyOh+gNSqH4oDIrCXBCqHyd2qCUKYipC7KHNwf3RmQecCQB4YscT7ZejOdSUMOiGK0KM7Q9JlHCz1eSE2NF7Hs2J1azin5NQfQUhRqK6UqxtRgJVwjSRPhEpK+2IlZRQfWH/tsm8ohBwct/ThEjKNFQCLjO9p/hudMsTt7vtgACqAQ9sQu5NR7XUDZ5gvO9RiLQUkqwfqhTEila+3yarEnbKpXMDQmzj2EZ+UeuEZTImOoiCmFyCZdJnAoDncWGB9Bap2QQefZQ/KLX4CaU9ymTkYVgUxJbsw5tpJVgmGSEGCCKSeOLRkFQPTT3J///gJunGwrVlUiTEPA/eSHAznzTKpLhe4v95vzRCX6cEsVarxTN2Xnf467D//P0x1ZzCdQ9fR5pfDNQHom97XJbSMikSYhTLJBPM2LY13I5s31w+fzn2GQ32645ZJuVRJis11FrqsHv+M7xvqDUlygu+9LtCTGPzyoRYM0qIiQ9zsVEmFbbHpAwxFSGWRHnVKjV4nhcX0ypqUYtElyURYhq6TBbEWDERhlQZCLHL77ocAHDeoeeh6lfxq7W/wh+f+qNRG6xES4ofZog1VIKYihCb1BBigmWS5YctmbMkOo0gnhy26DA++mKkWi2+vSdb0/x8unz+8shknBDzJEFsN0EQC4WtyQqwLRTERGGGT8cIMZ1lMjw2YhliQqh+Pdx/JirA9lbbMhlriyCIsW0sijwDlQG84fA34PzDzsffHPE3kfnYdErCKFy/p3cGsQM6ao0Sqi+Hu1NKF6qvy/RinzExTykeSsQdG0CDHb/i8sQMMZ34xEhJmQ4TpxEJMTHAnolOy+dF91n5HCP2Syf4iXZWcTADthy2jWThc8CPh+qLxB271sqWSRWxQ61Ht7avk0pBLHxheMxex2C3WcHxH7ufc1iTXvt+gmtjaRli4gs3xT1EmmXykc2PYMfkDtQrdaxcuJILo1nXM3FwK0tCjBM/GtGFIlxQrGa6tlSWyVQboEkuVoqYp+qHOI0ug0pJIdlaGE1tjlRbXoJARek7Nd/NVszTEVs6sZJiYVRmiGXMpqMQd7ocNXG+MlS/FMSKVwTL5OjgKH9AYDcbTi2TCYIYE59IgliEEAt3M1PL5JNPApOT8MKHNyawRR6iwoM+JogJ2y4pfF4piIkPigkkVdMDHpl4iv8ee9OWV6h+C5F1a8m7R7heIhWmJMRsLZMOhb7129dj59ROVP0qls9bjpcdEGRbUG2TbL+fOzg36Npk8NDUmVD9DhNi4X66bO4yPkLc+u3rIwHcTitimWzB87yo2BTJAVNQTwmUlxyUP6CwWw5EAvmFm2x4MZFLN8pkkgAGBOcxXYaYHKIfI8EUQpg8Le9jUg6YRXYZ7wcTxFrRY9dIELMkxFqtFs8l/H/P/X94+aqXAwC+8Nsv0JctVDRUP/zMa/cpKVQfaAtGygwxRlAJhNjec/aOTiM87KvskkFnmqiH23sCjTatOXeZenmMEAu30UG7HcSnYUH4qgyxSFsyIaYSXZjgJ2eICeHy4iiTO5rJhFhSqH5kGkmkA4L9f+XClfjG2d/AgbsdGNl/KeIaI4IO2f2Q2DScENNkiKlGfVSJSqpKskyKFkYdIaaj25K2Z8QyWY2TUTqLJiPEVN9fRBATB1UQCDHWvigKKzPEqlKGmGL9VAKcTIiJ01EyxDbt2hT5nJGHlRbgeb7SUkYpkRBbOwpMCRRVq9Xix/OK+StwwIIDAORrm5wKBTFx1GorQkxhmUwixO55MqDDDl10KKp+lWfJZV3PpAwxo1B9CiFGfGA3JqjyEF00fSeF6msoJJW4ph3JkEiIZbHcUQk4irWSRO+ZipW6TC9DMU8rVhIEKqo10WRwCK1oRrD0yssrQ/XLKlYJlkndTslsk2y6RcOL3PUhIUMshvUyYUV1EVeE6sP3OSFGumiuWRPMvzgQ+xgQ0QxPAOKbqYpXiQpiwraLEWLVKhrNBlZvXs0/UgpiCSTV4yNBwDKrWBBrnoSYsG5JlknRCppqmVQQYm+/5u04/r+Pj1sHHAp9jA5bPm85apUaD3v96YM/Jdki2APQ6GBIiP1LEHTdGh9LmsWqlJZJcX/qQIaYSIgtGl6Eml9Ds9XkQfvOS7JMAhLJJFNfiRli6YRYTGTihBiieWNyhphomZRHmZRoK2WGmBDAH/mpEKLmDc0D0B45LZYdljCvbG9KGmVSNa1Mb4jzJgpi4gAaaSUe9wbH84adG7B512Z48LBywUpcdPRFAIBv/OEbfBQ8k0rNEJMIMfHhS0eIDTALo9fEum3rAOgJsURBjEqIhfv8pN9SjjAJgBNbkxVg+3RwnlIRW7FRGJWjTDIBDuoMMcEyOVUBtrYC0UUlwMVC9Svxc5iKEBP3TyAktsNjkWK/ZHXk4iMTpxmfHufXvyTLJKAXcFSVGKqfkiHGlkmxTLJtIOckitZEqmWSHQfyCJPiNOPT49FBFSRCDEAkR0yXIbZpfFPi+okjjiotk9UoKSZmxdX8GhYPL8bwwDAWzlrI+8aEb5Z3JhJi8Dx+vs5imWz4wNqxNtn/zPgzXKxbOncpJ6diERgOi4Xq10S6V3yppnohKd5fsHsPVah+QoYYyw9j+aNcEHNEiLnOEEscOTEl68k4Y8uVLU8jUFHIKLGfupwqiuBHyeGi9p0Syk4RZiL9yph/ZiNWuhLzKGIlKbOMSAtSxDxK3ho1Iy1NRO2HKgWxopXvpxJiQFQQ2232bkYXodRKIsQ8S0KMrYfnmRFiTBDbO3izzw5W1ZspLSEmh+pXKli7bW0EMTexTD48D3z5gIYQyyNDTCTECOTb+u3r+Q230jIpEWKbKpO47DeX4ba1t+Gah66Jtu/QMnn/xvsBAKsWrgIAHL/keIzWR7FxbCN++/hvU+dnN9lzB+cCrRa86WBbNzduzNw3sdSEmLDvypbJgfgDoDNCbN4y+J6PJaPBg31uOWKCZZKtt0hURUQtRaZWsqglCWTCKJN83ghdFh1BNimfSybEZNqq6ldjvycRYirL5N5z9saVr7gS337Ft6PzKISxtFB93SiTMbosQSCrJQhi67ev5w/qqWVpmWRh8cvmLcNQbQjP3+/5WDJnCXZM7sAd6+8gt8Mq8lAVWiaVofohIQa0txknxFRh5NzCGM0QE4vZrWt+DSfue2JCB5uoixliTBCT7GciIfbI5kcw0ZjAUHWI5/UAAiEmCmIqQoximRQJMYVlsl5pjzIJAM+0guWRMsQUmWWcWptoj0Spuj/h4pqm76LIMrs2Ozb6oTiNHEwvVsWv8Ol4CLyFZXKoOhQltjQZYuwzMcMyqe/yNmD7rSg8TTYm+WipuuXpCDHWd3GgmXqlHhOoAES2tXyuE4VBtj2VduRqe8TRyCiTAokmzisSYp7n4fY33o6733R3lBALBTiWd8ZePsyeQnDfGF4LbC2T7L7zkR1r+d/Yy8K95+yNodoQDpgfEGJ5jjTJCLEBcTWqVeCAYNlYsSI+k3j/ILxcBhAQYikZYr9/6vcAwEeVZaKoM8ukX7EXxFSEmG04uILG0hJUFNGFGtxukulFtUxqxC5SDhfBMknZDqRw9xQ7KynfjSDMWGWIpYiVaZZCEiGmyCyzHdWSQpKZ7Hti36kDHpSh+mUVo4iEGMsRAxwH6gOpGWL8poSNMpmSIeaxg8uWENsreJBht/0N4UKcmCEWIcTigpg8MqQYRJ+WtfVwcK+GZ+/1bADAUzufiobBu7ZMUgkxRYYYoKDfNILYjfX1/AJy7UPXRtt3KIgxgoIJYrVKDc9f+nwAwE1rbkqdnxNi9VFg+/Y2XTLgUBhGQoZY1lB9g+3YbDU5ycgewpltMrccMcEyycUtUfQRRS5U4sJQuK0GGl7UItmqSPOq6LLwb00PNUHwqba8REKMZ4gJ5wJ52rQMMTEYXxamAODVh7waz13y3GhfUwgxncilzBDTiWm+lCHmeTFBDJBGldWVpWWSj56424FhMx7fLrevu53cDl+0kEMjE2KtVitGiAHt71ufIRas34TXSswQ2232bvjiGV/ElX91pVKYAoJ+cMtkqx2qn0yIta2rqxauigbvi6NMNpIJMZJlUhxlMsEyyZYHAM8gEF2yjjKpo8hUfdcRRkDwkK66F5CFtaR+MfHFlBATt4NMbGkzxOSgeM0ImZwQk0R/cZRJQBCDNJZJTohpMsRYn1gfuGVSaFcM1pfPdaaWSTFDTEWIyeIh688+o/vwfnBCLFw/9j28+7nvxnvHjsS5f4I1ITY+NY4ndwRE2NFPB9/B6p3r+N+ZIMYEok5YJifZdVXMu6jVgE98AnjgAeD00+MzsfsH8b5DRYglZIgxQuzwPQ4H0N4HHt70cKaICTG6RDyGjUL1FQKH1vplmFNFyvSyFBIo5Bo1p0p+trLNeqKsX2Q6TZ6VuH5NZBQrTbYVlRAzFSsJ+5VOXIvQe1kIsRRroul3k7Y8se+2Ax70S5WCWNFKyBDTqbQiIeY0PwxoP6hnCdWvVtuDAwhvtdhDD+miuS64eWGWSXYPoQrzrPiSZTKSIaYQxELR6MCFwYPd6i2r2+uVIhw9FApiR+xxBHafvTsA6SbK1DJ5yy3A3nsDP/6x8s8mhNhUY4oLJXsM7wFAEMhU6yWdaK+vtqmj6x65jl9MrNZLU395JiqIAcAp+50CAPj56p+nzs/eOo/WR4FNm/jpO/ZdZyz+Fkq0TMoZYlTLJDtODAixx7c/jonGBCpehRMuLFi/E4RYrRFaPETRRxBiIsH4Uuh7rRlcVpmYWGtFRa4BVOOEFp8XqIg0mWyZ1BFilRoqfoVf1HWZYrG+VwYi86oe/LWjTCYE4ct/V2aI6UaoFOblgpiwfViWHtk2aUmI3bfxPgDAQQvb2VjH7n0sADtBTKR92cuShg+g2cT2ye08Jy9CiIXbhQkOKuFigAtiDW6ZlDPEAODNR78Z5xx4TnIHm02+Tz5ZGefnHfH6CwhEWgW4N/wOxBEmAWCAjTJZdRCqz0bRrAK/2XA3p5rETKcBQXze2EoW4EijTBL6ZNoWoLZLAsmjWspFIZpU5Xs+3xaiQDXdnNYKrTHLpGaEzKQMsaHaUKRtRkEqLZOVaKi+LkOM7ZuAOkMMiFomZZu2SK7piDseqt+Y4PlnESKtFiXFmFCnFDRlQiycZ+XClbh053FYOAZrQoxdH2fXZuPojcGyH9m5nv+dC2Lzgm3ChKL7n7k/et/jqlqtdhSBLIhVq8D+cVISgDpfTJEhprJMPjP2DD//HbboMADBCLlVv4rx6fFMsQsioSM+QNsSYkkWOFPbIXUkwzSrWaRPKaILJRdLG2wu9EvVjri8yOiDCSJPKiFGEEFIohJBmElqS0fvUQY8IImV1JEaDcRKXb8ogzWI29xWoCIRYgSxOdL3FOtoP1QpiBWtxFEmiZZJ54IYe1CXbgpib+mIlkmfiV+iZZJCiIVB6d5gcKPFCTHEswuMCLFqlb/pP3m/k1H1q5hsTLZvDmRCrFKJEmLzgp/L5y1vv1UU8XNTy+QNNwDr1wPXXKP8c2yUSU6ISRNWKnhs62NotBqoV+p43j7PAyDkiKUIfS0A1/ntXLV129Zxkisyfw6EGACcsjQQxG597NZE/J9VxDK5aVObLskpVN9vtUWxZlWyTC5YACxaFNgddBkgFoQYo/v2nbsvv9HMXRATMsSYZVIUjqrhZcNrARV4cWJKELXQbEZHjhRuAGpeJU5KCSMoeq0W/73aij4Qi3afCCUq9CMpZN/3/PYgAXLfZQun4iFclTvGfiZlhsl/z0yI+dEBCs5ceSYAg2D9jISYGBbPBLFfr/u18YNk1DIZfhYSYkwEGKwO8gd1oP29MsFBN8rk5to0p11kyySpWi1umbyvFpxzFg8vjvQHEAS4qnqESQCoo011bW8kZ3pRCDFGB/7fAcBzfnwmXvLNl6DRbLQJsWodHsD7vpFAiPG2KYSY4rgQ56XQZkCyICZTLxWvorxniBFNRMsk0BYjh2pDEdFIR2yxz5jNUSf4sYcZmUKdVZsF3/P5Mtk2JRFimgwxts09BDStKkNMtEzKBKtIyrHtqbNxAuCC9WB1ECftdxKWzl2Ks1edHZlON+Io65tomeTFziWWhBizS+43dz8s2xF8B4+MxQkxRnsysXDLri18ezutZrNNXjeFe1LdQEaAWhATXk7qLJNX/ulKAMH5mu3vVb/KHSZZaLiIICbclxqF6lMIMYWIRbV+UcS1JFEJSB75UvxMZzVTCUa6IHVKULxOUKFkpEWm0/VdReZltLOSBSqCZZIqVqaO1EjM2NLlu5kMUkAOuafsx4RMNiD74BD9UqUgVrQiWiYjgthIToJYFkIsLVSfQoiF4e9e2H6bEGufwBMtk5EMMWlZAiF2wIIDuAUtZi1MGGWSWSZXzF+hHrHHVDhi23FKjbxzi2i4P7RHmZT2j2o1kjfF+sbtoSqhT1ivBxcAj3pbMVAZwPFLjgcAXPuwYJt0ZJncPrGdv7VcuWAl//yg3Q7C7rN3x/j0OO5YdwfWbFkTsYGIFQnVFwQx1292RUGMX0Bly+TAAPCXvwC/+12MuAOQKUOMW7SEzCJumRSGlHdaEctksD1FgWgYwf9nTyLyhrptewwfBJte8HdOiPkYaEUtk7FMLW6ZDNtutOdlQoiH4AFJJsRU4frsd13eWKTvCT/FitFdwrTieU0WyMQsMxNCjK+vSIj5Ps962t2fgxP3CTKwrAQxiwwxURA7Yo8jMFAZwMaxjVHrOaF0GWIquyQQF2NUQgITqB6eHQhEc+pzlDRWegfbhNi99eBcJNslgeioj38Ot5EsiDFCrOEDWxqBeKG0TBIEqnozeut2y6O34NJbL22H6lfqkfyzZ8JQfV2GGCvVPs/EDPZ9pRFi2r5XCYQYYRsAbUJMNwBBUnFBrDqUKvwkfaazTLISCVIAmFUN+izvt7rlsfUbriUTYuLyPc9TEmLL5y/n1zFZsBcJMfY960bRlPuwYv4KPHLxI3jrMW+NLRdQbyt5mojQLApiFoQYC9Tfd+6+XBBbPdYmoti9EhPCZtVmcVt1LrbJViuwOEPKf1RR5WItXQqMjgKHH97+TLjHTLJMNpoN/Osd/woAeOvRb438ja3zHzf80WwdhBKdGkD8xRSlVMRPoshDDCMnj7yXIvIAUYI5qe/kIHWCYEQRLrQ5VWKfTGk6gthFyRBLtbOabCuCQKUV4Aj2WSohpst3U2WW2Y5qSSIdbWk6zXGTZrPth3IbuFNW9hIsk7qdcsnoElS8ChqtRn6EGDFU/8G5TdS2rIkECCtD9X3fjBAL2/fCCyxrrykQYuxhacHQAmA8gRDTWCaXz1uOZfOW4eHND+ORzY8E4coakqqFtmVy+fzlXHRiwaXhCocdJZIXbDmTaiqKWybDEx4XZgTL5PoR4IcD92Db40HnVsxfwR/cHtqsIcSEfey68DnvhH1OwOkrTsev1v4K1z18Hd5x7Dv4eu0YAL598ARO2fSw8sFQrt898Tts2bUF+4zug2seugZXP3A1Dl8U3Ngtmr2Ij+AXNO/h+fs9H9/583fw/p+/H7etvQ2LZi/C7W+8nVNRrVYLN66+ETesvgFAaJl8ahM8TojRbpgbzQZpH5QJsYYHNCuKm9m5c5Xz75zciVn1AXgA1g3swp33/QAvbQ2iFqwMn8b3fOWD/e+e/B2AqADeWctk8PPi51yMG1bfgCMXH4l6ZT0+dT2weDuA1zTNCDExFwzxQP4BNnBt0wvmZaH+zWQyS2WZjPTHr8WmSZpWFuhUD+I6QszzPNT8GqaaU6hVavwmhf9NI7wx4rXZCrapH4oesREpmWVy2QrgyQ04cM/DuD2Papm8ds0NuPEFwCW/BvYgCmLPjD2Dp3Y+BSBKdtardRy5+Ejcvu523L7udtJ5gVV0lElJEFME6gPxBy4dIfZAKIjJ+WHkarWCYG8Avxx6GkDcLgm0BbixWpt+FUVDAKgLt1vbWIaYhhDjv6tGMhSOo1nVIYxNj+PDN38YKxeubLfRamFwGtiGYHCBpOXJ2y+N6gI0GWJSX3UZVPVKnUcWxNpJEJXkkkk9qmUSiBJinudhqDrELYBJbckZXjrLpPw7E7DZeX5WbRYnowC9HZIvX0OI8T6F34GKEBusDmK/ufth9ZbVGB4YjmeI1dJFrHqlDg8ef3hS9UHZL8W2ktcnsnxBEBMJmt8+/lt850/fwX5z98Nbj3lr4n0yE8T2G90PS3f8HgDw0Ng6rN26FnvP2TuWIQYEtsm129bi1sduxXFLjuPLfGrnU5g3OA9DtSFsHNuIf/rFP2HH5A6smL8CC2ctxMJZC/HC5S/k++PWXVvxuyd/h5GBERy151FB4+LozSLen0aIzZ0LPPYYMCRsG+HlJLv+vPv6d+Nzt38Ohy86HM/b53mYVZuFhzY9hHmD83DBsy6INHnEHkfgmoeuwSXXXoI/PvVHTkm+/Tlvx15z9gIQXEs+estHsXFsIxbMWoAz9j8D5x92Pj//MiKNfTcRtwaxlISYxnZIIms01koTG2CkT2kiDyVInSiCqPpNbYtiq5T7nkbmUUSsVBugQqizCdVn7TdaDa1YSemX6YAAkVEmsxBimmwwebq0/Zgi+AF68ZAifPZLlYJY0cr3SYRY1a9i37n74pHNj7gnxFIyxPiJYXoaXzsceONLHkblslX477P+G6899LXB30RCjK2H55kRYqFw41ejgpiYsXbgbgfiqlddFTygfemH4R/0o0zubE1ykmH5/OX8AWf15tVotVr43WLgt/OBZ2MnDkd4Egnb2zwEbA3v85bNW4YT9jkBAPD9+76Py35zGTaNb8KPH/gG3nQUcGGzgaS1nGxM4so/XYkr/3Ql9q2tw1krgOdNjyN+uytYJplVjb+NaK/jP50IfHHgeuDn1wfrNW85v9FjlFHL9/DUMPB/y7bgl4cCRy1YjTdM7wTjJm4In/NeuOyFeOHyFwLXAzeuvhH//Kt/xluOfgue3rkOL78Q+OOiCcz+4uH42PM/hrueuAvXPHQNxqbGsHh4MX594a+xaHgRntzxJN7xs3fgO3/+Tmx9rnv4OgDRh2pWpyw9Bd/583dw62O3AgCe2PEEzvjWGfiH4/8BN6y+Adc/fD3Wbw9yQEYGRnDyficD99wes0xu3bUVWye2Yq+RvSLC14PPPIh3XPsO3PjIjbjgWRfgXce9CwtmLeC2OvZPfivUJsRaaNWq+O2ewDUrgGcv2IxVWx7F1omtGKoOYZ/RfVCv1rFh5wa894b34orfX4Gj916Ik18A/OfglzH23S/gyNn74/I9gUX1XfjaL/4Jn/rVp1DxKrjo6IuwY3IHrnnoGqxauAoHLDgA//6bfwcAHlwOtAWxR7c+ig07N2BWbRZm12ZH9vmpxhTWbVuHp3Y+hdH6KOYPzecPP2NTY5F/Wye24qbVN+GXj/0SB+12EP56jxdjGxsFPiTELj72Ylx87MXBh60W3vOrcEHNJs/R2232bgAkQUwkxJrtUSZrDQSWSEnsmRcALpg7WYkSYmhbjNhDB5uHnUtYZh77qbJMJhFiTFhno5uNDo5i867NSqpIR4ixn1PNKdR8QRCTw/s1Fs2JxgRqfps2i2WVhYNrDIzMBZ4EDtrjUC6+PLHjCbzmqtdg//n7c5sZ+8msonc+fie+/advA8cDP1wF3Lh1LZYkCLpisfywJXOWxISVY/c6Frevux2/XvdrnHvIudxGk1bijeho+LS4Zi5w9fg92D4+CkBBiElijCp7ad+J4CS9ejhQs1T5YaRqNvHaPwLXLweeqAU7pzzCJNAW4P6yEHyEyaXzlkamGVBcDXQZYnw+lRgkXJv/+ch/wK1jf8G3//RtTvANVgeBZhOHPQXcIOgsKtHlpQe8FJf+6lLt8qjEFokQC9s6dNGhicKaPF8S+SULOKp9IalGB0cj87xk/5fg+/d9X9vW2avOxn/c+R+J/QSCAY+qfpXbCdm5hq0DE9VEMe8dz3mHUrSVBT/V97d4ZDGWzl2K1VuCuAN2vWMvKPcb3S8y/X+e8Z/41WO/wlF7HoX124Jrqe8FtC5FYPQ8D39zxN/gK7/7SvA7PKUIQhHXjl9yPA7Z/RD8acOf4uursEy++JsvjuRe3bD6Bnz05I9isDqIsakx7JwMrKwTjQnctvY2vh0YIbZ5ahv2+dd9cOjuh/KRO8Xj+SUrXoKfr/453v/z96PqV/GTB3+CW9bcgkargdH6KD5y8kfwX3f9Fz8XirXv6L542zFvw08e/Al+8egv+OevPfS1+PRpn0a9AXwthLzE0ZZTCTEAmCOdJ4SXrqsWrsLVD1yNDTs3YMPODfj9k7/H1+75Gp/0zUe9OSY8vuf49+CBTQ/gf+/9X3z5d1/mn19252V4zSGvwY7JHbjqvqv4PgwA/3vv/+LSWy/F0XsejXXb1uGWR28B0N5H2LXOKFSfID6ZWtu0mV5CW2kjUQJ0y2RS301FEIplUkebKUUzTd8pAg51e5JtgGlinsbOyttvEXOxCN9NmlgpfzeqdYx8N4T8Omq+W9p2p1BkAI10TDu++qFKQaxoRSTEAODvjvk7fO/e7+H5+z3fbR+SMsQkQuzf112Ft58d/G26MYHzvn8eNo1vwt89++/cEGLcMhncMLCDtSkp/jwUufl9vpwkQuz8c4BvXncS/33p3KVcELvszsvw9T98HWveED6VYz12OwXYa+0/4qip3fBFH1woGKoEuTbP2/d5+OCJH8THfvEx/P01f8/bffPLgH/Z8QcsvuKkYAQ1z8f2ycAqODY1honpdiAt6sAXzweqrauw6guHYmxqDNsmtvETIltvLogpCLFH50Y33fL5y/mN3sObH8Z+/7oftu31JDb/PwAIyIuv4U/4x68/Cz9cCpyyGngivNc+aLeDcMjuh+D0/U/HTx/8Kd5zw3vwnhveE/xxEVBtADuxE5dcd0lkmau3rMZv1v8Gp+9/Op7z5efgsa2PoeJVsO/cffHolkdx8O4H46jFR+Grv/8qgKhdkhXLEQOAVxz4Cty29jb8acOfcP4PzuefD1YH8aYj34R/PPEfAzFm00/5t91qNbF5fDP2+dd9sGNyB6p+FYPVQXjw4Hkedk7u5BeHy++6HJffdXmsD0BgMfiPM/5DEsTCZVSr+PuXALcvAYDfAp/fj8/nwUO9WsdUY4ov57cDG/Hb4wFgClW/irt3Pohj3gQADwA3fZDPKz6UPrz5Yfzfg/8HAHjv8e/F6w57Hf8bE8R2TO7Aon9ZxLdJ1a9iqjGF6ea08fD0rH697tf4Cr4CHBH8XlM1IwrlrRZesuIl+OY53wzoSgBnP1jBHYum8cINcyRCzMeoF9w8z92FqN0y/Pm+O2pY8dQUzqssiBFi+4zug/ef8H5+vDIxgT0kfuTkj+D5+z0fL9n/JUGbgk2Shc6znyxAme2DHzzpg3jWHs/Cqw9+NQDgv14aPPSwjECxYsSWJFgNVAYwNjUWI8QiPxMos4HKACYaExioDPDvUDnwgOcFdCSAQ3c/FHPqc3Ds3oEoxXJjdOV7PubvaOLBBcAx3z0Nz192CvYd3ZfbpoaqQ9g1vQtbJ7ai6lcxuzYbf9jwBwBx8gkIc8TuAL5+z9fx5bu/jAWzFuDVB78aBy48EPVqHZvHN2PT+CY+it+u6V0YqAzwMPiKX8GK7XWcfw/wP4cDf7X9K1jxy18CiBNisoikEgnOf3oxvtG6Bz8PRf4shNgbfg987yDgp+GuoBLEhhrB2eHPgTaMA3c7MHbzWUMFwxPADkEXUPV94ayFkd+VokujLUi+cdlf4cLly3HCPifg+keux5M7ngxynB7ehMt/Ahz9puBFDqC2aB6/z/G45NhL8NnbPwsA3HYpFhOK+bokZIjJQqmq78/e69kYrA7ilQe9UtkGEBViKl4Flxx7iXI6UUA5aLeDcNJ+JymnU9UJS07ATatv4gTPN8/5Jt5/4/vxuds/h/lD85Vi0KnLTsU/Pf+f8IGbPsD7JtfikcX46llfxet+EJyz2SinrzvsdXhw04N49SHBOeZNR74JP/jLD/DxUz6e2O9XHvxK/OTBn+C2tbeh2Wri0N0PjU1T9av4wat/gGdd/iwA7Tyu1x32Ouw/f/82oRTWi1e8GC9e8WIAgVD8yoNeib3n7A3P83DK0lOwauEqbBzbiDn1OXjVwa9S9uu/XvZfOGDBAXj/je/H0nlLlfeq5x96Pm545AYuYIkjXLKqV+u4/cLb8Y6fvQNf/t2XozZjQRBj+9vj2x/HUHUIL1j+AvzsoZ/hh3/5IX74lx8q+8hq+fzlGGlUcOn1wP+cvQz37XyUWwV3n717ZJ+95LhL8Lsnf4dv/fFbsfubrRNb8c5r3wkA2GtkL1x4xIVYvWU1tuzagrufuBuPbn20fZ+E4Jyzfvt6fOuP38K3/vit4MNQEPurDbsBeDT4JY0QU5VgmfzkqZ/EKw96JSYaE3hm7Bnc9cRd+OYfv4lHNj+CeqUe3I9LNTo4iu+98nv42UM/w4/+8iPMHZyLXzz2C9y29jYudALAWSvPwqsOfhUefOZB/Ntv/g33P3M/7n/m/qDbfhXnHXoeXr7q5UGXmCBmmSGWJD6J51FyhpgB0ZQUyt5CyzinShukrrMwGlomKQKcTnShbCul7TBheWwdk5anEuB0ofqU/DNT6slWzOMWTU2ovko8pFh/dXQbSTwk5IwBFiNylpbJsgpRnsdD9dOwxXce906887h3uu9DSoYYO7g+uu6bAIB3PbIHpl/zanz+js/jc7d/jgtifHAAdhD6Pg7dAKye70XCXROLWSYr0QyxxJNlYoZYezt++5Dg58JZC/E3z/obDNWGeCj0pvFN2DS+CYNTwNGPA3ftCTw9G3h61xr8Hmvw5j2ABePRbQEAHz35o3h066P4+j1fxz6j++DcoWNw+Zqr8ODwBB4U3hTKtcfwHnjLUW/BE9d8Fz+duhdrR1v8TalcSzcD8yYlQkxY/WHJbXnc3sdh8chiHLHHEfjdk78L8qbCLh/5pI/nP9LEVccMYw124McrA0GsEf694lfgeR6ufs3V+Po9X8cHfv4BTmUduxb4ztV1fO27/4h/+82/4cUrXoyLjr4Ib/vp2/D7J3+PFloYnx7ndr7bLrwNz97r2Wi2mnybveyAl+GyOy/D3x71t7H1XDF/BT544gexa3oXPnHqJ/CHp/6Ac75zDkYHRzm5dsI+J0TfPouh+s0m1mxZwwOGp5vTkZHKgOCB4I1HvBGX3XkZbl5zs3J7//D+H8YEMZ/d3FQrnBKc26hhbCAQJ3ZO7eQP/EBgS7j0tEvx6wdvwnX3XIU3nfz/cNqBZ+CtX38VrtlwG1oesGy3A/CRkz6CodoQvnz3lzF3cC7OOfAc/OqxX+EnD/4Ef3fM3+Hvn/P3kb4N1YbwigNfgavuu4p/pnqIrVfqWDS8CNsmtvHMNSB4mGXCB/t3yO6H4NSlp+IXj/0CP7z3+9g2tQOzJoETHlPY6UShPAz15WQogLPv93H27wEcPQtotXDymsCOu8qfh3mVUXzj+8CiHQBe38RRex6Fweogz6xbss3DJb8G8BwfaLXw4ocCcnFlfT48z8PHT/04X85ZK8/CB573Abzy4ODBeu7gXJy16iz+91OXnoqb19yMAxYcgJH6CL7zV9/BniN7AgBOW3YaVl+8mgsle8/ZGxcdcxGf9wXLX4AXLH9BfN3RFmTYTyZMib9v2bUFo/VRToUweo6PwBbSJ7JQNzo4iu2T2zGnPofve4xk4dPuAuD7+NBJH8Kqhavw+sNfDwC46Q034VeP/Qq3rb0NT+54Es1WM7AWNBvcYtBoNjBYHcRbjnozFh98LE57PfDAwg0kEY2VnI0FtAlGFqD9+PbH8bnbP0dus+bX4LWAr/4I2DkA/ODAaU47PWev50Smvewll+Fr93wNT489jcMXHc4tPmJVWx6++z3g2W+r4ZHhKSMbZ6RaAYf7pauBQy6uYXN1CocuiosSp29dhBPXAL/YL/hdtY28VgtXfxt4x4uBe/YIwvlV1/iPnvxRjAyM4Mo/X4kNOzcol3f41AL89H+AQzYAA6+tAdU63nrMW3l+EwDgwY1Ythn45lXAGecBtepAxKIu1sdP/TgXxFSW0CWjS3DFWVfgXde9C8+MP8P3Sbk+ddqn8LFffAw/X/1zDA8MK9s6bslx2PYP2xLpMCA4lv7mWX+DLRNb8LHnf0wpwgLtkPhXHvRKfOXMrxhZJj940gfx/577//gxOVgdxGdf9Fm8/vDX89FmVfX+570fD21+CN/647fwnL2fo5zm/MPOR7PVxJuufhNec8hr+Hpf/7rr+TTveu678K7nvkvbx2XzluGXf/1LbN21FU/ueFIpKgHA4XscjitfcSXOvepcnLbsNADBdfz4fY7Xtu95Hr77yu/y3/cZ3Qf3vS1OP8nlez7ec/x7cO4h58aoMlYvWP4CrHvnOjy8+WGMT40r92MgsE1+6cwv4SMnfyTqdhAEsXcc+w7cuPpGvOLAV+CvDvorzKnPwZ3r78Rbf/pWPLrlUUw0JjCrNgvDA8PwEFgs9527L47c40icsf8ZgPcevPdXwHs//Q0886yV+PDNH8YXfvuFyAs4tl5XnHUFxqfG8ZMHfoI3HvlGvPPYd2LfufviP37zH/jATR/A3nP2xrXnXxuJCNk5uROf+OUn8JMHf4KzV52NC4+4EEtGl+COdXfgTT95E/7wVPAyYeVG4As/AZ6/31JwQYxCiMklEGIVv4Jj9jqG/+msVWfhwyd9GNc/cj0WzlqoPD+yEsXRVquF79/3fdz9xN2YU5+DY/Y6JrJ93nHsO3DVfVdhy64t8D0fZ686G/vO3Zf//bxDz8O9T98bjU5JWw0KIaawVaZaClMIqtScKs/TBsCL81Fsa2n5ZyRCjCAYUW2HYt/lPsi/G4fcU8U83baiZIhpxEpbu2eaYCS3z38nZJZRrb8keynBPqs8tixJx36pUhArWnm0UP1cK8UyyQ6anc0gn+Xv1+2JZw5/PT5/x+fbI92oQvU9D1f+L7D1pOdgwUcJb+y5INa+MW0KmyR2UUwaZVIgxFgW8Z/f+mdu9Tpx3xNx39vuw/pt69FCC88+9hWY8/Q2jNd9/GlBE2f/3UKsn9yIqUqQISUv2/M8fPWsr+Kioy/C4YsOx9CNt+Ddl1yFX568DNOfvpS/NRiqDWGvkb0wpz4Hnudhv7n7BW/Qv7Ue+NK9eOxlJ+Lez76PB0BX/WpAl61bh72POgXVoWA7tEeZFE544X8/+8LP4uWrXs7tOr/5299g7da1eHLHkxj+9Oew/PLvYdZ0C2gBQ4etxD/V7uLzykKs7/m44FkX4A2HvwE7JndgbPX92P3AY+DVg4eJD57UppvEMF7xwsHywsTtdfaBZ+PsA89GUv1/z///+P+PXHwk1rxjTeK0AGKh+mz5e43shTveeAd2Te/iF+Kh6hB/G/6Kg16BVit4CzjdnMZ0cxr3PHkPTvjqCbwNNSFW4dvqR2tPwIlf/Tlf9jPjz2BsagwVr4I9R/aE53l44fIX4sMv/iTv7o8O/QRw8snAqlXAfe2HD/amFQiox8+86DOJq/y/r/pfvsydUzvx9M6n0Wg1eBj7QGUA84fmR24ymP1B9yB64ZEXAod8EI0D9kfTa4fcR0oixGLVarV/Npv4+g+AaR+ovSsI+j7/D+12jlx8JLa8d0vbEiXSqc0m/vtHwX7pvzve55H6CD52yscS1+UbZ38DzVaTP9jKtIPJjbtY5xx4DlZvXs1Jj/c/7/1YPm85b/+/XvZfeOCZB7gI86Nzf8SFt7NXnY3b192ONx/1ZgDAh078EA7Z7RC84qBXAAC+/LIv46FND/Hj9wtnfIFTaq86+FXYtO4BvOLznwY8D6sWrsKHTvoQ79dgdRCnLjsVpy47lbYi24DfXQ78/NZv4N6px/HE9icwPj2OsakxjE+Po16pY7Q+ikarEdiRpnai5tfwtme/LdbUktEl+PrLv46129bijP3PwKNbH8WP/vIjPD32NCYaE5g7OBcLhhZw+mywOogdkzvwp6f/hP3n7x9YGlstVJvAt/8X+O9PvhLDL3wZjlh8BA7Z/ZDIso7f5/jUB300m1gwDtx449745r9eiDcd9SbaNlG0AwB7bgd+/cDz8MD/9w4ctuiw2GQLpqq4+Qrg6pXAtz7yCrzzWMWLqlAcvvty4LorPog9XniOcpELZi3AJ0/7JD5x6icw1ZxKtEy+5KFoH1XLA4CXPARcN/lqNM7/ayWRBgT7zuOXPI4f3/9jTknK9YZnvQEvW/kyfOXur+D5S9VU+nOXPBfXnHcNdk7ujIx0KJfuHMTqK2d9JXWaf37BP+PCIy7EIbsfYvVWW5Xb+Kw9nqWdh13vv3DGF7QC3OsPfz1eceArEgUjkxodHE0UIVm9+pBX47lLnhsjKvMsRisnled5kYwuXcWEG0EQu+S4S3DJcVFi65i9jsGdf3snraMCUbVg1gJcdvpl+NjzP6a2xFdquOpVgV1Q3E/fedw78eaj3xwbAAUIRL2Pn/rxyAsbAHjO3s/BPW+5B5ONSYxt3oDR3ZYE9xH7C/NnJMRUVfErXOiiFrsnYtciuUYHR/E3R/xN4vz/9bL/MloeWyarNLIGoJEuxiJIgoDTbDX58qg5VdogdaIIkrY83ciQlGnEz8g2wJTlpfWdZPETpqFsB/J3QxgQgDTggWZbUfYFCuEnT2fSlva4IZCOkeWVhFhZhSjfJ1smc6skyyQb+jo8uHi4faUaOXkCiFgm+YHp+6g2g4cHUjFBTLhhaAibRCuIKQgxnZi2auGqdqZVOMTe0EQTxzwOzK7OAibDQHUmGilOUIw0g+9j4Rhw9uNzgIOTbSHyeu6zo4J9VDcxmz0gHFkOEAmxuCA2VItm11T9KpbOWxp81vwfYApgKWx++GaSbVPWhmwD8TwPI/URjMwO7HmqmzAVMix+nmttiobqs+VX/ar27SgQrFvVa2dMsYcOlSDGCbGK394PqrVIW7LdSVkGo0ymled5GB4YTnzQFaejPIQCAFotVFrgeX9otdpvpNnvrFTrIBKmIWHDAvZVYlrkoVkU49mNTAuJN/668jzPKM+EWnPqc/DR53+U/37AggMi4vALl4cZfGGdufJM/v9Fw4twxcuv4L8vn78c7z7+3fz3F614EV6EF/Hf33L0W/j/R+ojePeqC4HNnwZGHRxXnodZUy28dN8X4KWLFmVu7nWHt229h+9xeGS9ScX2hwZwUe25gNCecYVt7be9gn888R8ztwMAK3cNY+XKlyVO5wE4837gzJdeAQwrjsdm+1zy4uFnAQThJSmrK3I8JB0bwuenNfYFVrxIPV1Yi0cW481Hv1k7zfyh+ZH9NalU4e95VK1SSySP8i4Kjdap7cBqyailNbiIJQhimYu1IRwTSbRkMLn6emkrbg5UBjAwIIhvIhWWkRCbySU+oCcRRmTShWBPVGWIUULu00bnS6We0pbnSctLWz+CAEdZHjUoPi3fDaDRbRSLX5owQ9pWBt8NVYDTEmIK8dCa3qPYSw0oMrHvtsdNv5SxILZ69Wr88pe/xKOPPoqxsTHstttuOOKII3DcccdhcJCOq5eVUEUgxFIsk+zgaoQ/K0JWDj9pqEL1TYUAKUMMiIpase2TQIg1w6wtcV7tg7I0KqUfCoFNQRDT5iSY3qywUR+nptR/F9YLEAmx9iQqci2xX2FVqsHDFlun1DYUN5Wsui2IiaH6uotbWsn7cYQQY+CTYAf2bd7uarZjIUrulyyIGRJikc+pYpo87wy/8XdWrh8S5e3czUrbr2zactVOWltp+7U8jct+JS2vPH7KmsmVhyDWzeNAXLYogsmjoFMqhRCbKaV8aE8QZgAi6dJKFlSoI+rJA4jZjs6nClsnLS+N5MkoKilFniR7qaYt03w3Y5pOsx1M892yDtYQEQ+zEGJEeo8ktBJyxgAaTZdm6+2HIj/NffOb38TnP/95/Pa3v8WiRYuw5557YmhoCJs2bcLDDz+MwcFBnHfeeXjve9+LfffdN73BstTl0UP1c6sEQUwO1Wfh9pWqQhBThepbCkURQkw4nk0JMS1dJpZ0g1IJM8xEQYw0P/VmhQlik5Pqv8uCGLtQCaH6SXSXsl/s19oA0Gxv01Sxr8CCmDeLdU0/hHJaaQUxdgEVLJO+wRDj7YW4I8RyKZUgJlbag7ZEiEU+p4pp8rwz/MbfWUnngkzl+/Ht3M3KQzDKeoxRhC75b0l9dylQGRJihfmOyyqLWjkTYh0vcdlMEKvV7NavCAKfg1ISRraki0QOye2L0+hEM/Ez0qiPGkGFKvJw0YUwqiWF2CLncAk2QG2GWEJbygyxtG1lkremIZq024pAUFHD5E3y3Sijl6ZllpmSgGl5cgDtuylD9YmC2BFHHIGBgQFccMEFuOqqq7BkSRTLnpiYwK9//WtceeWVOProo/Gf//mfeOUrCVaxsuLl+5z86Rq2mJIh1mg2om88KtX4xUhHiBkKRSxUH5AIMfmgTcoQCzPIRDGNRHiBNdeen7Xh64Qn05uVkIQjC2L84iI0wftlIPTV6sCEQIiltaFar7Vrgauvhj8U9EYWxDpyYt28GX4YBcOCxAH3gpgYqq+yTJKrCDfnupL322YTEHL8UkUCHSFGFdNKQkxdbNu6EMSK9lBFFZ9M2ioSIZYHAUddXlG+47LKolavCWIqQsyGMAd6mhBLoqwAGumSleRRtWU7kqEpsUUSlagEnMHyVMs0WR5AH/UxLfMqVTCS7J62OVxKcZSyPBf7AkGAI5OACWIl9btRiXklIaapSy+9FC96UXL+RL1ex8knn4yTTz4ZH//4x7FmzRpX/eu/8rz46IydrpQMMVF0AKKWST5qiTjKJDsITckYhSBmlSGmsEyaCEe+LSGWt2VSRYiZCH21AWAiPUOsPYPiJuzDHwa++lX4lwajqjUb02juCobi7Jigu2kTvMWsa24IMXYBUYbqVwRBrOI+ELfrZUKIqdZBR4hRxbSSEFMX22a98pAoVh6CUZEIsTwEv04tr6yyOl0uBbEiXHOTCDGbKtq520FltUzKuV+q6SikkqotXVC8bnQ+VVA8JUNMa20T+p4YFK+ZRpyOHBSfYgkFoH0RTRGfqKSS/NJatzxd35UZaYTlkcVKB/tCks2WMgABxY4sr1+/h+qTnhh1YphcCxYswFFHHWXdob6vAlsmxbcKEVtcpaq3TMqEmHGGmIVlUiTE/Lhl0sRaqLJMaoWnTlkmxROeiVAXVmWgHpnXKkNsw4ZgnnAdmp/7LJpHH5XeF1c1Pg6Mj3ckQ8w5IVbUh9Q0QUz3oC0/qOsIMZ2YVhJi6nJJiBXhIVEslwJoAuVsXL1CiBXlOy6rLGr1AyGWVRDrgetiLMDe0jIpv8wUP5N/T7tPpFgmVUJC0vLScqNMLZOZRReKeEggh5QZYmnElglNp+k7hUjT5XCpxFHb74bvCyAQYsQ8OZ3t1dheqqHbKKOJ9ks5W+vp6Wk89thjrprr3xIsk10nxDSh+uJFR5khJlgm+cFlelMSiix+rT3SFjlUP0KI+bF5TcLnjQkxy6y03EP1VRliiGeIpQpiQPs73LkzmCfsTPOhB9Hc9Ex6X1zV5s3h8lm3cgzVD6dpJYwySV9IwYQIuVSWyaTf08QyU0IsadqibqtOVx6EWFEeqvLIEHMlrIlt6panW2ZJiJVVFr3yEMSKEKrvee0Igj63TALpFjil9UtD1kw3p2Ofyb+n2eRiVFCK1SxtpD9TC2NmUSlFKLG2l2rESnKmFyEXi0LKkXOxUkLnje2zmn1PS4gR9pfEvuv2K11WnEQe2g5A0C/l7In1z3/+M5YuXeqquf6tIhBiCW/XxVB98c1CpVKLqMwA1KH61pbJNo1FtUxGVoeF6gsfkwgvABgcbJ8QPUF4osxvKog5IMS05Ju0P1UGBiPzsu2TGqoPtPeRsbGga6wfaNFEQ1e1aVPQNbZ8Yd/UfscJFRs4Ii1U38YyWYSbc111ixDTzVvUbdXpch2qDxTnocqlgOPKMpkXIeayXyUhVlYvVq8RYiLdy+5pS0IsnRATHs5NrG3yvPI0FFEi60iGlKwnsQ9aAc5QVDJZnthXeXkUUgkwGAGUIubpvhuZ3rPM4YqIWIRtRcmvo2SksemSlmeyH5Oz6TR9V5KApWWyrEKU5xWHEJNuHsSDS3yz4Nf0hJi1ZZKPMtm+aaBaJsXlqjLEtNtWfNgcGmqvG5UQM33Q7FKovj8QEmLs3ipt3cT5ZUGs1b7AdkMQS7RM3n9/YKskFskyKWSIVaoD8UZSF1IwIUKuLBli8oO6LBToHuR18xZ1W3W6eo2aEMulgMPmLxIhlsf66doqCbGyZnL1miAmEmKMDCsJsdT8LNMMMR31RBkZUlymVkgQBKNUWx6V2CJYNHXLsxVKVMukWEKpAx6oxDxdLpYJvZeaIZZiZ6XSexSxUivAidtKZ2FUiJVJbZFHLzUdEKBPCTHy2fjII4/U/n3c4KGzLE35Phc6uubjTXi7HgnVFy2TlQF9hhg7CE1vSlQZYqKoJR/cCpKq1Wq1CTFBqNEq4KLwM2tWmxoSBDFSeH1elkmm6JuG6ssZYnWJEEujzHSWSdaPThNioWXS48+/Qqj++C5g1Srg3HOBb3+b1BzJMikSYjZveIsmRMhlIoilEWK6303osqJuq06XS0KsCA+JYuVhmSwSIZYHAadrqyTEyprJ5VIQK4KAVBJiyooRYgnUE0CndOS2VdNQ7JCmlsmkYHPTTC/boHiVUKLNxdKJhwSLn5J6StlWaXlkVEshlUhLG/AgbXRF17QgYD5aqm0WHkVoVYm2/ZohRhbE7r33Xpx77rmJtsgnnngCDzzwgLOO9W15wiiTBbNMJobqV6txVbtaLQ4hJhBeyvnkErd7JwgxU8skX6/2JFYZYmGovlWGGOuTTIh1yTLJCTG0xVp/MhQYV68mNydfjNSEmJ/NMlmEm3NdpWWI6YQL+UFd97sJXVbUbdXp6rWHRLFcft8lIVYSYmXN7OpVQsz3S0JMqDTxyZSs0YXqq2xytjlVtpZJ0vLSbIBpy9MIQZHlUQmxBHIoIlZSt5VJ3hqh76n2UgOLJkVo1Qp+GrsnddRHOcRfNZ0ys8xS2KXaiPuhyGfjQw45BM95znNw0UUXKf/++9//Hl/60pecdaxvq0iWSSC44IYHh5ivFFGvhVB9dnBCsJXllSGmC9Xny23FM8S0OVtiP4GIINbwDa2JeRNi4iRpdFcwY+TXSn0oMm8qZaazTLJ+oLuWyQghxvaP6WnVnMoS+xxpqyUIkb7fFiBtLJNFf7tbEmLFrTwIsaJs2zwytkpCLL7sssqaCdVr9nBVqH5JiMUEgKwjGRoHxROILerofFprG0G4oJA8xqIS0ZanzRBLCndXWCZTtxVBzCNliFEHIKDkrWX8bowJMaKFUe6Dcnkam6PRgAApNuJ+KPJd9fHHH4/7778/8e8jIyM48cQTnXSqr8srQKh+wo20ihDzmwHBpbVMsoPL9C2dghCL5IClWSbZSbhiSIhJghi3igqEGEl4MhXEpqfV80gjy8nZaKxvgB0h1gw/ThX7ZMtkqxUfZbJVnFB9Log1GooZ1SXffCktk76fzTJZ9Le7aYKY7sG+JMTyLdF2k7WKth8W0TKZFyHmsl8lIVZWL1avEWLiubskxHilEmIqssYyF4uS9aRqK00ESbUBiiKPrQCnooIotkMX4mEnCDGVeJjSlqod8TOdddR2W2nz5AT7JSVvTRvi3yLuxzoxTxKbdQScLpuuX4p8Nv785z+v/fvy5ctx0003Ze5Q35fvF4sQazb5xVc8EfEDrAWgWtWG6vOD0JSc4hliccukUmyRBDF+oLMTTBoBxarTlklRsJmaAur16N8JhJhNqD4jxBqeBwhWR3KG2OQk7xtrudFlQkwpiBkQYuK+IQtiSltopQczxNIsk7qH/zRCzIQuKwmxeEnieKYq2n5YRMtkXoRYJ/pVEmJlzeTqNUEsD0KsB45rUn4WPLTQIhNU/DNbQkwSJXSZXqIoQcoQ0wlwmuWJAk6avTSVejKx5QlPGkn9Ssuyomwr1fqlBfTr+gTQ6DbTbWVrO1SJh5T1U7VFJcRiNB3VXloSYmUVoopAiCXc3EdC9cMDrBIKYiISC0Adqm9rmRRuGpiopTxgk4QjiRAzskzOmtW+WHmGWV2mhBigtk0mZojFCTFS2D/rJgvVZ4RY2rrJGWIhHQYIglSXMsSUofoWgpiWEGP3oOI69iMhZjJSpI76Spu3JMTi5ZIQK9pD1UwmxCh9z4sQ64RFs6yyOl29JoiJ524miNkSYkV7mZGhYoSY5YO9knpKEnlaNNElqwiisu6lBear+i1+phMuVPZSX/GYb2rLo1hHqdtKnk81TVZ6T9xWFHqPIq5R9k8KvRdpi0g66rLwsvaduo/2Q/XnWhe5PK+zgoKq0iyTzUZEKEBNYZkUCLGslkkxuFy7bZKEo/B3EkUlzA/AjhCztUwC6mD9JKFPOLcZh+p7Hio8VD+8eKSJanKGWJgfBnRfEBND9dk+WGEbyJFlsm0LFSi4Wh9miOke/k0IsbR5S0IsXi4JsaIJs3kIRp0ixDqd6VUSYmX1erkUxIpwrlOF6tsSYkVYH0dFsQvKIoFOVNIJCaaiC0Wg0okgVPqGEhSvop6ShJJI3wk5XLq+p9k9STlVBOqJGqpvIo7qrLEqW2XW5VEsoUB2y6SSdLQUK5XboU8tk6UgVrQqomUyrEiofniAVZqIWCb5CcglIWZpmeTCkS8RYoaWSb7eHrEN21EmAZogxgkxYRIK/RYj39rrBVhkiCkFMaJo6Kq2bAm6xrvljhCLZOW1pLcybB1tLJNFv5l1RYipRK2SEMtWvRyqX3RCjGqZ7ESmV0mIldXrlQchVrRQ/ZIQI4Wkx0QzgjAjti23kxYgbjo6X1r+WepogCBsA4JwEbl/Jeafye2rpjEKnbfMd6OG6pMIMYK4pgzez0ikUUYABYikY5O4HxPEyqzCbr9UKYgVrYpgmUx44FWF6jPLpHiwtxAIYnyUSXZw2WaI1eKh+sptk0iIhSdsqlDTTUJMZZmUbFJtQkx440Ch3xIGC2DzGmeIRSyTbcqso4JYuL20GWKOQ/V7nhBLyxCjEmIqUaskxLJVr1ETYhVdEHMZqt8JQqwUxMqaydXLlsmSEONlEihPzUKSP2NlbMujjs7XUhNbytyvjAKcduRE4XcjmyNx/SiiC8USquoXNVTfOEMs4buh0nuyWJlmTUwM1Ref13Sko+l+TBErDfteEmJlFaM8r/uEmEGGGAvVFw+gVqsVtUx6/D+xNrXFCDHBMqkVtVIIMSb8GGWICYJYwzcUnvKyTPK3A8IkFHJNPMkJ2WhNdtJME7LkDDEVIdZpQawRPdHH9k3AWYaYj/YNAstdsxLEin4zmychZpI/VhJi8eplQmwmWyY7TWxR2iotk2XN5Oo1QawkxJQliyVZs5C0ZI0ie0krghBG59MJOCrRRdcWVShJEi6UFBLF5phmA9Tln1HoNmkaVVsUmyO17ybbKm10xdi+pxGedHSbMkPMlnRULY9wTGj7niJ89kOVgljRyveLRYglZIglWSaB8GCfPZuvB/+bpZVQJMSmTSyTEiFGFmok4YgLgVSxJ+sok3IRMsSM+gXEhD4AbZEnqQ1KhlinBbFw23h+/E2ce0Is/ubGr/bgKJOuMsRUopbuQb4kxNKr1x4SxeonQixrv0pCrKxer14715UZYsqi2CFNKDJTskYrUBFEF4oIQrVMksLPNSIImUKSbY4pVrqsghHFMmmcIUawhGppOsXybLeViTgKpOxXhll4pMEhCOJo2nboh8q01m9961uxceNGV30pCygGIZZimZSFAqUg9prXoHX6SwAI62FCTokPxNV4qL52lMnw7VuMEKNQVGI/gUTLpJYyy2KZtMwQMw7VnzVLmY0GaLaPzjLJutolQcz3g32kmTFDTPxekwix6Wa7vZ4kxNIsk64IMVMxrSy3hFjR9sM8BLEiEWIuia2SECur16vX7OGiZfJ5zwMWLwZOP92urSIIfI7KxC7IRR7NyInkDDENhUQZ9ZFih6QGt5MEOIKAo6SQCAJcKgGnsxRSxDVVUDyB2LIWKwl5a9Y5XDpxVLNfmea7kQY8IOafkfqekt3WD5Xprvp//ud/sG3bNld9KQsAvAKMMplkmfTilslKC0CtFjlgm60msPvuaL3udQCEg8tEKBKpnmq1rXR3ghBLIqlMCbG8BDENIUYS6oAYISYKYjPKMskIsVAEjRBibDc2IMTkUXpUhJj45qay+yLzPs90QkwniGUhxNLEtLKiD1VZq2j7oUtBjM1fJELMJbFVEmJl9XrlQYgVJVT/8MOB9euBCy+0a8v0HrPAZZJnRSG2spI84nTk0fkSRAmlaEbJSNMRW7qRE1UUUkYBjhw6T7Rfqvoq/p6W72ZM06WIh+SRGg0pKy29p8n0oozI6VKspB4T/VCZ7qpbPfB2onBVNMtkQqg+f1OjIMTYQRU7uEze0okiUaXSPrB1YkvOGWJNz5DEMrSGAqBZJvmFsV3GofoJVlBAs30KbJn0hXw7PuCDBSEGRG+GVKH6EULssMPN+1yEt9W6ymKZLAmxfEt8qMpaRdsP87AUloSYm+WVVVanq9csk/LLjCzrVYT1cVRGljtXIk8KDWMigpCD2wk5XOSgeMJIhiQBjiqUaPLPKHlrJiJP2gigRnlrSBaMbHO4dLbKNLqNYv01sZemWXGN8s9SRNt+qMxPrP264XKrolkmhQtuRSU6JGWIQXEQmrzVEkUMgRCzGmWy0ha0AIJQI5FUKmshKbw+b0JMWBapX4kZYh4X+gAiISZbJgsiiEUJsWyCWJJlUhcMSqoivK3WlU7kkn83JcSy2C3LKgkx07ZKQszN8soqq9PVa4KYy5cZRTt3ZyhKqL5NhliayEMZFZEq8qRRQS7Dz3VB8aoMMWvxULTSEQQq4wEPErZVKiFmYLHV5Z+Z0nvUHDVK3ynZbU4IMemZJYs1th/KaIiTpUuXRr6Y8fFxnHTSSagKGU+PPPKIu971Y3le8QmxZiNqmUwQxGIHl8lFXBbEPA9oGVomubUwSpcZZYhFRmM0tExSb75MQ/WFCxUqFWB62jxU3yZDTKyCEWIsVD9yo8OmMbBMAsmCmMoyabWORSNz5NKJXPLvpoSY7iG9JMTSK4+HqqJs2zwyxFhbtturJMTKKqs71auCWC/mP2Yo0Qoo/h6ZxoBo0opKou3QROSxJGuU1JOlZTLSVoJwEcmpMrDlpW0DL3y5TOo7YRpVvygEnDid1l6qsrMmWDTZdKo+kZdnsF+JkUMUGlLX9wgAoOs7cfCEfg/VNxLErrjiCv7/VquF008/HZdeein22msv1/3q3/L97hNihAwxOVRfPLDZyTp2ATC5iEsZYmKOF0AUxGwJMc1ojEbWxNwJsbYgxsU+mwwxr71eQMq6eV5b6CgCIdZgF/1w3+wAIRaxTPYDIWaSIaYTwGSBzFRMK8stIVa0hyqXAo4rQawkxMoqqzvlUhArwrmul+neDKWjnFiZEE3krCdHOVU6skZsx4SgSutTknARyRDTtGVkAySGzhsHxVOoJ0pWnI4QI4bcZx3h1LrvBPFX1/fU/DOCWEntez+UkSB20kknRX6vVCo49thjsWzZMqed6usSCLGuqbQGGWI6y2TMd25rmfT9uGVSJRamEWLh4lMzxGTL5EScpEoVjcT+6ErOVTIZZbLV4p9ZEWKmGWKsjUajeIQYC9VvtYSRiMJi4grxRjRJ+GXbPbMgVoSbc11lyRBLs0iWhFi26mXbjUsBR27L9iE0L0KsE1bOkhArayZXHoRYUUL1s1bR7yEMSkfuyJ+Z0DBpWU9GFBLRUphEPbHp0toiB8UntCUPCpXUlqlQwv5PESLJGWIa8VBHKsnfDdXOGhMPRXsphdgiiFim9lJrIs3QikvZF9Lotn6oTE+s/aoi5lqep8/J6kQl3EiLBxc/Geksk/LBZWOZDO24niRq+aoDNpEQi4pptoRYJEMsTTQCzEk4gGaZlAkxWITqy+SbsDlJYp8siDWDde1ahlgl2E+UhBhgZJtMtkwG5YwQA4p5Q6ujvuTfS0Kss9XLtps8LZO25ZIQy5OAU1VJiJU1k6vXLJMlIaYsHbkjf0YRJagkT9ZRHylkjUqg0maW6UQegg1QZZm0FnlEwU+GGxRtmYT4q6ZTZnpRvucU8TBpusi2MhjhNG0bmHzPJHE0jeqijJBJoek0eWv9UpnOzC5Hmbz00kvheR7e8Y53OGtzRpZXsFB90TKpCtVvAajVIn2NZYjZWCZlQYzROWETVoQYE7NMMsQka6FzQky285kSYpU25QUQw/4BbYYYWRArgmVSQYjFMsQAJ4KYs1B9ebTOopWJZdKUEMsSyF9W71ETYrn8vl2JQS4JsTwJOFWVhFhZM7l6TRDr5ZcZGSopXF31mVYkAIGsYffOoIk8VEshSXQhimtJ66fqe0yAUzyHubABkmgz4oiVqr6K7aRZ90j2UkJbEXspxYpLzaYz6buOTqSIlVQ7K4Wmg3400X4oI8ukXNu3b3fSiTvvvBOXX345DjvsMCftzejy/e6H6ifcbEdC9SXLZCRDLFyD2FsME8skEzAkQoyLLSaEmImtMFhY+/9SqL42w4yVzXqyMsgQEz8z6heQmCHmwdPvd+JDdIEsk2yUySZEsVZYD4McMfEiEiHEVG+5bERrcftmsXPlVSaWSVNCTCd6lIRYerl8qCrCQ6JYeRFUun2n0Qi2ZdI5jypiUcS8khArqyx69aog1ivr46h0QgUr0yB1VbtiOzrKSvwsK6VjLbpYUk8qGyCF6krNENNQSDb5bqplUkP1TbLidDZAlauJRMCl0YImfSfQiU6INAd2z36orj+J7dixA+eddx6+9KUvYd68edppJyYmsG3btsi/nquiEWLCBVcXqg/ETxyxi4lNhlhI/vATie+FyzIgxHw2b3Q9EksSjlRZW2ThKe2GRRZqCJZJ3rRgmTSycgKJGWJkeq5gGWK+H1omRUJM3PQOCTFmmUwVDxMXUHBCLItlsiTE8i2XtpuiUQYuBRyKYDQ1BRx6KPCiF2VrR56uiIRYKYiVNdPKpSBWhHNdHufuHjiuSYSYgUBFfvinkDwZaTNqphdJVFJZJjMSTcaW0IwClc4yGcn9ykqIKdoiZYjpCDjCd0Pdr4wEuLTlEcRKFwMC9EN1XRB729vehjPOOAOnnXZa6rSf/OQnMTo6yv8tWbKkAz3scHle9wkxSqh+eHKoaASx2EnU5K1WQoZYsxYKZKomiISYrWWSLByZZERltUya0G9ivyIZYi0aYSa2IVsmwy523TIpZoiJFzcLQiwpVF/35oZUMiFWtDIhxHSUVxohphPaSkJMXf0Sqt8Jy+STTwL33QfcdFP2PhWdECuK6FlWWdQqCbHkKsL6OCoKISaTLjrrF5Ws0ZI8MlljSZupcqrS8sHS+k62e1IEKsNBA2xFHnn9VP2iUF2RtkDYVppcLGWGGEEcdbpfZRQYqWIl21apwm5JiHWvrrzyStx999345Cc/SZr+fe97H7Zu3cr/rV27NucedqF8nxNiXfPxJjxMiBli/OQuCGIRoQZtyyRfjyyWSXaSqFbCNk0IMQNbYTBj+/9S+LwxIZa2rhaEmC5UXyvUyYQYI/5gYSeNEWLBd91xQSzcT9qWSZEQE/ZjA0FMlZWnGmXSev2KToiZZIg1m8E++/WvA489ZkaIpVkxywf6ePUyIdZpwYhNQw3LzzqdS4G3JMTK6vXqtbxEl+fuIqyPo9KJSPJnzkQe8T6RIK6l0WZJoosy08vSMqkaOdHWOmqa+6UT4EzEQ9EymTQAQRqpRNpWKppOJhGF3ykkIDXTy2gEyYxUF9nOShR2+fr0KSGWKUMsS61duxYXX3wxrr/+egwODpLmqdfrqNfrOfesy+UJo0wWwTKZkCHGc5qaSCfEslgm5VEmQ0LMLEMs+NjaMimEzxuP5pg3ISZZJq0yxAShj7xtms3Chuq3R0AV9hGXofqaN32k6jVC7JprgDe8ATj3XOADH4j+LU0gE6skxNKrJMTM20oTjLJmg1Gn67QltBSUy5rJ1auEWC++zMhQupEZ+WcEyyRl9ME8rG1MkFAt0zRUnyKUkO2eBiH3toMGKPueIuYllSpDLKuopAu5t84Qy2A7NBFaKTRkKiFGyCNTZSOXofodrrvuugsbNmzAkUceyT9rNBr4xS9+gcsuuwwTExOoVFIeznuximaZFC644kEfsUzWarG/A4gr8zaWyQojwsK2q8wyaU6IkYUadrPheUC9rrZM6oQjE8EjQ6h+C4FlsgXQqMKkDDEQhb5g4cFPmRDrsmWSZ4hByrdj5cIyGa5XZstk0Qkx0wyxTZuC/2/alG6R1D2kl4RYeuVBGRRl23ZaMJKn0VHHaX0yJcRcCn6U9SuK6FlWWdTqVUGsF19mZCgducPKROTJanMU56VQXaLIo82pIo6KSOmTyQiSJAKOuq0c5buliTxZR7VU0XRZvxsqLajbr0yEVtKgAS0ikZZxBNB+KStBbHp6GjfffDMefvhhvPa1r8XIyAgef/xxzJkzB8PDw6Q2Tj31VPzxj3+MfPbXf/3XWLVqFd773vf2pxgGRCyTRSPE0kL15ZO6U0JMtkzq+i2TVOyExsQsaobY0BDgee2TClXsyTlUXybEmsJuQhbqkgixtG1TNMskE8SqTBBzF6ovkpABIRZ8ziyTqdsqqWY6ISY/2LNtK1sigeh2T6O+TGiyfi2XD1VFowxcCqA2FkbVPcdMIMQoIf5F+Y7LKotaLgWxIpzretnunqFIhJiBQGUahJ+VstIFxasIMZJw4cImRxnJkGIDFIk0wnbQUkjUkSFd0XuavqsIMYpglIkQM8kQIwqoWcVK1QigpWWSWI8++ihe/OIX47HHHsPExARe8IIXYGRkBJ/61KcwMTGBL37xi6R2RkZGcMghh0Q+mz17NhYsWBD7vK+qCIRYwo17JFQ/PFhVlkl2gMZOMuJFPOmNPCs5Q4ydSKoGlklbQoz1a2goMr3VKJOmGWIWhFhDWJwRIea1c7eM89WKapkE8iPEwr9ntkwWnRAzsUyKIphMdQFRQUz+uwmJVsTt1I1yabspGmWQF0HlShCbaYSYSwGurLI6XSUhllxFO3dnKCNCzGAkwzT7G4WyIolKopCgWRdXVJdxpldGi6aOshLnJW1PF/SewXcT+Z41+W6U3DkK3SaKlbZ2T8r2VIqHjo6bpOn6oYzvqi+++GIcffTR2Lx5M4ZCwQAAzj77bNx4441OO9eX5XnFJcTEUH2e04TkDDFZuRYPfio5FSPEQlFINXsiIRZ8bJwhFu7ffL09YjC/CQFkQoiF7WoJMWqovosMsfHx9sdFsUwKF8CK4wwxZ6H6RSfETC2ToiBmQoilCW+ymFZW71ETYnXTMplVxCoiIVYKymXN5MpDEOvmdaTMEFOWjqri01BECZlC0ogN5JwqgrhGJsQMQudd2PIoIg8pb41K05kExWuWl0ZZmdB7upETld9NRlGJLLRmFCuVAyzYkpUqYbckxGj1y1/+ErfddhsGBgYin++3335Yv359ps7cfPPNmebvifL9zgoKqkrJEIuE6rcQz/mSLJN8PWyshGHbmQixiiEhJglixhlieYfqi4SYJIiRhbpZs1Dxw5MlLDLEBLskAPjN7lomvUpwKsuTEPOlC0dJiCEqclEIMZWY9utfA4cdli6mldXbhJhLi58NIWbbDrUtqgV49Wrg298G3vpWYO7c9H6VhFhZvVi9Roj1cv5jhopRVZQcJ0vRxdTaZkJ1ie2r1sVoJMM0yyQhuJ0UhO+SpiOM+kgeNICQ70YWjBK2leq7IRF3aUJrRrsnhRakWELF6ajfs/xZv5XxWjebTTQUtMW6deswMjLipFN9XQW2TFamgu+92WpGLZNhqH6EXALiJzUbK2ECIabccV1liCVYJkWSypll0iZUX9zOvs/XCzAQ6kRCzGuZZ4jt2BFtuhsZYoIY41dSQvUNCLGkrDzKW0pSFZ0QM8kQMyXE5Af5664DTjgBuOSSkhCjVC8/VLkkmopIiFHb+vSngX/8x0AUS6qSECur16vXBLE88h974Lqos7KxIhFGBFFJZW2zpnQsLZNZbYdpAhUl88pEVIoIVJZ0m2movomolBY6r7V7yuKhC6E1o5hHIeBUGWKkwROIwm5pmSTWC1/4Qvzrv/4r/93zPOzYsQMf/vCHcfrpp7vsW39WgS2T/mc/CwBo7BqPig4phJjSMkkViuTA/oqBZZJniIXzmhJis2YBEAQSj0hS5WWZzEqIJWWIeUQraLDw4KcsiHXDMinc3HYiQ4xRiZktkyUh1v6dUcXr15eEGKXKUH1auSK2XBJi1La2b4/+TGurJMTK6sXqNXt4L7/MyFAkQiwH4cJkBEknofomlkkqQaUTuwzsnmSaLkWcSVqeyfZMy+Ei5a2pMsQIfbfOLBOF1ox2TyOaThhEjLQ8wjRJ0/VDGZ+ZP/OZz+BXv/oVDjroIOzatQuvfe1ruV3yU5/6VB597K8qAiGWJIitfzz4qDHFD1atZVJHiGXMEPOZathsAh/9KPCznyWTVH6b8ALMM8QyhepnsUy+5z3A0Ue3w+uTCDFhceR+JRFi1G0jBOoDXSLExH2z0h5lkt8IZBTEGq3oKJNlhpji9yyEmDxvSYilVx4PVUXZti4FHFeWyW4QYuIxkTaNbjqqRbOssopYJSGWXEU7d2coE0LMREjQCSWi1czaJicJJap+mVomqSKPru+kPDKCwBjJEDPIW8s68iWVgKPaS+X2rftOtB0aEVu2opmKprMV/AikY7+UcYbY3nvvjXvuuQdXXnkl/vCHP2DHjh248MILcd5550VC9suyLN8vFiEm3EBUGqHo0WqhwUQBQRCTD77YyTbNSrhhA/DqVwNvfCMwOBguVGqb/c769bvfAR/5CHDgge12ZJKK3TuYUlC6DDGdtTALISYKYt/8JvD448B++4UdTyfESP3yfWBgAJUpQ/JNbEMWxLqRIZYgiLVFLOHm10WovucBLQeCGBBsR1kgKkqZWCZFQqzRMCPEZLqsJMTSq1cJsbR9zrRcWSapIh1lOlPajLo8CiFWhO+4rLJMqgzVT64inbszli7snE9DELtMxI00a5ss4GTJXvLgoSW8qM0zKF78zESgImeIEYRBW2FGmYtFsXtmsAGaWAqpAx6Y2Fmz2jiN7ay6768kxMwFMQCoVqs4//zzXfelLCBCiPnd2ikTbtyZ6NFAE81GIApUmoiNMskO0JhynSYU/exnABtY4S1vCX4mZoiFbTFbybZtwPBw2NEEQowiZgnzxzLEqGIPRRDbsAGYN09vmWT/37Ur0i8Rz5UFMa2IKq6X50WEPuN8tSJYJoVt64X7iXhByssymTlUH2gLYkV8w2timTQlxGQRgv29JMRo1auEmGtBzJQQSxLMTaku3XQlIVZWWfRyKfYUgRDr1XN3xtKFnbOyopA04gY1mJ5EpKUICZ7npQolroLiAZp11MQqaBrcbvv9KUdOdLQ88TPddBThM1VopYh5BqNMUmlBEglIEOmSltkPZSWIPf7447j11luxYcMGNKUT8tvf/nYnHevb8jwuKChzsjpRCTfSbUKsiQYTxCiWSRUhprox2bAh+DkxEcsQ4yeJCrNMhvMw0Whigmd+xQmx8KRAFWokQYyJROSsLc/TE0Br1gD77w+89KXBSGJiiYQYW7fxcfV6SaH6vufrlX0pGy2Sb0AlxFgbRQjVTyHEKi1hW7ggxKSbpEzr5/tqEagIZWKZNM0QS8ofazRKQoxSedhuirBtXQpi8nm3KIQYVaAqCbGyyiotk7rqYULM9sHeNGeMIrqQ8plSrGa+50fvJbNmbDkQ16xpuhSro5P1MxxFk0IUUvtuK2KZDtaQdXnUvDWK4FdaJttlLIhdccUVePOb34yBgQEsWLAg8oV6nlcKYlmraJZJJSHW4oQYJVSfH4Rplsmnnw5+Tk7GM8S4ZTIUhdiNgEhRJWaIhQd8uPjUnCy2T0vCkZHYoyOA/vCHYP3uvZdGiMmCWAIhZmoFFbfDlKllMkaIdVcQ80JBTAyZjFgmDQgxLoDKhJhryyRQzDe8nSTEdGJaSbjEq1dtN7rvPq+2XGaIUaZzaZksCbGyer16TRArCTFlUQgx16PzRUQll7Y8iqXQMqdKNbIghW4jZVnpthWIhBihLapF02gEUAJF5qItJ4QYYbub2BxT7awWx03SdP1QxoLYBz/4QXzoQx/C+973PvguTuxlRUuwTHrdungnZogFnzfRRKMRiDWiZTIS9g5FqH6alVAliLHMMHYiqQQ/eag+I6omJuKCmCNCjJ8wfJiFz6sEAgDYtKndZ12GGJUQowp9GkIsjBOji4WxDLHgZ0cFMUFsUWaINdXTplUqIebCMlkkMUIukwwx02D8LGJaWb2Xq8NK7kNWQkzXtupzCiFGDdV3Ja6VhFhZ/Vwuz3VFuN726suMjEXJEDOxAZJFF/mFvWI6F7Y8EmFEIZXEF+HSZ8p+uRR5DEZ9tLZMCs81RrZDAkWWNp3J4Am671jM9LImtgxpQSM7q0bYLQkxmI8yOTY2hnPPPbcUw/IqzxMIsS5VUoZYKIg10OKCGIUQI1smN24MfopCkUSIJVomp6ba8yQRYiZiFhAjqYzEHt0NyzPPBD9F4Y+VShAjZoil9mnffYOf++8PIJoXZk2I+ez76LJlsloLPnJAiIkXkVxGmQSKJUbIlSYoJNkeTQkxU7tlWb37UOXSMkkV11xSViUhVlZZbqvXxP9etbtnLMook85EF6K1zXTUR13fKcSPjYhF7RdFVNIJM2k0nY3okrZ+JFHJgLKiTmcrKikHICCItlnFQyotSBU+5c/6rYzX+sILL8T3vve9PPpSFhBYJsP/FtUy2UQLzfBhlZQhxtbDxDIpZYjJhFjMMgkkklRMoOEklWWovtVojKaEGFsfccQ+HSFWqdByzQBg1Srg/vuBb387Nj0nxKjbhgli4UAG3bdMBv1uAbkRYvLFM1VY1S6kQGKEXGmCQhZCLMu8ZfWu7cZ1hphYJSFWzPNMWWVRqlcEpF49d2esWIaYraXQgKyhBtMbiy62mVcUUkklwFkKVKYCnDa43UDkoRBwqaISQeRRDXhAEQ8pGVupQqujAQ8opKNTO2tpmTS3TH7yk5/ES1/6UvzsZz/DoYceilqtFvn7Zz/7WWed68sSCDG/W9fuJMskEz3kUH1ZgAoP9thBb2OZjBFiwc8YIQYkZ235UWEsVag55RTghz8ETj45Mr0o9pCFI9UNGBPEdISYSIrpMsR8n94nADjgAP7fTBlizDI5PAxs29Z1QUwkxPiFRNz0DkaZZHuvk1D9It/Qmlgm0wgxedosdFlZvUsZuMwQKwkx+jRllVXU6rUMMZfn7iK/UDMsEiFGyPQiZS+pMsQoFkYd9UTNqTKxTBLaSesXRXwiWRhFms5B/llSv6mh+pTlUcVK08EF0vqeln9G2e4mxF2qnZVAAqoIsX61TFoJYtdeey1WrlwJIPqF9quq6LQ8YZTJbu2USZbJ6dAy6Qmh+kIfxRNa0Ix0UhP3D9WFnJIh5kuCmCgcjY2FHZFIKlPL5HnnAa99Le8vP0GZiD3shkVHiIkkXL0eEGOiBZSVbJmUCDFbAcpJhlgRCDHPg+f5QEsTqu+CEAuBWqeWySLe0JqG6rMSqUbxM3HakhDLVi4pgyI9VLnMENOJskmfuyTEKAIchRDTnbNKQqysXq9eFcRKQixSNoQYacTANNGFkItFEi5aNArJxDJJCcLP1JbBtkrLWzMh84wHDchZrCQNLmAiHqbkn5l8zxSbY+rgECbrl7Jf9UMZC2Kf+cxn8N///d+44IILcuhOWVHLZJcq4UGBEWIAMM1C9VvtXiZZJiMHfVLY/OQksG1b+/9po0yyvojCkfSwyEkqyTpJEjKEEwIfddBE7NHdsIgZYqz/Q0OBIMYEPgr5FhJi5FB9qTJliDGRjtlKuymI+T58vwI0wuWrLJMmo0x66lEmKSgzuXSCabcrTVAwIcTSRplkfy8JMVrlQYgVYR80sUyuWQM88ADwwhdmaysvQixrW+Ix0YnllVVWEavXQvV79WVGxpIfvnMVXYT5SLlYBKsZ1cJoIuCQRwMkWOAoNF1ahpjRqI8UYSZF5DGysxJzsWwD+inbijwAgcV3k7bvUSyTJhlw/VzGZ+Z6vY7jjz8+j76UBUC0TBZilEmREGu0/z/VCIQb8aBPDNUXD9QkIYAF6gOBMJSUISYTYqJwJC1DHmWSnCEmNydmiJkSYjrLJNAWu0JhSSmIsf87JsTE78U4Q2xiIvg5OBh8rBLEzE8vZiUIYl7YLzFUv5LRMtloNdoXt1Z7e5WEmESImVBeOstkSYilV68+VJkIYuefD7zoRcC996r/XlRCzKVlsiTEyur1KkP1k6sI6+OoYoQYxTKZUXQBUgQqA6tZmpAgi122QfEqAY7UloOwdZO8rqyjHVIHPKBYUNPoPfmZ1VbEEvtOpe7EtlXLIxFiKXZWkxFOnbzon+FlvOYXX3wx/v3f/z2PvpQFBIIY+2+3GLGkDLFG+/9T04FwUxH6KJ9klQdqkhDA7JJAlJxilklJ1OJ2OJ0gloUQE5tjJ3O/3QZ5pEqdZRJoWw9nzQp+qiyTUpvyKJPcCmoo9Hmex9fNmBBjwp1OEMv7xMoEE0aIQbJMNhXTEor1mwlfQGiZJNwk0RdSYEJMJU488ghwww3B7y4JsTJDzKx69aGKKmIBwFNPBT83bMjWlivKyibEP2uofkmIldXr1WuWyV59mZGxYhlilrlKJlldkekoVJelHU2c15UIkko9WdB0um1lGtxOsmhSxcqMxJYLy6QJZZU2WIPJ8qgDSLgixHT7S7+UsWXyN7/5DX7+85/jJz/5CQ4++OBYqP73v/99Z53ry/J9gRDrUh+SMsSE/zNCrCJoqiLyKv4kEWKiINZqtQmkkBDjB21IiHHLpJghJi3DOkNMKm6hMxF7dA+boiDGcs90hBgrFSEmhOrbCDS+56PZappniLHvp14P2imCZRJAE8LbHnE/tiDEZEGsbwgxlaDwmtcAv/lNYFVLetAuCbH8K48cmiLsgyaEWJpgRG2Lsn+p2pJvKqnL6yYhVh4/Zc206jVBLI+XGUU4d2csEiEmiTwuRBdKrpIJRZaUu0SivwgUmTLriSJ26cghou1QSyERaDNSlpVCrLT9nk3z1lz1nWwvNfiede2Ifae0ZZtZ1i9lLIjNnTsX55xzTh59KQuIEGJdAxeTLJPToiAWCDa+cPDIJxflAUaxTAJtoUi2TOpGmZSW0SbEwkVmJMSankCo2VomJyeB7dvbv7P1ZIQYRRCTCTFqnxRV8SqYxjSmTdeL9bMggpgnIMS5EWKEiwt9ITOMEGNUztNPxx+02e8lIZZ/9VquDiuXglhelkn2e5ogVhRCrLRMljWTq1cFMZeEWBHvHwzLiBAzCKbXCU+AoYWRIKYk3Q+a5IOZCHBJbVEscEYWRiHTy7YtihBEFStNlpc2cqKJ3dMJIWYoRKa1Q+0XyTKpmaZfylgQ++pXv5pHP8pi5QmjTHbr2p1ws11RZIiJhJis3CuV8iRySiTEgHa2lhSq3/AAtKAO1WcVI8TalkcgW4ZYZkJs8+bo7zIhRrFMRgixCt3GqSjZMpm6bQpOiLU84UbANSHm9QkhphIBkoQrU8ory7xl5UOIFWHbmlgmXRFi4vxJgrm8DFVblGnk6TpBiJWWybJmcvWa+O/SMlnk+wfDkh/ASVSQJWVFteWZ5EalWSZNLIUmgl/SMk3ELp2Yp8wQ0wkquswyg/UT+2WbFacSD63z1ghRKaoMMUpmWVaRLtJ3y++Gsg36pboGIZWVUJ4Qqt8tpTbhTbYvjDLZJsQUlklIlkkVIabLEAPa2VosQ4ydBHmovkmGWLhaGQmxhiD2kMPn5QcR0S4JtNfTxDIpEWJZBCi2HswyaZwhVgRBrFLhhBggXLiE/dUFIebUMlnkN7xpgpguVD+N8soyb1m9m0NjQoix/YIqYuWdD0YlxFxaJim0WUmIlTWTqwzVT64irI+jihFiljZAE3EjMp1ueSaUVcL3ShHzbESspH6ZBOZTtlVacLsNsUWl97ISYmmWSVcDHijtpQTR1pY8NB0tNauQ3C9FIsSOPPJI3HjjjZg3bx6OOOIIrYJ49913O+tcX5bvt0P1u3UTm3DjHiHEmskZYjIhFjnAKBliQNwyyZRuH0BDEDtMRpm0JKmYaGQk9iQ9bD7zTPR32TLZagUPelRCLEOoPhAnxMiCGGWUyS4QYoAgWIm7mAEhxrZjPFQ/HfsmV5Hf8KoEhTwIsWazvR0ajZIQo1SvPlQV1TKZFyGW1TJJEddKQqysmVy9Zpns1ZcZGSuWIWYZFG8iPAEpGVsEUYmyPHGZlOWZCE9p07mikNKC20n5bgYiHUAT6kjZWSmh+jaCkVY8FOylWW29VJqORDFmtOv2S5EEsbPOOgv1kAQ566yz+hqpy70ihFiXKjFDrH1y0RFiXBBTnUSTHsKIGWLNcHGkUH1+kQkXmZEQa3ptUc3aMikTYrJlEoiOshnpiIIQyxiqz8RBcqh+UoZYo3iCWEW8WbQgxNg+DpSEWITKSaK8Gg0zQkxnxVTNW1bv2m66EapfEmJllVXc6jVBrFdfZmQsCiFmlYulEZ7S2jIRjNIyZU0D5VP7lGKZdGWTi2SIqeAGg7ZMsqyAFFET9OWljshpk+mVhRCj2C8pdl0F6ZjZollaJmmC2Ic//GH+/4985CN59aUsIBDE2H8LliHmNVvwWoEFkYkFkQwx6S2A8m0H1TKpyxCDKSEWfNYIkSHbDLGGD3peV9J6JglijBAD0gUxmRDziX1SddOWEGP966Zlkgkmvg9PEAj4BcKSEEsM1SfclJGrCDfoSdWpDDFx+jJDjFYuH6qKRBmYZIh10jJZEmJlldWd6lVBrCTEIkUhxOR7LVuKTJlTRRC7KHY7smWSsn4Eq2BaW5TMK0reVQt66slEwKGIN5G2HC0vqYxC5yniYcq2MgnoNybECIKf7aAI/VLGZ+Zly5bhGdn2BWDLli1YtmyZk071e3FBoVs7ZtKb5WaTj+44xSgcVYZYOI/ygmNqmZQzxEwEMeENh2gtNBUymNBkZZm0IcSmpjpHiEkZYuRQfVacEAvWs0iEWCRDzEIQm2q2v4NKK34D1LOEmIll0iQHLE1MKwmx9MqDECvCPjhTCDFVW3kRYjqqtSTEyur1KkP1k6tI5+6MRaKjZNHMktiKkDUEksyEEKNaJknrR7BVJi2TQqSZkEPUtjKPnEik92yINDK9Z9mWihDTWjkJlklq3ppRNp1uXy8JMXNBbM2aNWgobtQmJiawbt06J53q9+KWyYIRYqIgNhlmiJEsk+IBlnQhJ2aINcNQfaNRJiXhyJSkEi2TmUeZlMVkFqpfq3HxL5EQC9tMyhCzEWisCTFWBckQ8wRBLI9Qfa/V3u7ckmmR2dZusABvrJPKJFTfhBBLC9UvCbH06lVCbKZkiKmmKwmxsspyX2WofnIV6dydsUzELj4NhRyi5lQRBCOduKYTecT2KcuT51EtL+0zkvhkuK0oQhY1BF71uzxf1hEyTUcAzdqWvC8kTUcRqEwEuEi/KGKX5b7eL0WyTALAj3/8Y/7/a6+9FqOjo/z3RqOBG2+8EUuXLnXbuz4tbpnsVgeSbqQbDVSaAU00rSHEYpZJFSEm39AzoWh4GNixIyaI8ZNEqBL6hAyxCLEmWgstLZNND7wN61D9JEKsWgUGBgKrqEmGmDDKpI1AY50hxooLYr1HiLF2vFZwLMrZBT1LiKVliLkixJLsk6p5e+DG30m5tN0U4SGRFVVUEqfNYnOUp+sEIUYVqMoMsbLK6j3LZK/mP2asWIYYQfixJbZUIxnqxC6TkPuk+0ESpUOg5Kif2eSf5T3qo+l3bCTmUQi4FDtr1rZsM8soWXHkbWXZd8o26JciC2Ivf/nLAQQb7w1veEPkb7VaDfvttx8+85nPOO1cv1ahQvVTLJPiAS1fRBghFjnoVULA5s3t3/fcE3jggcRQ/bZlMpy+A9bCthgHuvhkapmsVgNKbHw83TIpEmIZLZOZCbFuZoixbVupRDLElIKYASHGREHeDjwArRgh1rcZYraUlypUX8wQSxPTyuq9h0RWVFEJSM8QcylQuSTEXIbql4RYWb1evXauy4MQ64Hj2hkhJtscM1BPvmSesrUmiu1Ts82S+pQmWsltUagu021lK+CYZKRF2rIV84giDykrjpJZJm3PtL4b2RyJNJ0zcbSPLZNkQawZnniXLl2KO++8EwsXLsytU/1eXBBrduninXSz3WyiEssQawtDiYRYmmWS2SVHRwNCDIhniLGLirxtTMLnmZhlaJlk4pcTy2QaIQYYE2JZQvXdZYh11zKZJyHGRGBfuuj1LCFmkiFmkgOmIsSoofpFEG2KUC4JsSLZbopqmcyLENOtH9vvXRJiRTzPlFWWrnpVECsJsUiR6CHpQT4tx0k1j9w2ZTRAXZ8o5Jc4HSVbKun3xPkIWWOUbZWat5ZxFEaTjLRIWxnFvDQboElWnEkQfpa+y98fNW+N0i+tJbQkxMwzxFavXl2KYTkXu8R1LVSfkCHGAse1lsmW4oKjegjbsiX4OW9eWxRKJMTCk3yj84SYVai+fMPCrKHs7yxDrFJpr3s3CLEXv4DWRmKGWPEskxXxGdAiQ4yNpMqOQwruTK4i39C6JMSSxDNVuyUhll69Gsxc1FD9XiHEinieKassXZWh+slVpHN3xqKIXRTLJIVUUs1rSzRRyC/q8mxso0llsq10oiBFuFN9RtmeqblYJgIVhd5zIFZqp1FkiJEGBKDQgkRCjEKu6b4bnQDXL0U6M1955ZXkBteuXYtf/epX1h0qC+3g+K51IOFBIcwQA4DJlsIyiehJQXmAqciYiYng5+BgWxRiRI8Uqj/N2iRkiMUIMRcZYq4IsUWLgp9M+GKWScCcEHMhiA3PAmCRIdZNyyQTTHw/YpnkgpVjQoxtdyeWySITYiYZYqKoJf+NfcYqq92yrHxsN0XYtnlmiM00QiyPDLEinmfKKktXvUqI9dq5O2OZCgCAveiimpci8ujEDZ1QQl2e1TZIsQEmLV81L9miabndTUQ6IIUQk8Q8SrZamp016XdqW2TxULIwUrZV2jSUnDsTGrKfQ/VJa/6FL3wBBx54ID796U/jvvvui/1969at+OlPf4rXvva1OPLII/GMPJJeWUbFLnGFGGUyJUNMFJciIfZIUPBVQhETxAYG2oIYK4kQa/JQ/c4RYkwkavigj1SZliG2eHH0cxPLpCT0OQnVD4k/6wyxRrcJsbb7O2uGWEwQC/cjaogqqYpwg55UJpZJ1e9iyaKWPK1oDysJsfTqVcrAZYYYVVxzRYi5JNLEv1FFupIQK6sXq1dHmey1c3fGsrFMkkglomBEEnAsRSVV+7aWUMo0yn45EuCSprMS/FLoPSOxy5K4U7ZFWT8H4iHFfmm8PEd972fLJClD7JZbbsGPf/xj/Pu//zve9773Yfbs2Vi0aBEGBwexefNmPPnkk1i4cCEuuOAC/OlPf8IiRr+UZVU8Q6xb97CaDDEuiHFCTJMh1lIo16o3W4zyqte5wMJLyhBrsBNlo3MZYs4sk5OTwPbtwf91gliaZdKR0CfOw+yBqdumiKH6EiHmLEOM7ePh505D9WcaIZZEcskil7yddYSYrl153vKBPiiXlEGRRNk8LZMUwYga0K/qF5VIo4766MoSWhJiZc3k6jVCzOXLjB4ixFyF6puKLkZtWQpBqs9tRR7q+tkIRtR8MmuBykCsjORwWQqfZELM0b5gai/VElsU8dfz4MHj7ST1wRV92S9FDtU/88wzceaZZ2Ljxo249dZb8eijj2J8fBwLFy7EEUccgSOOOAK+ixN9WW1CrFsd0GSIsVD96fCkpc0QUx30OstkvU4gxIKPPVNCTCCpsmSIMdullWVy8+b233bfPTq9bJmkWkGXLXMTqk8lxJIsk10O1fcEOo4TXJaCWGyUyVChZiMP9V2GmCwc6AgxWVhII8TKDDGz6tWHKpc5XC4tkyUhVlZZ3aleE8TyeJnRA9dFG3HG1panmtdVLpZTCimLAEfYVjYCXNJ0rgRN/nlLP50r4k7ZluW+QP3MqbDreRw6SewDhTwk7lf9UGRBjNXChQvx8pe/PIeulMWqTYh16eKtyRCTCTFRhJG93exgjRyEqgs5E390lskYIdaMzitWQvh81gyxRlZCjNFhs2cDQ0PR6U1C9UWh78IL0dx7M3DHe5wQYjMyVL9SgVepwGsFx457y2RQfUuIiWKiLgdMnhbQjzIp54+VhFh69epDlY3Ik5WgctWWDSGWRRBzKR6WVVZRqwzVT64irI+jyovS0YouhsvLZGHMyxKaRYCzWF7SZzYW1EwClSPBz7YtyjRJn7kWdsUQ/7yz9/qh+nfNC1xMEOvaKJNJ1o5mMx6qr8gQI4Xqi+0aEGINBG1zsaMDhBgT0ESxJ1VUsyHhqKH64vb0PDQW70Hrk6LkDLHUNpIsk9PdzRCD73OLsSvLJBcJwwPSaah+Ed5YJ5X88CwLYkmiFpBOiOlC9UtCLL1c5tAU6aGKKmKJ+wQ1Q6yIhFgnLKFUi2ZZZRWxSkIsuYqwPo7KJuTeJRVkK0pYh9y7soS6tAFmIcQc5lR12s5qlUfmUDy0FRhV8+YttPZDGRNi8+bNS9xpBgcHsWLFClxwwQX467/+aycd7MfitsBuXes0lsk2IcYsk4QMsTTLpI4QCzPERNsiAHiMEKNmiM2d29kMMd3gAaqsNJNQfQjrhfb27ltCzPcBz4PfAprIgRBrRT93Eqo/kwgx6kiRgBkhJgtiJSGWXr0WNM3KJdVFJaiKToh1YtCAssoqavXauS6Plxk9cFzbWNKc5kZZClTU4HYb21qnKSvbESWpy3NqL3VoZ80rvy5pOisRK8O2cim09kMZn5k/9KEPwfd9nHHGGfjoRz+Kj370ozjjjDPg+z7e9ra34YADDsBFF12EL33pS3n0ty9qJmSITSEUBYSLO8/5CgUa5UgabHrxwZlCiHHLZCj+6AQxL0r0tNACvvxlNA86MJjXNkNszggauy2ktWFDwhEtk6wYgZcl00rOECOPnsmqmxli7IGREWLsYybWZg3Vl0aZdGqZLPIbXp0wJWeIZSHE5FD9khBLL5cPVUXaB/MUsVzaL0tCrKyyOlO9RojlMUJwDxzXrkYpdBo6byOUJAlwFhbNIlpCqZ/ZCprK6WwJMUta0JqyIsynXJ7D/cqWdKQKu/1QxoTYrbfein/6p3/CW97ylsjnl19+Oa677jpcddVVOOyww/Bv//Zv+Nu//VtnHe2n4hlizS5d7FQPCuHPGCHmt3chdkAyckkZqh8SRVwcAqKEmIqcEtpIzRATBTqREDvySDT+sDvw6H3G1kImEjWGZ6E5vAh4aiNdOFKRcEmEmKFlUibEbEL1nRFi4X7RTcsk60ObENMINZpKJsQEgRV227u9kAK/4TWxTDab0W1bEmL5Vq/m0FBFHhvLZN60mUuLpvi3ThBwZZVV1Oo1QaxX8x8zFomgshAlyKKLLSFmK7rYikpE26GN2OU0KN4hIUYSebLYS3MSY5OmsxKxMgi7JSFmVsZ31ddeey1OO+202Oennnoqrr32WgDA6aefjkceeSR77/q0ukqIJb3tlgUxGFgmxYOQhcnv2tX+zIQQY9QZo01k4UhBrGUlqcT1ItsT0yyTKmuoqWUS2S2TzjLECiCIxTLEGo4JMaRfrMlVhBv0pDIJ1ZcJMZ0gJk+rGnUyad4euPF3Ur36UJVnEL5LQkzVL2rfqcRWSYiVVVYZqq+rIqyPo3JFNDklxFwGt+dlCXVpAySKa7Yh9y7FQ1eimW1bVJrOdp9xaf21EZvLUH2Dmj9/Pq6++urY51dffTXmz58PANi5cydGRkay965Pqz3KZDcWnnCzHf5kofpNRmoRQvUjBxgjxMbH258RMsRiofotqK2FSYSY0C/rDDETQUxnmUwi4RghljbKpLReLNPKJlQ/Jv6YEmLhejArbREIMZ7x1RAeFA0IMbYdZUKMmuFAqiITYmkZYkmUF5AuiMk5SlS6rAdu/J1Urz5U5Ul15R2qPxMIMfnYK6usoldJiCVXkV5mZCwr+isDsdVpIi03kYe6PIf5Z0oBBzmKPHnbWSnrZyEKJk7nSMRS9itneq8fytgy+cEPfhAXXXQRbrrpJjz72c8GANx555346U9/ii9+8YsAgOuvvx4nnXSS2572UbFLtt+Na3fSzb1EiLESRRgu1IRroAzVd0SIkQQxmRCzDEPnYeqtBr0Nm1Em2WcTE50jxFiGWMMiQ6xSaQ960E1BrFJJsExmI8QYNefzzx0KYkW4QU8qnWWy0dATYroMMfl3WUxLyx8rq3cfqmZyqL4NIZZFELNZHvu9j292y5phVYbqJ1eRXmZkrNwythwKVJmC2y0II5fLyzKSoe/53FmTNJ3LnCpX343LYHqb74Y6Xd7fM0lIJgqt/VDGgtjf/u3f4qCDDsJll12G73//+wCAlStX4pZbbsFzn/tcAMC73vUut73ss+IjKXZj4Uk32+HDaUX6M4UQixxwjBBTCWIqQkzKEGuCWSYRiEbUDDGhX8YZYuH0IiFGthYmCWIqQkzMVzMhxBQjflIrJv6YEGK1Gu9TISyT4cecTHQVqh8im04tk/1IiMm/m9gt2fT9/kDfq8HMLjPEbNrqNCGm2+asXy4JuLRlllVW0arXCLE8zt1FvH8wLMoDuJXtMEPoPEls6DDJ0+mgeNW8eY9k6Oq7oQp+eYX4J35mkyFGFHZJFk1Lwa9fylgQA4Djjz8exx9/vOu+lBVWVy2TGQgxWRDjo0yKB5zOMmlKiE1MxPtLyBDrmmUybT1FsbDTGWINiwyxgYH8BbFWC7jjDuCAA4DQks0rwTLJytYymRaqL09nVUW4QU8qkwwxE8pL/t103maTE4l9Wy4JsSJRBv1AiLmyTNoSYuXxU9ZMql4TxHr13J2xxAdwsnBRREKMasvLmeShCGdO89YIy3NJbOVqL7X8bqwJsQwWRit7aRmqry0rQazRaOCHP/wh7rvvPgDAwQcfjDPPPBOV8mbLSbFLnNeNi12KIFaR/lyptHchTnHpCDGdZVKVrSVniIWilgdERTVWBEIsS6g+OZjflBCrVKLbxiJDzEagMSbExLebnSDE7roLOO444KyzgB/+MPo3Jpj4PuB5MQG54ooQC/e9XDLEinhDa0KIpVFeuqB8G0Ks38ul7aZIlEGeIlanM8TyDtUvCbGy+qF6TRDrVbo3Y4kP4InCBYHOp4ouroLGXY4GaJWLlcUGaCmu2QqR1Hvn3ESeDPuCSwEuz9FEXYmH/RyqbyyIPfTQQzj99NOxfv16rFy5EgDwyU9+EkuWLMH//d//Yfny5c472W81kwgxlWWSCTXKDDFLQkymz/wWgO3b4/0nZIgZWyZDiqrRbJgTYuJDfVqGmLhtLAgxm1B94wyxTlsm164Nfq5fH/9bKiEmfOCAEMtllMkiiBFypWWI6SyTeRNi/V69NvIaqzwJsSyCmA0hlneofhZCrKyyZkr12rkuD0KsB45p8YE8i83RJBfLtK1Mwe0EwYEkXNjaHC2pLtXntlZVW3rPmQ3Q4bbqtFiZxe7pcvTSfijjJ7q3v/3tWL58OdauXYu7774bd999Nx577DEsXboUb3/72/PoY98VJ8S6snB9hljcMtnWVHPNEGOWyabQj23b4v3PmRAjj1Qp5oGxSltPqmXS0XqJ8xQ2Q4yJIipBKyFDjJWvo480xb7buGXSoSBWhBv0pDKxTJpSXiUhlq16lRBzmSFmQ5tR21L1q9Oh+iUhVlY/VBmqn1z9Roi5pJ5chZETRSVXlkmXoovLAQFc5lS5+m6sxVFbAs5SgHNqZ7X8nql974cyJsRuueUW3H777Zgv5PksWLAAl156aZkr5qi4oFBAQiweqq8RxMKLdeSgVFkmKRlikh3TowhiMiHWyQyxWbOCn2Nj7c+oofqGGWJZQvXleYwEsYGB/EeZNBDEXBNijJpja+EULS7CDXpSdSpUvyTEzKtXbTdFtUxSxC7XofolIVZWWaVlUldFvn8wLBIh5tAm5yzk3qFlsuO2Q5cZWzbfDXV5OX83rgZryDOTLUtWnEu6rR/K+Mxcr9exXWFV27FjBwZkMaMsq2pniHVh4aah+kJuHCeXwjXghFiaZVJnJWQZYjxUXyDEtm4NpqkKum6OhFij1aDndTHhz8Qaapkh5oIQY5Vqu5QzxMK+5C6I6bJ7kgQxS0Isbpn0Ip/L01lVkQkxnWUyKyEmt2XyHRVxW3W6ejWYuaih+i4JMVMBLgu1pvq8CN9zWWVRq9cEsV49d2cs8fkgC8ljKyTkLfLYjFjpcnlZKCSbfmXJW7P5bmypNdV01gRcnplsLoXWDOJoP5TxE91LX/pSvOlNb8Idd9yBVquFVquF22+/HW95y1tw5pln5tHHvqsiZ4jFQvVFQgxqQizVMsmEIlWovkSIRWxsTBAT58sjQ0yYntNYaW2oBLG0UH1bQqyZgRDzMxBitRrvFxfEfMeCGBNJdIRYpRJYJnPPECsJMacZYq1W9Pc0QayI26rT1auUAVVUsrFMFoUQo4Tqu+qT6vMifM9llUWtXhXEeu3cnbEihBhVVHJI6dgGjduOnOgsQyxLThVRPHQlBuVKiFkG76s+txXg5OlcZrKRM8QsqUKnL/pneBmv+b/9279h+fLlOO644zA4OIjBwUEcf/zxWLFiBT7/+c/n0ce+qzYh1oWLt2GGmCpUnwli3N4oHqi6USZ1lsnwwG63ibZlMkkQc0yIAYJIktUymSVUP09CzDRUP+yXuF808iDEbCyTzgix4PO+HWVSR3WlEWJy6SyTJSGWXr1KGeRpc8zSVqcJsbwGDdD1q6yyili9Fqrv8mVGEdbHUUUyxHIWsVTT5R6q78jC6HIkQ6oI4oo2sw7xdyTAZRGobPLWXI5qmUU8dEVD9ksZZ4jNnTsXP/rRj/Dggw/iL3/5CwDgwAMPxIoVK5x3rl+r0IRYzDJZ4/+XRSttqL7KSqgL1Q8P2kioPiPEau0+5JkhBhgIYqaEWFKG2KxZbVEt3I5Jo0zaCDSZM8SAmBg17RPbopTOMsn+lhCqXxEJsSyCmPS5PJ1VFfkNb1qGmBwOTgknV/1dFtPS5i3itup09SohlqfNMW8aqyTEyirLffVqqH6vEG+OikSI2VjNslBPFiKWU7tnBuqp0/lnrgYNUH1uLcA5tDCa5K0pIRBpmtTlUek2R/ZL6nHTD2UsiLHaf//9sf/++7vsS1lhsUtcV8BFwwwxFSHGyCVumcxCiOkyxBghVqvx6fIgxETRiAetZxHEKKNMMgFneLgtiCUQYmQbp6IyZ4iFn+UmiFEsk74PeJ4iVN9AbBHnS8gQky8mNhbV9kIK/IY3LUPMJFRfrpIQy1a9+lCVVw6XbrqSECurrOJWr1km8yDEekDkpljNbEZ9dBrQbyk2qD7PPVTf0fJUn9vSWHlST1ky0pzaLwmko8vvxiq7LYNY2Q9FEsQuueQScoOf/exnrTtTVlDNEA3rCiGW9DCRGKqvGWUylPYiB2pahpjJKJMiISaIM6xcZYiJ/Z9qBoJYqhhiM8qkKBay72H2bKEj6gwxl5ZJ6wwxMQoqD0LMapRJO8sk2z/Yd83aLQmx8P+6UH0TQsyULivitup0ucyhKZIoSxWxbDLEikiI5Z1rpvq8PH7KmknVa4JYr77MyFgUq5lLC6OrEHinowEaEFudpJCsaCyXYqXl90z+bmyINMJ2d5rJ5lJcy2Cf7YciCWK/+93vSI31s7LosrqaIZb0MBE+fMih+r5gmYwJYrpQfeook1KGWMQyKWaICfY9VkkkVZYMMbL4ZDrKpBiqPz7eXo/hYaEjCYSYw1B96wwxQW/qliAWC9VvFjhUv0hihFwmGWKmlJe8HBNBrIjbqtPVazYiVnlaJl3SWKp+2RBinVi/UhArayZXrwpiJSEWqQhZQyWVLIUZ5XSuqCeHYh4luN1lThWVNlNVnmKl7ffs8ruxIfMo319Sv6zzzwjCoFKMJYqH/VAkQeymm27Kux9lCdUWxLqwcEPLpEiIyRQXzxBLs0zqRpkMrZDcjsmoM5kQY2KShhBj/cqSIab7LFJZM8SY2KQSxFwSYqCdfOU+AOisZVJ145dGiE0bjGAozicJu+1Q/T4hxNIskzpCzEQQMxXTiritOl39Qoh1ImPLlf0yLwHO5frppiurrCJWGaqfXEUQ+BxVXoRYliB1m2wpp3ZPjWDEXuw7FZUIbWUReZyOnOjyu7GxzxL20UyDQ1iSh66suP0MNllniJWVX/FQ/W4s3DRU349bJmWhRkmImY4yGTtJIJohRhhlktNlphliCotlahtZRpkUt43KMukwQyxGiKW10a1QfR0hVqlAFaqfNUOM/56HZbIIN+hJlWaZLAmx7lWvhurnlcNFnS6L+GRjYUyaRrYUq6okxMrqh+o1GjYPy2QPHNMkQowgcFgLCRSiyWHWkyubnEsCjpSLlWH9XGZ6dfy7sdivqPux021luV+VofrtKgWxAhYXxJpdeAA0zBDzq5oMMVWovukok1KoPl+WTIipBDGJWOOEWIYMMd6ttDbSCDFxZEwgLoixQQLyJsRcZYh1UxBLIsTE48eCEOO/hz+pb51IVeQb2rRRJmXrl4moJVazGZ2+JMTSq1dzaFxmiHU6VL8kxMoqy331mmUyj1D9HjimKVYzV0IJdTorUinnEH95OpfB9KTlEQW4vMVKq+XlnMMlT5f3vqeazjb/jLqt+qH6Nz2toNUSLnDyA35HyjBDrCJkiLEDKWaZFA8wk1EmKxV+IxM7aEVBbGBAbZlkJBUL1bfMEKOo+LFi66kixAYGgn6Kopgcqs9EQkKGWBZBTLaPWmWIVSrdsUyyB+KkDLGWYlpCxQkxT/15vxBiOsukqagllmmofhG3VaerVx+q8swQy9t+6ZIQKzPEyiorqF4TxEpCTFk2uVgkGsYlbeaQQsok4AjTkW2AWQgxkUjLIMDZipW2Ik+uAywQ9qu86UTqdC4zy/qh+nfNC1ottC/YXdFpDTPE/EpbQOGWyfCmg0yIiUKRLIiFpSTE0iyTjKSShCPTDDHP88xPGswymRSqD8TXlW0bANi5M/hJGGUyS6i+8Xp1OkMsAyEWEW9NRpmUtmM7Q6xPCLG0DDGdZdKEEDPNHyvitup09epDFXtQZef8TlsmqSNWqtrKi0ijEnAlIVZWL1avCmK99jIjY5GEhA4HqdsIJVlysWxC7jPlVMl9T5ACSIRYjmKlrchjnSGW4buh5K3ZiKNOhc8M300/VCmIFaxEQqwrlknjUP22qBOzTLIAfPGgZBTU5GSbLmHLkAmxajywny+rhbaQlpYhJgtHFllbxsJRmmVS/AlELZNAe5vkTYhJ28LIMilkiHndFsQ8Ly7WOiPEgp9OBbEi39CmZYjlFapfEmLp1auEmLxelIwtl4JRFkIsL7qtJMTK6ucqQ/WTqwgCn6OiEGI21i+XIfCucriyLE9eZqYROR0SaTakErktS5Gn05SVPJ1Ti2YGMs/KVtnHlskyQ6xgFSHEunGtS3oASArVrxAyxFSh+kAgEInLkwkxURCLXTCEShtl0oFwJM+TSmPpLJMqQqxa5cJOZJsQMsSyhOrH1sskVF8gxDwE+2vLy8kySSDE5NN4RBBj4gvhZrQjofpFonPkSrNMloRY96pXCTGZnuiEYOSqLarw5MoyWRJiZfVDlaH6yZV2npxBZWM1s7XSUafLK4craTqXoktelkKnWVYOM7YyZZbZCJ8ZtpWrIHzldDnvx/1QJSFWsIoQYt3oQNIDQPigG7dMximuWIaYyjIJBPQUsxECgVDk+20hLI0QYzUwQCPEWh20FqoskzpCjIlh4vYBOp4hZhuqD7S/k1wIMV2YdVKovvz8RxRrEgkxGO4D2oUU4I11UqVZJvMixNLmLeK26nS5tN0UiTJwaZl0aWGcCYQYJbNM115ZZRWxes0yWRJiynJGIVkKCbaCisvl5SlQudpWTkc7pNJtliJPrgMsZNhWXc+my0DT9UOVgljBKkKIFdAyGQvVr8Ytk/Loh54sorCHnl272iIR+xvQJqfSMsTE+QwyxDoiHImWSXbTkkSIVavtGxw2HytKhphDoS+1jYQMMSBnQUxHiFUqMdum2J9YWylVEmIay2QaIZanIFbEbdXp6jUbEas8M8TyziPrNiGWdTuUVVYRq9cEsZIQU5aV1cwRyZO0TJtwd5diHiVDzKXtkNJWFotmrhZGhwJcp8VKkojlUlzLsO/1Q3V1zb/whS/gsMMOw5w5czBnzhwcd9xxuOaaa7rZpa4XE22ALn05STfbiaH6BMukfHJgFJQ4muLAQPtGQRSKpLZZebIgljLKpLhdO2ItZMJWs9leR3HwAKAtjAnr2XFCzFGGGNBFQYxKiBHFmjJDLCVDTLwJz0KIyfOWGWLplQchVoSHKpcZYi4tkyUhVlZZM7+KJIiVhFikXI1kaCskpPWJvLycqSd5Our6ZbIU2tB7FEuoSwtjzpllNtl0nRYrk9pyKST3Q3VVENt7771x6aWX4q677sJvf/tbnHLKKTjrrLPw5z//uZvd6mpFLJPduNYl3WwnZYhVa/z/7OCLWSblE5Y40qRMTQFKQUxrmSQQYhGh0UGGGNkyCbRtk0mjTAokXEwQIxBiTrPRLDPEgJwEMdmuJ1ZOlsmYSMgFMcNtpasiiRFyyW+ydZbJLBliJSFmXr0aqi/SnuLvSdPppnEZqt9LhFh5/JQ1U0rcx3uFhs3DMtkDxzSJELMQlZJGTrQS4ByJWEnT2QhULgm4TAKcRVA8VeRRinkEkcc6hyvvbUUQbV2OyGllq+xjy2RXQ/Vf9rKXRX7/+Mc/ji984Qu4/fbbcfDBB3epV92trofqG2aIVSptQYxbJsMLP/sZOwgZPbVrV5wKA5TkVCbLJFp8hEmgQxlitVrQl2YzEMRGR5MzxJIIMd+PWigTCLEso2caW0EplskKsS1KyXY9cfnsbwmh+rJ4m5UQc2qZLMINelKJN+6Nhj5UPyshRiF+5H71c/VqqH6elsmk6eRzi65fSb+r5i0qIVYeP2XNlHItiBWBqMrDMtkDxzQpVN9hLla3hQuSyONSoMpiKbQQD7Osn40NUCeu8dieDMJnntvKpdCq7JONrbKPCbHCjDLZaDTwve99Dzt37sRxxx2nnGZiYgITQubUtm3bOtW9jlWEEOvGg0pKhliMwqnGBTFtqD4QtUyyC7uKEBMzxGJvUBCdXheq32rxnC2xnyZlZS0cGgJ27gxGmpyebt+8qDLEWImCWK0WFQo7QYiliYUEQiypbauSBRNxW3U6VN+lZbJIYoRcojjRaMSFg7wIsbR5i7itOl29GsxMtUzOZELMlSBWEmJl9VP1ivjfq3b3jGVjNcuS9URZno3VjGKlo7blVHTpNPWUYf2s7JeEfmWxl+ZJ0xXCzkoUK/uhumqZBIA//vGPGB4eRr1ex1ve8hb84Ac/wEEHHaSc9pOf/CRGR0f5vyVLlnS4t/lXhBDrSgf0GWLxUH2NIMYyxHSWSTFDjJXKMplGiKkyxAThyHmGGIUyE0eaFAcP0BFiIhEmkm9AMiGWIVQ/U4YYE8RC4TIXQUwkjmTBRLzB9LwYUcl/Z8JqSYjRSqZ1dBRXJ0P1i7itOl29GswsPyz2YoaYK8tkSYiV1evVi4RYr9rdM5bV6HyOyBqnI/g5tExmyakihc5bLK8j+W42FkZCvzINeNBhO6sVLZiFrCTuC/1QXV/zlStX4ve//z3uuOMOXHTRRXjDG96Ae++9Vznt+973PmzdupX/W7t2bYd7m3+JhFiMcOlEpWSI6UL12YHFxKdEXFW0TOaZISYSYs1shBj1YhspcaRJURBj60chxFSCWJ6EmEmGWEKoflLbVqWzNWkIMY9tJc9rb9/MhJhDQazIb3hlcUIUqmTRyjQYP8u8RdxWna5eJ8RcZIi5FMQo4pNLQszGxlkSYmX1WvWiIJaH3b0I5+6MlZvoQqGsHFr38g6Kl6fLQiG5FJVcWjRdEk0kK66F0EoR11zSgqQsPKoYm4He64fqumVyYGAAK1asAAAcddRRuPPOO/H5z38el19+eWzaer2OuigQ9GCJJJPX7MLFLsUyGQvVr7XJLp4hFgo1qaH6u3a1/29IiMVGmUzJEIsQYjYklTAPWQhhgtjYWJuE8/32eqWF6idZJiVCjK1bUTLEyG1RKo1OYsuXMsSCZTeCvzkixPpulEmKIFYSYp2tXs2hcZkh5tIySRGf8iLE2HTyd10SYmX1epWh+voqEt2bsUgCVV6iUgeEGVdB8fJ0naCQbEQXV7bDpOlsBKpM9F6O2yoLTUcixAj0V2mZbFfXCTG5ms1mJCes36oXQvVjlskkQixtlElNhlikHwMDwGGHBdMffnh7HocZYhT1P1Yqy6S4nmmh+kRCjNFvPZkhRrFMVioxQowvWxQgi2SZLDIhprNMphFipqH6JSFmVr2aQ+MyQ2wmE2Ly56pjoiTEyur1KgkxfRVhfRxVXlazTLZDC+semQpyJYJ0ONOLTFk5soQmlUsLo5XQmmVb5UTvuRQYS0KsS/W+970PL3nJS7DPPvtg+/bt+Na3voWbb74Z1157bTe71dWKhOp3pwPR31MyxCih+rGDXiTEVBliqlEm5ZOg7wMIO1OrAUccAWzcGIzmKM0jE2JZhSMyYUYV/jJmiGWxTDrJECsIIZYoiKmEHU3FqLk8LJNFeGOdVKLQCJSEWJGqxXfG7G0V6aFKFvoolsKk49mGEKO2RQnVz1vMKwmxsnq98hLEeiVUv0cJMapQ0nEhQUWIEYUZG9tapzO9SMvrgC3PVe6XbVtZsuJcbSubETkz2VmJ69cP1VVBbMOGDXj961+PJ554AqOjozjssMNw7bXX4gUveEE3u9XVihBi3bjYmWaICZZJLtQwy2RaqL5JhpisrNcGAOwKfmHCzNy5kWlUGWIePKsD3ooQEy2TtoQYYZTJLKH6xtRTkTLE2N+YZVJYfsUpIaYWdnueEGM33SaEWJkhlm/16kOVS8tkpwkx6vJsLJMUAS5r1lhZZRWtepEQ69X8x4xFGg3Q4YO9TbZUFqsZiaCyyNgqir2UFNzuMvfLxl7agaw4VzSdy+W5JCv7oboqiH3lK1/p5uILWRFCrEiWSYUg5jcRsTUmEWJayyRxlMmYGKESxKRSEWI2OVvyfM4sk2mh+gMD+RNikoiWun06nSFGHWVSJsSYG7xSMSbEYvsadzo4vHAUmRCTxYm076AkxDpXvfpQVVRBLC9CjE2Xlg+WhRCjTldWWUWrXhTE8rBM9oDI7SoXq9NWM7aMxOechHmzCFQUmi6vzDIyseXKdpjxu8krK45CY2XZVmR6z0bw0wh1aftxP1T/SoEFLZEQSx1l8rvfBc47LxBcnHUg4QEgfBAWQ/UrLWgFMT7KpC0hpskQ81QCmlQ85F/IELMVMSKWSaqoprJMqqyhulB9gwwxG7EvEyGWJoi5OLFah+p77b85yhDrm1EmTTLEms3oOcOEEGs29W2rpu/36tVQfZl8S+pTXpbJLAH9NoRYlrZKQqysXq9eFMTyCNUvwrk7Y7nKxbIhtrJSSLnZ8nKmkPLMLHNmO8xA78XaotJ0FKE1y7ayyRBzKfhl6Hs/VCmIFayMRpn89KeBb30LuPVWhx2gWyYrTShzvlJD9ZnoY0BOqS2TYSURYoKFkxNiFrZCwIFlkpFwJoSYLIiF65NnhpjTUP1OCmKepybELDLEEkP1fYeCWJFvaGVxQpchZiKAyWUayF/EbdXpyoMQK4JQIouwLm2OebdlS4hR6K+SECurH6sXR5ksQ/WVRclCsiHEKG1lCcKXP3dqy8s5p4pKbLnKLLP5brKsX6wtqnhIILby3lZkes9C8CPRZn1MiHXVMllWvIxC9ZmYxMQWF5VimRRD9f0EQoytA0cw5YOVCUVJofoqQUx+KzAgCEVplkkhQ8zaMunlYJlUZYjJofq1GnDyycC2bcD8+QDihFgWsc+YEBMfxNMyxIx7oyhdhpg2VD8HQoz4ZopURRIj5DLJEDOxSMplSpcVcVt1unr1oYoq9Lmiuly25TLkviTEyiqrDNVPqyLlP2YsV7lRZOGCIGLlKhg5suW5zPTKIlCRvhub3K8M6yd/noXec0qIEfbRPAcNyEI69kOVgljBSrRMpj6osIfRLJSGXAYZYmmWyVRCLMkyqRplUj4JWhJiLiyTxoSYbYZYrRbcyP3858G+kJAhlsUOKs+TKhiqCLFwH4gJYi6esykZYpUK5FD9LIRYbORN/qxuOCKnrorwxjqpdBliLgUx+fsoCbH0ysN2U4SHKnmfY5/pMraKSoi5HEHSJSFWhO+5rLIoVVom9VWE9XFUNtRT3lazXAUjioWRQj25JMQcikpFWD9qW1YDAmQQ82y2VSZx1MaqWlomyypKcRGphfQbWPYAmeWhNN6B6O+aDLEkQkwO1Y8d0NRQfU2GWIQQS8gQixBiGUZiBKLrQG6DEWJZRpkEgpsfP/ki7jJU32mGmIt7NYMMsfwIMTXpaLsvhY0FP4v4kCoKjYD+O8hy7jG1XxZxW3W6epUQk/c5QN0vSoaYjWDUCULMlSBmS4gV4XsuqyxK9aIglkf+Yw9cE1092Lu0mnU65N7GMkm1e3Y6s4wU4p/z+snTZaH3nFpxbTLEOi0e9rFlshTEClbcZthCMQgxkwwxgcgKZk2wTFJD9XUZYhTL5EwhxHSh+oqSt7PLUH2nGWItBydWnWWS/S3MEBOXpiTEbAUxfh9raC/VLmQGEGKqN9kuM8RMabMibqtOVx62myJsV9U+lyYYZbUKdpoQc2WZLAmxsnq9elEQKwkxZVmFn+dsNcs15D6DpdBVkHqnM8tibeW8frG2MuS7uaSsbOyzTgeHyJCF1w9VCmIFKyYi+RRBjD2MFtUymTSMqyiIWWaIeZ3OEPMtMsRUofop60kRxFjlEapvlCHWCUKMYplULL/iCYIY275ZQ/VdCmJFJsRU9jVWeVomS0Isvdh30yu5OqxUD4tpglEnLJMlIVZWWZ2vMlRfXz1KiLm05WWxmuUqGGUInSeF+FtkepEsoRlFF9PMqyzrJ0/nlN5zKY5SxEqHhFgWmq4fqhTEClZM3PAAOiHWCcukQag+Wwe+Lkmh+llGmRwwyxDLkrMlz+csVP+oo4Lfjz22/Zkcqq8oOVQ/ix00U4ZYWqh+Jy2TkHBr9n+XofrEmwhSFfkNr4kg5pIQS2uriNuq0+WSECvSPki1THY6VL8kxMoqq/NVhurrq0jn7oxFIl0IlFWu2VJZRBCKQEWkkHIjxAh9z0yIGYp5TgmxLN+NQ8qKRIhRiTSC4GdDOpah+mUVpiKWSWqGWCcIsXAZWstkeLAZheqryCkmCgmCUOztT10gqQgZYllGYgSiJwkyZZYm/B11FLBlS5QKM7FMuiDEeiVDDFF1n6+HRah+UoaYb7qttAsp8Bte3Y27LGJl6b+pkF/EbdXp6rdQfblsMsRmEiFGoSZLQqysXq/SMqmvIgh8jspGSHCVU0UVXTIRWwRyzWX+GcmiaSHAkUUXR2JelvWTP3dJ77kURylEWhZ6zyYLr58tk6UgVrBqi0igWyZdEmIpGWKUUH22DlzcM80QO+cc4OabgQsuiLXNimSZFAmxZjZCTBSOrCyTqvUEogKY/DuREONiX+EyxIy7Ey9x3056oOTL9wA2kEMGQiwmEnKnQ58RYhRBLKl8ny7oU9sq4rbqdPVqqL7LDDGXofqUtmxHmewUIVapBPcKPfDwXFafVC8KYnlYJotw7s5YrrKQnBJiDgUjp5leHbYBOhUPDcU8pyOAZiAPbUL8M1l/XYqjDknHfqhSECtYFS5UP0uGGCdrpJND2iiT++8PXHNNZBarUH0VIWaZIZaLZVJVFoRYFrGv8BliFEIs3AfFPSQiiGUmxIKfuYTqF/Eh1cQymVSVirt1Y20VcVt1uvIgxIrwUEUlxPIK1U86N4iZba0WzTKZtD0p/TKxaKYJxeI2bTSK8T2XVRalelkQKy2TkSqk1cxGlHBpmXRp0aSILlmC6XMaEMDlCKBOCbEs24rwPbu0hLrMwuuH6l+zaEErQoh1wzJpmiEmWCatQvWJQlHsRFI3JMS6kSGWZplUVQZCzGbdjIPiC5wh5gv7iM/66TBDzPcdCmJFvqFV5Tmxop5r5HlVbSWV/MDA5i3itup05UGIFUFopGaIddoyKffLlupSzWsriMl9ova9CN9zWWVRqhdD9XvV7p6xKGSNTRh5J6xmrgQcl5QVRTy0soQ6tOVRlteJXCwKbWYT4p9lW5EtoY7suvLn/UyIlYJYwYqLGyaEWJ6WybQMMeEBRhSgAEHcSwrVT8oQU1TsxFU3EI5cZ4hR20gbZVI3D0AnxDKE6svzGIXqd9syyR6IlaH6fvtvWQmxJtuP+4wQy2KZzCKIJc1bxG3V6cojh6YIQmNRLZMyuaYjtnTbk/qZKSGW1I74eSkolzXTqpdD9XuFeHNUFCHBKvw8i9XMRpQg2vKy5J+5GsnQqahE3VaGYp5T6onalitiKwNNZ7M8skWTsB/3c6h+/655QcvIMske8PO0TEoZYiaWyfaImQmEmAE5ZRWq7zJDzI+vZ2rlZZnsJiFWRMukghCrsFNbpeKOEHMpiBX5htaVZVL3u828RdxWna48cmiKIDSqBLFOWiZdEGK67Wn7WUmIldWP5fpcX4TrbUmIKSu38HOHogtFcMgiSrikglyKPDbfjatML6qoRFpehqw4K8Evw35FXT9X2XvyMkrLZFmFKbJlstXqjCAmWyblUH0/fkGIWSZ1ofqWhBgpVF9FiHUyQywvy6REiGVZN3keq1D98IErLog5uPm0HWWSXSBEQsxaEOszQkxnmSwJse5Wr+bQUC2TRSbEdKJZHoRYmlBM6XtZZRWxZOoyaxXhXFcSYspyRllZiErkDLEMgpHVgABZCDGHNkBXmWXy504JMZcWRoJ4mEUctbH+dnxEztIyWVZRikyIieKAS8tkSoZYjBBTnHTkUP3YASZaJm0JsUEDa6HjDDGy8EQZZVIuC0IsC/1mnSFWrbb/XxBCLGqZFPrGCDFry2TwsxxlEmWGWLfL5YNiEXJ1WFFD9SkZYnkQYrptRSGxukGIlZbJsmZq9bIgVhJikcorF8tp7lcWwYggStjYPbOILuSREx0RafJ0LnOxslgKXQ5A0M3BITqRhdcPVQpiBasIIaa7eIsiWCcsk+EyoqH66pMqW4dUQmx6Gti5M/h/miDW5QwxcT5jy+TUVCCKAemCmEWGWJZ1s84QE/vWrQwxaqi+ISEWG3mT60OGI3Lqqsg3tDrLJLVKQiyfyoMQK8J27UaGmHjdzJsQo4xOqZouCyFWWibLmqnlWhArgvjfq/mPGcsVseXUamYjSnQgVN+VmOeUSKNuK0PhMzMh5sjumZfgl9QvK3ovo72UIq71Q5WCWMGKiUh+C/obWMqbcptKsUxGCDFp1phlMrxYxw5CkYLati34mWaZlE+6lhliXbFMAsDWrcHPNEFMXBcqIZaBfrPOEKMIYi6evyiEWPjAJ+4hbgkxtbBruy+FjQU/i3hD60IQS6K8KFVmiCVXrz5UUQkxm9yvTmWIUUSzpOUnfVYSYmX1Y+VFiPVaqH4PiNy5hZ9nEBJsRJcsYoNLKsil6EJZP5vtnin3K88BCLIQW44EOGpbro4beboyVL+swhQPok+zTIq0Swctk2KGWCWBEItliMkHoSiIEYWi2InExFooEGIuLJPkNsQ+btkS/EwbZdLz2vMZEmI262adIdYpQswyVN9thhhrqk8IMRcUUkmI5VO9GqrvMkOMmp3VyQwxl5bJkhArq9erFy2TeYTq94DI7coml6eolCUo3krkcUg9dXrkRFfCIFWAc2r3dJXplUGAk/uRd2aZPF1pmSyrMFUYyySjajSEmJ9wsMVGmZQPMN9vC2BMEDMM1bfNELO1TEYyxKht+H5b3GKCWBohBqQLYjIhloF+s84QIwhinoubNd1+zn7PO0Msj1D9ItygJ5WLG3edIJbWbkmIJVevE2Jpo0xSyGj5GuYyVN8lIUYRzlTrWBJiZfV69aIglgchBsz449pqtLwshBgcCgmGIo/TrCei6EIZgCCLiJVXdltWUckqQ0xFiFkIfmSaLst2cJjvVlomgyoFsYIVOVS/U4KYlCEWDdVXnyjYOiSG6gPAqlXR9g1D9T3LDDFbEUMUm4zaYLbJzZuDny4EMZeEmGdIPbETpyhgKgQxvwk3RIIu50dDiFXYqc0FIZZHqH6R6By5KJZJW1ErrV3dvEXcVp2uPAixIjxQqR6AbTPEXBJiJmIXlRCTXzYlTZdleeK85fFT1kyrXhbEXBJiQDHO3xnKRkhQijwdDj+Xp8tEWVkIVFlEl6KOZGhFwLm0FFK2lUOaztV2yDoCKKWtfqhSECtYRQgxaoaYS8tk0k00s0yKofoJJ/GYZVJ1EJ5wQvR3U0JsaFa7nwkP2F3PEAPagpgJIcbmIRJiWUL15RNhKi5LzBBLzcCjlgvLpCNCzDcVD3VVhBv0pFLZ1+SyFbWyzFvEbdXpyoMQK4JQIh7LumPDVe4XtS1XhJj4mYm1sswQK6sfqwzV11evEmIZLJPydFlsjmRxjbA8KxErZ/ulzfplJcRMhcgstGCsrQxCpEvBz2rwBJfiIaGtMkOsrMJU1wmxpLfrqlD9lvpEIYfqKw9CWRAzJcQW7ga87W3Ahz6UeNMkEmJZgufl+Yza2Hff4Cf7jnIgxLKsmygQksRComXSmSCm288lQSxy46ESxKijTMrCV78SYrobd1NRS2yrJMTsKw/bTREeqMQHYF2/TCyTRSLExM90bSWd40yXJ85bHj9lzbQqQ/X1lTYa7wwqK1teBuuXy/BzY+HCoQDnlHpyaDvMlHllQe85tXsqpnNKiFGFXVfiIZE8pPS9H6ra7Q6UFS0+MmO3QvVTMsQiofoJBxtbBz5ipuogzEqI+RXgsstI87TQykRRyfMZUWYnnwz8+tft39NC9QHjDDG+bhkzxEgCTycFsVZL/9CqI8RYX0TLZFZCzGWofpHECLkolsmSEOtO5WG7KcIDFVUQ63aovktCLM9RJlUCXHn8lDVTqhctkyUhpiwSIWZg/UrMLhamUf0/qU+q31X9cCqU5GxhzEuYSVqe/LnTYHqXFkbFdGSbo0Nh13QfzZpZVmaIBVUSYgUrfjIH9A8qeWeIyTfRLENMZUtjv8uWSR5GrjjA9t67TU8BxoQYRYyIEGLNLhFiJ58c/T0PQizDuolCn5EgJgp74b7iXBBLC5iWCTHVvmlBiCVniNHeTNEWUiAxQq68BbFqynuYkhBLLpcPikV4SGSlskzaZojZhOpTaTMd/WVKiNlaJl0SaWWVVcTqRUGsJMSU5ZQQM6SxOhJy73B5rtrKi4BLWl6sLQoB1wEboA0h5oqAU/2u6kfe9llqW/1QpSBWsCJbJinWEZtKyxDz2w+zSYRYLEMsSXE+/vj2/w0JMcpBqyTEOp0hdvzxUQGAIoi99KXA4sXAc56j/HMSIWYj0IjzkOi5TmaIyQJWkiDGBLkkQSxrqH6CsNuzhBiFQrIVtVR/o85bxG3V6cqDECvCdhUfgHX96jYhRhGoSkKsrLKyVS8LYiUhFilSDpdD61de4ecuhZK828or3F3blqGYl1VUciXm2eTXObXi5iyOyvOXhFhZhalIqH43LJNsmQmWSV94EPahPtiYQJOGLkdsk3kTYg4zxIxsl7NnA89+dvt3iiD2/vcD69cD++2n/HNShljWUP3CWSZlASzpd2WovtC3rKH6/D7WoWVyJhBiLjPEylEm3VSvhurbZIi1WjQLY5ZQfUpb8oNuGrFFFfyS+mU6qmV5/JQ106oM1deXuF1m+HFNseXZiCAUcc0pIdZh6qnTbTkVqDKIWFYCXAeWZyXAORIPXVpxy1D9sgpTEUKsm5bJJEGs0hZCEjPEwnXQhuoDbUHM8xLtgaxs6Jy8MsSMTxgnn9z+P0UQA7Q3gi4JsUKH6qcFTOtC9dmAD+IopNaWSUaIObRMFuGNdVKVGWLFrX4P1Xc5CmM3CDHf14vheRJiM/zBuaw+ql4mxFxbJotw/s5QVgJV3hQS8QW8sXCR0XZoOmhA5rYsKKQsmVckcdSGFswgmpFtjjZ5a66E1qzfDaHv/VClIFawsiLEOpghVqmIlkn1wRazTCYdYIccAlx8cTBSZMpbM+oFUTVNC13MEAOA5z+//X+qIKYpkRBrCftIR0L1i2iZVBBifN+0IMQ8z4tevPmLXYeCWJEJMfkcoKqSEOtOuaQMirQP2mSIJU3j0jJJob9MKDIXgp8tITbDH5zL6qPq5VEmS8tkpGxEEFd2yE5kL7kMpjelrKhtkWg6hxSSq2mo/coimjklxLpsxc1CyvVDlaNMFqz4yIwmGWJ5WCYTMsT8alsI8RMO7lioftIB5nnAv/4rqVtWlknBwtm1DDEAeO5zA+vk5CQwZ47V8sWKCH2t9n6QNVSfRM+x71zMfOuWZVIbqi/0zZAQA4JtyW22/UqI5WWZtM0fK+K26mSJ69/LhFhWS6FLy6QrQkw8pkpCrKyy9NWLhJjLdeqlUH2XhJhhW50QXUzJKGpbZNHFkZXT6YAAGfpkJVZS9peMRJpNiH/edJtL+2w/VCmIFax47lZBLZMUQowJYVzcc+BJtrJMMuHIQYaYsbVQrFmzgBtvBHbuBEZGrJYvlrgt2P4CdChDjAl6c+cKjRRDEBMFWm6ZtCDEgrYEQYzfx/YJIVZUy2QRt1UnS5VDlaWKkKvDyiZDTPW7OJ9LQoxCY1FzxlwTcCUhVlavVS8KYiUhpqxI1EUHcqqKSD25JOCcii4OBaqOi5WGghFVxHK172mnc5R/5tI+2w9VCmIFq8JYJuUh61WEWIJyHiPEHNzUUA9s1TQuMsQyhw4mjBhpU+K2YFZQwJIQMxX6zj4b+OxngbPOan9WlAwxx4QY/3+CZdJ2XwJQjBv0pCqqIFbEbdXJEvd/l4RYEYRGl4JR0jVMLPmzThFipgSc6tpOEeBKQqysmVy9KIjlYXcHZvxx7XJ0Poq41nHqySXJQxEPiWJeXoIRabsTtlXWfDdjyiqrzdEiby1vus2lfbYfqhTEClaRUP1uWCaT7Bg8Q0wI1U84EcZGmXSgOGcmxLqZIea4xG0x3Wx/9zb9Ml6v2bOBd75TaiQnQYyaIRbuq+Kop1ysFQmxjIJY3xBilDfZJSHW+RLX3yVlUAShsdOh+pR2VG25IMSyWiZNQvzT+lVWWUWsXhxlMo8BUYAZf1xbBcVnERIsiDRX4lpWm2Neds9MgpEFhZRFNHNpA7QSlRzl11H75Urw07VVWiaD6l8psKAVIcS6YZk0yBCTLZNiZhdACNU3qCwZYhFCrBsZYo4rQogJGWI262acIaaqblkm2e+KUH2lZTIzIVZJnMa4iiRGyEV5ky0KF6rSCWJpYk5JiKnLdYZYkURZlxlippbCpHZUbWUVqPIQ/EpCrKxeqzJUX1/idpnhx7WVLc9VLpZD0cVltlQnLIym9JfTAQEy2CptxENXtlHqdFlHfTTd7k7tnn1smSwFsYIVmRDrUoZYNFRffXCzdUgN1Tco6kleNY2YIWYr+jgRjhxVVwkxZSMFsUyKb6DEvrERMaemyItWE2LmomzyAgokRshFsUyKpIuq0ggxGzGtiNuqk1USYrQMsSISYi5D9SmCn0oQK8L3XFZZlCotk/rqM0Ks46KLBfXkMlvKqYCTxe7pUKDquFhpaId0mcnm1M6aoe8u7bP9UKUgVrBiJFPqKJOiIMb+f+edwN13Z+xAws09C9WvtkcXTArVLzIh1hOWSZEQEzLEbIS6TIMF8BlDy2K3LJMKQqwiEmIWgpi4LXKxTBbhBj2p5HOAqrIQYrbzFnFbdbJ6OVQ/z9D5LIKYDSFGDdWnCH4UQkzVljif/HKrrLKKXr0oiJWWSWWZUlZA/gKVUwHOJqfK1fppwAFju2dRthVxX3BFWeX13VCn64R9tiTEgiozxApW5FB98ca50QB27QJOPjl4+N+4sX0TbFopGWK+0K4vHVxJofouBCTqSSkyjUiINbMRYoUSxIR1Fy2TPU+IpQhiiYRYES2TM4EQ04kuWQgxNm/SupeEmLp6OVTfpWXShOrStWPalmmovivBj02XJJCVgnJZM616URBzSYixdprNYpy/M1RelI7LDDFXI1ZmJnlE8TCBZ6HSPqb2y6wh93mJlaRcrAy0IPsbBzwcCX5Z+5XXCKDdfr7tZpWCWMEqYpmkZohNTwM7dgBjY8HvExP2ghi7aCdYJiOEmCQuiQIUgNQTiElRT4KR/giEGBOObA/2CDHUbUFM2J6iZdKGxBO/w5kuiEVC9TMSYrmH6hfhBj2pqJbJThBiaRa6firXhFiRtmselkkdGUUZzVHsQx6EWNZRNKmEWJHF97LKUlUZqp9eRTp/ZygrW16GB3sb22EWm1xuofoOCTFXgwZk7ZdLUclVRhr7G4dVHFlCqf3q+HdTWibLKkqRCTE5Q0z83eDBP1YGGWJplkk+ymS3LJOCQOcyVL9IGWIuybfCherLRFdKhphILPK+OBTEfJ92MaMtoMAPqVTLZFZCjDJvGlHTT+WaECvSdqVaCjsdqm+SD2ZKiLlaP9V0KvG0CN9zWWVRqhdD9fMgxIAZf1x32mpmNbKgIzuk7p7R1L6XhRxy2ZZLsbLT9lnKNpD/5iq/jtyvTtN7fWyZLAWxgpV1qL74sO9CEEvKEKu1CbEktVu2THYtVF8kxJrZCLFCWSYVhFhXyTeFIFbpJCEW7qtezoJY3xFiaZZJW0LMRBBLE0j6qfqNEHNFULnMELMN1RcFv6yWSRNCLE2AK6usIlavWSZdn7uB7q+To7IKuc8gUHWV2KJaGCl9d5kh1oGcqiLmu1G/G9PAfArVpetXHvtC1rb6oUpBrGDFc7fSBAVRHJiedkeIJb3tZhliNdEyqT4RFiZUX0WI9WiGmBPyzTZUv1uWSfa7yjLJ/lOptAUxmTj73/8FLrggyN+TSimI9QshRhHEshBippbJIm+rTlZehFgRHqjyIKhMLJMuQ/V1y0uzhLomxNIsmgCwbRtwxRXA5s3qv5dVVqer1wQx1+duoGeuiy4zvUiEmA2RlkG4IOdUUaggQ3up7h7VVVtOR8g0JO6o/aJsT+22ciTmOd2PDb8/alvdfr7tZvXvmhe0uM0QKLxlMokQY0JYnoRYxzPEBCHNWjhyVKpRJl2sV+EyxAwtk1FCzGv/je3L8nHxiU8AX/sacNttsUVHtgt7tpQz8/7/9s48XI6i7v5n5t7ce7OHQEII+xJ2CDuGHQmbsqOsKiKLCwiIgvC6IMpPcBcRldctiAuICAi+oMiSGMISQsK+EwEhYQtJyL7c+f1Rt6ara6qqq7trZvrOnM/z3GfuzPRUV/d093SfPt9TeU5sm32C7iJ0hpgugCU5xNT3dIfYT38K3Hqr/bOtTL0cYkW4oKpHhpjLseUzmqP6uo9I51NWWUSH2C9+AZx6KnDlleb3CWk0rSyI0SEWI2gJY0pnTd6yvJAOsbSOpry5WKH6num7CTk/j+nyiHS+06V1J4ZsqxFuunaAgljBCFIymWI0vRoSBDFXqH5NyWQdHWJpR5kMmSHWbAVdXXZZMtmSGWKpQ/Wj9dLhE6ovnWEZHGK5t4Ei390NnSGmT5tWTJOf/e9/gc9/Hjj9dPtnW5l6jTJZhAuqkBli9SyZzOsQq0dGmmm6NA6xd94Rj+++a36fkEbTaqH66nxbKRctAGmzrPTPqKQVu3yFCx/nWp5pavqVJ1sqgyMtz7rK5HoK5Ljz7ZfXNL7lrIFcXfpnbK+HypNz9stzPbQ6FMQKRixUP80ok6EdYpYMsVjJZDlBEJPlnwEEpNwOsRbNEMvrfAuyXDZBLO/JZ0pBrGQQsZyCmNxnDAKyOVQ/4EijRRIjdHwzxLLkgAHpxDT1swsWiMdFi8yfmzcPuOkmMcpuK9LKofohM8QaHaqfxiGWVALsM/plaIeYPP7lOW8gJCStFqpfD4dYs0W+QKQWG3KWHWYqbcvhrPEVqOoh5oV0pIUcgCCPuy3kAAtBHWIZhNZQAz/4Zpbl2a7aAQpiBcPbIaaeKIcM1U/IEOvo6q5OajsQSiGsuiwhSiY9lXVbf/JmiAUJnw+EySGWtU+lUqm6nvpdhpjDIVZdGy5BTD73FMTU9Z57pNEiiRE69SiZDOEQW7FCPNocsJdeCnzkI8D119vb7s+EHqWsSKJsyAyxZjnEfISuJMdWMxxijuMgIU2h1UomGapvJa3LKq/Ik6m0LZBwkXfUx9QZaS7xMK0jLaB7L9f8PMXKtG4z3wyxUK4uV79SC3AhhV06xEhRiDnECpkhZg/V1x1i1Ty0ECWTnju26TMhMsSClBYGwpQhlifXTH42s9DXd2HW7Awxk4gVyxDT20vpECt5/OB5U+ST2XqUTIZwiKmCmGm9zZ0rHt96y952f0Z1/YSgSA4D35LCNBlicr/3EdYqlfwjSPpMUw/Bz9RWFocYBTFSFFpNEGOovpXUTqWcIk+mDLE8wkVIgSqlC8l1nlqPzDLXdKH67ju/uuW75Sg79BZa0zod8wq7nmW9rU77LnlBka6qREFBL5kM5RBLKJmMOcQ8SyYL5RBr0QyxPH2Sn+3vGWLxUH2lb0UsmSzyyaxvyaSvy0svr0xTbmlyiAFmIaTVL+zr5RArwjboG6qfxUHl48SyzS9UhljIUP20DrGkY02r7zcAMHt265ZStyKhhasiCWJ0iMVILQTldYhlcdbkEBLqJVDlXVepM8Qa4EJKuy349itPDldNvwK5ulz9Si20+q6rnIJzq0NBrGBUXVVpQ/VDOcRMJ9uVijlDzDLqXj1C9X3vCpj60w4ZYnlca/KzwTPE6imIqY6Ojtr+ewliaUsmQ4bqF/lk1qdkMo9DLK2YpjvEAPPFe6tf2Id2iBVpGwyZIZalZNL2Wpo8Mp9p6jFogKktdVtJ+p5D7jd/+Qtwwgn2nL9m8NhjwCabAJ/6VLN7QnxhqH4yRb6ploLU4ecu4SJtTpWnsyaPCymLQBVKgAvppgvpQsojzPhmloUSGH2nq9doqSG347zbaKtDQaxgeJdMujLEQo4yKV8zCGK2Hw0phAV1iHnesYlNY3KIZc0QKwV0B+VEXfa8Qp/62cJliOnbsbrNG+64xn5EcjrEYplx0mwRchto9gm6i9AZYrojLK2YZnKImY5xrZ6FFNohVqQLKpOjKWvJZJZQfdNr6udCOcTU7bkVHWLf/z5www3AlCn52wrFiy/GH0n/IbT430qh+kW6oZGDupUB5ihH8x7JMJBzSJ+u3s6hkG1lWVe5BEZf916gElT986HKZ139SivA+WaW5RVRWx0KYgUjFqrfzFEmdUHMEKrfUe5UP1lbMinLPwMISJlKJuuVIZYjrysERodYMzPEGuUQU9sznGAmOsQyZoh1SLNFPRxiRRAjdLRSVCNFdoi16mh5rewQ83U0hXaIqduwq+16OMRM/XId81x9L4pDbNky8Vik8sRWF8pbkVbLEKtHqH6RbmjkIFTpXs10IV1IecQ1T4EqbQh8w0fkbISbLqXw5D2/gEKrj/jkm38WauCHkCOA0iFGCkOiQ0yecNY7VN+nZNI3QyzADpYlVL/qWKtUcofPF6lkUiWkQ6zwgliCQ0wdZbKjUorek+Ju7lEmA24D/cEhliZDTJ+23g4x0zEu74W9PLYWlXo5xIqwDdajZNInVF+98aMfb+rtEMubkeYS15rlECuiKF3EPhE3rSaI1SNUv9nLFIJ//xulv/+9+jSvyFMvZ02oEHjXeWMogcp35MRQbWXJWwuVM+ZqK+TopaGEyIaPyJlhfkW6vm007bvkBSXmENN/6P70J2DoUODGG+sXqm8rx6iG6vdUX7apzzWjTIYomcziEJMlk6igFzlLJkMGquekbUL105ZM+mSIqftUM0P1i+wQ8y2ZVIWZzrhbNNEhlmeUSSB8yeSPfgQMGwZMnpz+s42iXg6xImyDzQrV153QtrZd7q80JZpJDrE0gp9rXZmmaYQgVkQ3VhH7RNy0siDWioOiZOW881C64c/Vp3kdLPVy1ng513KUVdb0K8d68M3FClXCmCnTK0cJo697L23fc383Id17aZ2OvuWlObfRVoeCWMGQIlLZJIg9+KA4qXv44doMsXqXTEqHWKxk0uwQk86wkKH6vrXQps+oDrEgJZM5AuxD0Lah+nlLJtU2lW26KaH6RXLn6Jhcojq6y8sliJlGmfQtmUyTIZbnwv6BB8Rx89FH03+2UfiUsqah2ReJKuqy5c0QS1My6RLEfB1iaUsmQzrEbO4v30w2oD6CWJHcWHSI9T/qFaqvtt1I6hmqX4Tjd1YWLBA3//sIKVx4uYIa7eTJKVCldWzldtMFXFehsuJCijwhBzyoV75bHjE2pHjYDlAQKxixkkn9JFdeEK5YUVsyGcohlpAhpgpiZZsghkp1OYD6OMRShepDCdVvgZLJujnEihaqn7JkMmaRVi8IVUHM5BagQyxOlpJJdR0DbodYmpLJRjnE+sNFczuE6tfDIeZbMhnKIWaaLmSovo9DzLcEFQi77RdxpFc6xPof9XKIqW03ktDuXrWt/iyIrVwZO7PPWwaY2qUTMnvJx8mTU6BKm4vVDDddKLHLu2SyKA6xQKJZlrZ8RDPndJ4iaqtDQaxgOEsm5YmdLojpofohRpm0ZIhhwACU5SRaqL66U8nl0F/PSm6HWMBQ/WYLYip5s9HUzxbOIeYqmVT/76jtv9xG0dERF2tMF2ymUSbVESXlubnnSZkXRb6761MyWU+HmC1vTA3qDu0Q6w8XzfUqmSwC9cgQ83F1qceGUA4xVz/VbT+kQ8xVMtkMh1iR9qMiinTETasJYqFvZqhtFeGGRlZWrow5xPJmPTU6Fyu1kyenQJVWMGpG3lqe7zCt8OQ9P4+2fIXWPFlxQfPdUq5PV1t0iAmKc2VPACSE6kuHxPLlteJAgzLE0NVVFQhsJZNAJNQAdXKIebRpdIhlzRArBXQHBUAuW16hT/1s5rJLKUg13SFmKZlUxRpTSY+PQ6xU8v7x9KI/OMSSMsTUH840GWK6mJYUyG9yiIUO1S9iqZdOvRxiatvNwhQUH2qUSZ+RIZPabpZDzFUS6hIPmxWqX8T9qIh9Im5aTRCjQ8yM7hBrsLPGuwwwR8md7830UI6meok8QcvycpYwevULyevdVwiqR/5ZQ0L8PYXdoDf7+zHtu+QFJeYQ8y2Z7O2tf8mkIoh1GErIAE0Qq6w2vp4V34OuaZrgGWI53FihkMsmSyaZIaY7xJQLQpMglrZkslQK6xIsskPMJ6sqr0NMfZ5UbumbIRaiZLLILpJ6OsSaLcz6Opp8MsTShOp3dCSLSmpbecU131B9l4iV1iGWdOFcj1Emi7QfFbFPxE2rCmJ0iMXRHGKNEEqyOGvyCEa+5WihMq8yiTyBHGku6hY6b+tXQDddsBEyPbeFeoT4O9tiySQACmKFQzrEjKH6NkEMiJcThRDE9JN7eeERc4jFL4LVnU8KNUAYC2YWd06rZogBikMsp9CnfrZwGWK+o0z2bRsx0VJ1iJVKkWCTQxALehelPzjE0mSIpR1lUn3uEsTSZIjRIZaOZl8kqoTMENNv6viG3Lva9hGoXA6xtKH6su++GWJ5HGIhyxyLuB8VsU/ETauG6ocUxFrBIbZiRVCHmM95elBnTVpHmmfJZKiyvEa46TK593IIcCH7FdQhFkg009/LtR3rTsec22ir0/wrexJDCjclwD9DDAgniNlO7pUT9Y7quX1cQFF3Prkc+utZySJGhMwQ6wgZqB4A3SGWp0+FzRCTApg8iTQJYspFYTxUP963quhiCpDO4BDL7RIsqkPM5Ioxobu80jjE0rjL0jjE8ghi/cFFEtohpn4HzRZm02aIpRGoktpJGqlRbSurQyzt8rkEsSI7xJghRkJQT4dYM4519SiZbCeHmKdQ4uUcCuisCSlQNVp0CeVC8jUrhAy595lnahdZTvdeqEw277ZSimb6/G2v0yFGCoN3yaReKrJsWfR/PRxiyutVh1iH3SEWyxAL7BBLa9ENkSFWWIdYJX+ofu4MsXoLYl1d8edA7YUxtO9Izlq+H8IhlsGlaKWoDjFfQSytqOVyiCWJaSaHmOkYl+divD+4SFrZIVYPB5VvyWQaQawVHWKtXjLZH/ZtEoclk8n0d4dYpQKsWhU7m8/jjNLfy5Ox5eus8Spz9DxvDCa6ZBF5PASqskUuyJKLFfR79ul7wPnlCczPVBIaaFvw7hcdYqQoeIXqmxxiqiAWYpRJ/URauYCQ5Wiukkk1Q6wQDrGQGWI58rpCIZctRMlk4TPEpCBmyhArmw/kZV08kA4xT0Es5gisR8lkf3CIpSmZTCp7DOEQUwVRjjKZn2aXEan4lkzKbUBuM67QeR9Xl2/JZEiHmE+ofn90iK1eHc2jSOJTEUU64qbVBLHQy6O2VbSbar707Y8xh1jIrKdAZYC+0wXN9MqxHoKKPAFzuEKG6qcVInPPL9B2lakkNNC24Gwr5M3+fkz7LnlBiTnE0pRMhnaI6SfuSvlauXo+rpVMKjtVrGQysEOMGWLRsoUI1S+sQ0xu4yaHmPy/w1zKai2ZzDHKZNBtoKgnsy6HmE2oApKD8XWHWJpyS9Pxo16h+kW6kNcJ7TJodhmRStoMMbm9NbpkMoRDzGf5+qNDLOGY2jRkv9ToB1Js6imIzZwJzJ4dpl1f6hmq3+ybGVnp2y/Vb7gR2VJpS+n0dq39ylmO5uVcSytQ+Yo8PoJfzhLNUKKS7zxDlmjWY7CGvNtCWtHMNU/ffrU6Tb2yv/zyy7Hrrrti6NChGD16NI466ig899xzzexS04k5xGwlk8uX108QU0+k1RN35Qe93LfD+JZMhhCQfA/ysc+oDrG8GWIls/DSLKoOsZzLBfSDDLEQJZMuh5hhf6l7yWRRT2ZNrhiJKnqlLXvUR5n0/awuvElcDrEsx7/+4BBr5ZJJ0++OSwxyCWL1CNUvqkPMJq5lcYjlFYMTjqlNo6j9InbqKYjttRdwyCFh2vWFofq1SEHMwyGWpdQsaNlaoLK8vK6noCH3PsJgSvGwEaKS7zxTu8h8v5s8DjHPKpNQDjhvh1iG6+tWpKlX9pMnT8ZZZ52FBx98EHfddRdWrlyJgw46CIsXL25mt5qKdIgljjJZ7wwxW8lkuVwtmdQdYnUtmQzlEGuxDLGqQyxAhljhBbHEkklVxLKUTJrcEJ4OMSBa723hENNP3lXhKmvZY9rPpnGItfook60cqm9yJmctmczqENPbMu0LIR1iprZkH3wEP5e4RodYRFH7RezUc5RJAHjhhcYe8xiqX0vf9Yy6RvIEpOufz1seFqrkLmTovI+gUq+RGvPmcIUSlUL2K1PIfY5tIWT5bNqSV72P1vm1sUOsM3mS+nHnnXfGnk+aNAmjR4/GjBkzsM8++zSpV82lOspk0UomTaH6jgwxKdSIpgIIYp6249hnVIdYyAyxvCMMBqAeGWKZl6sZJZMJDjE5Eqo1VD9DySQg1nulUmldh5ivIJbk8nI5wkI4xEKH6veHnKF2cIiFKJlME6rv4xBLKnM0CWKutvKG6vusK5+ySkk9BLEiCct0iPU/6ukQk+2//z4wfHjttCtX1kYA+HD77cB3vgP8/vfAhhvG31uyRDzSIRZhcojlEGaA9K6Z3CNWBio7zNRWA8ryQuZwpRWogn43DS5hzLM+fadL6/Dz7VcRDB/NolBLvmDBAgDAyJEjje8vX74cCxcujP21GkFC9UM7xCqVWIZYR0XsPHrJpLrzxTLECuAQyzsaYyxkvQAHDN0hlqdP/cYhliCIlVRxLClDjA4xM64MMb1kMpRDLCmQ38chph6j6BDzo+ih+nlLJvOG6qctc3Stz5Alk2n6niTAAeEEsYRjatOgQ6z/UW9BDADee6/2tR/8ABg6FJg8ufa95cuB444DvvnN2vd6e4FzzwWmTgX+/Ofa92+6STzuumu6frvo7w4xU4ZYyLI8n/KwnCV+6siLecvR0rrb8pblNTqHK6R4GKp0NGjIfUAxNtR6r3GI2dry2I7bgeZf2ffR29uL8847D3vuuSe23XZb4zSXX345hg8fXv1bf/31G9zL+hML1bdliJkEseXLo/9DZ4hpJZMyQ6ysC2LKjqRmiIXYwbKULKoOsZYrmZQOMSn05QjVl2Jf5jb6LjybLYipB/Vyb75RJm2CWG7xsNpoQR1ivhliqnABpCt7zCOmSfTvLGkEyiTaPUOs2RdVIUPnfUomlRs8DXeIpXHAuUpCfR1i7V4yWVShjtgJLYh1dgIjRkT/A8D8+bXT/fa34lx66tTa9+68E7jxRuB736t9b8oU4OWXxf9vvx1/b9Uq0S4AnHZalt6baVOHWNAcLs+cqjzOGm9XkIeQFUrE0udXlByuTOWsOYTITHlrObarLBlijSiN9d1GW53mX9n3cdZZZ+HJJ5/E9ddfb53m4osvxoIFC6p/r732WgN72BicDjG1ZNKVIZbnpC9FhpjuEAOinS94hpjnXRbTZyoIXDKZQ3wKhVy2ECWThXWI6SWTiRliyo9IUoZYwsVbbBAFrWQSaBOHWFLJZFZRK20gv49DLO/FeH8omQztECtqyWTeDLF6OMR8XGtFdYglXTi3eslkUftF7NQjQ2zaNODxx4HNNhOv6YLYnDnAU0+J/999t7aN224Tj4sW1R53fvOb6H8piM2cKQS0228Xba+1FnDEEbkXpUo7OcQyuLryClShXDohyz1Dha3rfQnlEMubw5VFzMszeEJQ915a0awBDriafuUUUVudpmaISc4++2zcfvvtmDJlCtZbbz3rdN3d3eju7m5gzxpPzCHWzJJJ/cRdzRCTDrFy7eYjdz61ZDKEoypTyaTJIZaxZLKoDrEQofqFzRCTJ51yn1dPQlWHR7UbioglZy0vivUMMTrEzLhKJvOE6qs/smk/6+MQy+sC6Q8lk6EdYkUK1Q+ZISY/5ysq2ZxkvrlfuvhUqfiF6jcqQ6xRDrGiOrGK2i9iJ7QgBgBbbSUe11hDPOolk3ffHf3/zjvx93p7hbAlef/9yHG2YAHwl79E70lB7Nhjgdmzo+PLxz8e3dwLQX93iMlQffW8MacbJm1ulK9Lx2eaRpdM5nGR+bblNY2nWBky0yut6yn3gACByiFDbsdpSmOlQSWP0NoONPXKvlKp4Oyzz8bNN9+Me+65BxtvvHEzu1MIpEPMKChIQWz58saWTGoZYlIQ6+ioDR6tOsQCl0z67vymz6gZYlmFjJhjqAiCmHSI5Vwu9bOFc4jpI66ZSiY7zN9LvUomWz5DzLdkUnduJeWAuRxiWTLE9GNcXodYfyiZbAeHWNIojI0M1U/rEHOJTz7zU19rpENMbvuVSr7jUVGdWEXtF7FTD0FMIoUs3SH2r39F/+sOsUceAd58M3qu5hf//OfA0qVRX99+W/yO/Oc/4rk8bwlZLgn0f0HM5BBrgLMmrdusKU6eQGV5jeh7llys3KH6acU1H4EqoPMwj2vNd7pGC5/tQFMdYmeddRb++Mc/4tZbb8XQoUMxd+5cAMDw4cMxcODAZnataVRHmQSKE6pfUzIpBTF7yWRslMkQJZMZapzbIUOs6hBrZoZYYUomUwhiOUaZ1N/PRH9wiOlOJD1DrJUcYqZS2qJhKs3LQ5EEMdOoiFkdVKFKJrM6xEzTpRXX8jrEfEs0gdp9J6uLhRliJBT1FMRMDrFKxS2IyXJJSd8AYLj1VuArXxH/n3IKMGmSEMTefFO02dEBXHaZKJfcZpugi9EyJZNqhljA7KXcGVvaOZ9rGle/smQ95XFjZXHA5XG3ZclIK4rwmSlvLYe7LeR23OjS2HagqVf2P//5z7FgwQLst99+WGeddap/N9xwQzO71VScJZPqxXwTBbFhq8XJ+rCe4TUfN2aIFcEhFjJDLEd5YijaIkPMVTJpCtVX/69zqH7uHLmiOsR8SybT5oDpDjFfQcw3QyzPRa96fCvyBXPokkmgONuhT4aY2kdXhphPqH5Wh5hLXEvrEMsr+PmIeT59Vz+bZ/svqvBEh1j/o9EOseeeA15/PXqul0zqgtjChcDzzwMnnij2n9NPB/7nf8R7b78tMsMAYO21gYsuEu+Hpl0dYr5lhx6ldD5CgncuVs4ywLq4nnKuq7TutqAjNYYUqBrgpgspxvqImlnEPJ9p2rlksqkOsUp/PZDXkViovnqiunp1/PnSpfEPNjBD7Oodv4IHH7kVux50as3H5Y6pZoiFdog1I0NM/VwRHWJ5+lT4DDHPUSZVh1iHnLUtVL/ZJZP9wSGWJIhldYilEdN8HWJ53Cn95YI5dMmkbMuUedVo0uRiAelKJkM6xFyOLV+HWN6SyTQOsaS+64Jinu2/qPtRUYU6YqeRDrFKBfjZz8T/664rhDHVIfbkk8Bjj4njyXrrAa+8IgSxRx4R5+G77y7KJhctEtMvWhSVS66zTvj+S9rJIebrQgqZG4X4OZ9rGtd0mVxBOcS8kAJcmsyrSqWSe/myOMSClTCGLNHMKcaG/J5Tl/7SIUaKgtUhJsslJUuWxJ+HGmXSI0Ns50/+D8766UMo9fTUfLwRGWKZRpnMmbVVuJJJ6RDrW648rrXCO8Q8BTGnQ0wP1c84ymSwUP2iOHN01P4klUxmdYjpIleWDLGQDrH+csFcD4dYUS6qTKHzNoEKSBeqH3KUySSHWFLfQ4Tqh3SIufajtBS1ZLKo/SJ2GuUQW7kSOOEE4KqrxGvnnRe9J7eVq68Wj0cdBcic4wULgHnzxP877ij21+HDo+PSE0+IxzFjwvdf0t8dYjJUX3kpr3CR2hWUs2Sybo4tD9Elbw5XWheSj1MutwDnKx4GEox8jRapxa6ADrFQ68pb8Gtjh1jzr+xJjJhDTP2h0++4ugSxOpdMujCVTIYgmEOsxTLEQpZM9vcMsbJyOCvr4kGoUH1NGMtMf3CIuQSxZjvE9GOc+h2mPf4V1dmiUy+HGND87dBHfFIFcVfJpGHADeeojyEdYrb92tex5TNoQEiHWEhBLM8+WE+K2i9ip1EOsRtvBP78Z7G//fKXkSAGCMFr/nzgd78Tz88+W4hegHCISYfZyJFRX9daS/z/+OPikQ4xOyaHWI48KN/pQjpr6ubYClWWF9DdFrJksijCZ5Z1lUc8LOp2XLTr22bRvkteUKRwUyMoJDnEQo0ymVQymSCIhcy2UvHd+U196a30hs0Qy5sfFQC5bKsq4UL1+5VDTHEsSkoNyBBr+VEmXYKYKlyFdIiFyBALVTJZZAdJPR1iRRLEfDLE0pRMmqZrlkPMNY36WlqHmG1dJc2PDjFSRBrlEHv2WfH/Jz8pcr46OyPB7N13gWuvFefa22wD7LsvMGyYeG/hwsghJqcHgFGjxGMjBLGi3MzIiilDzMNllTfTK21OVaNGTkwtGOVwKtW0FcjRlFuA8y0pDCQYZflu8uSRNXM7zhvi3w5QECsY3iWT+smtKhbU0yGm5wpp6A6xUPbLTKH60iGmlEx2lrPF5sVK6AqgoNfDIZa5jb5tom6CmLz4NZVMdqjfi/K/TRAzjSaYIVQ/mCBWtJNZ04W0pMgZYqFKJovsIKmnQ6zZwqxPDlfaksm0gpjuNgvpEAsZqu8zYqVPn4D6CWJF2o/6y/5NIhrlEJNB+htsEL2/5pri8d13gV//Wvx/1lmiL1IQW7Cg1iEGRILYyy+LRzrE7KRwiGXKegrkrMkrJKTNLHO11eiyw7SZV0EHIMgrRKYtYQzpBMy5fKnzz3wcYr6CH0smSVHwLpmU6Pk7rmn9OiAe1QsT9ULBt2SyT6gJpTZnKpns27ErlUo1fD5r1lbRLKVVh1hvAIdYUUP19ZLJpFD9RjjEQpdMFu1k1nSxLdEzxNLkgNV7lEk6xLJRFGE2jcsK8HNQ5S2ZDOkQ8xGxgOgYF9Ih1oySySLtR/1l/yYRjXKI/fe/4v/11ovel4LYnDnA00+L/484QjyqJZMuh5iEGWJ20jjEPEvN6pK95Du/QKKSs62ApXShygDV9/KKld4lhSnFrjzB+/p7eQZ+aPSolup7eddnO9D8K3sSI+YQc5VMSgzB9sEdYupJZMoMsXo4xLxD9VWHWG8+h1isZDJHgH0oqg6xnIMFqJ8tbMlkd7d4TMoQU38Ac4bqx3+Q0D4OMZOzRKJniLlcXnqJpDptmpJJm0PMlSHGUSb9KYowW48MMXW7ylsy6eMQ8yn3TOsQMy1fWoeY61jTDiWTdIj1PxrlEDMJYjIHbOZMsf/19EROL7Vk0uUQk7Bk0o4M1VfPG3NmS6UVebzKAH1FnoCupzxthSyZ9BZdZHlpTrEy07oKJR56blcNcYgFdB7SIeYPBbGCYXWI2QQxKRao5DkZNd1ZTiGIyZ1OZqEVziGW0UmlimBFcoiFKJkcOGCgeOwcmK2BgowyqTrEOuSsMzrEqrlq8hCp3WVpS4eYb8mkLmKZHGK+JZNZRplcvTrdRUJRnS06njmOqSjKRVXIDDFTyWTeUH3XejKJTz4ljI3KEGuUQ6yownJRhTpipxEOsaVLgf/8R/xvcog9/LB43GSTaB9SSyalQ6xZglhRzyF8kQ6xIUOqL+V1WQV1bHmIPD4CXMgg9ZCldPUoLw0qKvmuqzziYQbHls/3HNSRVpDtuB3IZpchdUM6xMp6hpjtBFOKBSqhSiZN2Sq+GWK9rZUhVriSyb5ly1sKCgBf+MAXMLRrKD6y9UeyNVCQkkmvDLGUJZPisTcSxEKVTBZFiNDxzRDT39fFMt0R5nKI6eWWrgB+iatkEhDbii602SjqhbxOK4fq+4hPeUomQ4Xqu+bnI+b5ll+GzBBrlEOsqMJyUftF7NRTEJOiFhANTrXuutFr0iE2fbp43GST6D1ZMqkKYiyZzIYUxIYPB7BI/J8gqFRQabhjK7dTKYMIkitU39PVFTS4PWV5aVEEo0zZdD7bVYNLQn2+57wloe1A86/sSYyqsyopVF9Sz5JJuWOo7aUsmQwlHvmq4bHPmBxiLZYhFmI97zBmB/z0Qz/FqMGjkic20SiHWFLJpE+GmClU37C/RD8ifdta6JLJot7ddTnEXLlgLrEsySGWlD+W1iFmeu6ivzhI2iFU3yXypA259xXE5HQ+88vqEPPtOx1i4Slqv4idegpiHR2RsAUIgWzo0Oi5dIgtXCgeN900Pi0AvPFGdH5ic4itsYa5giMURT2H8EUKYsOi7yJocLuHKOGTIeYdTJ+3TM6jrbqF+AdyiAVdVwEFI6/g/ZDOw4DlrLmdh9rN/MR+sWSSFIVYyaRPhpjpB7cIGWKhQ/XzOsRCZojlCLAPRY1DrJl9akbJpPxfLZk0ZYjJi109Q8zXIVavksmi3t21ZYh1dLhHjnSVUyYJZKFHmTQ9d9FfMoZaOVTfp2RS3edN7mVJPUP1szrE0rblUxIawiGmb+95tv+iCst0iPU/6imIAVHZJBAvlwQiQUxiEsRkqWVXFzBQiZpQBbF6lksCxTl2ZyXmEBOUHMuStvQrt+jiMb+QriCftkKW0oUqA1Sny1symdbdFnJb8BYPc25Xqfue090Wal21AxTECkYsVD9rhlgIQUw9uU+TIaY5l4KVTHpaaU19CZIhVipmhtjK1eK7zir0BaGZJZMd5u+lrDvIMpdMtplDzFYymWakyKTP6iJXlgyxpAt5OsT8KMp26CPgZHVZmaZLmyFWD4dY1pLJojrEiio80SHW/6i3IKaWOeqCmCyZlJhKJpctE48jR8b72EhBrCjH7qzIUH3FIVZebrnOgXJhn9dZk9Kl06gQfx9RotGOtJCiS+rQeR+xMm85a5by0pxt+dxUT1vOmlesLFoFVLNo3yUvKNZQfduJXGhBzHRynydDrA4OsUyjTLZohtjKXvFdN3Xky0Y5xKC0mRCqX16dkCHmOcpkjUPMw0bvRVHv7tpKJpNcX/oIlEkOMZu7DGiuQyxtIH8jaWWHmEmItYlKqluxnqH6zXSI9ccMsaIKT0UV6oidojvEJKqwBtAhlgbpEBs4qPpSafly6+Q+OVUhg+LT5mLlFi48pgtZShc0uD1QWV5qsTJgWaVvW3m+G/W9Rg1AkHpdsWSSFIWYQyxryWSIUSbVC4UcGWKhdq4sgpTqEJMCXatliEnnW0s6xFIKYmV1JNDVFoeY3DcyOsTkes8tQBb17q5vyWS9HGJJ4pkkKVQ/q0Ms7WcbST0dYs2+qDKJPCFLJvubQ8xVMllUh1hRnZZF7Rex00iHmBqoD8QdYqUSsNFG0XM1ewyI54fJ53J/q2egPlDccwhfpCCm/P6XVhmO5314OcSUS9o8OVzq/LxzsQIJF67pQpbShRqpUZ2uEYMGqH1ptFvQNV1q8TCnmOftSPO4ic9QfUHzr+xJDOkQqxllstGh+raSyYSdRe5M1cEBQjnEspRMKn2RQmNW4UgVQJrqxuqj6hArQslk34Vn3UsmgegCOEkQyznKpCyR1R1iwUom1f2i2WKESoiSyTwOMV0sa4RDLGSOUj2pp0Os2RdVJiE2a8lkPR1iLhHLxyEWIlS/qA6xombxFbVfxE5RHGLrrhs/x1bD94Fah1i5HH2eDjE3KQUxrwyxlAJVyFKz3MJFoJLJ2LXS0mXW+QUNbvf5bnyWL6RYGdBlVZdMr5xiXmqHWE5HWjtAQaxgVIUkoPklk7ogpl/YGqgpmQy0c2XZYas5W73R+siaIVZUh1i1ZLIdQvXV10wlk+rAB7ogpofqq/uIoUzO6hALVTKpihpFOqHNWjLpCtxPK6ZlcYglOcZchBQF6kk9R5ls9jZoEmJtIlZSyWQah1hHRzqHmEug8nGI+YprvhliSeuq3R1i+rG9KP0ibpqZIaYKYmp+GCBurKkh+rpDDIjKJimIuakKYlHcQtnk+JXvpXQh5XXypA0jDyVcuKZT51DqNX/vsXbenWedX0jxMK3IE0yszFvO6vH9ZWnLZ5vJK+b5uttCrat2oPlX9iRG6pJJVSyQ9PZmFyNMJ9vyRyohPwxQBLG+kslQ4lEeh5gsKwRaL0Os7UomfR1iSRli+oWRdiJmyxCreT0rRXWI+QpippJK+TxJPEsqmfRxiNUrVD/tZxuJYZvPTVHKbnxcT2lD9dOWMNbTIZZ1QIAkh1hS35Pm1+oZYnmEctI8mukQ6+qKnGBqfphELZvUHWIAcMwxolxyr71yd9NJUY7dWZGh+r4OsUBlcrGL/4ULg80vt3Dh4/767+vR/4sWmadRBiawiWZABvHQo++5ywDTipW+6zNveWnAtuoxAEHuzLIM19etSPsueUGxhuqbBLFyOR5mrZL1xM+VIeZxISZ3JinUNDVUX7qoVisOsRbLECtEyWQzMsTUPKE+YqH6aUomDc+TMsSCOsSKdELrmyHmEsx8xDT1fT2QP4RDjCWTfhTFZRAyQ8ynrawZYlkdYiFD9X0cYr5iXr1KJiuVYhzX+ovYTeI00yEGRC4xkyCmBuubHGLf+hbwxhvA+uvn62MSRTl2Z8XgEMtdMvmfV2qmr5nm9Tei/xcssLZVXt7XvxX284G0WU+5XU9PPh39b1lX5UdnRm06jsF1CW4P6BDLK8DVwy2o/29ryyd3znVk8xqAwNeRlnZdsWSSFIWYQyypZLKz0+7ayiuIqXef5Umkx4WY3Jmi0s9AgliTHWJqSWJTyxP7aItRJn0yxJTtP/ajpbtp9FD9hAulho0yCRTrhDZrhpj6vBEOsXqNMpn2s42kHUL1Q2SIZRW7dHGt3g4xU1uyD3KfSBL8kpavWSWTedsKBR1i/ZNGOcR6eswur9GjxWOSIGb6LFC/fqv0d4dY375Yjgli9mOGl+gyfXrUls1Zc9/k6P9VDsFo9mwxzXyHaJZW5Mnrenrqqej/leZ1VZo6NfrfsW3UI//MtwwwVHlp0Pl5LJ9rOu++LxG5bqV579mnSet0DPjdsGSSFIaYQyypZFJ3bahkPRl1ZYilcIhVM8Tq4BBLO8pkK2eIsWTS4hBLWzLp6RALFqpfdIdYkiCWxiGmi1q66ytEhljIksmiXjS3cqh+yAwxn0B5k9ssr0PM5f7ycZGpr4VyiLmmAeoriBVhPyqiSEeSaZRDbN11zfP4yleAj30MOPzw2vfUkkmTQ6xRtIpDbICHILZkCUqLl4hp3nnXPM3ixbH3rM6ae++N/l9tmd+cOcKUAH+hJO9IholOnuXLUX7m2Wh627qaMqXa97KjZNLHZJC6LC9vyWRKAS53Oaunyyp1W7Z+PfFE9XsrL1honV/qARbyCK033ojygw95tdXqNP/KnsSQDjGvUSY7O+MXkyohSiblDpQjQ6weDjHvUH2DQyyrkFE4QUxbtpYWxNRtXLZpyhBTxM4ah5geqp9wISgdd2W5rYUumSyqQ8xWMunzXB4fkkok9feb7RDrLxfNrRyqHypDTF0OHwdViAyxNO42l4tMnS5vhlhWh1geEauITssi9okkU29BbO+9gf33B77wBfP7RxwBXHcdMHhw7Xs+DrFG0CIOsViovsmx9f77wEEHodxXulheutTc3i23xM5BjQLAm2+iPOuxaBpbiea111bbKnXYz63TZj2VHD+zicLFAw/EyjdLpmP16tXA/fdHYt7qfCWTvtddQfLW7r4b5bPO9ppfKAHOO8Q/ZVvWfn35y9F2Zcr/TtFWfBo7iULrccehdOutNdO3I028iiYmqqWGvoJYvUomM2aIyZ2puhxFcIitjkZizNoftSSxqeWJfZiWrWnUu2RSbuerV/uH6idliGUsmWx5h5itZFJ3o7pEriSHWFLJpI9DLGSofn8pq6qHQ6woF1VZXV16SaG6HC43VtqySh9XV14xT32NDrEwFLFPRae3N+wxJgv1FsSGDQPuuSf7ZyV0iGXHGKpvOP78/vdC5Nmzbxqb6+l3v4N6WlY2CQC33hoTpYyCWKUC/OY3KE3sm8axfkux/y2i0py5UZ+U/2vbSrjZetddccHPVDL5xBPAggUoV4BeIHfJZKLAqLVlFV1mz0bpvfnutn7+c5SmTAG262vTdjPooYeqy173AQ8qFeD730e5K/reMrc1ZQpwxx0o9S2fa8ADLzed4nQsv/W2tS2n0CrdapXa6duR9l3ygmIN1TedyHV01DrE5EVrkzLE9JLJIowyKd1qeVxURXOISWQ5aEs7xFQxxlkyqYiWtpJJuS2bLgSXLgWWLxcfs4XqlxJOWnwpukPMp2TSN0MsqQxSD9WnQ8xMKzvEQo3UaHOIudqylV+anFhZHWIhQ/WZIeYPHWLpePZZYNQo4IormtuPegtieUgaZbJRFCX/MSvVksnIKWMUxN56C0B0fmkUEhYuBP71r7iAY2rrzjuTp3ngAeCFF6L52Vbv5Mko/fSnUVumbXX5cpQ/9alomiUGd9vy5cDHPobSP/8pprG5uu6+Oy7Amfrelx8mp7MKYlOmoPTqq1FbJpHna19D6fAj3NOsWAH86U9VYdF4XrxqFbD77ij/+Ep3W0uWxL8bZUTNKtOnAxMmoDzjUXs7ALB6NUp95/LWfi1YgJLaJ9PgCU8/DVx4YSyXzdjWl7+M8nHHuaeZNk2859qOAbGN3nZ71C/TMt52G8oXXBhNY9quVq0CbrghEg9N7fSto5hIzJJJUhRiofpJGWImh9jAgeIxqyAWKEOsOspkoJ3LN0BQRZ93HmdXFodaPSlUyaQUiZosiMUcYqu195MyxJYtA7baCthpJ6BSsYfqhyqZLKpDzFcQcz1vhEOMo0yGoSgXVWlGhnRliOkOsTwlk/V2iGUtmQzpEAspYhVRfOov+3ZRePBBYN48oO/ivGkUWRArmkOsSOcPaaiWTKoOMYNja1lfCLmrDPC994De3rhgtNxwzfT227FpjCWazz0Xm58xh6tSAQ4/HOX/RKKS0ZH2wx+i9MiMqE+mY/ADDwB/+APKi4WoYRXEXn01JlyUTQ6x//431vfSasP83noLOOAAlP9xV9SW6Xz2jjtiAwoYr7tuugk46SSUX33NPs2iRTXr3SYexoQZ07H6tb759F0HlJctr50GAA48EOUvX+Se3w9/GJ9GGX20yvz5Yj5JTrm//Q2lxZEoVXr//dpp5Hbc99S4Xb3/PnDYYSg9/0L1JeN38/Ofx7YTYxbeP/8JnHACyi+8aO+3FMSUl9q5ZLL5V/YkhtUh5iOIdXTUXvinJXSGWDNLJrV55xGNSqUShncPR0epA4MHGHIlGkxNyWQzyzj7LgTrWjKpX7Sq5VOyG6o4Jn9s5DarZ4jp+8ebbwKvvCLuCC1dqghicYdYsJLJojvEkgQwl2CW5AjTn+uh+lkcYiFLJotwIW+ing6xZl9U1SNDzCUGpc0QC+UQc4lrlUo0nfwdL5JDLOk4VUTxqb/s20Wh76INyy0XmvVg+nTgba3kp8iCmOoQk6NVNoOiuHuzYsgQM4ogmpBgdD31TVNW3WambXjp0ri4YRISZFvyJqjtRsL778eFBFPfn3oquURz0SLxnpymdopqvxJLJmXf5Y1bk7j21lvAqlXJIsjSpXHxcNHi2mnefFN8vm8+pfcX2fuUVJanL59DHK26rEyjMAPAvffG+mI0Zrz5Znz5TG1pYqy1Lb3vpnUlq0/kgACm7Xj+fGD16mQBbtEibVsw9F37bspLDfuD4btx5dy1OhTECoY1VF8ebNUgPj1Uf8CA2tKwtJhOpNNkiPXt7NUMsTqE6qfNEJPkzdm66bibcONHb8TwnuHJE9cZuT4KUTIJAB0d/dshtlj5AVMFsXqVTBbVIZYmQ8zlEPMtrwToEPPFsM3npigXVfXIEPMRjHwEsZAOMZe4pj53CWLNyBBbtgzYckvghBPM7wPFLJmsx74tRaOkaZq9T2VBigg+yxiCZ58FdtutdrsqsiAmHWJDh8bL/RtNUfIfs2IM1XeIIDILyeSskcKF6jYzOcSWLk0WqPpC+2WYvnF+fdPEhARTyd3y5fHzYpNApYs8tsPGsmXJAlwa8TCtyKNkgNnaKi1YYJ3GR1SKl4R6uAUdh9hEUalGHLVvC4ltpflu5LWEaztW27Ktq6TSX/27WbKkdhpTyWQRfrebBAWxghEL1TeVTKqj3ugXqZ2d+R1igUomZYZYPRxiaUeZlOQVjQ7Y5AAcvdXRudoIhS48Nl0QK5ebI4gp238s5y2tILZIubMVc4jFSybl67kdeUV3iPmUTGZ1iOkCWUdH9L6vQ2zlSvFj/pvfCBu9z4X93/4GPPywuS39szNnAn/9a/TawoXRMfj118UoZHf1lRxUKsC7ylDwjz4K3Hxz9PyWW4BHHqmdrw8LFkT9a+VQ/TQZYmlKJvM4xHxHhgzlEFOfp80Q8xnVMo9D7MUXgeefB/7+d/P7ps/azkGWLRP77euGjBiVJUvEXx5CO8SuvVYIIer+rTNvHrDuuoCSKZOZl14CDj8cuP/+/G3pxykTUghLEsSWLYv/Zpp4803g17+O32zSeeWV+KPaV6DYglgzyyWB4tzMyIoM1R+gOsTsF/YleU5mOo5J940yImTZIogluqykCCIFMdP6laJZTBAzzE8XShyiSzWOw/R1Viq1ApVJgNPdbb6uJ5tDLCaUmAU/0fe+aUwij/xuPESlxHw3XfAzrau+z8X6bvoOdVHJIVb6CFTe7r2qsOsrVhrQ5ucSWp3ryuCsLBf1hnADoCBWMBJLJocMiV7TSyZVh1g9SibTCGKyZLKJDjF9uiKMDhmK0GJfbuohiLlKJhNC9a2CmGxT3z80h5h0E+olk22TIeZTMpnHIWYT09I4xG69FTjtNODii82i1ne/Cxx5pHhvzhzgqKOAj3xEvL90KXDllcDLL5sv5E84ATj2WOA//xFi2IYbAgccIN7/29+A224DfvEL8fyyy4C11gL+9S/x/PjjgWOOERd5r70GHH20eA0Q29mVV4p2AWD2bPFcbn9LllRDhDF/PrD++sDBB4vnrRyqX4+SyVAOMd8gfJ/8Mx8RC0ifIeZyt4UQxKRzaKkhvFfi6xD7y1/Efvv1r9vbWr0a2GYbYPvt3cfHl18W+49NOEvjWrvvPuCPf7S/D4iMrVWrxKON554ToliSiHX11cAhh7hFvxtvBG6/XQhLLt59V+TPuDj/fGD0aLc47yuI7bYbMG6cu7Ty8suB008Hfve79PMrsiA2erR4HDOmuf0oys2MrBgzxBwClRQSXBf/HR4OMeWp0ZGWxiGmutssAlViGaCcX1/1j1G8WbUKqFSSM7aqIkjyuior+5bxOs1HPNQEHGMulklUMp1uaG46Z8lk37l+2SFWxq5HlhvWlbYtGNeV/G6UbdRW7hnLd3OIlVVh17UddyUMNFHjSPNwiJnWlRQ0e3qitkzrqk2gIFYwYqH6ppJJXRBTy41COMRMFwpyh/TIEJNCTdXp1swMMe0g33TRKCChy0Fz0yyHWKiSSZtDrF4lk0V1iNlKJn0cYvI7SusQ091lvhliUjjqy8SIsXIl8OMfC/Hq8ceBd94R61lm1fz1r8B55wFf+5r5ollt+5VXhDg1a5Z4TV7AyseZM8Xj449HnwHEPOX85OONN4r5fuMb4vmll4rnf/mLeL7//sDGG4uQ4JdfFhe5cr7yolE5ecmNy/nUSOpRMunj/ko7ja9DLM/8gLAOsVAlk4BY37ZzC9/yRHXfsrFggRCNX3rJ7US65BKx/9x0k/l9X9caAJx0EnDyyW7nmlwPLmHQZxoA+OlPgX/8Q4Rq25DHGJfLatkyYPPNgV12cc9v5kyxXTz5pLstwC10rVoFPPEEMHeu+zvsy7CpyQdTsZVoFlkQ23dfIfb9+MfN7UdRbmZkxTTKpMshVvZw1sQEMcM2rI9k6HIh9R2DjSKPFEpUd5tJgKtxITlEngFSEKudxJjD5XQhybI8l+jSHbVlOp/VBaOc301MdLGsh5io5PxuxHp3ratY31eY8+TUNpzfjbKuaq5pTe49h1jpU/pb7o7WlZ/Q6vPdOOan5viZnI5tAgWxghFziJlKJpMcYnp4eFpMd7JzlEyGGpFRPQh5jzJZKphoFJCWd4ipn/UUxNRQ/Y7Vlfj7+n7hXTIZF8TqEqpfpDu8tpJJvTy7CA4xecG5dKlZ1JLvL1sWdyKopUPvvGO+kFen110M+kWc63V9Gjlfdf7q82eeERfBr71mn09IQawoofppXFa+DrG8JZM+I1/69j2NAw6Ijlemk/R6OsRs5w3qxaVN6EkrrrlcSOp7LmFJ34900jjE9H3T1S8fQSzJZSXbcDnEfJx5b70lHGnPP2/eXvS2fNa7axqfbcG3rf7oEOvoAC66CPjAB5rbj1ZxiKmikisLqe+Gp1N0UQUx0wiEehmgT8mkyyGminkewoXRFaQ7xODpsnK6kPrOV02jaBpEl5q9rLe3Nv/MJcBVvxuH6NIzKGrL9N2kCdXvdJSzVsXR6HzV6HrS3W1Ox5YiiOlryzRSo0lUktP5bMfqd2NZV4m5X1oZsdFNJ/ukujQpiJGiYHWImQSxAmeIrertq+MOVTJJh1iMwi1baEFMPbH3dYip+RFJDjF9/1AFsSVLrBliwUomlTYLdYfXt2TS9TzJIZb0Wd8MMfXi1HQxrl5sqRdxMn/M9lldEJMXfatWxdtVBTk5baUSHatVQWzlyupJZvW9pEe1j313IQEAAwfWro+sFGUbTON68s0Qy1sy6TON2k+ffDCX26w/OMT0/1V8xae0gpjPdL4ine28qLc32m9dIo/cJ10ili6+J02X122Wdl2FmsZ3Ot/lU9dVkQWxolCUY3dWTKH6ptHydKeLqwyw0+EQ63O3JjprtDK5kjr6rz5NzCFmES6Up86S0D7RpWw6bdZca4CnC8kkkEsRpFsReXSxS84vSTyUbfX9HjvLWWNleRnXlWyrb727B1hQxUpz+SwQndO7ylmdDjGDe8+Z71Z2rCspYnkMDuFbzuos/TU4xIyOtDaBgljBkA4x6yiTviWTeUeZVE/uszjEZIZYqJLJpHr3hM8ArZ0h1vRlCy2IqduvKUPMkGunOsTKumCWdZTJejnE1L4V6Q6vb6i+6/0kh1jSZ13zlejClEvUWr689iJO/axLHFWn1dtyiVhZPquWo+l9XrGiPg4xKSwllXfVm1AZYnlC9fWLhywOMZtAFTJUv5kZYoB9W/EtmQzpEEtqy1ekS+t68hWoXHfbdUHd1VbIdeUj+IWYn3qjIGl+vb3x76a/ijyNpCjl7lkxhuo7XE8yN8ohJJRcofpVASTClSFWVsvy9P24Oo0iuviUTDozxLqj+enHailceDjSAFRzqpziYcyFpO2jptEVXQJcWZaXukSehPJSPUPM5Ujr9CgvVYPiLaISEJ3jO8U8ZV3VnCeY3HvO7bjvGsKZhZdOaHWOzuoSxKoOsYTvpk2gIFYwqtlbQLaSyVAZYqaSSZ8MMW30w1AOsdgIgnSIFW/ZiuAQyxOq3+gMMaXNQp3Q2oLbm+kQUz8rw0aTSiZVB4cuVKliU1pBTH2uP7qEN5/P6mKa/lwub0hBbLvtxOPkyeHazIJJ5MmSIRYyVL8eDjGXaKYuS5EdYo0omdS3fRuhHGK+Ik8ah5j+v226vCWT9XCIrV7tJx7W221Gh5idopS7Z8XgEHOWMLqcNSaHmF5qZhR5PHKqlM/qbaliXtkyymRsfq5RH6slk6jN8Ks6o/wEsXJJrqvVtcd9bX6AQTDSxBsgQeSRootL5FG/56Xavt7nvk8sA9Ta8s53s3w3QHRO78wQ61bce/q2YBTgPBxiriw8l9Oxb115Z4i5Rks1OMSMAlybQEGsYCSWTA4eHL2mC2L1DtXPkCHW1FB9Zog1jmYJYsr2Xy57lEzKEwR1BEugdpRJeTeyXqNMqn0r0gmtXqoiH/Xy7KQcsCSHmCt/zCWeSTFIL4l0Of5MwpTqhNA/q47W5uPyUp0QSQ4xV8mkbT5yHvVwiB1+uHi87bZwbWYhjcjj6xDL6zbzdYjVo+90iJn/t03n26c8gp/P/PS2bH33LdFMO7+Q7i/bhVEo956rLZZMJlPEG2ppkIKYOqKeh0PMPcqk48K+T3hODPHXHGLlCmq3YVOGmMX1FC8DdDjS+lxIpQrsokusvDTZhVSqoPaYV3UOKevKJh6q4e6OkslqpldWkX3rjnEAAHxUSURBVEe245shNsAxIqfJTefjEPNx7wG1NzCqAqNaguoSDz22Y/V7Xqb1Xbq6vN10srzU4axMGhyiTaAgVjBiofo+GWJqyWQIh1igDLFqyWSoDLEsofpFc1EFpOVHmdRLJn1C9ZXtomwL1QfEviHbl3lMjQ7VV9ospCCmOrbko6+o5eMQswX2J5VXyu9r9eq4Q8w1SIIuLqVxiKlCFBAXrvSSIJcjTH+eJIi5xLWQgtjBB4t947nngBdeCNduWnzEJ58MMb2dEAKVS3hSX3MJcGmWD4h+x/uTQyyt+BRC5EkSXUIunzqdj6vL1ZY6TV6HWCMdW77T+M7P1ncKYskU8YZaGowZYo4Leykk9LrC1lWRxyJiqaWCrgwxKcA5BLGYmGcpmYydF7tElwGKQ8wiiMXm55FTVXaIayWXQCX7pOaMuUo0UzrErI405SWne2+AUs5qEfxiwqdJ3JfLKM/tnSOAKvPzcIjZxFEgunFfUm/M623FnI7LzNN4Ox0do6Wa3HsUxEhRiDnE1B86W4aY7hALNcpkxgwxeXAJPspkCIdYs3O2AlI4h1hHR/0cYqaL1qRQ/Qri7yt3QLByZbR/SHHBmiGmOJeg/HiG2K7XXVc83ndf/rZCoZdMykdTmWNWh1jackt1WjVQXjq5fMoebeJSkpi2bFn8BEh3l6mPprLHrIKYq62QofrDhwP77iv+b6ZLLGswvas0UU6btS0f4UnveyiHmE3w09vyEeB8HGJqKbKJNA4xeQ5ShFD9tK41V1tAuBLGerms8qwr37Z8++6TIWabHwWxZFrFITYgoWTSJLrYygBdYeRSAFHD3U3Hg+ook32CmNJ+zTRJossyj9EAq4KfHA0QfqKLswywbO+7aV3pJYzVMPmEdaX13ekQG+BwpGlCHpAgjnqIh+Uk15OcTq4rhyNNOg/dDjHPfLcOh9DqI1Bp7QAJ303VIWbPEGOovoCCWMGIOcRUUcCWIRbaIZYzQ6wRofrMECvgstWrZFJe9Pk4xFRxzCWIydECgUgQ01xBm43cDACwWfeYvsbr4BA780zx+OMfF+ek1rdk0iRyyfeTHGFpA/lNDjEgEsSWL4+Oj/KkJCkYX72w1Y+VrpJJk6ilXvi5HGJJofpJ7rJ6ZIgBxSib9HE0+WSI6ccFn1D9pBErQzrE8gp+altpQ/xNxxj9xkCIDDHZViND9X371AiHWFpRyaetvG46dZRa3/XOksli058dYspozDEnzyoP0cXkCjKMzle2lQH2ROcQLtHFyyGm5kbpQkLfIDm+JZPRqJawijzxsjyHYKSWTNocRq7yUqN46JHp5RrwwOXeq458qczPsS1I8clVzqquqxpHmnI8LLlG5KyWz3qIler8HGJlTPi0CWIdHg4xdX4+brrein1AgKTRUtsECmIFQzrEyhUACxZEJwY+ofohRpksasmk0k7mUSabXVYYkMK53+pVMilPcLxC9aMfkY4kh5heMqk5xHYYswNeOe8V/GqDs8Vr9cgQO+MMMf9Zs4ApU/K3FwLfkskiOMRU0Ut+f6YSWJdDbNmy2hGkfEP1V6wQ26HqEEsStdJkiOmuHPl+vQSxf/9b/OY0gzRlgL4lk0DxHGJ5yzj16dKE+LscYkmCURqH2KBB7raa4RDzFenyzs+3rUY70latirY31/L5lF+mXVdZ5kdBLJn+7BBTLszj4efJzhqnkOBRMllSBDFXhlhJdYjZQvVVV5AuwCnbtjxvdJVollUXUoMdYlbxUBWoTN+N5jByuvfUvltFnoR8N5M46lXCaP9u/BxiKcVKhyOtmu8GR99dIfe+pb/VthQ3nUWIjI+WSkGMFITq6IwViJM3effQVDKpuzZCOsQylkzWLVSfDrEYhVu2ejnEVNcRlDZVt0gfgwYMxNDlwBrLShggf9tMriVTyaQqgvTtcxsM3wADoGRbIdoOgwhiI0cCn/iE+P/HP87fXgh8SybzZIilCdXXn6tikOrkkv+bSmBdGWLqtHI+vg4xQGwr8sTUJbzp8/VxiDUiQwwANtkEGDNG7FMvvSROuPbaCzj//LDzcZEmY8s390u2F6KtvA6xNCJWaIeYS8wL6aCSx1QpSieJT66RDH1EFx/Xk37jI69DrB4lkz4OseXL7cKHT9/Tiliu6erlEKMglo7+7BBTjg3OUSZXr46cZKooYRNUOh1leX37Wdm7ZNLh5KmWFKoiiLkMEFBupK50iC5eDjFHWV6louRUJZflxRxiNsFPDdV3iZVq33Whzue7qQpi6nfjUT6r9LVmGle+m/KZkksQ0wZYMJZMmrYFj3w3Z8mkOnqpZV2VksRK3SHm2q7oEANAQaxwVEsmy30nAvPmiUfbKJNqyWQBRpmUB/6qsFcHhxgzxGqXreUFMQ+HWHdnD+7/NTD11jVrHWJAtG+oFxcWh1gV7cQ8aMkkAHz+8+LxttvsJSqNxFYy6XJ16c9Nopavu0wXMvT5qGKQKmIuXBh/P8nlpa5rKYDZ3GV6hpja1vz57vnYnstRMuX27BLP9H6EzBCTjB0rHufMAR5+GLj/fuAnP4mvi3riIxj5lEzaHI7NdoiFGolSn87HIeYS83zLHH0cYlnEpzyiy8qV0Tr0FemKWDIZUlwL5erK25YiZNAhVif6s0NMFcS6HGWHqpOnrDi2GuIQcwhwhpJJqyDW0aG4kBzz63SIPLKk0FUGqDrS5Hmq002nluXZyktVgcpRBqhmetkytmIjJ1r65FmiWfbI4XI6xOT67eiolpeWHS6r1OWsJvee/A49tmMv8VAd8MDHTWeanxRQY+49huqTglAN1ZcH7ffeE49pSybzjjJZ4AwxjjJZu2xNLwdtVsmkuk2Wy9juLWDrt+P9qiL3DfUHzTLKZBXtxFyu92Dre+utRYnR6tXAq6+GaTMPvhliSaKWyyGWJKbJ1+Rz9bMDBkT9MDnEfEomdbFJimnys2q7rlEmgbgg5hOqr76vlieaHGGNKpkEgHXWEY9z5gBvvCH+X70aeOCB8PMyEdohpm+/eQL6QzvE8oh0gN+6Mol0eUom6+EQc7UVSlTSRbpQofpq2X2WttLOL29bPu3obeXJEEs78qXeFgWxZFwifdFRBTH1YlzfP1WXlSsXq3ph7xASDCJP2ScoXutHrC1VzLMJVD091WsWZ8mkzOHyFXksTiwAUYaYqe8m0UUP1TeKPMmOLe8BASzlpTGXleO7SesQs7msMHCg2yGmrStnmaPLLbhqVfXaJTYCaBZh1+QQM+WtaQKce8RRdQTQAtyYbxIUxApG1SGmC2I+o0zmLZlUf1jzZoj11i9DLLNDrNmiUUCK6BArhRTEMjjEarZX/X2TICZPjtS7Wo10iJVKomQNAF5+2T7d6tXAf/8bZp4ufDPEksoe9Uf9s1kdYuoxzlUyqb5nclupP/q6IJbGXZbGIab3w/VZVx5ZvQWx11+PXm9Utl2oDDH9uJDHjVUPh5gqrrmWz+aA8+27zyAFQH0yxBrlEPNpJ6RDrC8kO0i/1Nd93WZ5xKdGO8Tyzo+CWDLyZuHbb7unKyJSzCmV4gHiutNFbhOdnakdYrZcrNLAQdH8VqyKHxcrldoMMadDTBFBbKJLT0/kEFu5qvY4XHWIeZS2ucry5PxKpUjk8SzLs+ZUqW46h+sp5tiyudsG+Ig8foKYXzmrIhjZSibV72bV6sTvplyB3QGn9L283FPYtawrp3hYdYgpwq5DrHTmrfnMr42gIFYwqg4xeQKnl0zqGWJ6yaR8nkUQU0+qM2aISaFGOsRCCQdqO8wQK+CylcsoIepXbkFMbu/yR1S/8DMJYvr2qr/vcoipP4Quh1jIDDGJjyB27rnA+usDU6eGm68J3wwxn7JHm8tLb0sVAJIcYuoxTj2J00tg0zjEdHdZmgwxecPCZ1qXuyxJeKu3IDZmjHhUHWJA4wUxH8HIR8TyLZlUt71GOMRcIffNdIjlFYwqlfqVTBbFIWYru3FNl6fv+nTNFKhs09j6rpeZ22DJZHYOPFA8/uEPcVHsmWfqf56QF7kfDhgQr/6wiTw9PfFzS58wct31ZBJ51OMWIM47+7a9so/rqdtPEIv13eKCS+9CsjjEVJFHfV2iB+Gbls80yqQrp0oVK30cYkssrjVV5PHJEPPNW7OIoxg4MB5yn5B/5gzx98yTk4N/ZS79TSkeOgeHMImjJuGzTaAgVjBk9lY5S8mk6p7IMsqkesJsKpnM4hCrR8lk1lEmWzhDrOnLJuvwQwli774rHtdcUzyGcIhJIcXkEFPxKJkMKohtvLF4dAlis2aJx+nTw83XhG+GWBqHWJrP+jjEOh3irykTLkmYknf8smSIqYKYT8mkTRAzBf/rbdUzQ8xUMgkADz3kvqANhY+o5JMhZhN0i+gQyyqI1cMhliRiJTnE1O8hjfhk27ZCiUohHWL6RY7N2RXSIRaqZNJXEPNZ7/VypLFkMh0HHgjsvLNYb3JQnt5e4KCDgP32A2bPbmbv3KiCmDqCvJoLCJidPE7Xk2Mkw779LJYhprelhq17lEzGg9sdDjFXuaeeIeYdqm8XxKolmh5lea4+lbujdVXW57dqVfV46jPqo59DTBHgVvfW/r7rrielrzXTxNx75vlh4ECUSx4lhQMUsdLDIWb9bhQB2Df/rLw02SHmDPH3cYi5xMM2goJYwaiWTEpbr14yqYfqh8wQ0x1iITLEmlkyWTQXVUAKt2yhBbF33hGPa60lHutRMlkqAcoPQRXHiblcz0EFSOkQmz1bLPchhwDXXx+fRh4H1HK2emBz2CRliKnv+zjEbI6xNA4xE6ZQ/SSxSTJoUO1n87q8XPNN89lGZYjNnRvfxpYvr78IC9RvlEnbdGkD+pOcWHrffRxiWUP1i5ghpp5vFKlkMqRDzHLRmKlfWRxieebnI2IpOTc1n8nbJ58RMukQS0epBHz1q+L/n/5U/J4895yIVli9WgyMUlTkftjVFbu5W+7VHFsmkUd9XZsuliHm4xDT25L/l8txIcEmGClledacqu7u6vm6UcAxleVZHWnK/BwOsZgjLUF0cfbJ5RBTQ/w9XEhll1gphRldrLRmXiWPOFpy5bspQqtToNJHmfR2iNnz5MrqgAdZxEPDCKA1QrIyArOXQ6yTghhAQaxwGEsmV6+OTma7u6MLe71ksogZYkVyiDFDrH70HXRjB/siCGJ6IDsQ/SjY3EYOQezMnc/EIZsdgkM2OyTFwiSglkz+/vfAP/4BXHZZfJpGCWK+JZN5HGKu52kyxEyYLsaXLXM7t/TPqiQJVbpDzDVtGjFNF88WL46O6Y0K1ZcjTzaibDJUhpjN4Zgn5F7f1vUTT4nLIZZmZMhyOVq+SiX/4AKhM8RM+476uRAOsVCiUn92iOkCVSiHmK/gF+q7AWpLkSQsmczHEUcA22wjcjCvuw6YNi16rxE3MrJic4hVYN4mFJHHFaofv7C3CAmDogyxGsHIJJQAtfOrOsQ8SyZtoouSS+gUqGwOMfWYrjrSXCWTmqjkzOFyZYipI4B6lJeWYq4nS59UkUfpBwDNkZZOPHSVTMbWVULovHMadX4Ot6CfsOvYjuU0unio/t4q35XzuzE4xGqE3TaCgljBiAQxxSGmHogGDIhcLfV2iGXJEOv70ZKln3SI1YcijjIJ1NEhpl/4qQ4PrQ/W57ogZnMbqRco2on5YZsfhjtOvgNjh471XBAPpCD20kvRqH7PPBMv+2u0Q6yeGWImx5g8joVyiKmYxCZfQcw0yqTL5ZW1ZNI0rW1EykYJYscfLx4bkUeTVjBK6xDLWzKpboM2QczlEPMpqzS51kzT9WeHWJZgep9p9HYloTLS9Gl8pws1MqRp/qbXQ4pYeYRIi+PEu18UxPwol4FTTxX/33Zb3BX2yCPN6ZMPSkZs/GY3rIKYj8jjDAeXIo8aqq8LcGq2lEuAq5YUepTl6a4gyzbvUzIZcyH5uOl8HGKuDLGBDkFMttvREQ/xt4bqK9+NRRCLrU+976ojTa4rwPrdxDPELN/NwIF+DrG+vrtC9WPuvWUe+W7O0l/HaKlyGnW0VNd25VXO6lhXbQQFsYJRLZmUdzHmzYsLYl1dcUEspEMsZIZYpX4OscyjTDY7ZysghXOISUGsCA4x23NfQazRWSYbbSQeFy4E7rpL/N/bCzz2mPhfFUuKJIjVwyGW9NmkDDGTWGTK4zKVA/k4xOSIlBLVIbZqVXJ2mU1MW748vt3pop06n3qG6q9YEc13//3FYyNHN3WJPGnLHNXHvOKauv+r09l+M20CXNpQfd/pXPML7RBLEsRcbdVL5Enqlzp4imnkznoJYnkcYiEFuLS5X77T1UtcoyDmz2GHicf77gPuvjt6febMbFnCjSCLQ8xL5FFGmdT3BVOGmDoPwOgcMjprTA4x/dimlMlZSyaV/2MOMZtQ0uUQjFI7xDxC/JV1ZR05UZ+fTTDyEKhqvhvL4Bw+mWWxUSZtDjHXulIcutUMMZeopDrEdHevbFcvn7Vtx53Jbjo1362mX6Z1BdjFSmaIAaAgVjikQ6ysOsTUk80kh1ieUSaTSibTZIj1hh1lMvaj6XmSVOMQK7WuQ6wwglijM8T0XCtDn6rIfUP+YGcomawLgwZFDh1V+JB3eNXXXn/dnscSAt8MMZ9gfJdDLOtnkxxiWcoekz6rbg+qiAXEvxsgLpjp81m0KL5P6G2pLjBdPJPzSVr+rHR3A2usET0fOVKMagrERzCrF2kyxFwlk/UO1dfb0n8zfRxiaRxwSf1K40irp0NMfi7JpR7SheTTlu4QU19LO7+QJZP6xYvpmO4riKUVu1avTl4Heru26fKWX7JkMj9bbAGMGyf2t1dfFa8NHCi+m6efbm7fbNhGmQTM24QuUFmdNQ4Xks0hZtpfVecQYN0XY64gS/i50xUk/x8wIBqp0SW6qIKY3i9VPHS520wOMZvLSi+ZVI9T8nvq7o6LlT4upCWWadTMMpvI09npNaplqTu5fFZ1AtYIn6qo1KmIlfpx3+QWtLUVIkPMtK60/mZ1CwIGYbeNoCBWMKqlhoP6wvPVkkk5RLzc8fSL1JCjTObMEFvVKz4TrGSSDrEYhVu29dYDAJTLyogteQQxeRE+apR41C+AW8khBkQjTarMmCEe9ZwqXYQJiW+GmKtksp6fzeIQ08Wl9983f9ZHTNNFLNdz/bOq4JX0Wd0hJt+rhztMIkVZQOSHSTH6nXfqK8ICfiWFIUP1TXldjXaI+SxfUr/yzA+ozaXLGjqvXOBW908f0aXRDjH1tXrML2tbpumyuOnyuNvq6d5L2xYFsXRIlxgAbLkl8IEPiP+LmiNmcYiVbSWMusvK58LeJvIoxwJXhpiztK0abO5XMmkV1wzL5x2q7+EQc4uHDoeYKUOsgni1UkqRJ+YQ08VK6VTq8RCoPMsOYwMeWFxWsQwx27aADA4x7fMxt6CPQKUKnzaHmGu0VJt7z+oQc2ThtREUxApGzSiTasmkPKDYSibTZIhNmQJcemnc1hkiQ6zvM1VhL1TJJDPEYsQccygFc+JlZtIkYMaM6mgsdcsQc5VM6ttafxLEZI4YEJVQmhxiQH3LJn1LJvO4vFwlkz4OMVeovq1k0iVMSZSw3SppBTHd5ZVHTDM5xBoliK27biRGr15d29fQmEr8spQ5hgzVD+kQy5qRltSvPI40IHzJZD0cYnkEqpAOMX1+oUL1bW01WoAL6d6zXDTGUEZCq/kMBbF0HH549P8eewC77ir+L7ogpo0y6XVhbxISTKVfNofYoCHRNOo8lGliGWL6NGpbnkHqVsEobbi77kJKKi91tBXLxfJxiDkEKucomlUBLuq7VaBS3WZ6W4blc5azur4b2+AJpvl1dVVv9DvXpzo/PWvMJlAliJWAp5tO75fq3vMRWtUBD2zH/TaAgljBqIbqDzKUTJoEsayh+ueeC3zjG/ERxEJkiPVtUtUMsUAOMVXw4SiT8WUrhNA3fDiw0079K0PM5jZatiw6IW+GIPa5z4nHZ58VmVTNEMRMAlUol5cucknnq89nkxxiWVxeeT6rfzcuQSytQ8wkiJn6GArdIdbdDQwdKp7L/bFehMoQCxmqH9IhlmYUzUY6xEKFzsvP1cMhlqcszzQya5JQ51syGTKvy9RWSEHMx/1VT7HS9d2YpqEglo699hLnYIAQxHbZRfwvb6otWwaceSZwxx3N6Z+OGqofu7mLnCJPd+1nJaageJsrSHcO+WSIuUQeWwmj4lRylh3K0QB1F1Ja8VAZfMTpeqqWlzry1mzzs/U95kKyu+kSy0t9B1iIufc8Rpl0zC8mKlky0kq6ey9JPHS66Rzrqurec4yWmvK7cW7HbQQFsYIRher3lUzOnx9TqmOPuiCWJlRfBiXPnavOPPo/UIZYU0P128QhVqTlGtA3HPKA1cguiFUqYQQxfXuVF2pJDjEg2ueaIYgddZQQJHp7gVmzGiuI2UomfTLE5Pv1dohlEcRsIzYmfXbp0vhJhP5ZlyCmi1ppBDGbENfIkkkg2v/qnSNmcjRlyRDTBd12d4i5pgHCO8SSzkHq6XpyCXVdXbXnM7a28jjEXK6ntG35CnChxEOL66Zh86Mglp0BA4BvfQs44ADgmGMih9jjj4v98LbbgF/+Utxsq3f5uw+2DDGX68kmKvX2VgU2KfKUeyFeU2+YyFB9NUNMnYf6vyJQ1YgNvb2RkBDLELOLPIklk1kdYgkCTs381JEaXSH+VfFQE11sLiQPsTLm3rOISk63WUpRKS7yWByrLiegrzgqRSzdsWVyiHV3Z8jC88xbs20LPkKya8CDNoKCWMGocYj19gLvviv+lyeacuPt6HCXTNpObletitqUj3JeEvVEOk/JZKgMsSwlk0XL2QqIumxFWq5L9r0EZ25xErZ4F9kFscWLox9bvWRStqm6RSShMsTUaRp1Yr755uJx1Chgs83id3iLWDKZ1yFmE8Qa4RBLI4itXBk/sdFFrEWL4s9DlUzqYpoM629kySQQlU2+844o3//FL+ylYnnImrGVFKpfNIdY2ow0wO2CSzO/eobqqyWTLoeYb05VkgBn+qxvKWeSILZqlV/fbfNTv4uQDrF6O7ZCzs+nZJKCWFg+/3ngX/8Sg6NssIHY3leuFDe95Q3w//wHeOmlpnYTgH2UScAqiFldQcr00llTMrxXFXkGaaH6lswy6/yU/2MiyOre+E0AH5dOjgyxTA4xdV2pofpWh5i2rnwyvTzK8mwZYs62UjvEVLFyefy4bCiZdLq6fAQ/vWTSpy3dvdf3u1N2ZXpVBTGPETk9v5uaDLEiiOZNgIJYwZAOsXJXd3SB9uab4jGNQ+yZZ8RIYRdcUDuTd9+NNniXIJalZFI6xCr1c4hlHmWyQE6qvBTVIfaZXT6Da/b+rrlkcsEC4GtfSx75SLrDenqiXKdGlkyq0zTqxPwDHwAuvxy47joxr512Eq8/9lh+Qezxx4Fvf9t+x18ljSCWJ0PMJqbldYjZMsSyOsSA+HZs+6xEFbVWroyH0CY5xFyjTErqKYiNGRP9b3KIfec7wGc/C1x2Wfh5+zia8pRMFsUhprZVT4eYKZPNVTLpEsQqlWSBSi2Z9BWeTM9Nr+cRedTRL10jcGcRqHxyv3z7HipDLI94mKXveRxiLoGRglg+yuXoBsfrrwNvvBG9d9ddzemTisUhZg3Vdzm2lP9jZYDaeybHjzVUX3UO2aYBaoPUTeKTy7FlWD5X2HqN6OLjCjJN09GBUoc4HjqdSp6ii1XMU0s0fRxbITLETG46wLpdJYbq69NY3G01YmXaAQFM7j3X8rlKf30cd8o8VQGuZh9sIyiIFYyYs2qNNcSLvoKYetI3e7YYTe3//q92JrI9QNz1l9jyg3KMMhkq7D2IQ4wZYo3BdsH3hz+Ii+lvftP9eSmIjRoVnRDbBDF1+w/pEJM/eo06MS+VgIsuAg4+WDyXJZSvvhoJYiNHise0gti55wJf+Qpw9dXJ0/pmiLnKIPOIaT4OsbSh+qtWCdehJK0gppJ058wlmOnv6Seh0gUG1A4EIGlkhhgQd4hJIfvOO8PP2ydDLE+ofpEcYnlKJm2xBnkdYq4MMd31lFQymUZ4CimI+ZZy+gh1WTO9sgpiprZCZpaFzBBT27K56fI60iiI5Ue6fHVB7J//bE5/VNRQfdUh5iiZTBQSyuXqoE7VFk1urEGD0glUtmk6O6vzq/YrQQSxinkZSiYzjTJpc1nZHGK6m87UllIGWNOWKvIoZXnlZVo5a5rvOalkspqxpQlGlpD7VCWa+jTKdDUljOr8bOKoaX5AtYTRJY46v0Mf8VBxpMXEStM82wQKYgWjWjJZUgQxmfMlT+jkQVG/oDddLEqrtMpbb0X/mxxi+sWwPDnJkiEWqmRSdYj5huq3iUOscEKf7QJMWvWloDNlCnDCCcCcOfHpZF6RdKcAtZlB8sdUF2VM/ZD4CGLyR63RDjGd9dYTj//9bySIbbedeEwjiFUqwKOPiv9///vk6X0zxHwcYmob6vpziWn1yBBTlwuwX1iaRplMi0sws5WjmT67dGncXSZpVsnk228Dr7wi/p81K/674cPTTwMnnSQEXhOhMsT6k0PMJdLZ3F/qZxrlEDMJM3pbviWT9XQhuZxrIRxiPgJVf3GI+Ti28jjzKIg1H5sgds89yb9F9caWIQYkZ4ipr+vTqGKKPl2aIHVXbpTBRQZ4ltx5lEw6yw67/ULuE0P8PUUlrwwxfflMQhD8yz2tbjObEOTjplOWSV0+31D9mMBoc4i51lUasbKzs+req/bbVIavr6sk4dP2/UEr0QTsvyMtDgWxglEN1UcpcoRIAUsq7PLktavLPsqkZOFC4RRT8RXE9JMRnwyxvh206nQLVTLJDLEY/cIhBsQP5K+9Jh6l4PXDHwI33CD+VPRAfbVNKYRJUUgt89K3T1uovvxBM5VMjhghHpstiK2/vnh87bVIENtmG/GYRhB79dXIeTRzJvDUU+7pfUsmXTlgSe6yvA4xmyBWKkXHyCzU032VFtugKPUUxNZfHxg2TIhgo0eL1+Q++M47kSBWqQD33puu7W99C/jTn4Dvf7/2PV3kyZP7lSdU31V+mdchliZUXx+cwja/RmWI+YgnppLJUA6xlStrvxvTZxvtEAtZMlmkDDG5/fmE6tva8skQY8lkfZEu3zfeiAtiCxcCDz/cnD5JbKNM+l7YJziVqmcM6jYl91cf95dLbDCIKca2fAScOoXqJ+afJYlKJpeV3q80TiWlRLPad4u4lsb1VNOO4lgtuzK90jgBk0YAleKhZ8mktTTWJppVKn7ZdEnCp61PMGxXFMRIETA6xPSSyc99DjjiCODDH052iAG1F9CqIKaWTNoCiSVZMsTq4BDjKJPxZSvcctnKfXRBTDrDdBejSRDTSyZffFE8brZZNI1+8jxxYvy5j0NMDl3ebEFMOsSWLBHlzwCw7bbi8e23/Wv8H3ss/vwPf3BP71sy6coB00Utk0NMb8t3hEr9O0vjHkuinmJTKOrZx0GDgOnTgYceitajdIi9+GK8pPNf/0rX9gMPiMfp02vf83U9pckQCxGqH9IhljZUX30sikNM3df0E2ZTyWRW4UkfqdH0OdNrpmlMDrG8gpg+WnHaPvnOz8eRpmT0WKcxteVan/I3sN5iHh1i9cXkENthB/H4j3+ka+uBB9K7gl34ZoiZhAT1dX2a6sV/Kf6eKqorzi4fsctHKAEMQoIi1CU6tvSMNJtDTBeorE65hPmpWV2mUjo5aqdnqH6iq0sRK6t9t5Qw5grVV0s09fLShPmlcqSpv7em78Y2P1d5qW170aeziXlZM8uUUmNjW20EBbGCYXSIyZJJKYhNnAjcequ4A5TkEANqBQc1Q0z9kbNdDEuylEzWwSHmHarfJhlihXO+2QQxWS41b140+hEQv3sJJAti8+ZFrimZtaXPt7NTuFJUkkL1u7ujsrlmC2I9PdHyP/useNx006hcWi8ztfH44+JRtvWHP7hH/7SVTIZ2iKnrs6Mju0NMXryZ3ktDR4c7m6wo1NvFtvnmwMYbR8/ldjNzZny6u+/2b3POnMhdNnNmrXPIJiqFKJlM4xDzzRBT+5XWIeYr+KmPoRxiPoLY6tW108mT6MGDo7Zsglia0RxNzwFzqbCPoOLrEMtbMildxCaHmG/ul5zf0KH2trKIZsstI4SlEbHk8oVwm/lMI4/ZFMTCIgWxZ5+NRkM+/XTxeN11ZteliXvuAfbYA/jkJ8P1LcMok1YRy+YuUj+vbu+DBnkFqSeG6pscYj5lcmlLJhXXUz0cYtX1Kfc5ZfryoMHx5Utw5vmMruicTg/VTxA+resTCe42n+/ZJLSaSnGley/lAATegyI42griEFOmASyCbJtAQaxgSIdYuVSOHGLS0WW6YPMRxFwOMZ8MMYlPyWTfjhwbHCAAdIjF6XcOsZUr4yLOO+9Ewqy+fboEsd7eyB02dqy4SDPN95xzxNDjKkkOse7uSHBotiAGRGWT8sJy5MjoJPc///FrQzrEzj1XXHy9+ipwxx326W0lk6YMMd0hJt9Pcoj5lEy6HGLqMU5evMn3sgpiJnet6ztv1oVao11s0iEmL9jHjRPf84svAvvsA+y/f9w5Nn8+8MEPAuefH21L0h0GiIuXJ56IzyNL2aGPq0tOG6ItW8mk3q5NzGumQ8zmbgNqBTH1NYlywVlzfNQ/EyJUP+1rch34Zoi5hDp509E1P3mj0sdFliQqyXM8V1vyJo3vejG5h9Osd5dDTB1x1CaOqp+VvwmufuoxBXI+AAWxPMhzBXkOMGwYcOqpYpubPRv4+9/92rn9dvF411327Tkttgwx3SGWMuspEhK0Y4J8LJX8RBeXQ0x1daXNENPbsjnSViih87Ecrkh0ceVG+WRLxYSSSiW6EaEsQ0kP1U/KENM+bxJvqv3yEXmyTjNggLtE0/Y9J+WM6e+pJZouh5hBtPXO/YJ9OmuunpyfS6xUpnGKlW0EBbGCEcvespVMquglk+pzeZKrO8RUQWz+/NqR+3JkiOklk6FGmYwdTJkh1n8yxGR+3RtvxC/GXnwxOiBLh9jXviZKgaWbxJYhZiqXBITgs9FGwNpri7Z0pHgmRWBdQDFd8BVBEJOssUZU8iBFhlmzgOees7chHWK77w58+tPi/698xe4SS1MymdUh5iqhNAljLoeYvDg1vZeEKgK4ymf1aQFxcaHicWwMQrMEMcm22wK77Sb+//e/gfvui49k/J3viHyxH/0I+OUvxWuqIAbUlk3aRJ68I0Oqj3kdYrJver/SZpap23qWvmdxiNnEPKB2lEn1NYly9746nX5hnDarSzqjkkSeIUPEo0t0cQk4aUe/dAlU8vdKTuNydfm6rOTxy+U2MwlGejsqrunk9uJa73J+JmFNfc21jLIPcl35zE91qVAQy4/MEJPb+9ixQlw94wzx/Cc/8WtnyhTxuHy5ueQ9C7ZRJuV8JDZRyae0TX1PcWKhVIqLBCYBzpRTJbdJm0PM4VxLUzJZFUHk55XpYw4x7b20ziGr6GIRlTKJWIZ1UO17nlEfPVxkzhJN2/ecILSW9RJGtURTOTfLJI7a1oE6P8sy+szP5/urzpOCGCkCxpJJudOZBDHdIaZemB16qHj873/FRcmECeICWhXEAFF+9swz0YV1gAyxVb19Ft96lEx6us7axSFWuFLQAQOibWiddYRbROaHSVSXyOuvC6Hru98FbrstyrdQL8bVkkkpiI0bF2+zs1Nsxy++GHcOSeQJohTpilwyCZgFsX32Ef//+99iv/7AB0Q5g+mCavFi4IUXxP/jxwMXXSSEnMceqx3IQJKmZDJrhphLPPNxiKnfmbzgMr2XhPpZk0NM3Yb07Ul/ropnQHzESn30Sn1bcg0EoAtgjRbEVFEaADbcUIhdp50mfk+AyIEwZw5w5ZXRtOeeKwTZBx8Uz6Vjc/p04VS89lpg0qT4RVbeDLE8ofpJbjNTW1kyy3z6pD7WyyHW2xtNl9chZhpl0iU8ucQU5UTdyx3lErFUh5hNqOvtjdwRPo6tNNMoQ9tnbstHEBs0yC12ye/QJR7qAlySWOnTL9c0+vzU8GgKYvmRDjGJPP856yyxrdx9dzTIzo9+JKJYZBSFZMGCeLn85Mlh+qaG6rscYmlEHiV7qaw7xJRAfQCwOmsMYldNsLktQ0x9z9Z3fRrb/NT3FNdpeUB0ruBdcpe2LM+1fFnL8vQMMYdwFsQhllQGaFvvSW5BuQxye1KmTz0AgcfyVbfjpO/Qt3zWR6zU22ojKIgVDGOoviSpZFJ//8ADxePrr4sLlQcfBK6+Op4hBghHzoQJwGGHiechM8SaWTLZJhlihRP6enqAK64Q+V6rVgFXXVXrYlIFsaVLxYWznh1jyxCTIo/uEJPzlq4CHXlCKNHL79QLPvmD18wTcxmsLxk+PBLEpk4FbrxR/KjNmxeJiFOnCtcnIE52KxXhmBs9GlhzTeCCC8R7Z5wh1t+XvhSfR5qSyRAOMR9hzNchZhLE1JJaPX9LFbVcI47q//s8VwUyXSzTn+ufVbc1/b1Gj4Q5fHh8vWy4oXAb/upXwCc+IV6bNUs8XnaZ2JcnTAA+9CFxUnXkkcAjj4j3zzpLPN53H7DnniKT5tRTgb32itoPlSGWRqCytWVzS2ZxiIUqmQzpEFOFGpcgZnKI+ZRMuoQgX0FM9iurQOXjEFMvwNOIXT4OMfU103Quh1ga19rAgfbvxtSvrI47+VqpFLlkfd1fSdOor1EQy8/gwfHfGimQbbABcPTR4v+f/ESIYP/zP0Ig+9Of4m1MmxY/Zki3WF6U/bKmlC5teZjx4l8TEhQBBIBdBDFlS+nzcTnEfFw6pvmpy6eXgLtcVgEcYjWii2VduUru0jiVqn1PyBBztqWXhOr5Z7rryVEyaRXzfL4bRawsqcH0esmkQbTNJFaqA854ZsVZs+lsYqW+HtqIpgpiU6ZMweGHH46xY8eiVCrhlltuaWZ3CkHMIbb99vE3t9yy9gPqxUpnp7hY+eIXgd//Proj/9//RhclM2dGDjH52QcfFHeCJHkyxPp2vljpZwBiaj8zxIqdIQYAF14onFpjxogLo1tvjb+v5wiZ7jyaSibVDDGTIObCJIj1hwwxQJzYdnSIY8KwYcLl9t3vRu/fdJPY5/feW4w+29sbOXfGj4+mO+88IbQtXgy89BLwgx+I/yU2QSxN2WOzHGIml5dLmNLzx/TPqm0nCWD6zQuXmJYkiKnvDxkSX++NdoiVSvH9UM3lk9vVY4+JMmRZInn55cL9tdlmIutu2TKxfk4+Wbz/0kviN2mddWrXRd4MsXqE6od0iPm4uvR9QnXB5XWI2QQxpRQnl0PMN7zexzmkCmL1dIipbacpmfQRlWx9l5915ZGlcWz5riuftnwEMde2oL7mUzKplp8X4Xe3lVBdYur5zznniMfrrhOimPwu/v53cYz4zGfEaPZy8JRddxWP06aJ/XrxYuCb3wQOOqjW/e9DhlD9xNHyTBliusjT5zhNdNaYShj1tkIIVKacKnl9o4supmD6hAEIqn0yCEY1I3IaHGLq+y7XU6Looq0rAMmjTKqvu6bp7a1175kEOEvJZKryUptDLKns0CREOrZjq7Crmgb0AQHSuhMN+021XxTEGs/ixYsxfvx4XH311c3sRqGIOcS23VZcUEybBsyYIX6AdHSHWLkMfP/74uJD/hg+/7y4CAHExYvc2KWg8PDD8Tb1CwBJhgyxejjEMo8y2UIZYiqFXa5SSZT0AbXDfNsEMfU7sznEQglipsy9ogpi8sKio0O4a4BolE5AlJpecon4f9o04Le/Ba6/XjxXhfUhQ8QxYNq06OJIHhuAbBli8sLc1yGW5CaTbZo+q4teSSWTqtjkEqL0baGjI+40HDo0vg3oApjLBZYkgLkEMfVCVz5vNGrp8oYbRv9vt51YJ3PmiAurlSvFb9a++4p99847hTMREMeBddeN9sEBA8Q2K13MktAlk3nEtf7gELO56WwOMXU6VRgaMCA6xmZxiJlKJkM4xNK4nkxtmRxituUrlyNxJm/J5KBBUSm0yU0n++DjNvOZn+u7AdKJa3IaU7lnSAFOdbfpbVEQC4N6zqP+v/fe4obG0qXApZdGr99zD3DzzcA11wA//7kopQSAz35WiLeLF4t81i22EOcbd90F/PnP6fuVNlTf5bIyChdukccqupiEEr0U2eIQszqadFeQzakkBTg5P4Mg5u2m0zOvpJBiEl1s66pvn0xVwgiIY5nDsVUVGNVgevk77iuu6WKlvq6S3HuGslCrQ8znu9HLS3WHmEm0rUAst9wX0oiVfdN5OcTU+am/PTaHGAWx5nDooYfisssuw9HSvpvA8uXLsXDhwthfqyGdVdUNdMMNRQnKTjuZBSk9Q0xFllypDhC5MwwaFN3tf+ih+OcCZIhVSybpEKsLhS6ZVJGCmPxBXmcd8ajvu9KKf/LJQhAYM8YsiM2bF41Cuemm6foi5y0pukNMLZlUBRhZNgmI48I664j1+fLLUT/POEOc3A4cCJxySrzdkSPFMWXzzcVzWYIKpMsQSxKzbA4xV8lk6AwxVVzSg/Bd7jJdiFIv2oB0JZRpBLFSKQocB+LbpOxXo1H3Q1UQGzIkEqV/8APx+OEPR+9vuinwz3+KgTIuvli8JgWwyy8Hdt5ZCGgqPiWTaVxdPk6yojvEXBli6jSLFpnn5+MQcwlGaUeZ9AnVD+l6SusQ051raQU4HxHL1Xf1gj+NQ8zVp5AOMfV4pAfr+2wLaefX3U1BrF7YHGKlksh4BMS67ukR5xHLlgl3mEQeL/bbLzrv+M53RAyLPF7oNzd9SBuqr5e2JZZVWhxiuuvJp2RS3nBOytjyEYy8SiYdDrEsjjRbW0luOrmukoRIddTOCuIjVhqEoJp8N/X7djmabAKc+p5lXcXESlUU8hy9tKa81CAeph5F09H3xGlKpZiYXP1uliwR69PmSFP77CrFpSBWfC6//HIMHz68+re+HjrdAsRKJn3Q3RMqa64ZL4VQGT06OhnT852k4JAhQ0zuoKFHmWSGWJzCl0xKdt89/nznnc3TzZsnHvfYQziYHn44vj3Lbe/558Xj2mvHhQMfenrcmVNFc4ipJ7M2QeyjH43yQACR47T55qLf5bJwiemig0QOSqAKYmkyxHQRQL5vErlsDjHXZ/RpgfSjTKoi2KBB/u4ykzNLfe4SuXR32eDB8bZdn02ab6MzxIDIITZokPhNUZGjnsqRjOVALpLx40W59N57i+dXXSXK97/4RfF8m23i06uCkU+GmD6dzbGVxyHmErvq6RAzZZvZHGI33SSOh//7v7Xz83GIlcvJDqq0DjGf0RyThBLbqJamtnwzxFyuJx+XlZzfihW1DkUlV8cqUPmWaOqi0tKltdtMSDedXjLp6ntIAc6UFUdBLAw2QQwATjwxutlx3HHAUUeJ/995R6x3eRNj3Dgxere82TF0qMiIve468TyLIJY3VH/58loXkpplJduU72mh+j4ZYtVppCBmEIycQoLNsZUU7u7pEHOG6tvyz0zTpHHTWUruEgcEUL6bap/0afqmqxnd07V8DvHQOsqkOj89jyxpfvK7MZRMem8LqhirvmfaXnQBTj1mlrTRUufNEzcod9nFLLTqfaFDrIZ+JYhdfPHFWLBgQfXvtSy16wUnVjLpQ6kUnTjrF4OlUvwHUb2YWnvt2osbHf39LA6xQCWTKhxlMr59FFro22WX+HajC2JjxsSfjxsnXCi62C3bkOKtPsKkL+pJoWmUySIJYt3dYj8F4uLNLruIC41yGfjIR4Djjxevjx4t7vr+6lfih/GXvxTuHBsuQSxP2aPJXVYvh9jQodG0LseffG5zeSU5xPTnLpeXPp80n9WnNbXVaKQgtsEGtfuBFMQAIT7usYe7raFD48cAXaxNmyGmvq7+H9Ih5hK7soT4+84vjUNM3lCQo3ymcYh1dorpkkLn02aINSpU39WWuoy25Us7P/VYrE9nEqhsolJnZ3RTx6dkUh3pztV3H4HKJfgNGRKdU7oEsbwZYqbtig6xsLgEsZ4ekUO63XZiBGrV3XvEEcC3vy1Gs/7HP8T3cNppokTyxReBL39ZnIcAwNNP1wrDSVgyxMquXCy1tA3KdC7Xky0XS23L5kiT03SU4+9ZSiZjAlWlYnTgWF1dsbI8zZFmWD5nW6p4KI95DhdSUqh+mnwwq9hlcu/polJnJ9DRkVwGqAqftnJWfV2pffctO0yZ71azLVhC9att6cdY0/eni5Xq8ReIr/eZM0V8xZNPRnEqpvwzk0NM364oiBWf7u5uDBs2LPbXaqR2iAF2QQyIl13JC2cgGnVOZbvt4s9POEGEZko8fvRqMsQCntBUlfysDrGiZm1loN84xIYMiV/07rRT/H39uS0XTG7jstQybX6YRD0pNLmC+kJXCyGIAdH+q16EdXWJE9N//Uush332Ebkf994r3Eh77y1Erk99yt22SRBLUzJpE7XSOMSyZIip35k6upqPy0t1zKpimitPTs5HbcsVop9HTNP7WARBTLoI1HJJiTpgw4EHmkdCdqHvx2kzxNTX1c8kObbkZzo6GuMQC1Uyqf6vi8WAuDh9+ul0DjG53edxUJlGmfQJ1V+xonY9pBV5fB1iSaH6vg4xdX91CWJJDrFGzU9tyycwXz3m6CWT9XCIsWSyfqjnO3pkBCBG+X38cWCrrYD994/Of2Q55V57ARtvLP4vlYCJE6NcyE02EdvvsmVRrqsvtgwxIL4NuMoA5XuuXCxdKOlbPp8SxihDzOEQUytX9LI8eVzzKZlURRdd5MniEKuKeRZBTHVi2USeJIeYS4BzCEYu0Sw2P58MMZt4qIuVpmm6u4FyOe5IS1qfDvee1yiaqvtLfjcvvww88IBbPDRsC4C2rtRjtdwfY9uCZV1pDrEa12Qb0a8EsXYgtUMMiE5ATRcjqiB22mnR/yZB7He/EwKGvFM0cCBw++0ig6hUAnbbLbErcseqjjIZ0CFW82OXQLs4xAq/XDJHbOjQWmeXKoh1ddU6wyR6ua7HtmhEPSkseoYYEK0PXYDZZRdxAis56ihg663TtZ2mZDKkQ8z0Wfn9+jjE1OOcKlSZRK0kcUk+19vNK2q5hLikEH11Wl2Ya4Ygtsce4vs44IDa91SH2Ic+lL7tzs74Pu+TIaaXTJocVPr2Va8RK30dYqFC9W0OMZW//CXeL3VdqaNJ+wpiahlg6FB99TX9uUswUi+WfTPEfBxiPmWHgwZF+6ju7EoriEkBwuUQGzYs2r70fvnmu0kBOG92m8+6UoOis86v2b+7rYLMCR492h6fIhk0SJS3T5okMsOSKJejkve0ZZOqUK0Qu7BXR9RTS9vka1dfDXzsY9FNUlcpnc31BIht7s03hcvWlCGmi0o2h5j6nl6WpzrSTCWTsbI8h0PMliGmOdKqDiqbIOYjHmbJEOuwC2JW9556DFPe98rhcglU6rpSHVvKOgdSCn6+o0zqfTeJtvJ3+eijxTmWHNxOdZElOcR016Tk/fdr29K3K1uGGEsmSVGQDrFU2Vsuh5i0TG+xhchzkj+KaoYYIE5cxo8XPwy33BK9PmCA+IFcsAA49tjErugiVFCHmDwwM0Mstp4L73yTgtj660d3FyWqILbJJvacOvX1DTYQdzaz4CqZVE/y5Q9es0/MZVlCWrHLBymIzZ0b/YD6ZoipQlVSDph+4e7jLvN1iKnfWVqHmPpZU7suQWzIkHi/VFErqWQyTYaY3lYzMsQOOgiYPx+44ILa98aOFRdFI0bES27SoG7bPhliaUomfdxYsi3biJVFdIjp81O58cZ43zs7oxsIN98cTacLYnkcVFlLJtXXTPOzCTMrV0bLaCvLU0sMQzjETEKWj2PLp6zSNT/TKIyu+dmmAfyy29TjpK0tV0aa6lTwGWXStB6a/bvbKuy4oyhv/PGP/aafODG6Ae6DrCpJI4j97W9ilGugOhJrTKCS24WttE2KLl/7GvCHP4jRivVp5Pnx888D558PTJ8unuuupwpEZtrWWwO77hr9BsQcW33nN0uWiOOJLUNMFYzU7V0VJWyiUt5RJrVg+ur6dJRM+oqHVseWUeTRBDGTSGcrmUwhUNWIhwluuqq4Vqm4S0It7r2qSOcpVpZtJZNq3+W6eu898Sj3CZN4qC+fKQvPhCokezrEatZ7G9FUa8miRYvwomK1nT17NmbNmoWRI0diA3lno83I5KwaM0aMJKmOBibZaivxuPfe4oRw222BGTNqHWKbbip+BOWJno5ngLkuVoV0iKUumaRDrBgcfTRwww2iZHeNNYSAIk88dtwxms5VBil/NAARHJ/VKZNUMlm0E/OLLsrm/vJhxAhxzHjnHWGx3nHHsBli6utJDrGsGWJqyWRaUUt1vPhkiOliWk+POOkZMECUqvp+tr+VTAL243+pJEaIXbYsyrtLy9Zbi5wa2V7aDLE77hC/gXvuWTyH2IwZ4s6vFGZCOcT0+cn/OzpEhoi8EJHvf+xjoh9/+ENUDhXSIZa2ZFIKyr29fi4kl8hjy8VS15vLueYzP6B2Pbz3Xn6HmH4DxjW/pUvt4lp3d3T8sk0D+JVM5nWIqc9ZMtlcSiURgF8vpCD25JN+0//2t1GUw4YbCgEO4ny2UqnEHWJyWyiVxGiUUiTo7ASwKtpGXnlFPCpOpeo1wn33iT+JyfW0ZIn4kzmMfdNFpW19QsKnPy1uHMqoHh+HWFcXUCrFXU9yQIBSKe42W9wnbthEF2X5AIcAFythHBB/3yAYVZfB5hBTxUqbQLVQriuLQ6y7WxHgLK6nvvOeWAnjihVipOrOznjfKwnrylTC+Nhj4ruTES4mUUmOQKnPT3ekmUL1bSWTttJf3cCiOB1rvpukDDGHICa3K6dDzJa31mY09Ur6kUcewf5K2c/5558PADjllFMwadKkJvWquWQqmbztNuCtt8wXJCefLEQIOTLdpz4lXGATJ8ZPwrJmMmnUCGJ1cIh5h+ozQ6wYjBgB3Hln9HyttcQ2OGKEcI1JgcwVlK+WVZ18cva+uAQx9YLvpZeEO+uZZ8TzZp2Yd3TUjsQXknHjhCD2wgtCEEuTIeabA5bGIZY03zQOse7uWnHJ5hAzCW26M8skri1Z4idiuRxiesmk/tlml0wmoTqNs2BziMltceVK4MEHzaISIAaWGDwYePvtsKH6NrekfP3FF6PRNfVpFi0SwtNVV9nLHHt7xUl6T49wcKu5Zra+uxxi668vboDdeWdUBi3fP/544AtfEE6Jm24SF8lbbCHeC5EhppZM+jjEpJC9eLGf68lH5LH1CYgL3nlLJtM6xHxKJpPmZ+uXOo3cNmyOrc7OSLhPmp/se5YMMfm8szMabZclk61JWofYn/4kHj/5SeCnP61ujzHB6O23RXm+rG7p6QFKpVoXko6pDFBHzxCz9VMVcOR1w9tvi0eDqwvQhASX60m+L0XuvulKi7X5pSmZVMVDJZetKro4BBzrKJN6331KGOX85s4VN4NcJZP33y/OOXfd1b2uDj44fm7e04PSMumysuS7mUoY5bn8gw/G5ldTdrhsmThuGbLiXKH6ctkq6BN2588XVVeHHhodz1TR1rEd14iHhuUzritTW0s0R9rVVwMXXhj9/ivbOmAoVW0jmnolvd9++1VLBIkgU6j+FltEG7dOV1c0nDIAfO5z4g+IB2Fuumm6jlrQf4hSlX4mtZ3SIaZTeOEoBf3KIaYzapQQxMaMERd/66wjLixdouzHPibKdo8/3pyb44tvyeR//xtd7ALA9ttnn2eRGTdOBHrKC+g0GWJJolYWh5hJGHM5xFyiVpJDTP3sgAHxktA0n7WJZbbnQ4aIfq5aZXaXFdEhVk9Ul6ipZPLznweuuSaaRi/fBYSw8sgjya6u3/1OOKVUB5WvQ0x1dr3+ushPW7zYPM3VV5uXVd0X5s6NXJlbbRW5ubM6xDbaSJSnqzcf5PujR4tBD+68UwiIgFhfQNgMMV+HmNwnfAWxLKKSuhxpHWKuzDJfgSqtQ2zZsvhxyOSgcs1Pbht5g/fzOsTSzq+I2Z3EDymIvfii2IYHDRLfdUdHbaZxb2+Uk3TOObHfvVKpBFT6Luxfe038SXQ3jCkapm+6SMRSjovXXSfm9957VYHW6azxFeB8HGK66KIOCNDTYwnV79v/77lHDHggHWkmkWfZMrHeVaFE7fsAUZKKj34UOO44caNGa6u6rh57DDjpJDFSYV+fYn2X/V62TOyzyv5bkyF23HHiUf6mmcRKef352GPxdWX6bpYvj5dfLre4nlSxUhcPdUwloYBYRzaHmDw2v/WW+DMIn6srq8X8envF96e6J3t6UEYaYTdBgNO3K1dbsu9//at4VNY7SyYF/exKuvXJ5BDLiloyWS+HWMiSyZQZYnL+cp22bIZYf1sumSMmHY3bbCPEp513tn9m8GBxVyMvvg4xQAxIccstwmG5ySb5511EpCtvyhRRFvfSS+K5KfReFSF8gvFtDjH1s7obxiSm6Q4xV6i+S9TSxSX1+YABqOYtrVpldmbZRC4f8cwkci1a5PdZ3anWauy0kygNluVVqiD24ovAr34Vn17fJiRTp0ZOUlMJ4513CmeCehMui0Ost1fkdEkxzDSN5KSTgD/+0d53OY9nnonuYGfNENtoo9rSavX9k0+Oi2WSeo0yWamI/qrL6yOW+ATFm9qRI1bK+ekOsaRQfbX82rR8yl1+axi+qe++Yt4WW4h9YOZMsQxp25L98xUYp00TJWAHH1w7XVKGmO7eu/ZacdNoxx3jF+cuQYwOsf7P6NFR7MItt4hzuUMOEa7Nxx+vZoQBAJ57TtzUHDiwZkT7RIEKqBVdDNNVL/4HDgJOPlQIQUceKc7frrpKHI9hcNZssIFYhiVLaoUgNa5jp52ARx8V/6vB+ygBUERzS1C8aLNvOnXQANWFJM9h5LFaHrd6DGV5r78uRD557qzNr7znXsCUCvDqqyILWn4fPYYMMfV3qq9P6roqVyDa2WQTIXT5lAHK3zST60ki9/U0rqcFmpvuhReE49BQdljtu6EdMR91va8WjrVVq8QNKzm/ldq6+v3vxZ+s3lC3mQpQOvdc4MUFYoAbKUICcfFQlrOOGRPNS87P5t5Lch4OHx4fOEdd7/pNRPmbbSrFbVNBLJx9hwQhk0MsK8OHRyePgRxidS2ZtB1QXZ/pz04qB/16uUaNEo9jxojH664D/v1vMehDvZHzBMwOMZnDN2CA+DHbeefWFcOASBD75z9FidfPfiae+5RM6qLDwQcLYf3ww+Ov6w4xV8lkXoeYFLXke3qJpKtkEojEtjRClY+rK42Y1m4OsVIJuPxyEf4MxEWsb36zNuxe354kU6eK4wgQbddyuv/8RwhCuiNd5m4B6Rxif/5zbTvqo5z3d75jH4xC8rGPudtSlz/JIaYLYur7xxwjHGRHHhm5DoBsDjFd4DCVTKqvS9KUQ9oElSefFOUochpVJFan0x1ieUL1tYye6nQXXADst1+UcZnVIQaIi83HH48CwNO6sdTv5tlnI7HOJKwtXCiEiw9/WFxUqw44V2C+aX733COE5n33FRemqsCoi5W2tiiI9U9KJeDEE8X/p5wi9oW5c4X4pQ7OBUSlarvsUjMAmHW0PCC6+O+7XC1LIUG/KaIKCeWyECyOPFK8t8ceolxz441j86sKJR/5SCQM60LQ2n3njNtuC/zwh9H8OjtrQ/zffBO46y7g3XfjfS9pfX/lFWDWrHjf9eB2iTxemUSeSkX8SbevLpRsu5347ZNVQlKEM7mQdEyh84BwkN1zT3RsV/tuE1J61FB9y/xMGWIJbVVFnm9+UzjJr7su6pMu8gDxSir9e5bfzX//GxeoFPdeecQa4ka5FBafeqo6v1hbn/6syMtTb/J3d8fde/JmiMz0NCxf9ft+4AFRfnn99eb5yeXry+Rzrisdk0PMdAOjDaAgVjCkmylkqaGVcln8OI0YEawkrGgOMXVaZogVBOkMk+LUqFHAXns1Zt5dXZEgZ3KIbbmlKM+6667GCHTNZs89hUtBF1p8Sib1C/cPfEBcEMnRBn0cYj7llh0d4ntZd13h1rOF6usX9j4OMbVk0vVZV1mkT8mkKZDfNq2tj/L9VkduA08/LULgAWDChOh9myA2bVo0iqIsC5Tb0c9+JkKT9byzLA6xV14RJ6imPqv92nFHcfIsM1L0+UnOOqu2T+qjr0Nsww2FEKgLcJJBg0S/b7kF2Hzz6PU0DjGbiGUqmfRty0d0ka9ddZVwl3zpS7XT6P2SferoiAvlWQQxNUuruzvKxnr+eWDyZJHLprblU+aoHrtUbr/d3taUKWLZTWKXnObOO4WD4zOfsbezYIFwh61eLS6mVbeK2pa63JWKeX7SkfD++6JUSoqD+nej55GxZLI1+OEPhfNq1SohVMtSSL1s/KGHxKMcdVwhliEGRKOhAnaH2B57RPshEBstL+kaYUBZ/N53yuqKo48Wf0DVtVkVgnbaCfj+98Xvyz77iAHKBg0CtttOuSbp205ffFGMynz66ea+y/OMCROi9VAqAV1dGNDR16fhfct+wAHxTvdoGWK/+IVYx2q+rj4/eQNmjz1q25J9V3+PttsOOOII8X/fNaFP3lq17+UB1mliuWadnaKE/6CDavpe/Q633VZcm151laPv2jWdItLViDwA8KMfRU503Qmol/ga5lcaPFiU8+qDSBidgBCOQss05X33A37yE3E8V2/Um9x7L7wgbpb885/m+cn95sAD4+cXsXVluU7UMsToECOFoTrKZKNOBKZMEXcQ9JHPMqILYPVwiKUtmZT0O+HIgbpe+13J5KmnCtHk1FObM39ZNmlyGwHAmWeKu93twHrribth77wTjb4DZHOI6egOsZEjo4vJtA6xRx8VrgdX7psuapncVTaHmPysr0NMC3m1Cl62z9oEMVPwf6s7xHTkd//aa0IAOvJIcUEi0cvUPvEJcRE2f74IPl5zTeFUAOLlA+usI0YcU7efcjkqW3vrrbgby+YQ091hpmmAqA/KwEE1DrG11gJ22y1+UaOXE6uCmHQeyG1Pd4h1dcUHJ7Htm+o0+n6ju7rSZoipFxaq+NTbG3MoJApiuqg0dy7wla+I51OnRu2ox3GTQ0x3f+ZxrQ0YINbpeecBhx0Wfcd3321vyyX4qetK/n/77UIQUteVXO/f+x7wgx8I94Helpzm2WfF4803i2U19Ull5sxaB5xeMvmlL4kbSWrej0nMmzUrcnrq80uTFUdBrP/Q2SmyGf/nf4RTcPp0ceyaMiUuHEiHmEkQ011BF18cRbroWU9yf95nn3jUi+qGSbgRf+GeF+KEbU/ATqdeDHz2s0IwOvZYkVF7wQXx+Q0eAnzxiyJSolQSN0v7BjGLuaw+8pHoHErmz+qOrQFKCalcjg03BEolHLzpwThyiyPx2SO+Jdr/5z9rhMGYyHPU0SIP2iEqVdfDbrvFV4DNIXbaaeKGyVtvVR1zowaLG8hrGgbClW2duO2JOGzzw3DScd8SwqJ0uZrmV+4Q7d9xR/ymsy7yXPpNcW16wgn2tkw5lX3TDOwciMEDBmN41zAMQFn8xh54IPChD8XmV21r4fvR59WqENN2NW5c/DfUtt5Vh5g+zcg1hautszNeodXjMTiEvk/I6XbYQZznmNqS29vee9eMbM4MMQEFsYLR0JJJQJz46KOe5aARDrE0bfZr4chBvxb6dthBnPSPH9+c+cuLz0GDah1i7cjQoUJQkM4uoPYC3ydDTEcXte6+G7j3XjGvJCFs4kQhoMk7bAMHRneD1Qvcjg63QyxJXPJ1l7meJznRXJ9NcpepbekCbquil8h+5ztxh5hu/T/ppPiJ9dFHR+tJvfN6773iDrh6QVYuizvhw4YJZ4ssPQHsDjE5Upp6/DI5xKRYIh/l++r+su++4rkMH5Z9Uh9VQezee8Wjuj4kG20kHvVRO02oJ/NpHGJDh4r/Z8+OhLJKJQrL3nDD+DaqXrCoDiFfwUid5qKLhAtJzlNOoz6aHGK64JenZFLO54ADxOjel14qnt99t/ie0gpigChn2nJLITaVSkJUklmOer8kU6bE29KPOYDIrpk+Pb0g1tUV7/v11wsR7t13xUWsra3/9/+i9gDR587OaH9dsiS+PZj6NXOmKO2V4gkFsf5BR4f4/n/7W3Esk6WKP/+5eFy0KBqJ0uC87+kU33+3vB9xwglCcAZqhZJNNhHnAscfXyNKdHeI3/vuTve53Kd2/BT+dOyf0HXJt4R7WN4Yuf56cUMUjuuN7u6a0TFLpZLIlZw1K349JcsA5XTbbiuEvt/+Vvze/O1v1X1q7SFr45YTbsGh4w4V4nO5HD/O9+gZYn3/H3hgbJqafgHiPErdl1QXkrpPnnyymE5WUQC46bib8K/j/w/rL+x7Ye+9o+nLZaCzE9utvR1uO/E27HDY6SK0fZddquWpNfMrlYTQ19FhFMRigxkMHy5uGm2wgbmtHuW4uOee0f8DB6K7sxv3nHIP7j7lHnTcN0X8dnZ2Ap/+tDg2ffCD8XUlxdhhw4Arr4y1ZYzrkdun0vea7C+9ZNI0DVAj7Fan6e4R6+pjH4sPjqfPb5ddxW/I+PHxm2vqaKlj1xVC2Ne/Hj93Ud17KAkXnfxdazMoiBWMhobq1wFdEAtZ+ulrh1bp18KRg36dIdZsvvY1cWftiCPMDrF2RRXE5EV4KIcYIIRQeYKXNMrkN74h3D6mbEPVxaU+JpU96u4rU8mk2rZvqH7okkmbi61dtk9VsDjvPJH7USoJ0eXcc4ULAQD+93+Byy4Td8jVkuuPfjT6/6yzxL7+3HNRfoh6R71cFt+5vKiQF/yA3SG2erXYxi6+ON4OIDJkJPLCQT1RX7iwVhAD/AWxe+4Rj30n83jrrei99dYTjzLoV21DxySI2TK2VIfYHnsIkfH116NSlscfFwJZT09UsiHnq7Zly+HyKZmcOlUEt6v9lNMAZiEri0MsqWRSv2nygQ+IC+m33xZuGJPIs2CBuACW35UaOg8IJ9dTT4ltQIq1ciQwOU81nBwQ68M28qXK3XfH+24TxNRSyFIpmu6ZZ8QFpEQdWECd39prA+efby7vlo9HHCHc2XIEQVO/7r5brA8Zxt1Pz4PbHlkG/rvfCRF7xgxxHFtvPRF9oPHjg3+My/a/DGO/fRXwm9+Ii/rjjxdv9l3grzdMHN/W/dAJYp/afvsaIWHHdXbEBXtcgCsOuCL3Isj5yUcTIweOxMDOgVh3aN8ydXTEb4DoIs9nPiuOA5/8pBD1Dj9cCBk21FLHnh50lDowZsgYDO0aiqHdfTcn1NJKm1Np6ND4jZKenmh9rrVJNC+Zoauw+Zqb44AtDxW/t1deKW5MqPOz7aOaO2r04NHoLHfG16fqXOs7thrdUbvsIh47OoDOqI31dtpPbGsvvRSNbKmsh93W3Q07j91Z/AbLc8l99hEC7Wc/K9qQbW3Z15drrxXnwj/6kRjQp6srWldDlW3XIIitN2w9dHd0Y61Bfetx3LiohFiZRn0EUOMQGzt0LEooYd2RG4rqjeuuE+49Sd9vWrVfP/+9+A3p6YnEw74bxtX1uckmIjZi4sS4IKY70s47TwijbQivpAtGwx1igdGFvKaXTKoOsRbNEGul5WoIu+0W/RCrI8W1q0NMot6NlI6PNBliOibXjESWqclHUyaSrV15gSsvvkK4vJLEtCRRK2Sovs0h1i6CmHQhAUK8luy6azyP64wzov/32Uc8jhwZL1HceuvaLJsDDxR3SYHoAv9DHxI5UHfcIcRY9T25TarllGefHc0TiMrbZKg/EJUlDB4MnHOOuNjfc08hJknkxZNLEPvBD8QIYFddFbUvL4JeeSX6nNyG6+kQGzJEDIBw6qnAt74FfPzjUW7bwQdHJ/+dnWKdfO5z4gL4yiujdvouamKurvnzo/Xlcll97GNC3JQlObogNn++OKYPHpzPIbZkibgIO/ZYcQGou7okXV1C1LzjDvH9mtr67W/F37hx4vsztSW/6w9/WOS8/eUv8XnI/CW5PG+8IURItS1TaPLdd0eigc0h9tpr0TapH2v+93/F49prx8Veva199hHP998f+L//q21r8eJodL5rrhFCdpKYB1AQ66/sv78Qep59VlzMyzJeQ7kkAJyywyniH+WQikMPBe6/v1qGeOGeF2LiJhOx27q7RfuL5hArl8r47oHfDbIIp4w/BVuutSV2Xsc++vnQ7qF49NOPYkiXkmV2wAHArbdW+wQAx2x1DF5//3XsscGe9vMaE5ogViqV8OBpD2Jl78qqqw5rrSXyKmfOrM7vsM0Pw/2v3Y/9N1Z+C3fdNQqB7+7GNqO3wIwzZ2Dj4RsBE15MzpGWv7dz5kSvuc6Zd9opOo719GDU4FGY9elZWHPQmtE0o0YJJ5m8oQKLM2/nncVNgr5pjt7qaEz71DTsMGYH4Li+Y8cxx0QB9UnnSsp38L0Dv4dPjP8Edh84DrhoTlT2et551Wl2W3c3PHz6w9hiLSWUf6+9hJts4cLq/O495V4sWLYAI3pGiGk6OsR3M3VqdZov7fEl7LfRfmI7lqiCWHc3Nhi+AWZ+eibGDh0b9VUO+gBUnex//sif8drC17DZmsp+IB1iuiMNyk3sHXaIzS8+Wmr7QodYwWg1h1hdSiZTrJuWdYi16HI1HDrEItR1IbOK9BwwU1C+r0NMZdddxchAP/hB8rS2fsrvS54Uy5KxLA6xpFEmVReYfkHvI7zZBLOenrj7I2naduCww4BLLhGuAnU0RBcf/KDIGbvxxriDyIS82wxEd8QPOUQ8Tp8u3D69vVFwvpzmjTeiz3396/FyTDkq1fnni0fpYpNceaVwEA0ZEs/4kG4ulyB2991ixLQTThBCzahR0edMo0HlFcSee04IcNKZprujPvEJsQ4XLhTB7dLNJEOp1bZuvhn46U/jgpi+PZ91ligjkiPcuoSSiy8WZa8Sva399xd3yF94QZQ0AtH3ZMtIU+c3cmQ08MKMGSIX6etftwtiQCRO/utf7vLEF14Q7kS5rZjakvk2quBXKkWOm4suirbfqVPt60qWIj3wQDzkvqMjfoyTx0y5rcvvWD0GDRggSrtU9PlJcfjQQ+PTmJbz178W34HqSlNdkB//ePS/LJEl/YtSSYjhAPDd70alk33liN7ssUf1N6C7sxsT1p8QvwGsHscC39DsKHdgj/X3SCy/3HKtLeNuH4Nj65zdz8ELn38BGwzfAKnYdddI6O5ra8MRG2KzkZvFp5MO5751cMK2J2D2ubOx0zo7RdOobqy+tnZaZyesMWikeM/3/GKddaJjquszaqB8X7+2Gb0NxgwZE59O9qtvmo1GbARALGcV6TZTXGQT1p+AgQOUY9B664njT7kcH00ygcFdg/GB9T6A0pprxnN0FUqlEnZdd1cM61bOR7q6xA0ToFoeOnboWGw1aqv4h+V66FtXXR1d2GP9PeLXbZpDDADGjxlfzW8DII7DcrCGPnfbmoPWFKKgihTEbG5BoMYhNnLgSAzpGhJf520IBbGCIR1iDRllsg7o/Q4p1uR2iLVShhhLJsPADLE4998vfpx/+Uvx/LjjxAW2LP/6xCfEnaoxY5JLJqVbRLrAVDo6hPtHOmTkSZ+P2K07xE4/XWR3SDFCDdVPyhA7/HAx4t7hh8c/6xK5ZJuuUH2XY0wXucrlSBRzzdfmomg1urqES0s9oU6iVBLBx7KU0EVnp7jLPXt2tI2OHStOEisV4B//EC6Xp58WbjVZuiMdTF/6ksj1MG2rp58unDDXXGOf/7BhwMsvC8eN3HdUQUyKHHr7//iHeNx//+hz3/ym+Oyvfx1Np44gqZZUqowcGYVW64LYt74lSiZ+8Yt4f+R2WC4LkaurSwQwP/GE2H/lPgTUDnxw8cVRtpQulKxYIQSRs84SQpxNVDrySHEsMgli8rtZvVqUhZx4onCyAVGJjxSC/vY3sQ710RzlfvvYY+L7v+wy8fpllwE//rH43/QbIYe6nzxZiISmvm+yiXBZPf549F2ZLibHj48Hacv5feELwt3x7W9H5cG620xt71OfEheIK1ZEgf/6et9uu7i4pr6n/i6efba4aJXimZwuSRCzlXrPnSvWvZp/Nnp09P411wCTJontWo4WS/ofcrCTV14RrsxDD43nXYXAEGzedLbayk8w8mHIEOEwAtw3h049VThB1bJBHYMglhnZJ19BzDXdaaeJvvf9fvzuqN/hmbOewbajFXFq772Fu/BjH3P364YbxI2HFIJYLn76U/F7IW+omZDr3fX9aaW/Vq65RtxYO/FE+zSyZLKvnYM2PQhbrLkFjtzyyGiarbeOnScP7hqMpz73FB487UF7u20Ar6QLRnWUyX5qXdTFqnEjx1mmTE+mUP0WdVLFSiZbSOhrONoQxW3PHnuIocMln/mM+JNMmhT9L394N7Dc9fzWt4RTweckOI1DTF6YyZGAyuX4Ha80ZY/77CMcMRKfUSb1x4EDIYdOx4oV7pJJORCAqa0VK8xiWrtliDUC1d0lOfRQcXL7ox9F29BnPhOFJN98sxB1Pv/56DMbbSRGwpKUStHFggs1cBiIhRjjhRfE44PKyem220YjtqkOhHHjhHCnom4napmbzrhxwgmqC2KSSy8Vbh1Tftbuuwvx6rjjhJi1776Rs0pliy3Est55Z/WuttE5NHKkELJOOSUqadT3BSlsmQSx73xHzGPCBHEhPmOGeH2XXaJRyuRx4913xWij990nSlmlC0m2td56kdugUhHCvVL6U8N22wlBVXUQSmeX5CtfEaWj6oWTqa1yWVz8SUeWetyQzr+99hKjTU6dGj82qN/fPvuI7eTaa6O25Lrs6RHLvOOOYhCEv/wlKnuV62jevKitr35VPO68c7St9/QIgW/gQLHtSmfFppuKi7sXXzR/z0ceKcrJfvaz+Pf80Y8KkfgjHxFtnnKK+CP9l+HDhYBxzTViu/7e98LPY/RocXyZOzcuqjaTUkncmPnjH8PcZL36aiHQq9mXOltuGf1u2NhuO7HPLlsWH2UwCzvtJMrEXcs3apSY5zPPRPmWJg48MNb3wV2DseVaW8anGTQocrG6GDo0HrlQbwYNSi41/ehHxTmmmjmmM3KkOHa+/rr4jmyUy/GMUBPjx4vp+s4xdhm7C549+9n4ND094rfy3/8WvwFAevdiC9I/bUgtTH8vmdTFqho7Z4C2mSFGh1gwSqW4o4j4s8km4mTnppvM7++8syh98xFyTBliNrbYQrglrr/e/L6rZFJ1uZhGbPRxiMlHecdPPrpKKPVSJJu4ZuqzPh9SH2TQ8aOPigy9AQOiTBJAuBm/8IX4dqO6sULx/PPiUd7lnjgxcmsBfi64K64QwoSrREm6K+TyPP549N6YMcJd9oMf2MsFjz1WBGCvt55YLyrnnCPEqMmTRTDxiBGRW03uA3I5AXFifuSR8Xyvnh4htOyxh8ivkflDJkFsv/3EMh95ZBT2D4gyWnlsOeYYcVF1++2ifwDwk5/ER07UueCC+H5nE7H+53/ir/X0RKPiAsJxd9BB8Ysn23FRzaYzTSMHaXj22SigvqcnEm5HjRLbjhwkRRWe1Mcdd4yLtyNHRqVtn/mMOL7/6U+R0KmFZGPoUCHSPvRQ/EaGdInJC1M5v3HjohL5f/0r3tbAgcIVailZIv2UCy4Qx6FLLkm+kM9CqQRMmya2w0YKIUmceaY4LqoDFWVlt93EvpH3hlhXl7hR8Nhj+c915bEgaZ3fc4+4YVMUsbIZdHUJJ7fq0NMplUR1xlNPxUcpzcImm4j9QS9z17n1VnH+Lm8sEzrEikZ/D9XXxarxY8ZbpkyPFIE4ymTrLldT6OwUFw104KTHNUJSGsaOFY9qvpILNSdJR3V52UoZbd+1+tnBg8XF4KpV4qJYv6A84wwxWpEcha2nJwpY9XWXmR71Pk+YIC4sZL4QqQ9bbCEuGI4/XpTgnnqqcUS0GJtvDvzzn2H7IcvufvtbId5cfLEQV/73f4WLUC2vsPHlL4s/F7ogduSRwA9/KBxWhx0m3F/f/3408IjpIsrm5FGHrQeEy0GOXCW39aOOAqZMEaUyW28tnHn/939RxldPj7iYuP/+eFujR4u/t94y78cf/7hwgcnAe0mpFIlqH/6wyOdR86pMbXV3i/Vy3XX2dQCIY8H/+39R4HRPjxAMv/xl8Xn5uS99Saxf2R8Tap9N81tzTSGsPf64GIFMTrfTTkKY+8AHRNuHHy7KL9UMMUA4el97TTjNxoyJyrb//veodHe33cTIbSq6IAaYL6Yuukg4684+Oz7tEUcIF8Qpp0QjhtqWkbQGm26a7FzKi8nt22z23TcSq4tE0u+ZL4cdJs5JXK41QORvGkauJAZczrC0qBEMNtZYI16eT+gQKxr93SGmilWjBo3COkM8L3A9qI6EkSZUvw0yxFrJ+dYU5Am7KeuKNIbTThNZN1/6Uv62VMefK7vLxAEHiJOEnXcW7UybJlwlps9uuKHIFpLlb6oLTN2WBg70c4bJR73P3d0imFjmrZH6sfnmolTxvvtEPkgSX/2qEGekKJqHv/9dbDd//KN4PmGCEFmk0+iMM6Jw9RB86EOibbldXXqpcHRNmiRK1yZMiI/Cm+eGwYknRgHEMrftc58D7r03KtfbeOOorBJwCyXSJWbqU6kkHGtJ6+pjH4vKKQH7YAwf/Wj0vzoAhkpXlygRl3R3i9euuCI+eq/MowOiUUx1dtih1l2lc8EF8edS1Pp//y/KcuvpAU46KT4NIAaemDZNuLHWWks4vB57zDoCYBWTIGZi7Fjh0pMOxw9+UGxnp54qnv/oR/HpTU5dQkhxGTBAnJPI/ERCWgD+EhWM/h6qrwo148eMDyrs0SEW0arL1RQuvVTcxaR1uHl0dfmVgvngkyFmu9j+9rfFha0M+VfDWaXtX817UlFFrUGDRJlIb6+4uNWFL70tl0OMNJbu7rhLx8Xaa4vsqI4ANyU+9CHhDgvRlg877wzMnx/Nb8iQeLneL38pSupUx1ZWSiWRJTR0qHAKAWI96yLvV78qyhgBd5nNrrsKAd22L/py1VVR6bV0qeqoLgg5MICJU08V2VlrrmkXebq6RLngr38tgu9NdHaKssg777Qfp048UYhtTz0lntu+m1NPFe48AFi6VDyus07ciZuUgSNZc00hGD//fLr1/s1vimOh3M7WWEOUYp54YnuXUhFCCCkMvJIuGK0Uqr/D2jsEbZsZYhHMEAuIzJMhrYEqiNlGe3Rd3NsEiQkTgN//PhqZTUcX277xDft7n/+8KF845hjxXIamb7llvJSK5UTFJ6SA1SgxzGd+22wjyjW/+U3xPO+2uOaaogzUxahRQnSaM8cdxHzhhaIcS3VvZWGttYTAM2NG8n4NiBJpG+VyNLKliwMOiA+MYGLffYUgZjtOdXSI0S+PPlo8t02njvTmGmDBl7/8RWQCpc2D0rezE04QuXJypFNCCCGkifBKumC0UslkyPwwtW2OMslRJgmxctJJotRrr71EuWJHh3BqDR6c7BBzUSpFOUgmZPi2KWhWf2/EiLhD5NprReD0mDFimHoJBTHSTL7yFeCJJ4SzKe/IZL7ssIP4c7HGGsDpp4eZ37hxUZ6ajSeeEHljUhysNyedJHLL1JJOnSOPFHlc8+bZR/otlUQu24UXCpdWXrbbLj6oQR7UETcJIYSQJtI6CkGL0N9D9VVCjjAJZCyZbIMMsVYS+gjJzdlnR4HOAPCzn4mR8oYPj8qibOVRebjsMpEDtf/+te/ttx/wxS/ag/E7O6NwYL3Mk5Bm0dUF/PWvze5F89l2W3e5ZGg22CAqh7RRKom8tyQOPTQa+ZEQQgghNfBKumD0d4fYK/Mjd8MWa27hmDI9mUom28Ah1krLRUhwzjwz+n+nnYRote224eez//5mMQwQwsL3v+/Xjh6qTwghhBBCCKkLvJIuGP3dIdbTGV3MDeiwjNqUESkSZh1lsr8OVGCiVbPRCKkrpZLdpVUUhg0T2UgdHdEIg4QQQgghhJDgUBArGNIh1l/FmzN2PgMvvfcSjtnqmOBt53GIdZQ6+q3rzgQdYoS0KKUS8Oc/N7sXhBBCCCGEtDy8ki4Y1VEm+6l4M6RrCH76oZ/Wpe08GWKt5qJihhghhBBCCCGEEJKd/mlDamH6e8lkPZHrJMsok60mGtEhRgghhBBCCCGEZIeCWMHo76H69SSXQ6yFRpgEWnf0TEIIIYQQQgghpBFQECsYdIjZqTrE0oTq0yFGCCGEEEIIIYQQDQpiBYMOMTvSGcYMMWaIEUIIIYQQQggheaAgVjCkQ6y/jjJZTzKVTLaBQ6zVxD5CCCGEEEIIIaTeUHUpGNVRJlkyWUOmUP02yBBrNbGPEEIIIYQQQgipNxTECgZLJu10dXTFHn1oB4dYqy0bIYQQQgghhBBSb3glXTAYqm/n0v0uxb3/uRe7rrur92faIUOs1dxvhBBCCCGEEEJIvaEgVjDoELNz7NbH4titj031GTrECCGEEEIIIYQQosOSyYJBh1hYmCFGCCGEEEIIIYQQHQpiBUOG6nOUyTC0g0Os1cpBCSGEEEIIIYSQekPVpWCwZDIs7ZAh1mpiHyGEEEIIIYQQUm8oiBUMlkyGpR0cYq22bIQQQgghhBBCSL2hIFYw6BALS6tmiKm08rIRQgghhBBCCCH1gIJYwaBDLCwt6xBjySQhhBBCCCGEEJIZCmIFQzrEGKofhpbNEGPJJCGEEEIIIYQQkhmqLgVDjjLJkskwSOGo1coK1e2j1cQ+QgghhBBCCCGk3lAQKxgsmQyLFI5azUVFhxghhBBCCCGEEJIdCmIFg6H6Yak6xFrMRcUMMUIIIYQQQgghJDsUxAoGHWJhaQeHWKuVgxJCCCGEEEIIIfWGgljBoEMsLO2QIdZqYh8hhBBCCCGEEFJvKIgVDOkQ4yiTYWgHh1irLRshhBBCCCGEEFJvqLoUjOookyyZDEI7ZIi12rIRQgghhBBCCCH1hoJYwWDJZFjawSHWauWghBBCCCGEEEJIvaEgVjAYqh+WVs8Q6yh1UDwlhBBCCCGEEEJSQkGsYNAhFpZWd4i12nIRQgghhBBCCCGNgIJYwaBDLCwt7xBjfhghhBBCCCGEEJIaCmIFQzrEOMpkGOgQI4QQQgghhBBCiA5Vl4JRHWWSJZNBaPVRJimIEUIIIYQQQggh6aEgVjBYMhmWVhWOWrUUlBBCCCGEEEIIaQQUxAoGQ/XD0qrCUasKfYQQQgghhBBCSCOgIFYw6BALi8xiazXhiBlihBBCCCGEEEJIdiiIFQw6xMLSqqMxtupyEUIIIYQQQgghjYCCWIGQ7jCAo0yGolWdVK26XIQQQgghhBBCSCOg6lIgpDsMYMlkKKpOKmaIEUIIIYQQQgghpA8KYgVCdYixZDIMreqkatXBAgghhBBCCCGEkEZAQaxA0CEWnlbN2qJDjBBCCCGEEEIIyQ4FsQJBh1h4Wt0h1mrLRQghhBBCCCGENAIKYgVCdYgxVD8MrZ4h1mrON0IIIYQQQgghpBFQdSkQvZXe6v8smQxDqzqpWnW5CCGEEEIIIYSQRkBBrECwZDI8reqkYoYYIYQQQgghhBCSnUIIYldffTU22mgj9PT0YPfdd8fDDz/c7C41BYbqh6dVnVQcZZIQQgghhBBCCMlO0wWxG264Aeeffz4uueQSPProoxg/fjwOPvhgvPXWW83uWsOhQyw8rZ4h1mpCHyGEEEIIIYQQ0giaLoj98Ic/xBlnnIFTTz0VW2+9NX7xi19g0KBB+M1vftPsrjUcOsTC0+oOsVZbLkIIIYQQQgghpBE0VRBbsWIFZsyYgYkTJ1ZfK5fLmDhxIh544IGa6ZcvX46FCxfG/loJ1SHGUSbDMKBjAACgu7O7yT0JS6suFyGEEEIIIYQQ0giaai955513sHr1aqy99tqx19dee208++yzNdNffvnluPTSSxvVvYZTLpWx1wZ7oVKpUBALxHm7n4c1etbAQZse1OyuBOVD4z6EE7Y9AWfsdEazu0IIIYQQQgghhPQ7ShXVltRg3njjDay77rqYNm0aJkyYUH39wgsvxOTJk/HQQw/Fpl++fDmWL19efb5w4UKsv/76WLBgAYYNG9awfhNCCCGEEEIIIYSQ4rFw4UIMHz48UStqqkNsrbXWQkdHB958883Y62+++SbGjBlTM313dze6u1kiRgghhBBCCCGEEEKy09S6vK6uLuy88864++67q6/19vbi7rvvjjnGCCGEEEIIIYQQQggJRdOHqDv//PNxyimnYJdddsFuu+2GH//4x1i8eDFOPfXUZneNEEIIIYQQQgghhLQgTRfEjj/+eLz99tv4+te/jrlz52KHHXbAnXfeWRO0TwghhBBCCCGEEEJICJoaqp8X36A0QgghhBBCCCGEENL6+GpFTc0QI4QQQgghhBBCCCGk0VAQI4QQQgghhBBCCCFtBQUxQgghhBBCCCGEENJWUBAjhBBCCCGEEEIIIW0FBTFCCCGEEEIIIYQQ0lZQECOEEEIIIYQQQgghbQUFMUIIIYQQQgghhBDSVlAQI4QQQgghhBBCCCFtBQUxQgghhBBCCCGEENJWUBAjhBBCCCGEEEIIIW0FBTFCCCGEEEIIIYQQ0lZQECOEEEIIIYQQQgghbQUFMUIIIYQQQgghhBDSVlAQI4QQQgghhBBCCCFtBQUxQgghhBBCCCGEENJWUBAjhBBCCCGEEEIIIW0FBTFCCCGEEEIIIYQQ0lZQECOEEEIIIYQQQgghbQUFMUIIIYQQQgghhBDSVlAQI4QQQgghhBBCCCFtRWezO5CHSqUCAFi4cGGTe0IIIYQQQgghhBBCmo3UiKRmZKNfC2Lvv/8+AGD99ddvck8IIYQQQgghhBBCSFF4//33MXz4cOv7pUqSZFZgent78cYbb2Do0KEolUrN7k4QFi5ciPXXXx+vvfYahg0b1uzuEFI3uK2TdoLbO2kXuK2TdoLbO2kXuK2T/kalUsH777+PsWPHoly2J4X1a4dYuVzGeuut1+xu1IVhw4bxYEPaAm7rpJ3g9k7aBW7rpJ3g9k7aBW7rpD/hcoZJGKpPCCGEEEIIIYQQQtoKCmKEEEIIIYQQQgghpK2gIFYwuru7cckll6C7u7vZXSGkrnBbJ+0Et3fSLnBbJ+0Et3fSLnBbJ61Kvw7VJ4QQQgghhBBCCCEkLXSIEUIIIYQQQgghhJC2goIYIYQQQgghhBBCCGkrKIgRQgghhBBCCCGEkLaCghghhBBCCCGEEEIIaSsoiBWIq6++GhtttBF6enqw++674+GHH252lwhJzZQpU3D44Ydj7NixKJVKuOWWW2LvVyoVfP3rX8c666yDgQMHYuLEiXjhhRdi08ybNw8nn3wyhg0bhhEjRuC0007DokWLGrgUhCRz+eWXY9ddd8XQoUMxevRoHHXUUXjuuedi0yxbtgxnnXUW1lxzTQwZMgTHHnss3nzzzdg0r776Kj784Q9j0KBBGD16NC644AKsWrWqkYtCiJOf//zn2H777TFs2DAMGzYMEyZMwB133FF9n9s5aVWuuOIKlEolnHfeedXXuL2TVuEb3/gGSqVS7G/LLbesvs9tnbQDFMQKwg033IDzzz8fl1xyCR599FGMHz8eBx98MN56661md42QVCxevBjjx4/H1VdfbXz/u9/9Ln7yk5/gF7/4BR566CEMHjwYBx98MJYtW1ad5uSTT8ZTTz2Fu+66C7fffjumTJmCM888s1GLQIgXkydPxllnnYUHH3wQd911F1auXImDDjoIixcvrk7zhS98AbfddhtuvPFGTJ48GW+88QaOOeaY6vurV6/Ghz/8YaxYsQLTpk3Dtddei0mTJuHrX/96MxaJECPrrbcerrjiCsyYMQOPPPIIPvjBD+LII4/EU089BYDbOWlNpk+fjmuuuQbbb7997HVu76SV2GabbTBnzpzq39SpU6vvcVsnbUGFFILddtutctZZZ1Wfr169ujJ27NjK5Zdf3sReEZIPAJWbb765+ry3t7cyZsyYyve+973qa/Pnz690d3dX/vSnP1UqlUrl6aefrgCoTJ8+vTrNHXfcUSmVSpXXX3+9YX0nJC1vvfVWBUBl8uTJlUpFbNsDBgyo3HjjjdVpnnnmmQqAygMPPFCpVCqV//u//6uUy+XK3Llzq9P8/Oc/rwwbNqyyfPnyxi4AISlYY401Kr/61a+4nZOW5P3336+MGzeuctddd1X23XffyrnnnlupVHhcJ63FJZdcUhk/frzxPW7rpF2gQ6wArFixAjNmzMDEiROrr5XLZUycOBEPPPBAE3tGSFhmz56NuXPnxrb14cOHY/fdd69u6w888ABGjBiBXXbZpTrNxIkTUS6X8dBDDzW8z4T4smDBAgDAyJEjAQAzZszAypUrY9v7lltuiQ022CC2vW+33XZYe+21q9McfPDBWLhwYdV9Q0iRWL16Na6//nosXrwYEyZM4HZOWpKzzjoLH/7wh2PbNcDjOmk9XnjhBYwdOxabbLIJTj75ZLz66qsAuK2T9qGz2R0gwDvvvIPVq1fHDiYAsPbaa+PZZ59tUq8ICc/cuXMBwLity/fmzp2L0aNHx97v7OzEyJEjq9MQUjR6e3tx3nnnYc8998S2224LQGzLXV1dGDFiRGxafXs37Q/yPUKKwhNPPIEJEyZg2bJlGDJkCG6++WZsvfXWmDVrFrdz0lJcf/31ePTRRzF9+vSa93hcJ63E7rvvjkmTJmGLLbbAnDlzcOmll2LvvffGk08+yW2dtA0UxAghhJCcnHXWWXjyySdj2RuEtBJbbLEFZs2ahQULFuAvf/kLTjnlFEyePLnZ3SIkKK+99hrOPfdc3HXXXejp6Wl2dwipK4ceemj1/+233x677747NtxwQ/z5z3/GwIEDm9gzQhoHSyYLwFprrYWOjo6aUTvefPNNjBkzpkm9IiQ8cnt2betjxoypGUxi1apVmDdvHvcHUkjOPvts3H777bj33nux3nrrVV8fM2YMVqxYgfnz58em17d30/4g3yOkKHR1dWGzzTbDzjvvjMsvvxzjx4/HlVdeye2ctBQzZszAW2+9hZ122gmdnZ3o7OzE5MmT8ZOf/ASdnZ1Ye+21ub2TlmXEiBHYfPPN8eKLL/LYTtoGCmIFoKurCzvvvDPuvvvu6mu9vb24++67MWHChCb2jJCwbLzxxhgzZkxsW1+4cCEeeuih6rY+YcIEzJ8/HzNmzKhOc88996C3txe77757w/tMiI1KpYKzzz4bN998M+655x5svPHGsfd33nlnDBgwILa9P/fcc3j11Vdj2/sTTzwRE4HvuusuDBs2DFtvvXVjFoSQDPT29mL58uXczklLccABB+CJJ57ArFmzqn+77LILTj755Or/3N5Jq7Jo0SK89NJLWGeddXhsJ+1Ds1P9ieD666+vdHd3VyZNmlR5+umnK2eeeWZlxIgRsVE7COkPvP/++5WZM2dWZs6cWQFQ+eEPf1iZOXNm5ZVXXqlUKpXKFVdcURkxYkTl1ltvrTz++OOVI488srLxxhtXli5dWm3jkEMOqey4446Vhx56qDJ16tTKuHHjKieeeGKzFokQI5/97Gcrw4cPr9x3332VOXPmVP+WLFlSneYzn/lMZYMNNqjcc889lUceeaQyYcKEyoQJE6rvr1q1qrLttttWDjrooMqsWbMqd955Z2XUqFGViy++uBmLRIiRiy66qDJ58uTK7NmzK48//njloosuqpRKpco///nPSqXC7Zy0Nuook5UKt3fSOnzxi1+s3HfffZXZs2dX7r///srEiRMra621VuWtt96qVCrc1kl7QEGsQFx11VWVDTbYoNLV1VXZbbfdKg8++GCzu0RIau69994KgJq/U045pVKpVCq9vb2Vr33ta5W111670t3dXTnggAMqzz33XKyNd999t3LiiSdWhgwZUhk2bFjl1FNPrbz//vtNWBpC7Ji2cwCV3/72t9Vpli5dWvnc5z5XWWONNSqDBg2qHH300ZU5c+bE2vnPf/5TOfTQQysDBw6srLXWWpUvfvGLlZUrVzZ4aQix86lPfaqy4YYbVrq6uiqjRo2qHHDAAVUxrFLhdk5aG10Q4/ZOWoXjjz++ss4661S6uroq6667buX444+vvPjii9X3ua2TdqBUqVQqzfGmEUIIIYQQQgghhBDSeJghRgghhBBCCCGEEELaCgpihBBCCCGEEEIIIaStoCBGCCGEEEIIIYQQQtoKCmKEEEIIIYQQQgghpK2gIEYIIYQQQgghhBBC2goKYoQQQgghhBBCCCGkraAgRgghhBBCCCGEEELaCgpihBBCCCGEEEIIIaStoCBGCCGEEEIIIYQQQtoKCmKEEEIIIW3GN77xDeywww7N7gYhhBBCSNOgIEYIIYQQ0iKsWLGipedHCCGEEBIKCmKEEEIIIXVg+fLlOOecczB69Gj09PRgr732wvTp0wEA9913H0qlEv7+979j++23R09PDz7wgQ/gySefjLUxdepU7L333hg4cCDWX399nHPOOVi8eHH1/Y022gjf+ta38IlPfALDhg3DmWeeCQD48pe/jM033xyDBg3CJptsgq997WtYuXIlAGDSpEm49NJL8dhjj6FUKqFUKmHSpEkAgFdffRVHHnkkhgwZgmHDhuG4447Dm2++WZ2fdJb96le/wsYbb4yenp56rkJCCCGEkLpBQYwQQgghpA5ceOGFuOmmm3Dttdfi0UcfxWabbYaDDz4Y8+bNq05zwQUX4Ac/+AGmT5+OUaNG4fDDD68KVy+99BIOOeQQHHvssXj88cdxww03YOrUqTj77LNj8/n+97+P8ePHY+bMmfja174GABg6dCgmTZqEp59+GldeeSV++ctf4kc/+hEA4Pjjj8cXv/hFbLPNNpgzZw7mzJmD448/Hr29vTjyyCMxb948TJ48GXfddRdefvllHH/88bH5vfjii7jpppvw17/+FbNmzarjGiSEEEIIqR+lSqVSaXYnCCGEEEJaicWLF2ONNdbApEmTcNJJJwEAVq5ciY022gjnnXcedt11V+y///64/vrrq4LTvHnzsN5662HSpEk47rjjcPrpp6OjowPXXHNNtd2pU6di3333xeLFi9HT04ONNtoIO+64I26++WZnf77//e/j+uuvxyOPPAJAOL1uueWWmKB111134dBDD8Xs2bOx/vrrAwCefvppbLPNNnj44Yex66674hvf+Aa+/e1v4/XXX8eoUaNCrjJCCCGEkIZChxghhBBCSGBeeuklrFy5EnvuuWf1tQEDBmC33XbDM888U31twoQJ1f9HjhyJLbbYovr+Y489hkmTJmHIkCHVv4MPPhi9vb2YPXt29XO77LJLzfxvuOEG7LnnnhgzZgyGDBmCr371q3j11VedfX7mmWew/vrrV8UwANh6660xYsSIWJ833HBDimGEEEII6fd0NrsDhBBCCCGklkWLFuHTn/40zjnnnJr3Nthgg+r/gwcPjr33wAMP4OSTT8all16Kgw8+GMOHD8f111+PH/zgB0H6pc+PEEIIIaQ/QkGMEEIIISQwm266Kbq6unD//fdjww03BCBKJqdPn47zzjuvOt2DDz5YFbfee+89PP/889hqq60AADvttBOefvppbLbZZqnmPW3aNGy44Yb4yle+Un3tlVdeiU3T1dWF1atXx17baqut8Nprr+G1116LlUzOnz8fW2+9dao+EEIIIYQUHZZMEkIIIYQEZvDgwfjsZz+LCy64AHfeeSeefvppnHHGGViyZAlOO+206nTf/OY3cffdd+PJJ5/EJz/5Say11lo46qijAIiRIqdNm4azzz4bs2bNwgsvvIBbb721JlRfZ9y4cXj11Vdx/fXX46WXXsJPfvKTmoyxjTbaCLNnz8asWbPwzjvvYPny5Zg4cSK22247nHzyyXj00Ufx8MMP4xOf+AT23XdfY1kmIYQQQkh/hoIYIYQQQkgduOKKK3Dsscfi4x//OHbaaSe8+OKL+Mc//oE11lgjNs25556LnXfeGXPnzsVtt92Grq4uAMD222+PyZMn4/nnn8fee++NHXfcEV//+tcxduxY53yPOOIIfOELX8DZZ5+NHXbYAdOmTauOPik59thjccghh2D//ffHqFGj8Kc//QmlUgm33nor1lhjDeyzzz6YOHEiNtlkE9xwww3hVw4hhBBCSJPhKJOEEEIIIQ3mvvvuw/7774/33nsPI0aMaHZ3CCGEEELaDjrECCGEEEIIIYQQQkhbQUGMEEIIIYQQQgghhLQVLJkkhBBCCCGEEEIIIW0FHWKEEEIIIYQQQgghpK2gIEYIIYQQQgghhBBC2goKYoQQQgghhBBCCCGkraAgRgghhBBCCCGEEELaCgpihBBCCCGEEEIIIaStoCBGCCGEEEIIIYQQQtoKCmKEEEIIIYQQQgghpK2gIEYIIYQQQgghhBBC2or/D4UqQZPfzJjUAAAAAElFTkSuQmCC", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NPU Utilication: 71.33%, NPU Free Utilization: 28.67%.\n", - "Device synchronize 5 times, try to reduce synchronization statements to alleviate the bottleneck of operator delivery.\n", - "There are too many small operators, you can increase the batch size appropriately.\n" - ] - } - ], - "source": [ - "dataset = interface.get_data('timeline', 'op_schedule')\n", - "data = dataset.get(\"data\")\n", - "import math\n", - "op_dur = [math.log(i + 1) for i in data[0]]\n", - "op_free = [math.log(i + 1) for i in data[1]]\n", - "x = [i for i in range(len(op_dur))]\n", - "fig = plt.figure(figsize=(15, 8))\n", - "plt.plot(x, op_dur, c='r', ls='-', label='op duration')\n", - "plt.plot(x, op_free, c='g', ls='-', label='op wait')\n", - "\n", - "plt.xlabel('operator')\n", - "plt.ylabel('log(time + 1)')\n", - "plt.title('Op Schedule')\n", - "plt.legend(loc='upper right')\n", - "plt.show()\n", - "\n", - "print(dataset.get('bottleneck'))\n", - "print(dataset.get('advice'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.1" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/profiler/advisor/utils/__init__.py b/profiler/advisor/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/advisor/utils/log.py b/profiler/advisor/utils/log.py new file mode 100644 index 0000000000000000000000000000000000000000..b18272a82b6c5f529e5d36ceca921734eba9f592 --- /dev/null +++ b/profiler/advisor/utils/log.py @@ -0,0 +1,63 @@ +""" +log module +""" +import logging +import os + +from profiler.advisor.common import constant as const + + +def get_log_level(): + log_level = os.getenv(const.ADVISOR_LOG_LEVEL, const.DEFAULT_LOG_LEVEL).upper() + if not hasattr(logging, log_level): + raise AttributeError(f"module 'logging' has no attribute '{log_level}', " + f"supported log level: {', '.join(const.SUPPORTED_LOG_LEVEL)}") + return log_level + + +def init_logger(ctx, param, debug_mode) -> logging.Logger: + logging.logThreads = False + logging.logMultiprocessing = False + logging.logProcesses = False + + class LevelFilter(logging.Filter): + """ + level filter, filer only log with level out + """ + + # pylint:disable=too-few-public-methods + def filter(self, record): + if record.levelno == 60: + return False + return True + + console_log_level = getattr(logging, get_log_level()) + console_handle = logging.StreamHandler() + console_handle.setLevel(console_log_level) + console_handle.addFilter(LevelFilter()) + if debug_mode and not ctx.resilient_parsing: + formatter = logging.Formatter(fmt="[%(asctime)s][%(levelname)s][%(filename)s L%(lineno)s] %(message)s", + datefmt='%Y-%m-%d,%H:%M:%S') + else: + formatter = logging.Formatter(fmt="[%(asctime)s][%(levelname)s] %(message)s", + datefmt='%Y-%m-%d,%H:%M:%S') + console_handle.setFormatter(formatter) + + # add log level out + logging.addLevelName(60, 'OUT') + logger = logging.getLogger() + setattr(logger, 'out', lambda *args: logger.log(60, *args)) + output_handle = logging.StreamHandler() + output_handle.setLevel("OUT") + formatter = logging.Formatter("%(message)s") + output_handle.setFormatter(formatter) + + logger.setLevel("DEBUG") + logger.handlers = [] + if not logger.handlers: + logger.addHandler(console_handle) + logger.addHandler(output_handle) + else: + logger.info(logger.handlers) + logger.debug("The logger of analysis have initialized successfully.") + return logger diff --git a/profiler/advisor/utils/tools.py b/profiler/advisor/utils/tools.py new file mode 100644 index 0000000000000000000000000000000000000000..2cbcb5e0521d4a947fb8ff6af40e98c32dedab23 --- /dev/null +++ b/profiler/advisor/utils/tools.py @@ -0,0 +1,76 @@ +from functools import partial + +import click + +CONTEXT_SETTINGS = dict(help_option_names=['-H', '-h', '--help']) + + +class ClickAliasedGroup(click.Group): + """ + Alias click command + """ + FORMAT_LIMIT_LEN = 6 + + def __init__(self, *args, **kwargs): + super(ClickAliasedGroup, self).__init__(*args, **kwargs) + self._alias_dict = {} + self._commands = {} + + def command(self, *args, **kwargs): + alias = kwargs.pop('alias', None) + decorator = super(ClickAliasedGroup, self).command(*args, **kwargs) + if not alias: + return decorator + + return partial(self._decorator_warpper, decorator, alias) + + def group(self, *args, **kwargs): + alias = kwargs.pop('alias', None) + decorator = super(ClickAliasedGroup, self).group(*args, **kwargs) + if not alias: + return decorator + + return partial(self._decorator_warpper, decorator, alias) + + def _decorator_warpper(self, decorator, alias, func=None): + cmd = decorator(func) + self._commands[cmd.name] = alias + self._alias_dict[alias] = cmd.name + return cmd + + def resolve_alias(self, cmd_name): + if cmd_name in self._alias_dict.keys(): + return self._alias_dict[cmd_name] + return cmd_name + + def get_command(self, ctx, cmd_name): + cmd_name = self.resolve_alias(cmd_name) + command = super(ClickAliasedGroup, self).get_command(ctx, cmd_name) + return command if command else None + + def format_commands(self, ctx, formatter): + rows = [] + sub_commands = self.list_commands(ctx) + max_len = 0 + if len(sub_commands) > 0: + max_len = max(len(cmd) for cmd in sub_commands) + + limit = formatter.width - self.FORMAT_LIMIT_LEN - max_len + for sub_command in sub_commands: + cmd = self.get_command(ctx, sub_command) + if cmd is None: + continue + if hasattr(cmd, 'hidden') and cmd.hidden: + continue + if sub_command in self._commands: + alias = self._commands[sub_command] + sub_command = f'{sub_command}, {alias}' + if click.__version__[0] < '7': + cmd_help = cmd.short_help or '' + else: + cmd_help = cmd.get_short_help_str(limit) + rows.append((sub_command, cmd_help)) + + if rows: + with formatter.section('Commands'): + formatter.write_dl(rows) diff --git a/profiler/advisor/utils/utils.py b/profiler/advisor/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..84419b67087f8a434361f77479899d10ef91b9f5 --- /dev/null +++ b/profiler/advisor/utils/utils.py @@ -0,0 +1,552 @@ +import inspect +import json +import logging +import multiprocessing as mp +import os +import queue +import re +import stat +import time +import traceback +import types +from functools import wraps +from typing import Any, Set + +import click +import requests +from requests.adapters import HTTPAdapter +from tqdm import tqdm + +from profiler.advisor.common import constant as const +from profiler.advisor.common.version_control import VersionControl +from profiler.advisor.utils.log import init_logger, get_log_level + +logger = logging.getLogger() +logger.setLevel(get_log_level()) +permission_warned: Set = set() + + +def ignore_warning(exception: Exception = None): + return exception + + +class ContextObject(object): + def __init__(self): + self._debug = False + + def set_debug(self, debug=False): + self._debug = debug + + @property + def debug_mode(self): + return self._debug + + +def debug_option(f): + return click.option('--debug', '-D', + is_flag=True, + expose_value=False, + is_eager=True, + callback=init_logger, + help="Debug Mode. Shows full stack trace when error occurs.")(f) + + +def get_class_absolute_path(cls): + module = inspect.getmodule(cls) + if module is not None: + module_path = module.__name__ + class_name = cls.__name__ + return f"{module_path}.{class_name}" + else: + return None + + +def is_static_func(function_obj): + return isinstance(function_obj, staticmethod) + + +def singleton(cls): + """ + :param cls: any class + :return: singleton handle + + When using the singleton function, you need to manually specify collection_path='dataSet_path'. Otherwise, the singleton function + is initialized by class name. + if cls has 'collection_path' property, _instance map will build by class_name and 'collection_path', the default value of + collection path is class absolute path. + + _instance = {cls.name: {collection_path: instance}} + """ + _instance = {} + + def _singleton(*args: any, **kw: any) -> any: + collection_path = kw.get("collection_path") + if not collection_path: + collection_path = get_class_absolute_path(cls) + if cls in _instance and collection_path in _instance[cls]: + return _instance[cls].get(collection_path) + if cls not in _instance: + _instance[cls] = {collection_path: cls(*args, **kw)} + else: + _instance[cls][collection_path] = cls(*args, **kw) + return _instance[cls].get(collection_path) + + # 保留原始类的属性和方法 + _singleton.__name__ = cls.__name__ + _singleton.__module__ = cls.__module__ + _singleton.__doc__ = cls.__doc__ + + # 拷贝原始类的类方法和静态方法 + _singleton.__dict__.update(cls.__dict__) + for base_class in inspect.getmro(cls)[::-1]: + # 获取类的所有成员 + members = inspect.getmembers(base_class) + + # 过滤出函数对象 + function_objs = [member[1] for member in members if inspect.isfunction(member[1]) or inspect.ismethod(member[1])] + for function_obj in function_objs: + if inspect.isfunction(function_obj) and not is_static_func(function_obj): + continue + setattr(_singleton, function_obj.__name__, function_obj) + + return _singleton + + +def lazy_property(func): + """ + Lazy loading of class attributes. + which is calculated only once when it is called for the first time, + and will not be repeated for each call after that. + """ + attr_name = "_lazy_" + func.__name__ + + @property + def _lazy_property(instance): + if not hasattr(instance, attr_name): + setattr(instance, attr_name, func(instance)) + return getattr(instance, attr_name) + + return _lazy_property + + +class CheckPathAccess: + """ + check path access permissions + """ + + # pylint: disable=no-member + def __init__(self, func): + wraps(func)(self) + self.warned = permission_warned + + def __call__(self, *args, **kwargs): + path = args[0] + if not os.access(path, os.R_OK) and path not in self.warned: + logger.warning("%s can not read, check the permissions", path) + self.warned.add(path) + return self.__wrapped__(*args, **kwargs) + + def __get__(self, instance, cls): + if instance is None: + return self + return types.MethodType(self, instance) + + +def walk_error_handler(error): + """ + handle dir walk error + """ + if error.filename not in permission_warned: + logger.warning(error) + permission_warned.add(error.filename) + + +@CheckPathAccess +def get_file_path_from_directory(path: str, check_func: Any) -> list: + """ + get file from directory + """ + file_list = [] + for root, _, files in os.walk(path, onerror=walk_error_handler): + for filename in files: + filepath = os.path.join(root, filename) + if check_func(filename): + file_list.append(filepath) + return file_list + + +@singleton +class Timer: + def __init__(self): + self.strftime = time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())) + + +def get_analyze_processes(): + # n_processes not exposed to user through att-advisor command arguments now + return min(int(os.getenv(const.MA_ADVISOR_ANALYZE_PROCESSES, 1)), const.MA_ADVISOR_MAX_PROCESSES) + + +def format_timeline_result(result: dict, dump_html=False): + """ + :Param result: json for api name and stack + :Return: json after format + """ + format_result = {} + if dump_html: + result = json.loads(json.dumps(result).replace("\\r\\n", "
").replace("", "<module>")) + + for key, stacks in result.items(): + api_name = key.split(":")[0] + format_result[api_name] = sorted(list(stacks.items()), key=lambda stack: stack[1], reverse=True) + return format_result + + +class ParallelJob: + + def __init__(self, src_func, ops_api_list, job_name=None): + if not callable(src_func): + raise TypeError(f"src_func should be callable") + + if not isinstance(ops_api_list, (list, tuple)): + raise TypeError(f"ops_api_list should be list or tuple") + + self.src_func = src_func + self.ops_api_list = ops_api_list + self.job_name = job_name + + def start(self, n_proccesses): + + job_queue = mp.Queue(len(self.ops_api_list)) + completed_queue = mp.Queue() + for i in range(len(self.ops_api_list)): + job_queue.put(i) + + processes = [] + listen = mp.Process(target=self.listener, args=(completed_queue, len(self.ops_api_list),)) + listen.start() + + for i in range(n_proccesses): + p = mp.Process(target=self.parallel_queue, args=(job_queue, completed_queue,)) + processes.append(p) + p.start() + + for p in processes: + p.join() + + completed_queue.put(None) + listen.join() + + def listener(self, completed_queue, num): + pbar = tqdm(total=num, position=0, leave=False, ncols=100, desc=self.job_name) + for _ in iter(completed_queue.get, None): + pbar.update() + pbar.refresh() + pbar.n = num + + def parallel_queue(self, job_queue, completed_queue): + while True: + try: + if job_queue.empty(): + break + token = job_queue.get(timeout=1) + except queue.Empty: + continue + self.src_func(*self.ops_api_list[token]) + completed_queue.put(token) + + +def mp_queue_to_list(job_queue): + queue_list = [] + while True: + try: + if job_queue.empty(): + break + token = job_queue.get(timeout=1) + queue_list.append(token) + except queue.Empty: + continue + return queue_list + + +def load_parameter(parameter, default): + if not os.environ.get(parameter, None): + return default + else: + return os.environ.get(parameter) + + +def get_supported_subclass(clazz: VersionControl.__class__, cann_version: str): + """ + Returns a list of subclasses that support the specified version, because of the __subclasses__(), + you need to import the all subclass first + :param clazz: Class name which is extends to VersionControl.__class__ + :param cann_version: The CANN software version + :return: The list of subclasses that support the specified CANN version + """ + # 获取所有支持这个cann版本的子类 + dataset_classes = clazz.__subclasses__() + sub_class_list = [cls for cls in dataset_classes if cls.is_supported(cann_version)] + logger.debug("The support subclass list is %s, cann version is %s", str(sub_class_list), cann_version) + return sub_class_list + + +def to_percent(num: float) -> str: + """ + change float to percent format + """ + num = num * 100 + return f"{num:.2f}%" + + +def safe_division(numerator, denominator): + """Return 0 if denominator is 0.""" + return denominator and numerator / denominator + + +def safe_write(content, save_path): + if os.path.dirname(save_path) != "": + os.makedirs(os.path.dirname(save_path), exist_ok=True) + + with os.fdopen(os.open(save_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, + stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP), "w") as f: + f.write(content) + + +def create_directory_for_file(file: str) -> None: + """ + create directory for file + """ + dirname = os.path.dirname(file) + if not os.path.exists(dirname): + os.makedirs(dirname) + + +class CheckPathAccess: + """ + check path access permissions + """ + + # pylint: disable=no-member + def __init__(self, func): + wraps(func)(self) + self.warned = permission_warned + + def __call__(self, *args, **kwargs): + path = args[0] + if path and not os.access(path, os.R_OK) and path not in self.warned: + logger.warning("%s can not read, check the permissions", path) + self.warned.add(path) + return self.__wrapped__(*args, **kwargs) + + def __get__(self, instance, cls): + if instance is None: + return self + return types.MethodType(self, instance) + + +@CheckPathAccess +def get_file_path_from_directory(path, check_func): + """ + get file from directory + """ + file_list = [] + + if not path: + return file_list + + if not os.path.isdir(path): + logger.warning("Expected existed directory, but got %s", path) + + for root, _, files in os.walk(path): + for filename in files: + filepath = os.path.join(root, filename) + if check_func(filename): + file_list.append(filepath) + return file_list + + +@CheckPathAccess +def get_dir_path_from_directory(path: str, check_func: Any) -> list: + """ + get file from directory + """ + file_list = [] + for root, _, files in os.walk(path, onerror=walk_error_handler): + for filename in files: + filepath = os.path.join(root, filename) + if check_func(filename): + file_list.append(filepath) + return file_list + + +def is_regex_pattern(string: str): + """ + Check if str is a regular expression. + """ + escaped_string = re.escape(string) + return not (escaped_string == string) + + +def join_prof_path(root_dir: str, sub_dir: str) -> str: + """ + regular expression matching method for path concatenation + """ + if is_regex_pattern(sub_dir): + for root, _, _ in os.walk(root_dir, onerror=walk_error_handler): + if re.match(sub_dir, os.path.basename(root)): + return root + logger.debug("Fail to get profiling path %s from local path %s by regular expression matching", sub_dir, root_dir) + else: + sub_dir = os.path.join(root_dir, sub_dir) + if os.path.exists(sub_dir): + return sub_dir + logger.debug("Fail to get profiling path %s from local path %s", sub_dir, root_dir) + return "" + + +def format_excel_title(title: str) -> str: + """ + format excel title + """ + title = title.lower() + title = title.replace("(us)", '') + title = title.replace("(ns)", '') + title = title.replace("(%)", '') + title = title.replace(" ", "_") + return title + + +def format_float(num: float) -> float: + """ + format float num, round to 2 decimal places + """ + return round(num, 2) + + +class SafeOpen: + """ + safe open to check file + """ + + # pylint: disable=consider-using-with + def __init__(self, name, mode='r', encoding=None): + self.file = None + if not os.path.exists(name): + logger.warning("%s not exist, please check", name) + return + + if os.access(name, os.R_OK): + self.file = open(name, mode, encoding=encoding, errors="ignore") + else: + logger.warning("%s can not read, check the permissions", name) + + def __enter__(self): + return self.file + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.file: + self.file.close() + return True + + +def save_downloaded_file(response, url_path, file_save_path): + """保存响应体中的文件 + + 参数: + response: 请求后获取的响应体 + url_path: url路径 + file_save_path: 保存路径 + 返回: + final_file_path: 文件保存绝对路径 + """ + # 获取url路径中的文件名, 拼接在保存路径下 + file_save_path = os.path.normpath(file_save_path) + file_name = os.path.basename(url_path) + final_file_path = os.path.join(file_save_path, file_name) + # 若目标保存路径不存在,则自动生成 + if not os.path.exists(file_save_path): + os.makedirs(file_save_path) + if response.status_code <= 300: + logger.debug("Response status code is %s", response.status_code) + flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL + modes = stat.S_IWUSR | stat.S_IRUSR + # 若文件已存在,则移除已有的文件并保存最新的文件 + if os.path.exists(final_file_path): + os.remove(final_file_path) + # 保存文件 + with os.fdopen(os.open(final_file_path, flags, modes), mode="wb") as f: + f.write(response.content) + logger.info("Success to save content in: %s", os.path.abspath(final_file_path)) + else: + # 若响应码不为预期的数值, 显示相应告警 + logger.warning("Failed to save the response body. The response status code is %s. " + "Please check the network or try another region", response.status_code) + + +def request_with_retry(url_path, region_name=None): + """使用requests请求获取文件, 失败则进行重试, 最多请求 max_retries+1 次 + + 参数: + url_path: URL路径 + file_save_path: 云文件保存路径 + """ + logger.debug("Requesting or retrying to get file from region: %s", region_name) + + # 若从环境变量指定了保存路径,优先从环境变量中获取,若为空则使用默认的云文件保存路径constant.CLOUD_RULE_PATH + file_save_path = os.path.join(os.path.expanduser("~"), const.CLOUD_RULE_PATH) + if os.getenv(const.ADVISOR_RULE_PATH): + file_save_path = os.getenv(const.ADVISOR_RULE_PATH) + + session = requests.Session() + # 使用session发起的所有请求, 默认最多会重试 max_retries 次, 计入最初请求, 最差情况下请求 max_retries+1 次 + adapter = HTTPAdapter(max_retries=const.MAX_RETRIES) + session.mount(const.HTTP_PREFIXES, adapter) + session.mount(const.HTTPS_PREFIXES, adapter) + + logger.debug('Session try to get response') + response = None + try: + response = session.get(url_path, timeout=const.TIMEOUT) + except Exception as e: + logger.debug("Error: %s: %s", e, traceback.format_exc()) + + if response is None: + logger.warning("Fail to download file from region: %s, response is None, " + "please use the environment variable %s for more detailed information", + region_name, const.ADVISOR_LOG_LEVEL) + else: + try: + # 若响应码为400~600之间,response.raise_for_status抛出HTTPError错误, 跳过调用save_downloaded_file函数逻辑 + response.raise_for_status() + save_downloaded_file(response, url_path=url_path, file_save_path=file_save_path) + except Exception as e: + logger.warning("Error: %s: %s", e, traceback.format_exc()) + # 关闭 session, 清除所有装配器 + session.close() + + +def read_csv(file): + import csv + + raw_data = [] + logger.debug("Parse file %s", file) + with SafeOpen(file, encoding="utf-8") as csv_file: + try: + csv_content = csv.reader(csv_file) + for row in csv_content: + raw_data.append(row) + except OSError as error: + logger.error("Read csv file failed : %s", error) + return [] + + return raw_data + + +def get_file_path_by_walk(root, filename): + file_path = "" + for root, _, files in os.walk(root, topdown=True): + for name in files: + if name == filename: + file_path = os.path.join(root, name) + return file_path + return file_path diff --git a/profiler/advisor/version.py b/profiler/advisor/version.py new file mode 100644 index 0000000000000000000000000000000000000000..1a95cc3c0f93f49a2aaacf483770462d09961ff9 --- /dev/null +++ b/profiler/advisor/version.py @@ -0,0 +1,38 @@ +import sys + + +def get_package_version(package_name) -> str: + """ + Get package version info by importlib + Args: + package_name: package name + + Returns: + version: version info string + """ + if sys.version_info >= (3, 8): + # Because importlib_metadata has been changed to importlib.metadata in py3.8 + from importlib import metadata + from importlib.metadata import PackageNotFoundError + else: + import importlib_metadata as metadata + from importlib_metadata import PackageNotFoundError + + try: + version = metadata.version(package_name) + except PackageNotFoundError: + version = "UNKNOWN" + return version + + +def print_version_callback(ctx, param, value): # NOQA + import click + + if not value or ctx.resilient_parsing: + return + click.echo('Version {}'.format(get_package_version("att_advisor"))) + ctx.exit() + + +def cli_version(): + return get_package_version("att_advisor") diff --git a/profiler/affinity_cpu_bind/README.md b/profiler/affinity_cpu_bind/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8c3b47ed5183fd2dbade8fc316e0319b8feea880 --- /dev/null +++ b/profiler/affinity_cpu_bind/README.md @@ -0,0 +1,40 @@ +# 昇腾亲和性CPU绑核工具 + +昇腾亲和性CPU绑核工具支持用户无需修改代码,直接运行工具即可实现按CPU亲和性策略绑核,提升推理或训练性能。 + +绑核工具用户arm服务器环境,对于训练或推理任务因为CPU资源调度等出现host_bound问题时使用,可改善该问题;对于非host_bound的场景无明显改善效果。 + +## 使用须知 + +使用绑核工具前手动执行npu-smi info -t topo,出现以下类似信息,说明环境支持绑核,否则请将环境HDK包升级到Ascend HDK 23.0.RC2及以上版本。 + + NPU0 NPU1 NPU2 NPU3 NPU4 NPU5 NPU6 NPU7 NPUx CPU Affinity + NPU0 X HCCS HCCS HCCS HCCS HCCS HCCS HCCS ... xx-xx + NPU1 HCCS X HCCS HCCS HCCS HCCS HCCS HCCS ... xx-xx + NPU2 HCCS HCCS X HCCS HCCS HCCS HCCS HCCS ... xx-xx + NPU3 HCCS HCCS HCCS X HCCS HCCS HCCS HCCS ... xx-xx + NPU4 HCCS HCCS HCCS HCCS X HCCS HCCS HCCS ... xx-xx + NPU5 HCCS HCCS HCCS HCCS HCCS X HCCS HCCS ... xx-xx + NPU6 HCCS HCCS HCCS HCCS HCCS HCCS X HCCS ... xx-xx + NPU7 HCCS HCCS HCCS HCCS HCCS HCCS HCCS X ... xx-xx + NPUx ... ... ... ... ... ... ... ... ... ... + +## 使用方式 + +1.执行以下命令实施绑核: + + - 直接执行绑核命令 +```bash +python3 bind_core.py -app/--application="inferenec/train cmd" +``` +该方式会自动拉起训练或推理任务,检测任务进程,并实施绑核。 + + - 手动拉起训练或推理任务后再执行绑核 +```bash +python3 bind_core.py +``` +该方式会循环查找(循环5次,每次10s,若找不到进程,则直接退出)使用到NPU的任务进程,并实施绑核。 + +2.绑核运行过程的日志会保存到当前路径的bind_core_时间戳.log。 + +3.如果推理或训练进程拉起后需要一定时间预处理,才会真正执行任务,可在执行绑核命令时设置-t/--time参数(单位秒),绑核工具会在延迟配置的时间后,再实施绑核动作。例如:python3 bind_core.py -app="cmd" -t=10,配置后工具会在10秒后执行绑核操作。 \ No newline at end of file diff --git a/profiler/affinity_cpu_bind/bind_core.py b/profiler/affinity_cpu_bind/bind_core.py new file mode 100644 index 0000000000000000000000000000000000000000..7f27e924238861be8f9041f8e9c93850d5834059 --- /dev/null +++ b/profiler/affinity_cpu_bind/bind_core.py @@ -0,0 +1,213 @@ +import subprocess +import argparse +import os +import time +import logging +from datetime import datetime +from datetime import timezone + + +class PathManager: + DATA_FILE_AUTHORITY = 0o640 + + @classmethod + def create_file_safety(cls, path: str): + base_name = os.path.basename(path) + msg = f"Failed to create file: {base_name}" + if os.path.islink(path): + raise RuntimeError(msg) + if os.path.exists(path): + return + try: + os.close(os.open(path, os.O_WRONLY | os.O_CREAT, cls.DATA_FILE_AUTHORITY)) + except Exception as err: + raise RuntimeError(msg) from err + + +class BindCoreManager(): + DEFAULT_FIND_RUNNING_PID_TIMES = 5 + + def __init__(self): + self.npu_id_list = [] + self.running_pid_on_npu = {} + self.find_running_pid_times = self.DEFAULT_FIND_RUNNING_PID_TIMES + self.npu_affinity_cpu_dict = {} + self.log_file = '' + self._init_log_file() + + + def _init_log_file(self): + now_time = datetime.now(tz=timezone.utc) + time_stamp = str(now_time.year) + '_' + \ + str(now_time.month) + '_' + \ + str(now_time.day) + '_' + \ + str(now_time.hour) + '_' + \ + str(now_time.minute) + '_' + \ + str(now_time.second) + log_file_name = 'bind_core_' + time_stamp + '.log' + msg = f"Failed to create file: {log_file_name}" + try: + PathManager.create_file_safety(os.path.join(os.getcwd(), log_file_name)) + except RuntimeError as err: + raise RuntimeError(msg) from err + self.log_file = log_file_name + logging.basicConfig(filename=self.log_file, + level=logging.INFO, + filemode='w', + format='%(asctime)s-%(name)s-%(levelname)s-%(message)s') + + def _get_all_npu_id(self) -> None: + get_npu_info_cmd = 'npu-smi info -l' + get_npu_info_process = subprocess.run(get_npu_info_cmd.split(), shell=False, capture_output=True) + get_npu_id_cmd = 'grep ID' + get_npu_id_process = subprocess.run(get_npu_id_cmd.split(), shell=False, input=get_npu_info_process.stdout, capture_output=True) + res = get_npu_id_process.stdout.decode('utf-8').split() + for i in res: + if i.isdigit(): + self.npu_id_list.append(int(i)) + logging.info(f'NPU total id list: {self.npu_id_list}') + + def _get_npu_affinity(self) -> bool: + cpu_num = os.cpu_count() + cpu_num_for_each_npu = cpu_num // len(self.npu_id_list) + get_npu_topo_cmd = 'npu-smi info -t topo' + p = subprocess.run(get_npu_topo_cmd.split(), shell=False, capture_output=True) + res = p.stdout.decode('utf-8').split() + if not res: + print('[ERROR] Failed to run get npu affinity info, please check if driver version support cmd npu-smi info -t topo') + return False + + index = 0 + for v in res: + if '-' in v: + affinity_cpus = [] + cpu_lists = v.split(',') + for cpu_list in cpu_lists: + cpus = cpu_list.split('-') + if len(cpus) != 2: + continue + if int(cpus[1]) - int(cpus[0]) == cpu_num_for_each_npu - 1: + cpus[1] = str(int(cpus[1]) + cpu_num_for_each_npu) + affinity_cpus.append(cpus[0] + '-' + cpus[1]) + if index < len(self.npu_id_list): + self.npu_affinity_cpu_dict[self.npu_id_list[index]] = ','.join(affinity_cpu for affinity_cpu in affinity_cpus) + index += 1 + else: + print('[ERROR] Get affinity_cpu_list for {} npus, more than real npu num: {}'.format(index + 1, len(self.npu_id_list))) + return False + + for k in self.npu_affinity_cpu_dict.keys(): + logging.info(f'Affinity CPU list {self.npu_affinity_cpu_dict[k]} for NPU {k}') + return True + + def get_running_pid_on_npu(self) -> bool: + no_running_pids_on_npu_msg = '[INFO] Now there is no running process on all NPUs, stop bind cores' + logging.info('Begin to find running process on all NPUs') + # get running process on NPUs + for times in range(self.find_running_pid_times): + running_pid_on_npu = {} + for npu_id in self.npu_id_list: + get_npu_pids_cmd = 'npu-smi info -t proc-mem -i {} -c 0'.format(npu_id) + get_npu_pids_process = subprocess.run(get_npu_pids_cmd.split(), shell=False, capture_output=True) + res = get_npu_pids_process.stdout.decode('utf-8').split() + pid_list = [] + for value in res: + if value.startswith('id:'): + pid = value.split(':')[1] + pid_list.append(pid) + if pid_list: + running_pid_on_npu[npu_id] = list(set(pid_list)) + + if len(self.running_pid_on_npu.keys()) == len(running_pid_on_npu.keys()) and running_pid_on_npu: + self.running_pid_on_npu = running_pid_on_npu + break + + self.running_pid_on_npu = running_pid_on_npu + time.sleep(5) + + # delete repeat pid + for npu_id in self.npu_id_list: + if npu_id not in self.running_pid_on_npu: + continue + pids_on_npu = self.running_pid_on_npu[npu_id] + for npu_id_with_pids, pids in self.running_pid_on_npu.items(): + if npu_id == npu_id_with_pids: + continue + pids_on_npu = list(set(pids_on_npu) - set(pids)) + self.running_pid_on_npu[npu_id] = pids_on_npu + + if_running_process = False + for npu_id, pids in self.running_pid_on_npu.items(): + if not pids: + logging.info(f'There is no running process on NPU {npu_id}') + else: + logging.info(f'Succeed to find running process {pids} on NPU {npu_id}') + if_running_process = True + if not if_running_process: + print(no_running_pids_on_npu_msg) + return if_running_process + + def get_npu_info(self) -> bool: + try: + self._get_all_npu_id() + if not self._get_npu_affinity(): + return False + except subprocess.CalledProcessError: + return False + return True + + def run_bind_core(self): + if not self.running_pid_on_npu: + return + for npu, pid_list in self.running_pid_on_npu.items(): + if npu not in self.npu_affinity_cpu_dict.keys(): + logging.warning(f'Cannot find affinity cpu for npu: {npu}') + continue + affinity_cpu = self.npu_affinity_cpu_dict.get(npu) + for pid in pid_list: + try: + logging.info(f'Begin to bind cores for process {pid} on NPU {npu}') + set_affinity_cpu_cmd = 'taskset -pc {} {}'.format(affinity_cpu, pid) + p = subprocess.run(set_affinity_cpu_cmd.split(), shell=False, capture_output=True) + logging.info(p.stdout.decode('utf-8')) + except subprocess.CalledProcessError: + print('[ERROR] Failed to bind process {} on NPU {} with cpu cores list {}'.format(pid, npu, affinity_cpu)) + + logging.info(f'Succeed to bind process {pid} on NPU {npu} with cpu cores list {affinity_cpu}') + + def args_parse(self): + parser = argparse.ArgumentParser(description='This is a affinity cpu core bind script.') + parser.add_argument('-t', '--time', type=int, metavar='', help='Wait time before bind cores that you want to set. The unit is \'s\'.') + parser.add_argument('-app', '--application', metavar='', nargs='+', help='Training or inference command that you want to run.') + args = parser.parse_args() + if args.application: + application_cmd = ' '.join(args.application) + self.launch_process(application_cmd) + time.sleep(2) + # if time is set, wait for setting time before bind cores + if args.time: + time.sleep(args.time) + + def launch_process(self, cmd: list): + logging.info(f'Start to execute cmd: {cmd}') + try: + subprocess.Popen(cmd.split(), shell=False) + except subprocess.CalledProcessError as e: + raise RuntimeError(f'Failed to run cmd: {cmd}') from e + + +if __name__ == '__main__': + print('[INFO] Begin to run bind-cores script...') + bind_core_manager = BindCoreManager() + bind_core_manager.args_parse() + + if not bind_core_manager.get_npu_info(): + print('[ERROR] Failed to get current npus info') + exit() + + if not bind_core_manager.get_running_pid_on_npu(): + exit() + bind_core_manager.run_bind_core() + print('[INFO] End to run bind-cores script, the log is saved in {}'.format(bind_core_manager.log_file)) + + diff --git a/profiler/cli/__init__.py b/profiler/cli/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..eab13571c58756cc978ebc59479c86c0d1e85529 --- /dev/null +++ b/profiler/cli/__init__.py @@ -0,0 +1,4 @@ +from profiler.advisor.config.config import Config +from profiler.advisor.utils.utils import Timer + +Config().set_log_path(f"att_advisor_{Timer().strftime}.xlsx") diff --git a/profiler/cli/analyze_cli.py b/profiler/cli/analyze_cli.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd7f1722517edc2e8177d3b88af06a6217cf5f2 --- /dev/null +++ b/profiler/cli/analyze_cli.py @@ -0,0 +1,136 @@ +import click +import sys +import os +import logging +from pathlib import Path + +sys.path.append(os.path.join(os.path.dirname(os.path.dirname(__file__)), "compare_tools")) +sys.path.append(os.path.join(os.path.dirname(os.path.dirname(__file__)), "cluster_analyse")) + +from profiler.advisor.utils.tools import CONTEXT_SETTINGS, ClickAliasedGroup +from profiler.advisor.common import constant +from profiler.advisor.utils.utils import debug_option +from profiler.advisor.interface.interface import Interface +from profiler.cluster_analyse.cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor + +logger = logging.getLogger() + + +def _analyze(dimensions, **kwargs): + result_list = [] + job_list = [] + if not Path(kwargs.get("profiling_path")).exists(): + print(f"[ERROR] Profiling_path is not existed. Invalid profiling path: {kwargs.get('profiling_path')}") + return + + def is_cluster(): + profiling_path = kwargs.get("profiling_path") + path_list = [os.path.join(profiling_path, dir_name) for dir_name in os.listdir(profiling_path)] + ascend_pt_dirs = [path for path in path_list if os.path.isdir(path) and path.endswith("ascend_pt")] + data_processor = PytorchDataPreprocessor(ascend_pt_dirs) + data_map = data_processor.get_data_map() + return len(data_map) > 1 + + is_cluster = is_cluster() + + for dimension in dimensions: + if not is_cluster and dimension == "cluster": + continue + for scope in Interface.get_scope(dimension): + interface = Interface(**kwargs) + job_list.append((dimension, scope, interface)) + + for i, (dimension, scope, interface) in enumerate(job_list[::-1]): + result_list.append( + interface.get_result(dimension, scope, render_html=i == len(job_list) - 1, output_dict=False, **kwargs)) + + for result in result_list[::-1]: + if result and hasattr(result, "show"): + result.show() + break + + +@click.group(name="analyze", cls=ClickAliasedGroup) +def analyze_cli(**kwargs): + """Analyze profiling datasets and give performance optimization suggestion.""" + pass + + +@analyze_cli.command(context_settings=CONTEXT_SETTINGS, + name="all", + short_help='Analyze timeline, operators and graph.') +@click.option('--profiling_path', '-d', 'profiling_path', type=click.Path(), required=True, + help='Directory of profiling data') +@click.option('--benchmark_profiling_path', '-bp', 'benchmark_profiling_path', type=click.Path(), + help='Directory of benchmark profiling data, used for compare performance') +@click.option('--cann_version', '-cv', 'cann_version', + type=click.Choice(constant.SUPPORTED_CANN_VERSION, case_sensitive=False), + default=constant.DEFAULT_CANN_VERSION, + help='The CANN software version, which can be viewed by executing the following command: ' + '"cat /usr/local/Ascend/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info"') +@click.option('--torch_version', '-tv', 'torch_version', + type=click.Choice(constant.SUPPORTED_TORCH_VERSION, case_sensitive=False), + default=constant.DEFAULT_TORCH_VERSION, + help='The runtime torch version, which can be detected by exec command "pip show torch"') +# @click.option('--is_inference', is_flag=True, help="Enable performance analysis of inference task") +@click.option("-pt", + "--profiling_type", + metavar="", + default=constant.ASCEND_PYTORCH_PROFILER, + required=False, + type=click.Choice(constant.SUPPORTED_PROFILING_TYPE), + help="enter the profiling type, selectable range ascend_pytorch_profiler, mslite ,msprof") +@debug_option +def analyze_all(**kwargs) -> None: + # 当前compare_tools必须输入两个profiling路径,att-advisor有等价功能支持输入一个Profiling路径,后续替换成对应实现 + if not kwargs.get("benchmark_profiling_path"): + kwargs["benchmark_profiling_path"] = kwargs.get("profiling_path") + try: + _analyze(Interface.all_dimension, **kwargs) + except RuntimeError as e: + print(f"[ERROR] {e}") + + +@analyze_cli.command(context_settings=CONTEXT_SETTINGS, + name="schedule", + short_help='Analyze timeline, operators and graph.') +@click.option('--profiling_path', '-d', 'profiling_path', type=click.Path(), required=True, + help='Directory of profiling data') +@click.option('--cann_version', '-cv', 'cann_version', + type=click.Choice(constant.SUPPORTED_CANN_VERSION, case_sensitive=False), + default=constant.DEFAULT_CANN_VERSION, + help='The CANN software version, which can be viewed by executing the following command: ' + '"cat /usr/local/Ascend/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info"') +@click.option('--torch_version', '-tv', 'torch_version', + type=click.Choice(constant.SUPPORTED_TORCH_VERSION, case_sensitive=False), + default=constant.DEFAULT_TORCH_VERSION, + help='The runtime torch version, which can be detected by exec command "pip show torch"') +@debug_option +def analyze_schedule(**kwargs) -> None: + _analyze(["schedule"], **kwargs) + + +@analyze_cli.command(context_settings=CONTEXT_SETTINGS, + name="computation", + short_help='Analyze timeline, operators and graph.') +@click.option('--profiling_path', '-d', 'profiling_path', type=click.Path(), required=True, + help='Directory of profiling data') +@click.option('--cann_version', '-cv', 'cann_version', + type=click.Choice(constant.SUPPORTED_CANN_VERSION, case_sensitive=False), + default=constant.DEFAULT_CANN_VERSION, + help='The CANN software version, which can be viewed by executing the following command: ' + '"cat /usr/local/Ascend/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info"') +@click.option('--torch_version', '-tv', 'torch_version', + type=click.Choice(constant.SUPPORTED_TORCH_VERSION, case_sensitive=False), + default=constant.DEFAULT_TORCH_VERSION, + help='The runtime torch version, which can be detected by exec command "pip show torch"') +@click.option("-pt", + "--profiling_type", + metavar="", + default=constant.ASCEND_PYTORCH_PROFILER, + required=False, + type=click.Choice(constant.SUPPORTED_PROFILING_TYPE), + help="enter the profiling type, selectable range ascend_pytorch_profiler, mslite ,msprof") +@debug_option +def analyze_computation(**kwargs) -> None: + _analyze(["computation"], **kwargs) \ No newline at end of file diff --git a/profiler/cli/cluster_cli.py b/profiler/cli/cluster_cli.py new file mode 100644 index 0000000000000000000000000000000000000000..93a4a638f270a7aeac853943895f707c7a0c0f28 --- /dev/null +++ b/profiler/cli/cluster_cli.py @@ -0,0 +1,40 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import ast +import click +import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(__file__))) +from profiler.advisor.utils.tools import CONTEXT_SETTINGS, ClickAliasedGroup +from profiler.advisor.utils.utils import debug_option +from profiler.prof_common.constant import Constant +from profiler.cluster_analyse.cluster_analysis import ALL_FEATURE_LIST +from profiler.cluster_analyse.cluster_analysis import cluster_analysis_main + + +context_settings = dict(Constant.CONTEXT_SETTINGS) +context_settings['ignore_unknown_options'] = True + + +@click.command(context_settings=context_settings, name="cluster", + short_help='Analyze cluster data to locate slow nodes and slow links.') +@click.option('--profiling_path', '-d', type=click.Path(), required=True, + help='path of the profiling data') +@click.option('--mode', '-m', type=click.Choice(ALL_FEATURE_LIST), default='all') +@click.argument('args', nargs=-1) +def cluster_cli(profiling_path, mode, args) -> None: + required_args = ('-d', profiling_path, '-m', mode) + cluster_analysis_main(required_args + args) diff --git a/profiler/cli/compare_cli.py b/profiler/cli/compare_cli.py new file mode 100644 index 0000000000000000000000000000000000000000..e794578da8c37db4f825532a20c802f162bcb066 --- /dev/null +++ b/profiler/cli/compare_cli.py @@ -0,0 +1,48 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import ast +import click +import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(__file__))) + +from profiler.prof_common.constant import Constant +from profiler.prof_common.analyze_dict import AnalyzeDict +from profiler.compare_tools.compare_backend.comparison_generator import ComparisonGenerator + +@click.command(context_settings=Constant.CONTEXT_SETTINGS, name="compare", + short_help='Compare the performance differences between GPUs and NPUs.') +@click.option('--profiling_path', '-d', 'comparison_profiling_path', type=click.Path(), required=True, + help='path of the profiling data') +@click.option('--benchmark_profiling_path', '-bp', 'base_profiling_path', type=click.Path(), required=True) +@click.option('--enable_profiling_compare', is_flag=True) +@click.option('--enable_operator_compare', is_flag=True) +@click.option('--enable_memory_compare', is_flag=True) +@click.option('--enable_communication_compare', is_flag=True) +@click.option('--disable_details', is_flag=True) +@click.option('--output_path', '-o', 'output_path', type=click.Path()) +@click.option('--max_kernel_num', 'max_kernel_num', type=int, help="The number of kernels per torch op is limited.") +@click.option('--op_name_map', type=ast.literal_eval, default='{}', + help="The mapping of operator names equivalent to GPUs and NPUs in the form of dictionaries.", + required=False) +@click.option('--use_input_shape', is_flag=True) +@click.option('--gpu_flow_cat', type=str, default='', help="Identifier of the GPU connection.") +def compare_cli(**kwargs) -> None: + args = AnalyzeDict(kwargs) + try: + ComparisonGenerator(args).run() + except RuntimeError as e: + print(f"[ERROR] {e}") diff --git a/profiler/cli/complete_cli.py b/profiler/cli/complete_cli.py new file mode 100644 index 0000000000000000000000000000000000000000..9c4efd0af90daa84b7ae5c3a0b2462dc52873da5 --- /dev/null +++ b/profiler/cli/complete_cli.py @@ -0,0 +1,29 @@ +import click + +from profiler.advisor.utils.tools import CONTEXT_SETTINGS + + +@click.command(context_settings=CONTEXT_SETTINGS, + short_help='Auto complete ma-advisor command in terminal, support "bash(default)/zsh/fish".') +@click.argument('shell_type', nargs=1, default="Bash", type=click.Choice(["Bash", "Zsh", "Fish"], case_sensitive=False)) +def auto_complete_cli(shell_type): + """ + Auto complete ma-advisor command in terminal. + + Example: + + \b + # print bash auto complete command to terminal + msprof-analyze auto-completion Bash + """ + click.echo("Tips: please paste following shell command to your terminal to activate auto completion.\n") + if shell_type.lower() == "bash": + bash_str = 'eval "$(_MSPROF_ANALYZE_COMPLETE=bash_source msprof-analyze)"' + elif shell_type.lower() == "zsh": + bash_str = 'eval "$(_MSPROF_ANALYZE_COMPLETE=zsh_source msprof-analyze)"' + elif shell_type.lower() == "fish": + bash_str = 'eval (env _MSPROF_ANALYZE_COMPLETE=fish_source msprof-analyze)' + else: + click.echo(f'Unsupported shell type {shell_type}.') + return + click.echo(f'{bash_str}\n') diff --git a/profiler/cli/entrance.py b/profiler/cli/entrance.py new file mode 100644 index 0000000000000000000000000000000000000000..a260553031ecfc904ae8411d944037bdb2f101ab --- /dev/null +++ b/profiler/cli/entrance.py @@ -0,0 +1,67 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import click + +from profiler.cli.analyze_cli import analyze_cli +from profiler.cli.complete_cli import auto_complete_cli +from profiler.cli.compare_cli import compare_cli +from profiler.cli.cluster_cli import cluster_cli +from profiler.advisor.version import print_version_callback, cli_version + +logger = logging.getLogger() +CONTEXT_SETTINGS = dict(help_option_names=['-H', '-h', '--help'], + max_content_width=160) + +COMMAND_PRIORITY = { + "advisor": 1, + "compare": 2, + "cluster": 3, + "auto-completion": 4 +} + + +class SpecialHelpOrder(click.Group): + + def __init__(self, *args, **kwargs): + super(SpecialHelpOrder, self).__init__(*args, **kwargs) + + def list_commands_for_help(self, ctx): + """ + reorder the list of commands when listing the help + """ + commands = super(SpecialHelpOrder, self).list_commands(ctx) + return [item[1] for item in sorted((COMMAND_PRIORITY.get(command, float('INF')), + command) for command in commands)] + + def get_help(self, ctx): + self.list_commands = self.list_commands_for_help + return super(SpecialHelpOrder, self).get_help(ctx) + + +@click.group(context_settings=CONTEXT_SETTINGS, cls=SpecialHelpOrder) +@click.option('--version', '-V', '-v', is_flag=True, + callback=print_version_callback, expose_value=False, + is_eager=True, help=cli_version()) +def msprof_analyze_cli(**kwargs): + pass + + +msprof_analyze_cli.add_command(analyze_cli, name="advisor") +msprof_analyze_cli.add_command(compare_cli, name="compare") +msprof_analyze_cli.add_command(cluster_cli, name="cluster") +msprof_analyze_cli.add_command(auto_complete_cli, name="auto-completion") diff --git a/profiler/cluster_analyse/README.md b/profiler/cluster_analyse/README.md index 7cdb2d2c1e68da2cbbc00629ddf06b2ae48a28c2..989122375db65d32047555d29dd25ed9289764ea 100644 --- a/profiler/cluster_analyse/README.md +++ b/profiler/cluster_analyse/README.md @@ -21,27 +21,74 @@ experimental_config = torch_npu.profiler._ExperimentalConfig( - ./ASCEND_PROFILER_OUTPUT/communication.json, - ./ASCEND_PROFILER_OUTPUT/communication_matrix.json -确认这几个文件生成后,继续下面的集群分析。 - -## 数据汇聚与集群解析 - -将所有卡的数据拷贝并汇集到一个目录下,在本目录下运行以下命令即可生成cluster_analysis_output文件夹。 - -```shell -python3 cluster_analysis.py -d {cluster profiling data path} -m {mode} -``` -### 参数说明 +或者具备: -| 参数名 | 说明 | 是否必选 | -| ---------------------- | --------------------------------------- | --------------------------------------- | -| --collection_path或-d | 性能数据汇集目录,运行分析脚本之后会在该目录下自动创建cluster_analysis_output文件夹,保存分析数据。 | 是 | -| --mode或-m | 数据解析模式。取值为:communication_matrix(解析通信矩阵数据)、communication_time(解析通信耗时数据)、all(同时解析通信矩阵和通信耗时数据),默认值为all。 | 否 | +- analysis.db +- ascend_pytorch_profiler_{rank_id}.db -## 交付件 +以上csv、json文件与db文件只能存在一类,否则集群分析工具解析异常。 -集群分析工具的交付件通过Ascend Insight工具展示,详见《MindStudio 可视化调优工具指南(Ascend Insight)》。 +确认这几个文件生成后,继续下面的集群分析。 -### cluster_step_trace_time.csv +## 数据汇聚与解析 + +### 操作步骤 + +1. 参见《[性能工具](../README.md)》完成工具安装。建议安装最新版本。 + +2. 将所有卡的数据拷贝并汇集到一个目录下,在本目录下运行以下命令即可生成cluster_analysis_output文件夹。 + + ```bash + msprof-analyze cluster -d {cluster profiling data path} -m {mode} + ``` + + 或 + + ```bash + python3 cluster_analysis.py -d {cluster profiling data path} -m {mode} + ``` + + 参数说明: + + | 参数名 | 说明 | 是否必选 | + | --------------------- | ------------------------------------------------------------ | -------- | + | --collection_path或-d | 性能数据汇集目录,运行分析脚本之后会在该目录下自动创建cluster_analysis_output文件夹,保存分析数据。 | 是 | + | --mode或-m | 数据解析模式,取值详见“**--mode参数说明**”表。 | 否 | + | --parallel_mode | 设置收集多卡、多节点db数据时的并发方式。取值为concurrent(使用concurrent.feature进程池实现并发)。
**只有-m配置cann_api_sum、compute_op_sum、hccl_sum、mstx_sum时可配置此参数。** | 否 | + | --export_type | 设置导出的数据形式。取值为db(.db格式文件)和notebook(Jupyter Notebook文件),默认值为db。
**只有-m配置cann_api_sum、compute_op_sum、hccl_sum、mstx_sum时可配置此参数。** | 否 | + | --rank_list | 对特定Rank上的数据进行统计,默认值为all(表示对所有Rank进行统计),须根据实际卡的Rank ID配置。应配置为大于等于0的整数,若所配置的值大于实际训练所运行的卡的Rank ID,则仅解析合法的RankID的数据,比如当前环境Rank ID为0到7,实际训练运行0到3卡,此时若配置Rank ID为0, 3, 4或不存在的10等其他值,则仅解析0和3。配置示例:--rank_list 0, 1, 2。
**只有-m配置cann_api_sum、compute_op_sum、hccl_sum、mstx_sum时可配置此参数。** | 否 | + | --top_num | 设置TopN耗时的通信算子的数量,默认值为15,配置示例:--top_num 20。
**只有-m配置hccl_sum时可配置此参数。** | 否 | + + --mode参数说明: + + | 参数名 | 说明 | 是否必选 | + | -------------------- | ------------------------------------------------------------ | -------- | + | communication_matrix | 解析通信矩阵数据。 | 否 | + | communication_time | 解析通信耗时数据。 | 否 | + | all | 同时解析通信矩阵communication_matrix和通信耗时数据communication_time,--mode参数默认值为all。 | 否 | + | cann_api_sum | 集群API性能数据汇总分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。--export_type为db时,输出交付件cluster_analysis.db;--export_type为notebook时,在cluster_analysis_output/CannApiSum目录下输出交付件stats.ipynb。 | 否 | + | compute_op_sum | 集群场景性能数据的device运行算子信息汇总分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。--export_type为db时,输出交付件cluster_analysis.db;--export_type为notebook时,在cluster_analysis_output/ComputeOpSum目录下输出交付件stats.ipynb。 | 否 | + | hccl_sum | 集合通信算子耗时分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。--export_type为db时,输出交付件cluster_analysis.db;--export_type为notebook时,在cluster_analysis_output/HcclSum目录下输出交付件stats.ipynb。 | 否 | + | mstx_sum | 集群场景mstx打点信息汇总分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。--export_type为db时,输出交付件cluster_analysis.db;--export_type为notebook时,在cluster_analysis_output/MstxSum目录下输出交付件stats.ipynb。 | 否 | + + --parallel_mode参数示例如下: + + ```bash + msprof-analyze cluster -d {cluster profiling data path} -m cann_api_sum --parallel_mode concurrent + ``` + + 或 + + ```bash + python3 cluster_analysis.py -d {cluster profiling data path} -m cann_api_sum --parallel_mode concurrent + ``` + + +### 交付件 + +集群分析工具的交付件通过Ascend Insight工具展示,详见《[MindStudio Ascend Insight用户指南](https://www.hiascend.com/document/detail/zh/mindstudio/70RC1/GUI-baseddevelopmenttool/msascendinsightug/AscendInsight_0002.html)》。 + +#### cluster_step_trace_time.csv 数据解析模式为communication_matrix、communication_time或all时均生成。 @@ -67,6 +114,8 @@ J列:Bubble时间,指receive时间的总和。 K列:Communication(Not Overlapped and Exclude Receive)指剔除recieve算子外的并且不被掩盖的通信时间。 +L列:Preparing,指迭代开始到首个计算或通信算子运行的时间。 + **Tips**:先筛选B列type为stage, 看stage间是否有问题,再筛选B列type为rank,看rank是否有问题,根据以下几点排查。 * 根据Computing的时间差异判断是否有慢卡,或者有负载不均衡的现象。 @@ -79,7 +128,7 @@ K列:Communication(Not Overlapped and Exclude Receive)指剔除recieve算 以上时间理论上都应该处于持平状态,即最大值小于最小值5%,否则就可能出现慢卡。 -### cluster_communication_matrix.json +#### cluster_communication_matrix.json 数据解析模式为communication_matrix或all时生成。 @@ -99,8 +148,33 @@ K列:Communication(Not Overlapped and Exclude Receive)指剔除recieve算 - “HCCS”或“PCIE”是节点内片间拷贝,速度在18GB左右或以上比较正常。 - “RDMA”是节点间拷贝,910A速度在12GB左右或以上。 -### cluster_communication.json +#### cluster_communication.json 数据解析模式为communication_time或all时生成。 主要为通信耗时数据。 + +#### cluster_analysis.db + +解析analysis.db或ascend_pytorch_profiler_{rank_id}.db生成的交付件,根据数据解析模式不同而解析不同的数据,可以使用Ascend Insight工具展示。 + +#### stats.ipynb + +- 数据解析模式为cann_api_sum时生成,保存在cluster_analysis_output/CannApiSum目录下。 + + 可使用jupyter notebook工具或Ascend Insight工具打开,主要展示集群API耗时信息。 + +- 数据解析模式为compute_op_sum时生成,保存在cluster_analysis_output/ComputeOpSum目录下。 + + 可使用jupyter notebook工具或Ascend Insight工具打开,主要展示集群计算算子耗时分析(将集群所有计算算子进行汇总并以图表展示),集群Rank计算算子耗时分析(将每个Rank的计算算子进行各自汇总)。 + +- 数据解析模式为hccl_sum时生成,保存在cluster_analysis_output/HcclSum目录下。 + + 可使用jupyter notebook工具或Ascend Insight工具打开,主要展示集群通信算子耗时分析(将集群所有通信算子进行汇总并以图表展示),集群Rank通信算子耗时分析(将每个Rank的通信算子进行各自汇总)、Top通信算子信息展示。 + +- 数据解析模式为mstx_sum时生成,保存在cluster_analysis_output/MstxSum目录下。 + + 可使用jupyter notebook工具或Ascend Insight工具打开,主要展示集群场景mstx打点信息,分为框架侧、CANN侧和Device侧三部分的打点信息。 + + + diff --git a/profiler/cluster_analyse/analysis/analysis_facade.py b/profiler/cluster_analyse/analysis/analysis_facade.py index b383a704df27d18e0191b2b251efd9de61beee55..435d77b21bff423b207bf050ea660a1738f0fe5f 100644 --- a/profiler/cluster_analyse/analysis/analysis_facade.py +++ b/profiler/cluster_analyse/analysis/analysis_facade.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# Copyright (c) 2024, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,14 +14,16 @@ # limitations under the License. from multiprocessing import Process -from common_func.constant import Constant + from analysis.communication_analysis import CommunicationAnalysis +from analysis.comm_matrix_analysis import CommMatrixAnalysis from analysis.step_trace_time_analysis import StepTraceTimeAnalysis -from analysis.communication_analysis import CommMatrixAnalysis - +from analysis.host_info_analysis import HostInfoAnalysis +from common_func.context import Context +from common_func.constant import Constant class AnalysisFacade: - analysis_module = {CommunicationAnalysis, StepTraceTimeAnalysis, CommMatrixAnalysis} + default_module = {CommunicationAnalysis, StepTraceTimeAnalysis, CommMatrixAnalysis, HostInfoAnalysis} def __init__(self, params: dict): self.params = params @@ -29,10 +31,20 @@ class AnalysisFacade: def cluster_analyze(self): # 多个profiler用多进程处理 process_list = [] - for analysis in self.analysis_module: + for analysis in self.default_module: process = Process(target=analysis(self.params).run) process.start() process_list.append(process) for process in process_list: process.join() + + def recipe_analyze(self): + HostInfoAnalysis(self.params).run() + print("[INFO] Recipe analysis launched.") + try: + with Context.create_context(self.params.get(Constant.PARALLEL_MODE)) as context: + with self.params.get(Constant.RECIPE_CLASS)(self.params) as recipe: + recipe.run(context) + except Exception as e: + print("[ERROR] Recipe analysis launched failed, %s." % str(e)) diff --git a/profiler/cluster_analyse/analysis/base_analysis.py b/profiler/cluster_analyse/analysis/base_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..7209e9b56f04cc6e97e4331db2ca48ba18a67ed6 --- /dev/null +++ b/profiler/cluster_analyse/analysis/base_analysis.py @@ -0,0 +1,255 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import traceback +import shutil +import pandas as pd +from abc import abstractmethod + +from common_func.constant import Constant +from common_func.file_manager import FileManager +from common_func.db_manager import DBManager +from common_func.utils import convert_unit +from cluster_utils.data_transfer_adapter import DataTransferAdapter + + +class BaseAnalysis: + MAX_RANKS = 1000 + def __init__(self, param: dict): + self.collection_path = param.get(Constant.COLLECTION_PATH) + self.data_map = param.get(Constant.DATA_MAP) + self.data_type = param.get(Constant.DATA_TYPE) + self.communication_ops = [] + self.collective_group_dict = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.COLLECTIVE_GROUP) + self.comm_ops_struct = {} + self.adapter = DataTransferAdapter() + + @staticmethod + def compute_ratio(dividend: float, divisor: float): + if abs(divisor) < Constant.EPS: + return 0 + else: + return round(dividend / divisor, 4) + + @staticmethod + def check_add_op(op_name: str): + """ + 兼容2个版本,判断是否需要将此算子信息相加 + """ + stat_list = ["middle", "top", "bottom", "total"] + total = "total" + for stat_name in stat_list: + if stat_name in op_name: + if stat_name != total: + return False + return True + + @abstractmethod + def run(self): + pass + + def dump_data(self): + if not self.comm_ops_struct: + print("[WARNING] There is no final comm ops data generated") + return + if self.data_type == Constant.TEXT: + self.dump_json() + else: + if len(self.data_map) >= self.MAX_RANKS: + print("[WARNING]The number of ranks is too large to dump to db, it will be dumped to json file.") + self.dump_json() + else: + self.dump_db() + + @abstractmethod + def dump_db(self): + pass + + def dump_json(self): + output_comm_data = {} + for key in self.comm_ops_struct: + output_comm_data[str(key)] = self.comm_ops_struct.get(key) + FileManager.create_json_file(self.collection_path, output_comm_data, self.SAVED_JSON) + + def split_op_by_group(self): + for single_op in self.communication_ops: + if single_op.get(Constant.COMM_OP_TYPE) == Constant.P2P: + rank_tup = Constant.P2P + else: + rank_tup = tuple(self.collective_group_dict.get(single_op.get(Constant.GROUP_NAME), [])) + rank_id = single_op.get(Constant.RANK_ID, 'N/A') + step_id = single_op.get(Constant.STEP_ID, 'N/A') + op_name = single_op.get(Constant.COMM_OP_NAME, 'N/A') + op_info = single_op.get(Constant.COMM_OP_INFO) + self.comm_ops_struct.setdefault(rank_tup, {}).setdefault(step_id, {}).\ + setdefault(op_name, {}).setdefault(rank_id, op_info) + + def combine_ops_total_info(self): + for rank_tup, group_dict in self.comm_ops_struct.items(): + for step_id, communication_ops in group_dict.items(): + self.compute_total_info(communication_ops) + + +class BaseRecipeAnalysis: + + UNIT = "Us" + DB_UNIT = "Ns" + + RANK_LIST = "rank_list" + + def __init__(self, params): + self._params = params + self._collection_dir = params.get(Constant.COLLECTION_PATH, "") + self._data_map = params.get(Constant.DATA_MAP, {}) + self._recipe_name = params.get(Constant.RECIPE_NAME, "") + self._mode = params.get(Constant.PARALLEL_MODE, "") + self._export_type = params.get(Constant.EXPORT_TYPE, "") + self._output_dir = None + self._rank_list = params.get(self.RANK_LIST, 'all') + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._params is not None and exc_type is not None: + print(f"[ERROR] Failed to exit analysis: {exc_val}") + traceback.print_exc(file=sys.stdout) + + def run(self, context): + pass + + @property + def base_dir(self): + return os.path.basename(os.path.dirname(__file__)) + + def _get_rank_db(self): + invalid_rank_id = [] + if self._rank_list == 'all': + rank_ids = list(self._data_map.keys()) + else: + rank_ids = [] + for rank_id in self._rank_list: + if rank_id in self._data_map.keys(): + rank_ids.append(rank_id) + else: + invalid_rank_id.append(str(rank_id)) + db_paths = [] + for rank_id in rank_ids: + rank_path = self._data_map[rank_id] + db_path = os.path.join(rank_path, Constant.SINGLE_OUTPUT, f"ascend_pytorch_profiler_{rank_id}.db") + if os.path.exists(db_path): + db_paths.append((rank_id, db_path)) + else: + print(f"[WARNING] DB file not found, rank id: {rank_id}, db path: {db_path}.") + if invalid_rank_id: + print(f"[WARNING] Invalid Rank id : [{','.join(invalid_rank_id)}].") + return db_paths + + def get_mode(self): + return self._mode + + def get_recipe_name(self): + return self._recipe_name + + def dump_data(self, data, file_name, table_name=None, index=True): + output_path = os.path.join(self._collection_dir, Constant.CLUSTER_ANALYSIS_OUTPUT) + if table_name: + result_db = os.path.join(output_path, file_name) + conn, cursor = DBManager.create_connect_db(result_db) + if isinstance(data, pd.DataFrame): + data.to_sql(table_name, conn, if_exists='replace', index=True) + else: + print(f"[ERROR] Unknown dump data type: {type(data)}") + DBManager.destroy_db_connect(conn, cursor) + else: + result_csv = os.path.join(output_path, file_name) + if isinstance(data, pd.DataFrame): + data = convert_unit(data, self.DB_UNIT, self.UNIT) + data.to_csv(result_csv, index=index) + else: + print(f"[ERROR] Unknown dump data type: {type(data)}") + + def _create_output_dir_name(self, name): + i = 1 + while os.path.exists(f"{name}-{i}"): + i += 1 + return f"{name}-{i}" + + def _create_unique_output_dir(self): + output_dir = os.path.join(self._collection_dir, Constant.CLUSTER_ANALYSIS_OUTPUT, self._recipe_name) + + if os.path.exists(output_dir): + return self._create_output_dir_name(output_dir) + return output_dir + + def _get_output_dir(self): + if self._output_dir is None: + self._output_dir = self._create_unique_output_dir() + os.makedirs(self._output_dir) + return self._output_dir + + def create_notebook(self, filename, notebook_template_dir=None, replace_dict=None): + if notebook_template_dir is None: + template_path = os.path.dirname(__file__) + else: + template_path = notebook_template_dir + output_path = os.path.join(self._get_output_dir(), filename) + template_file = os.path.join(template_path, self.base_dir, filename) + if replace_dict is None: + shutil.copy(template_file, output_path) + else: + with open(template_file, 'r') as f: + template_content = f.read() + for key, value in replace_dict.items(): + template_content = template_content.replace(str(key), str(value)) + with open(output_path, 'w') as f: + f.write(template_content) + print(f"[INFO] Notebook export path is: {self._get_output_dir()}") + + def add_helper_file(self, helper_file): + helper_output_path = os.path.join(self._get_output_dir(), helper_file) + helper_file_path = os.path.join(os.path.dirname(__file__), helper_file) + + if helper_file_path is not None: + shutil.copy(helper_file_path, helper_output_path) + + @staticmethod + def _filter_data(mapper_data): + return [(rank, data) for rank, data in mapper_data if data is not None and len(data) != 0] + + @classmethod + def add_parser_argument(cls, parser): + parser.add_argument("--rank_list", type=str, help="Rank id list", default='all') + + @classmethod + def parse_argument(cls, args_parsed) -> dict: + if args_parsed.rank_list == 'all': + return { + cls.RANK_LIST: 'all' + } + else: + rank_str_list = args_parsed.rank_list.split(",") + rank_list = [int(rank) for rank in rank_str_list if rank.isdigit()] + return { + cls.RANK_LIST: rank_list + } + + @classmethod + def get_extra_argument(cls, params) -> dict: + return { + cls.RANK_LIST: params.get(cls.RANK_LIST, "all") + } diff --git a/profiler/cluster_analyse/analysis/cann_api_sum/__init__.py b/profiler/cluster_analyse/analysis/cann_api_sum/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7101187a2c2619f3b1c20dded14b433950b4c662 --- /dev/null +++ b/profiler/cluster_analyse/analysis/cann_api_sum/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse/analysis/cann_api_sum/cann_api_sum.py b/profiler/cluster_analyse/analysis/cann_api_sum/cann_api_sum.py new file mode 100644 index 0000000000000000000000000000000000000000..db37b004b150eaa65b9c9cd4e12f1f5bdc0836e9 --- /dev/null +++ b/profiler/cluster_analyse/analysis/cann_api_sum/cann_api_sum.py @@ -0,0 +1,108 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pandas as pd + +from analysis.base_analysis import BaseRecipeAnalysis +from common_func.constant import Constant +from common_func.utils import stdev +from cluster_statistics_export.cann_api_sum_export import CannApiSumExport + + +class CannApiSum(BaseRecipeAnalysis): + + def __init__(self, params): + super().__init__(params) + print("[INFO] CannApiSum init.") + + @property + def base_dir(self): + return os.path.basename(os.path.dirname(__file__)) + + @staticmethod + def _mapper_func(data_map, analysis_class): + df = CannApiSumExport(data_map[1], analysis_class).read_export_db() + + if df is None or df.empty: + print(f"[WARNING] There is no stats data in {data_map[1]}.") + return None + return data_map[0], df + + def mapper_func(self, context): + return context.wait( + context.map( + self._mapper_func, + self._get_rank_db(), + analysis_class=self._recipe_name + ) + ) + + def reducer_func(self, mapper_res): + stats_rank_data = self._filter_data(mapper_res) + if not stats_rank_data: + print("[ERROR] Mapper data is None.") + return + stats_rank_data = [df.assign(rank=rank) for rank, df in stats_rank_data] + stats_rank_data = pd.concat(stats_rank_data) + stats_data = self._aggregate_stats(stats_rank_data) + if self._export_type == "db": + self.dump_data(stats_rank_data, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, "CannApiSumRank") + self.dump_data(stats_data, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, "CannApiSum") + elif self._export_type == "notebook": + self.dump_data(stats_rank_data, os.path.join(self._get_output_dir(), "rank_stats.csv"), index=False) + self.dump_data(stats_data, os.path.join(self._get_output_dir(), "all_stats.csv")) + self.save_notebook() + else: + print("[ERROR] Unknown export type.") + + def run(self, context): + mapper_res = self.mapper_func(context) + self.reducer_func(mapper_res) + + @staticmethod + def _aggregate_stats(stats_res): + grouped = stats_res.groupby("name") + res = {} + total_time = grouped["totalTimeNs"].sum() + res["timeRatio"] = total_time / total_time.sum() * 100.0 + res["totalTimeNs"] = total_time + res["totalCount"] = grouped["totalCount"].sum() + res["averageNs"] = res["totalTimeNs"] / res["totalCount"] + res["Q1Ns"] = grouped["Q1Ns"].min() + res["medNs"] = grouped["medNs"].median() + res["Q3Ns"] = grouped["Q3Ns"].max() + res["minNs"] = grouped["minNs"].min() + res["maxNs"] = grouped["maxNs"].max() + res["stdev"] = grouped.apply(lambda x: stdev(x, res)) + min_value = grouped["minNs"].min() + res["minRank"] = grouped.apply( + lambda x: ", ".join( + x.loc[x["minNs"] == min_value.loc[x.name], "rank"].astype(str) + ) + ) + max_value = grouped["maxNs"].max() + res["maxRank"] = grouped.apply( + lambda x: ", ".join( + x.loc[x["maxNs"] == max_value.loc[x.name], "rank"].astype(str) + ) + ) + res = pd.concat(res.values(), axis=1, keys=res.keys()).round(1) + res.sort_values(by="totalTimeNs", ascending=False, inplace=True) + return res + + def save_notebook(self): + self.create_notebook("stats.ipynb") + self.add_helper_file("cluster_display.py") diff --git a/profiler/cluster_analyse/analysis/cann_api_sum/stats.ipynb b/profiler/cluster_analyse/analysis/cann_api_sum/stats.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c97f039c5a01a6e7cce2968d569d79e137e76f8c --- /dev/null +++ b/profiler/cluster_analyse/analysis/cann_api_sum/stats.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CANN_API_SUM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import plotly.offline as pyo\n", + "\n", + "from IPython.display import display, HTML\n", + "\n", + "import cluster_display\n", + "\n", + "display(HTML(\"\"))\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_rows', None)\n", + "pyo.init_notebook_mode()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## 集群场景CANN层API统计分析\n", + "该分析脚本展示了集群场景的统计数据分析结果。需要注意以下几点:\n", + "1. 所有的时间信息单位是微秒(us);\n", + "2. Q1表示单个API耗时的25%分位数,最终结果取自所有卡的Q1值中最小值;\n", + "3. Q3表示单个API耗时的75%分位数,最终结果取自所有卡的Q3值中最大值;\n", + "4. 'minRank'展示了API最小耗时所在卡;\n", + "5. 'maxRank'展示了API最大耗时所在卡。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"all_stats.csv\")\n", + "display(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cluster_display.display_box(df, xaxis_title=\"name\", yaxis_title=\"duration (ns)\")\n", + "cluster_display.display_stats_scatter(df, xaxis_title=\"name\", yaxis_title=\"duration (ns)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "per_rank_df = pd.read_csv(\"rank_stats.csv\")\n", + "cluster_display.display_stats_per_operation(per_rank_df, xaxis_title='rank', yaxis_title='duration (ns)')" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/profiler/cluster_analyse/analysis/cluster_display.py b/profiler/cluster_analyse/analysis/cluster_display.py new file mode 100644 index 0000000000000000000000000000000000000000..8fc6040ccafae2d069e2e6e394941c7aff83a452 --- /dev/null +++ b/profiler/cluster_analyse/analysis/cluster_display.py @@ -0,0 +1,239 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import plotly.graph_objects as go +from IPython.display import display, HTML +from ipywidgets import Dropdown, fixed, interact + + +def get_stats_cols(df): + cols = df.columns.tolist() + q1 = "Q1(Us)" if "Q1(Us)" in cols else "Q1~" + q3 = "Q3(Us)" if "Q3(Us)" in cols else "Q3~" + med = "med(Us)" if "med(Us)" in cols else "med~" + std = "stdev" if "stdev" in cols else "stdev~" + return q1, q3, med, std + + +def display_box(df, x=None, **layout_args): + if x is None: + x = df.columns[0] + q1, q3, med, std = get_stats_cols(df) + fig = go.Figure() + fig.add_trace( + go.Box( + x=df[x], + q1=df[q1], + median=df[med], + q3=df[q3], + sd=df[std], + lowerfence=df["minRank"], + upperfence=df["maxRank"] + ) + ) + fig.update_layout(**layout_args) + fig.show() + + +def display_stats_scatter(df, x=None, **layout_args): + if x is None: + x = df.columns[0] + q1, q3, med, _ = get_stats_cols(df) + fig = go.Figure() + col_names = [q1, med, q3, "minRank", "maxRank"] + for name in col_names: + fig.add_trace( + go.Scatter( + x=df[x], + y=df[name], + name=name + ) + ) + fig.update_layout(**layout_args) + fig.show() + + +def display_table_per_rank(df): + if df.empty: + display(df) + return + + rank_groups = df.groupby("rank") + def display_table(name): + rank_df = rank_groups.get_group(name) + rank_df = rank_df.drop(columns=["rank"]) + display(rank_df) + + dropdown = Dropdown( + options=rank_groups.groups.keys(), + description="rank:", + disabled=False, + ) + interact( + display_table, + name=dropdown + ) + + +def display_stats_per_operation(df, x=None, box=True, scatter=True, table=True, **layout_args): + if df.empty: + display(df) + return + + if x is None: + x = df.columns[0] + + op_groups = df.groupby(x) + + def display_graphs(name): + op_df = op_groups.get_group(name) + if table: + display(op_df.reset_index(drop=True).set_index("rank")) + if box: + display_box(op_df, x=op_df["rank"], **layout_args) + if scatter: + display_stats_scatter(op_df, x=op_df["rank"], **layout_args) + + operations = list(op_groups.groups.keys()) + + if len(operations) > 1: + dropdown = Dropdown( + options=operations, + description="Operation:", + disabled=False, + value=operations[1] + ) + interact( + display_graphs, + name=dropdown + ) + dropdown.value = operations[0] + else: + display_graphs(operations[0]) + + +def display_duration_boxplots(figs, stats_df: pd.DataFrame, orientation="v", title=None, + x_title="Names", y_title="Time", legend_title="Legend"): + mean_ds = stats_df.get("Mean(Us)", None) + min_ds = stats_df.get("Min(Us)", None) + max_ds = stats_df.get("Max(Us)", None) + q1_ds = stats_df.get("Q1(Us)", None) + median_ds = stats_df.get('Median(Us)', None) + q3_ds = stats_df.get('Q3(Us)', None) + return display_boxplot(figs, stats_df.index, min_ds, q1_ds, median_ds, q3_ds, max_ds, mean_ds, + orientation=orientation, title=title, x_title=x_title, y_title=y_title, + legend_title=legend_title) + + +def display_boxplot(figs, x_axis, min_ds, q1_ds, median_ds, q3_ds, max_ds, mean_ds, orientation="v", + title=None, x_title=None, y_title="Time", legend_title="Legend"): + fig = go.Figure() + fig.add_trace( + go.Box( + x=x_axis, + lowerfence=min_ds, + q1=q1_ds, + median=median_ds, + q3=q3_ds, + upperfence=max_ds, + mean=mean_ds + ) + ) + fig.update_traces(orientation=orientation) + fig.update_layout( + xaxis_title=x_title, yaxis_title=y_title, legend_title=legend_title, + title=title, height=1024 + ) + fig.show() + if isinstance(figs, list): + figs.append(fig) + return fig + + +def display_graph(figs, x_axis, y_axes, title=None, + x_title=None, y_title=None, legend_title="Legend"): + data = None + if isinstance(y_axes, pd.DataFrame): + data = y_axes.set_index(x_axis) + elif isinstance(y_axes, dict): + data = pd.DataFrame(y_axes, index=x_axis) + elif isinstance(y_axes, pd.Series): + data = pd.DataFrame({"": y_axes}, index=x_axis) + elif isinstance(y_axes, np.ndarray): + data = pd.DataFrame({"": pd.Series(y_axes)}, index=x_axis) + else: + return + + fig = data.plot.line() + fig.update_layout( + title=title, xaxis_title=x_title, yaxis_title=y_title, legend_title=legend_title + ) + fig.show() + if isinstance(figs, list): + figs.append(fig) + return fig + + +def display_stats_per_rank_groups_combobox(rank_stats_gdf): + names = list(rank_stats_gdf.groups.keys()) + if len(names) > 1: + dropdown = Dropdown( + options=names, layout={"width": "max-content"}, value=names[1] + ) + interact( + __display_stats_per_rank_group, + selected=dropdown, + rank_stats_gdf=fixed(rank_stats_gdf) + ) + dropdown.value = names[0] + elif len(names) == 1: + __display_stats_per_rank_group(names[0], rank_stats_gdf) + else: + print("cluster_display func:input rank_stats_gdf groups is null so no need to display") + + +def __display_stats_per_rank_group(selected, rank_stats_gdf): + df = rank_stats_gdf.get_group(selected) + df = df.reset_index(drop=True) + df = df.set_index(df["Rank"]) + display(df) + + figs = [] + display_duration_boxplots(figs, df, x_title="Ranks") + display_graph( + figs, + df.index, + df[["Q1(Us)", "Median(Us)", "Q3(Us)"]], + title="50% of Distribution", + x_title="Ranks" + ) + + +def display_stats_optional_combobox(options, display_func, args, description="Option:"): + if len(options) > 1: + dropdown = Dropdown( + options=options, layout={"width": "max-content"}, value=options[1], + description=description + ) + interact( + display_func, + selected=dropdown, + args=fixed(args) + ) + dropdown.value = options[0] + elif len(options) == 1: + display_func(options[0], args) diff --git a/profiler/cluster_analyse/analysis/comm_matrix_analysis.py b/profiler/cluster_analyse/analysis/comm_matrix_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..8dc04471fe0a164fc859e51597d41028523f7a32 --- /dev/null +++ b/profiler/cluster_analyse/analysis/comm_matrix_analysis.py @@ -0,0 +1,106 @@ +import os +from collections import defaultdict + +from analysis.base_analysis import BaseAnalysis +from common_func.constant import Constant +from common_func.db_manager import DBManager + + +class CommMatrixAnalysis(BaseAnalysis): + SAVED_JSON = "cluster_communication_matrix.json" + COMMUNICATION_MATRIX_TABLE = "ClusterCommAnalyzerMatrix" + + def __init__(self, param: dict): + super().__init__(param) + self.communication_ops = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.MATRIX_OPS) + + @staticmethod + def combine_link(link_info_dict: dict, single_link_dict: dict): + link_info_dict[Constant.TRANSPORT_TYPE] = single_link_dict.get(Constant.TRANSPORT_TYPE) + link_info_dict[Constant.OP_NAME] = single_link_dict.get(Constant.OP_NAME, '') + link_info_dict[Constant.TRANSIT_TIME_MS] += single_link_dict.get(Constant.TRANSIT_TIME_MS, 0) + link_info_dict[Constant.TRANSIT_SIZE_MB] += single_link_dict.get(Constant.TRANSIT_SIZE_MB, 0) + + def run(self): + if not self.communication_ops: + return + self.split_op_by_group() + self.combine_ops_total_info() + self.dump_data() + + def dump_db(self): + res_comm_matrix = self.adapter.transfer_matrix_from_json_to_db(self.comm_ops_struct) + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + DBManager.create_tables(result_db, self.COMMUNICATION_MATRIX_TABLE) + conn, cursor = DBManager.create_connect_db(result_db) + if res_comm_matrix: + res_matrix_value = [list(data.values()) for data in res_comm_matrix] + sql = "insert into {} values ({value})".format(self.COMMUNICATION_MATRIX_TABLE, + value="?," * (len(res_matrix_value[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, res_matrix_value) + DBManager.destroy_db_connect(conn, cursor) + + def compute_total_info(self, step_dict: dict): + self.merge_same_links(step_dict) + self.combine_link_info(step_dict) + + def merge_same_links(self, step_dict: dict): + def process_link_key(): + for link_key in rank_dict: + if '-' not in link_key: + print(f"[WARNING] {op_name} has an invalid link key {link_key}!") + break + src_rank = link_key.split('-')[0] + dst_rank = link_key.split('-')[1] + if src_rank == dst_rank: + if src_rank not in project_local_global_rank_map: + project_local_global_rank_map[src_rank] = rank_id + elif project_local_global_rank_map.get(src_rank) != rank_id: + print(f"[WARNING] In the same communication group, local ranks projecting to global ranks " + f"repeat!") + self.combine_link(link_info[link_key], rank_dict[link_key]) + + def convert_local_to_global_rank(): + tmp_link = {} + for link_key, link_dict in link_info.items(): + src_rank = link_key.split('-')[0] + dst_rank = link_key.split('-')[1] + src_rank = project_local_global_rank_map[src_rank] \ + if src_rank in project_local_global_rank_map else src_rank + dst_rank = project_local_global_rank_map[dst_rank] \ + if dst_rank in project_local_global_rank_map else dst_rank + link_dict[Constant.BANDWIDTH_GB_S] = \ + self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0), + link_dict.get(Constant.TRANSIT_TIME_MS, 0)) + tmp_link[f"{src_rank}-{dst_rank}"] = link_dict + return tmp_link + + project_local_global_rank_map = dict() + for op_name, op_dict in step_dict.items(): + link_info = defaultdict(lambda: { + Constant.TRANSPORT_TYPE: '', + Constant.TRANSIT_TIME_MS: 0, + Constant.TRANSIT_SIZE_MB: 0, + Constant.OP_NAME: '' + }) + for rank_id, rank_dict in op_dict.items(): + process_link_key() + step_dict[op_name] = convert_local_to_global_rank() + + def combine_link_info(self, step_dict: dict): + total_op_info = defaultdict(lambda: { + Constant.TRANSPORT_TYPE: '', + Constant.TRANSIT_TIME_MS: 0, + Constant.TRANSIT_SIZE_MB: 0, + Constant.OP_NAME: '' + }) + for op_name, op_dict in step_dict.items(): + if self.check_add_op(op_name): + for link_key, link_dict in op_dict.items(): + self.combine_link(total_op_info[link_key], link_dict) + for link_key, link_dict in total_op_info.items(): + link_dict[Constant.BANDWIDTH_GB_S] = \ + self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0), + link_dict.get(Constant.TRANSIT_TIME_MS, 0)) + step_dict[Constant.TOTAL_OP_INFO] = total_op_info diff --git a/profiler/cluster_analyse/analysis/communication_analysis.py b/profiler/cluster_analyse/analysis/communication_analysis.py index 88ac073a9cc899ecfb32378a8aca662de2bfe879..3f0a9b417e211b124b052cb5c5534f2fdbe5302e 100644 --- a/profiler/cluster_analyse/analysis/communication_analysis.py +++ b/profiler/cluster_analyse/analysis/communication_analysis.py @@ -1,75 +1,15 @@ -# Copyright (c) 2023, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +import os from collections import defaultdict -from abc import abstractmethod +from analysis.base_analysis import BaseAnalysis from common_func.constant import Constant -from common_func.file_manager import FileManager - - -class BaseCommAnalysis: - - def __init__(self, param: dict): - self.collection_path = param.get(Constant.COLLECTION_PATH) - self.data_map = param.get(Constant.DATA_MAP) - self.communication_ops = [] - self.collective_group_dict = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.COLLECTIVE_GROUP) - self.comm_ops_struct = {} +from common_func.db_manager import DBManager - @staticmethod - def compute_ratio(dividend: float, divisor: float): - if abs(divisor) < Constant.EPS: - return 0 - else: - return round(dividend / divisor, 4) - - @abstractmethod - def run(self): - pass - - def dump_data(self): - if not self.comm_ops_struct: - print("[WARNING] There is no final comm ops data generated") - return - output_comm_data = {} - for key in self.comm_ops_struct: - output_comm_data[str(key)] = self.comm_ops_struct.get(key) - FileManager.create_json_file(self.collection_path, output_comm_data, self.SAVED_JSON) - def split_op_by_group(self): - for single_op in self.communication_ops: - if single_op.get(Constant.COMM_OP_TYPE) == Constant.P2P: - rank_tup = Constant.P2P - else: - rank_tup = tuple(self.collective_group_dict.get(single_op.get(Constant.GROUP_NAME), [])) - rank_id = single_op.get(Constant.RANK_ID, 'N/A') - step_id = single_op.get(Constant.STEP_ID, 'N/A') - op_name = single_op.get(Constant.COMM_OP_NAME, 'N/A') - op_info = single_op.get(Constant.COMM_OP_INFO) - self.comm_ops_struct.setdefault(rank_tup, {}).setdefault(step_id, {}).\ - setdefault(op_name, {}).setdefault(rank_id, op_info) - - def combine_ops_total_info(self): - for rank_tup, group_dict in self.comm_ops_struct.items(): - for step_id, communication_ops in group_dict.items(): - self.compute_total_info(communication_ops) - - -class CommunicationAnalysis(BaseCommAnalysis): +class CommunicationAnalysis(BaseAnalysis): SAVED_JSON = "cluster_communication.json" + COMMUNICATION_BANDWIDTH_TABLE = "ClusterCommAnalyzerBandwidth" + COMMUNICATION_TIME_TABLE = "ClusterCommAnalyzerTime" def __init__(self, param: dict): super().__init__(param) @@ -88,6 +28,23 @@ class CommunicationAnalysis(BaseCommAnalysis): self.combine_ops_total_info() self.dump_data() + def dump_db(self): + res_comm_time, res_comm_bandwidth = self.adapter.transfer_comm_from_json_to_db(self.comm_ops_struct) + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + DBManager.create_tables(result_db, self.COMMUNICATION_TIME_TABLE, self.COMMUNICATION_BANDWIDTH_TABLE) + conn, cursor = DBManager.create_connect_db(result_db) + self.execute(conn, res_comm_time, self.COMMUNICATION_TIME_TABLE) + self.execute(conn, res_comm_bandwidth, self.COMMUNICATION_BANDWIDTH_TABLE) + DBManager.destroy_db_connect(conn, cursor) + + @staticmethod + def execute(conn, res_data, table_name): + if res_data: + res_value = [list(data.values()) for data in res_data] + sql = "insert into {} values ({value})".format(table_name, value="?," * (len(res_value[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, res_value) + def compute_total_info(self, comm_ops: dict): if not comm_ops: return @@ -144,100 +101,3 @@ class CommunicationAnalysis(BaseCommAnalysis): bandwidth_dict[Constant.BANDWIDTH_GB_S] = \ self.compute_ratio(bandwidth_dict.get(Constant.TRANSIT_SIZE_MB, 0), bandwidth_dict.get(Constant.TRANSIT_TIME_MS, 0)) - - -class CommMatrixAnalysis(BaseCommAnalysis): - SAVED_JSON = "cluster_communication_matrix.json" - STAT_LIST = ['middle', 'top', 'bottom', 'total'] - TOTAL = 'total' - - def __init__(self, param: dict): - super().__init__(param) - self.communication_ops = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.MATRIX_OPS) - - @staticmethod - def combine_link(link_info_dict: dict, single_link_dict: dict): - link_info_dict[Constant.TRANSPORT_TYPE] = single_link_dict.get(Constant.TRANSPORT_TYPE) - link_info_dict[Constant.OP_NAME] = single_link_dict.get(Constant.OP_NAME, '') - link_info_dict[Constant.TRANSIT_TIME_MS] += single_link_dict.get(Constant.TRANSIT_TIME_MS, 0) - link_info_dict[Constant.TRANSIT_SIZE_MB] += single_link_dict.get(Constant.TRANSIT_SIZE_MB, 0) - - def run(self): - if not self.communication_ops: - return - self.split_op_by_group() - self.combine_ops_total_info() - self.dump_data() - - def compute_total_info(self, step_dict: dict): - self.merge_same_links(step_dict) - self.combine_link_info(step_dict) - - def merge_same_links(self, step_dict: dict): - def process_link_key(): - for link_key in rank_dict: - if '-' not in link_key: - print(f"[WARNING] {op_name} has an invalid link key {link_key}!") - break - src_rank = link_key.split('-')[0] - dst_rank = link_key.split('-')[1] - if src_rank == dst_rank: - if src_rank not in project_local_global_rank_map: - project_local_global_rank_map[src_rank] = rank_id - elif project_local_global_rank_map.get(src_rank) != rank_id: - print(f"[WARNING] In the same communication group, local ranks projecting to global ranks repeat!") - self.combine_link(link_info[link_key], rank_dict[link_key]) - - def convert_local_to_global_rank(): - tmp_link = {} - for link_key, link_dict in link_info.items(): - src_rank = link_key.split('-')[0] - dst_rank = link_key.split('-')[1] - src_rank = project_local_global_rank_map[src_rank] \ - if src_rank in project_local_global_rank_map else src_rank - dst_rank = project_local_global_rank_map[dst_rank] \ - if dst_rank in project_local_global_rank_map else dst_rank - link_dict[Constant.BANDWIDTH_GB_S] = \ - self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0), - link_dict.get(Constant.TRANSIT_TIME_MS, 0)) - tmp_link[f"{src_rank}-{dst_rank}"] = link_dict - return tmp_link - - project_local_global_rank_map = dict() - for op_name, op_dict in step_dict.items(): - link_info = defaultdict(lambda: { - Constant.TRANSPORT_TYPE: '', - Constant.TRANSIT_TIME_MS: 0, - Constant.TRANSIT_SIZE_MB: 0, - Constant.OP_NAME: '' - }) - for rank_id, rank_dict in op_dict.items(): - process_link_key() - step_dict[op_name] = convert_local_to_global_rank() - - def combine_link_info(self, step_dict: dict): - total_op_info = defaultdict(lambda: { - Constant.TRANSPORT_TYPE: '', - Constant.TRANSIT_TIME_MS: 0, - Constant.TRANSIT_SIZE_MB: 0, - Constant.OP_NAME: '' - }) - for op_name, op_dict in step_dict.items(): - if self.check_add_op(op_name): - for link_key, link_dict in op_dict.items(): - self.combine_link(total_op_info[link_key], link_dict) - for link_key, link_dict in total_op_info.items(): - link_dict[Constant.BANDWIDTH_GB_S] = \ - self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0), - link_dict.get(Constant.TRANSIT_TIME_MS, 0)) - step_dict[Constant.TOTAL_OP_INFO] = total_op_info - - def check_add_op(self: any, op_name: str): - """ - 兼容2个版本,判断是否需要将此算子信息相加 - """ - for stat_name in self.STAT_LIST: - if stat_name in op_name: - if stat_name != self.TOTAL: - return False - return True diff --git a/profiler/cluster_analyse/analysis/compute_op_sum/__init__.py b/profiler/cluster_analyse/analysis/compute_op_sum/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7101187a2c2619f3b1c20dded14b433950b4c662 --- /dev/null +++ b/profiler/cluster_analyse/analysis/compute_op_sum/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse/analysis/compute_op_sum/compute_op_sum.py b/profiler/cluster_analyse/analysis/compute_op_sum/compute_op_sum.py new file mode 100644 index 0000000000000000000000000000000000000000..e71cf868ac9e06785d030a702bf9c8182ae4e948 --- /dev/null +++ b/profiler/cluster_analyse/analysis/compute_op_sum/compute_op_sum.py @@ -0,0 +1,103 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pandas as pd +from analysis.base_analysis import BaseRecipeAnalysis +from common_func.constant import Constant +from common_func.utils import describe_duration +from cluster_statistics_export.compute_op_sum_export import ComputeOpSumExport + + +class ComputeOpSum(BaseRecipeAnalysis): + + TABLE_ALL_RANK_STATS = "ComputeOpAllRankStats" + TABLE_PER_RANK_STATS_BY_OPTYPE = "ComputeOpPerRankStatsByOpType" + TABLE_PER_RANK_STATS_BY_OPNAME = "ComputeOpPerRankStatsByOpName" + + def __init__(self, params): + super().__init__(params) + print("[INFO] ComputeOpSum init.") + self.all_rank_stats = None + self.per_rank_stats_by_optype = None + self.per_rank_stats_by_opname = None + + @property + def base_dir(self): + return os.path.basename(os.path.dirname(__file__)) + + @staticmethod + def _mapper_func(data_map, analysis_class): + df = ComputeOpSumExport(data_map[1], analysis_class).read_export_db() + + if df is None or df.empty: + print(f"[WARNING] There is no stats data in {data_map[1]}.") + return None + + df["Rank"] = data_map[0] + return df + + def mapper_func(self, context): + return context.wait( + context.map( + self._mapper_func, + self._get_rank_db(), + analysis_class=self._recipe_name + ) + ) + + def reducer_func(self, mapper_res): + mapper_res = list(filter(lambda df: df is not None, mapper_res)) + if not mapper_res: + print("[ERROR] Mapper data is None.") + return + # get per rank stats by optype + self.per_rank_stats_by_optype = pd.concat( + describe_duration(df.groupby(["OpType", "TaskType"])["Duration"]).assign(Rank=df["Rank"][0]) for df in mapper_res) + self.per_rank_stats_by_optype.sort_values(by=["SumNs"], inplace=True, ascending=False) + + # get all rank stats by optype + all_op_data = pd.concat(mapper_res) + self.all_rank_stats = describe_duration(all_op_data.groupby(["OpType", "TaskType"])["Duration"]) + self.all_rank_stats.sort_values(by=["SumNs"], inplace=True, ascending=False) + + # get per rank stats by opname + self.per_rank_stats_by_opname = pd.concat( + describe_duration(df.groupby(["OpName", "OpType", "TaskType", "InputShapes"])["Duration"]).assign(Rank=df["Rank"][0]) for df in mapper_res) + self.per_rank_stats_by_opname.sort_values(by=["SumNs"], inplace=True, ascending=False) + + def run(self, context): + super().run(context) + mapper_res = self.mapper_func(context) + self.reducer_func(mapper_res) + + if self._export_type == "db": + self.save_db() + elif self._export_type == "notebook": + self.save_notebook() + else: + print("[ERROR] Unknown export type.") + + def save_notebook(self): + self.dump_data(self.all_rank_stats, os.path.join(self._get_output_dir(), "all_stats.csv")) + self.dump_data(self.per_rank_stats_by_optype, os.path.join(self._get_output_dir(), "rank_stats_by_optype.csv")) + self.dump_data(self.per_rank_stats_by_opname, os.path.join(self._get_output_dir(), "rank_stats_by_opname.csv")) + self.create_notebook("stats.ipynb") + self.add_helper_file("cluster_display.py") + + def save_db(self): + self.dump_data(self.all_rank_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_ALL_RANK_STATS) + self.dump_data(self.per_rank_stats_by_optype, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_PER_RANK_STATS_BY_OPTYPE) + self.dump_data(self.per_rank_stats_by_opname, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_PER_RANK_STATS_BY_OPNAME) diff --git a/profiler/cluster_analyse/analysis/compute_op_sum/stats.ipynb b/profiler/cluster_analyse/analysis/compute_op_sum/stats.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c88d2684c1f8822818f62005355c444332aaa915 --- /dev/null +++ b/profiler/cluster_analyse/analysis/compute_op_sum/stats.ipynb @@ -0,0 +1,164 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compute Op Summary\n", + "\n", + "集群场景计算类算子数据分析\n", + "\n", + "主要包含以下3个统计内容:\n", + "1. 按算子类型和任务类型分组的,整个集群通信算子耗时的统计情况\n", + "2. 按算子类型和任务类型分组的,每个Rank上计算类算子的耗时情况\n", + "3. 按算子名称、任务类型、输入shape分组的,每个Rank上的计算类算子的耗时情况" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据准备" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display, HTML\n", + "display(HTML(\"\"))\n", + "\n", + "import plotly.offline as pyo\n", + "\n", + "def is_lab_notebook():\n", + " import re\n", + " import psutil\n", + " return any(re.search('jupyter--lab-script', x) for x in psutil.Process().parent().cmdline())\n", + "\n", + "if is_lab_notebook():\n", + " pyo.init_notebook_mode()\n", + "\n", + "import pandas as pd\n", + "pd.options.plotting.backend = \"plotly\"\n", + "pd.set_option(\"display.max_rows\", 100)\n", + "pd.set_option(\"display.width\", 1000)\n", + "\n", + "import cluster_display\n", + "\n", + "all_stats_df = pd.read_csv(\"all_stats.csv\", index_col=\"OpType\")\n", + "rank_stats_by_optype_df = pd.read_csv(\"rank_stats_by_optype.csv\", index_col=\"OpType\")\n", + "rank_stats_by_opname_df = pd.read_csv(\"rank_stats_by_opname.csv\", index_col=\"OpName\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 计算类算子耗时分析\n", + "\n", + "将整个集群所有Rank的计算类算子进行汇总,按算子类型和任务类型分类,统计分析耗时情况,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Count:算子数量\n", + "- Mean:平均耗时\n", + "- Std:标准差\n", + "- Min:最小值\n", + "- Q1:四分之一分位数\n", + "- Median:中位数\n", + "- Q3:四分之三分位数\n", + "- Max:最大值\n", + "- Sum:总耗时" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(all_stats_df)\n", + "fig_all_rank = cluster_display.display_duration_boxplots(None, all_stats_df, x_title=\"OpType\")\n", + "fig_per_rank = cluster_display.display_graph(None, all_stats_df.index, all_stats_df[[\"Q1(Us)\", \"Median(Us)\", \"Q3(Us)\"]], title=\"50% of Distribution\", x_title=\"OpType\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 单个Rank的计算类算子基于算子类型的耗时分析\n", + "将集群内每个Rank的计算类算子进行汇总,按算子类型和任务类型分类,统计分析耗时情况,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Count:算子数量\n", + "- Mean:平均耗时\n", + "- Std:标准差\n", + "- Min:最小值\n", + "- Q1:四分之一分位数\n", + "- Median:中位数\n", + "- Q3:四分之三分位数\n", + "- Max:最大值\n", + "- Sum:总耗时" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rank_stats_gdf = rank_stats_by_optype_df.groupby(rank_stats_by_optype_df.index)\n", + "cluster_display.display_stats_per_rank_groups_combobox(rank_stats_gdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 单个Rank的计算类算子基于算子名的耗时分析\n", + "\n", + "将集群内每个Rank的计算类算子进行汇总,按算子名称、任务类型、输入shape分类,统计分析耗时情况,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Count:算子数量\n", + "- Mean:平均耗时\n", + "- Std:标准差\n", + "- Min:最小值\n", + "- Q1:四分之一分位数\n", + "- Median:中位数\n", + "- Q3:四分之三分位数\n", + "- Max:最大值\n", + "- Sum:总耗时" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rank_stats_gdf = rank_stats_by_opname_df.groupby(rank_stats_by_opname_df.index)\n", + "cluster_display.display_stats_per_rank_groups_combobox(rank_stats_gdf)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/profiler/cluster_analyse/analysis/hccl_sum/__init__.py b/profiler/cluster_analyse/analysis/hccl_sum/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7101187a2c2619f3b1c20dded14b433950b4c662 --- /dev/null +++ b/profiler/cluster_analyse/analysis/hccl_sum/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse/analysis/hccl_sum/hccl_sum.py b/profiler/cluster_analyse/analysis/hccl_sum/hccl_sum.py new file mode 100644 index 0000000000000000000000000000000000000000..da0c575e4683f1c51c4cf38e89b9c096c484777e --- /dev/null +++ b/profiler/cluster_analyse/analysis/hccl_sum/hccl_sum.py @@ -0,0 +1,133 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pandas as pd +from analysis.base_analysis import BaseRecipeAnalysis +from common_func.constant import Constant +from common_func.utils import describe_duration +from cluster_statistics_export.hccl_sum_export import HcclSumExport + + +class HcclSum(BaseRecipeAnalysis): + + TABLE_ALL_RANK_STATS = "HcclAllRankStats" + TABLE_PER_RANK_STATS = "HcclPerRankStats" + TABLE_TOP_OP_STATS = "HcclTopOpStats" + + TOP_NUM = "top_num" + DEFAULT_TOP_NUM = 15 + + def __init__(self, params): + super().__init__(params) + print("[INFO] HcclSum init.") + self.per_rank_stats = None + self.all_rank_stats = None + self.top_op_stats = None + self.top_num = params.get(self.TOP_NUM, self.DEFAULT_TOP_NUM) + + @property + def base_dir(self): + return os.path.basename(os.path.dirname(__file__)) + + @staticmethod + def _mapper_func(data_map, analysis_class): + df = HcclSumExport(data_map[1], analysis_class).read_export_db() + + if df is None or df.empty: + print(f"[WARNING] There is no stats data in {data_map[1]}.") + return None + + df["Rank"] = data_map[0] + return df + + def mapper_func(self, context): + return context.wait( + context.map( + self._mapper_func, + self._get_rank_db(), + analysis_class=self._recipe_name + ) + ) + + def reducer_func(self, mapper_res): + mapper_res = list(filter(lambda df: df is not None, mapper_res)) + if not mapper_res: + print("[ERROR] Mapper data is None.") + return + self.per_rank_stats = pd.concat( + describe_duration(df.groupby("OpType")["Duration"]).assign(Rank=df["Rank"][0]) for df in mapper_res) + self.per_rank_stats.sort_values(by=["Rank"], inplace=True) + all_op_data = pd.concat(mapper_res) + self.all_rank_stats = describe_duration(all_op_data.groupby("OpType")["Duration"]) + grouped_op_stats = all_op_data.groupby("OpName") + self.top_op_stats = describe_duration(grouped_op_stats["Duration"]).nlargest(self.top_num, "MeanNs") + min_rank = [] + max_rank = [] + for op_name in self.top_op_stats.index: + df = grouped_op_stats.get_group(op_name) + min_rank.append(df[df["Duration"] == df["Duration"].min()]["Rank"].values[0]) + max_rank.append(df[df["Duration"] == df["Duration"].max()]["Rank"].values[0]) + self.top_op_stats["MinRank"] = min_rank + self.top_op_stats["MaxRank"] = max_rank + + def run(self, context): + super().run(context) + if self.top_num <= 0: + print(f"[WARNING] HcclSum: top_num is set to a invalid value, " + f"it will be reset to default value({self.DEFAULT_TOP_NUM}).") + self.top_num = self.DEFAULT_TOP_NUM + mapper_res = self.mapper_func(context) + self.reducer_func(mapper_res) + + if self._export_type == "db": + self.save_db() + elif self._export_type == "notebook": + self.save_notebook() + else: + print("[ERROR] Unknown export type.") + + def save_notebook(self): + self.dump_data(self.all_rank_stats, os.path.join(self._get_output_dir(), "all_stats.csv")) + self.dump_data(self.per_rank_stats, os.path.join(self._get_output_dir(), "rank_stats.csv")) + self.dump_data(self.top_op_stats, os.path.join(self._get_output_dir(), "top_op_stats.csv")) + self.create_notebook("stats.ipynb") + self.add_helper_file("cluster_display.py") + + def save_db(self): + self.dump_data(self.all_rank_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_ALL_RANK_STATS) + self.dump_data(self.per_rank_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_PER_RANK_STATS) + self.dump_data(self.top_op_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_TOP_OP_STATS) + + @classmethod + def add_parser_argument(cls, parser): + BaseRecipeAnalysis.add_parser_argument(parser) + parser.add_argument("--top_num", type=int, help="Duration cost top count", default=cls.DEFAULT_TOP_NUM) + + @classmethod + def parse_argument(cls, args_parsed) -> dict: + argument_dict = BaseRecipeAnalysis.parse_argument(args_parsed) + argument_dict.update({ + cls.TOP_NUM: args_parsed.top_num + }) + return argument_dict + + @classmethod + def get_extra_argument(cls, params) -> dict: + argument_dict = BaseRecipeAnalysis.get_extra_argument(params) + argument_dict.update({ + cls.TOP_NUM: params.get(cls.TOP_NUM, cls.DEFAULT_TOP_NUM) + }) + return argument_dict diff --git a/profiler/cluster_analyse/analysis/hccl_sum/stats.ipynb b/profiler/cluster_analyse/analysis/hccl_sum/stats.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..87f8c6d736240531e2c28c0cf33df087ecfe38e8 --- /dev/null +++ b/profiler/cluster_analyse/analysis/hccl_sum/stats.ipynb @@ -0,0 +1,162 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# HCCL Summary\n", + "\n", + "集群场景Hccl算子数据分析\n", + "\n", + "主要包含以下3个统计内容:\n", + "1. 按算子类型分组的,整个集群通信算子耗时的统计情况\n", + "2. 按算子类型分组的,每个Rank上通信算子的耗时情况\n", + "3. 整个集群平均耗时最久的TOP通信算子" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据准备" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display, HTML\n", + "display(HTML(\"\"))\n", + "\n", + "import plotly.offline as pyo\n", + "\n", + "def is_lab_notebook():\n", + " import re\n", + " import psutil\n", + " return any(re.search('jupyter--lab-script', x) for x in psutil.Process().parent().cmdline())\n", + "\n", + "if is_lab_notebook():\n", + " pyo.init_notebook_mode()\n", + "\n", + "import pandas as pd\n", + "pd.options.plotting.backend = \"plotly\"\n", + "pd.set_option(\"display.max_rows\", 100)\n", + "pd.set_option(\"display.width\", 1000)\n", + "\n", + "import cluster_display\n", + "\n", + "all_stats_df = pd.read_csv(\"all_stats.csv\", index_col=\"OpType\")\n", + "rank_stats_df = pd.read_csv(\"rank_stats.csv\", index_col=\"OpType\")\n", + "top_op_stats_df = pd.read_csv(\"top_op_stats.csv\", index_col=\"OpName\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 集群通信算子耗时分析\n", + "\n", + "将整个集群所有Rank的通信算子进行汇总,按算子类型分类,统计分析耗时情况,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Count:算子数量\n", + "- Mean:平均耗时\n", + "- Std:标准差\n", + "- Min:最小值\n", + "- Q1:四分之一分位数\n", + "- Median:中位数\n", + "- Q3:四分之三分位数\n", + "- Max:最大值\n", + "- Sum:总耗时" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(all_stats_df)\n", + "fig_all_rank = cluster_display.display_duration_boxplots(None, all_stats_df, x_title=\"Hccl OpType\")\n", + "fig_per_rank = cluster_display.display_graph(None, all_stats_df.index, all_stats_df[[\"Q1(Us)\", \"Median(Us)\", \"Q3(Us)\"]], title=\"50% of Distribution\", x_title=\"Hccl OpType\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 集群Rank通信算子耗时分析\n", + "\n", + "将集群内每个Rank的通信算子进行汇总,按算子类型分类,统计分析耗时情况,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Count:算子数量\n", + "- Mean:平均耗时\n", + "- Std:标准差\n", + "- Min:最小值\n", + "- Q1:四分之一分位数\n", + "- Median:中位数\n", + "- Q3:四分之三分位数\n", + "- Max:最大值\n", + "- Sum:总耗时" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rank_stats_gdf = rank_stats_df.groupby(rank_stats_df.index)\n", + "cluster_display.display_stats_per_rank_groups_combobox(rank_stats_gdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 集群TOP-N通信算子耗时分析\n", + "\n", + "统计集群内耗时最多的TOP-N通信算子,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Count:算子数量\n", + "- Mean:平均耗时\n", + "- Std:标准差\n", + "- Min:最小值\n", + "- Q1:四分之一分位数\n", + "- Median:中位数\n", + "- Q3:四分之三分位数\n", + "- Max:最大值\n", + "- Sum:总耗时\n", + "- MinRank:耗时最少算子所在的Rank\n", + "- MaxRank:耗时最长算子所在的Rank" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(top_op_stats_df)\n", + "fig_top_op = cluster_display.display_duration_boxplots(None, top_op_stats_df, x_title=\"Hccl OpName\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/profiler/cluster_analyse/analysis/host_info_analysis.py b/profiler/cluster_analyse/analysis/host_info_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..563711080ed3a20923ce73ec595b84892492e9f6 --- /dev/null +++ b/profiler/cluster_analyse/analysis/host_info_analysis.py @@ -0,0 +1,96 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from analysis.base_analysis import BaseAnalysis +from common_func.constant import Constant +from common_func.db_manager import DBManager + + +class HostInfoAnalysis(BaseAnalysis): + + TABLE_HOST_INFO = "HOST_INFO" + TABLE_RANK_DEVICE_MAP = "RANK_DEVICE_MAP" + + def __init__(self, param: dict): + super().__init__(param) + self.all_rank_host_info = {} + self.all_rank_device_info = [] + + def run(self): + if self.data_type != Constant.DB: + return + self.analyze_host_info() + self.dump_db() + + def dump_db(self): + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + conn, curs = DBManager.create_connect_db(result_db) + if not (conn and curs): + print(f"[ERROR] Failed to create db {Constant.DB_CLUSTER_COMMUNICATION_ANALYZER}") + return + self.dump_host_info(result_db, conn) + self.dump_rank_device_map(result_db, conn) + DBManager.destroy_db_connect(conn, curs) + + def dump_host_info(self, result_db, db_conn): + if not self.all_rank_host_info: + print(f"[WARNING] No host info data be analyzed.") + return + DBManager.create_tables(result_db, Constant.TABLE_HOST_INFO) + save_host_info = list(self.all_rank_host_info.items()) + sql = "insert into {} values ({value})".format(Constant.TABLE_HOST_INFO, + value="?," * (len(save_host_info[0]) - 1) + "?") + DBManager.executemany_sql(db_conn, sql, save_host_info) + + def dump_rank_device_map(self, result_db, db_conn): + if not self.all_rank_device_info: + print(f"[WARNING] No rank device map data be analyzed.") + return + self.all_rank_device_info.sort() + DBManager.create_tables(result_db, Constant.TABLE_RANK_DEVICE_MAP) + sql = "insert into {} values ({value})".format(Constant.TABLE_RANK_DEVICE_MAP, + value="?," * (len(self.all_rank_device_info[0]) - 1) + "?") + DBManager.executemany_sql(db_conn, sql, self.all_rank_device_info) + + def analyze_host_info(self): + print_empty_host_info = "" + for rank_id, profiling_dir in self.data_map.items(): + host_info = [] + rank_device_info = [] + db_path = os.path.join(profiling_dir, Constant.SINGLE_OUTPUT, f"ascend_pytorch_profiler_{rank_id}.db") + if (os.path.exists(db_path) and DBManager.check_tables_in_db(db_path, self.TABLE_HOST_INFO)): + conn, curs = DBManager.create_connect_db(db_path) + sql = "select * from {0}".format(self.TABLE_HOST_INFO) + host_info = DBManager.fetch_all_data(curs, sql, is_dict=False) + DBManager.destroy_db_connect(conn, curs) + if not (host_info and host_info[0]): + if not print_empty_host_info: + print_empty_host_info = f"[WARNING] No {self.TABLE_HOST_INFO} data in {self.data_type} file." + continue + if (os.path.exists(db_path) and DBManager.check_tables_in_db(db_path, self.TABLE_RANK_DEVICE_MAP)): + conn, curs = DBManager.create_connect_db(db_path) + sql = "select * from {0}".format(self.TABLE_RANK_DEVICE_MAP) + rank_device_info = DBManager.fetch_all_data(curs, sql, is_dict=False) + DBManager.destroy_db_connect(conn, curs) + host_uid, host_name = host_info[0][0], host_info[0][1] + for idx, data in enumerate(rank_device_info): + rank_device_info[idx] = list(data) + [host_uid, ] + self.all_rank_host_info[host_uid] = host_name + self.all_rank_device_info.extend(rank_device_info) + if print_empty_host_info: + print(print_empty_host_info) diff --git a/profiler/cluster_analyse/analysis/mstx_sum/__init__.py b/profiler/cluster_analyse/analysis/mstx_sum/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7101187a2c2619f3b1c20dded14b433950b4c662 --- /dev/null +++ b/profiler/cluster_analyse/analysis/mstx_sum/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse/analysis/mstx_sum/mstx_sum.py b/profiler/cluster_analyse/analysis/mstx_sum/mstx_sum.py new file mode 100644 index 0000000000000000000000000000000000000000..46a0e18abeee5cdd6b058d71e3a1bd2b97e7c29d --- /dev/null +++ b/profiler/cluster_analyse/analysis/mstx_sum/mstx_sum.py @@ -0,0 +1,204 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pandas as pd +from collections import namedtuple +from analysis.base_analysis import BaseRecipeAnalysis +from common_func.constant import Constant +from common_func.utils import describe_duration +from cluster_statistics_export.mstx_mark_export import MstxMarkExport +from cluster_statistics_export.mstx_step_export import MstxStepExport + + +MarkInfo = namedtuple("MarkInfo", ["name", "framework_duration", "cann_duration", "device_duration", + "tid", "start_ns"]) + + +def format_mark_info(df: pd.DataFrame, start_idx, stop_idx, name) -> MarkInfo: + start_series = df.iloc[start_idx] + stop_series = df.iloc[stop_idx] + return MarkInfo( + name=name, + framework_duration=float(stop_series["framework_ts"]-start_series["framework_ts"]), + cann_duration=float(stop_series["cann_ts"]-start_series["cann_ts"]), + device_duration=float(stop_series["device_ts"]-start_series["device_ts"]), + tid=start_series["tid"], + start_ns=start_series["cann_ts"] + ) + + +def rename_mark_msg_name(mark_stats_df: pd.DataFrame): + msg_idx_counter = {} + for idx, mark_info in enumerate(mark_stats_df.itertuples(index=False)): + msg_idx_counter.setdefault(mark_info.step_id, {}).setdefault(mark_info.name, []).append(idx) + for msg_dict in msg_idx_counter.values(): + for msg, idx_list in msg_dict.items(): + if len(idx_list) <= 1: + continue + for i, idx in enumerate(idx_list): + mark_stats_df.loc[idx, 'name'] = f"{msg}_{i}" + + +def compute_step_id(mark_stat, step_stats_df: pd.DataFrame): + for step_info in step_stats_df.itertuples(index=False): + if step_info.start_ns <= mark_stat.start_ns <= step_info.end_ns: + return step_info.step_id + print(f"[WARNING] {mark_stat.name} is not in any step.") + return 0 + + +def format_columns(df: pd.DataFrame): + formatted_df = df.rename( + { + "framework_duration": "FrameworkDurationNs", + "cann_duration": "CannDurationNs", + "device_duration": "DeviceDurationNs", + "duration": "DurationNs", + "step_id": "StepId", + "tid": "Tid", + "name": "Name" + }, + axis="columns" + ) + cols = [col for col in formatted_df.columns if not col.endswith("_ns") and col not in {"Tid"}] + return formatted_df[cols] + + +class MstxSum(BaseRecipeAnalysis): + + TABLE_FRAMEWORK_STATS = "MSTXAllFrameworkStats" + TABLE_CANN_STATS = "MSTXAllCannStats" + TABLE_DEVICE_STATS = "MSTXAllDeviceStats" + TABLE_MARK_STATS = "MSTXMarkStats" + + START_SUFFIX = "_start" + STOP_SUFFIX = "_stop" + + def __init__(self, params): + super().__init__(params) + print("[INFO] MstxSum init.") + self.mark_stats = None + self.all_fwk_stats = None + self.all_cann_stats = None + self.all_device_stats = None + + @property + def base_dir(self): + return os.path.basename(os.path.dirname(__file__)) + + @staticmethod + def _mapper_func(data_map, analysis_class): + step_df = MstxStepExport(data_map[1], analysis_class).read_export_db() + if step_df is None or step_df.empty: + step_df = pd.DataFrame({"start_ns": [0], "end_ns": [float("inf")], "step_id": [0]}) + mark_df = MstxMarkExport(data_map[1], analysis_class).read_export_db() + if mark_df is None or mark_df.empty: + print(f"[WARNING] There is no mark data in {data_map[1]}.") + return None + mark_df["framework_ts"] = mark_df["framework_ts"].astype("int64") + + mark_info = {} + mark_res = [] + mismatch_msg = [] + for idx, row in enumerate(mark_df.itertuples(index=False)): + if row.msg.endswith(MstxSum.START_SUFFIX): + msg = row.msg[:-len(MstxSum.START_SUFFIX)] + mark_info.setdefault(row.tid, {}).setdefault(msg, []).append(idx) + elif row.msg.endswith(MstxSum.STOP_SUFFIX): + msg = row.msg[:-len(MstxSum.STOP_SUFFIX)] + idx_list = mark_info.get(row.tid, {}).get(msg, []) + if not idx_list: + mismatch_msg.append((row.msg, idx)) + continue + start_idx = idx_list.pop() + mark_res.append(format_mark_info(mark_df, start_idx, idx, msg)) + + # 统计未匹配上的mark信息 + for msg_info in mark_info.values(): + for msg, idx_list in msg_info.items(): + if not idx_list: + continue + mismatch_msg.extend((msg + MstxSum.START_SUFFIX, idx) for idx in idx_list) + if mismatch_msg: + mismatch_msg.sort(key=lambda msg: msg[1]) + print(f"[WARNING] The following mark messages do not match anyone in " + f"rank {data_map[0]}: {','.join(msg[0] for msg in mismatch_msg)}.") + + mark_stats_df = pd.DataFrame(mark_res).assign(Rank=data_map[0]) + mark_stats_df["step_id"] = mark_stats_df.apply(compute_step_id, axis=1, step_stats_df=step_df) + rename_mark_msg_name(mark_stats_df) + mark_stats_df = format_columns(mark_stats_df).set_index("Name", drop=True) + return mark_stats_df + + def mapper_func(self, context): + return context.wait( + context.map( + self._mapper_func, + self._get_rank_db(), + analysis_class=self._recipe_name + ) + ) + + def reducer_func(self, mapper_res): + mapper_res = list(filter(lambda df: df is not None, mapper_res)) + if not mapper_res: + print("[ERROR] Mapper data is None.") + return + self.mark_stats = pd.concat(mapper_res) + all_fwk_stats = [] + all_cann_stats = [] + all_device_stats = [] + mark_step_df = self.mark_stats.groupby("StepId") + for step_id, df in mark_step_df: + name_gdf = df.groupby("Name") + fwk_stats = describe_duration(name_gdf["FrameworkDurationNs"]).assign(StepId=step_id) + fwk_stats.sort_values(by=["SumNs"], inplace=True, ascending=False) + all_fwk_stats.append(fwk_stats) + cann_stats = describe_duration(name_gdf["CannDurationNs"]).assign(StepId=step_id) + cann_stats.sort_values(by=["SumNs"], inplace=True, ascending=False) + all_cann_stats.append(cann_stats) + device_stats = describe_duration(name_gdf["DeviceDurationNs"]).assign(StepId=step_id) + device_stats.sort_values(by=["SumNs"], inplace=True, ascending=False) + all_device_stats.append(device_stats) + self.all_fwk_stats = pd.concat(all_fwk_stats) + self.all_cann_stats = pd.concat(all_cann_stats) + self.all_device_stats = pd.concat(all_device_stats) + + def run(self, context): + super().run(context) + mapper_res = self.mapper_func(context) + self.reducer_func(mapper_res) + + if self._export_type == "db": + self.save_db() + elif self._export_type == "notebook": + self.save_notebook() + else: + print("[ERROR] Unknown export type.") + + def save_notebook(self): + self.dump_data(self.mark_stats, os.path.join(self._get_output_dir(), "mark_stats.csv")) + self.dump_data(self.all_fwk_stats, os.path.join(self._get_output_dir(), "all_fwk_stats.csv")) + self.dump_data(self.all_cann_stats, os.path.join(self._get_output_dir(), "all_cann_stats.csv")) + self.dump_data(self.all_device_stats, os.path.join(self._get_output_dir(), "all_device_stats.csv")) + self.create_notebook("stats.ipynb") + self.add_helper_file("cluster_display.py") + + def save_db(self): + self.dump_data(self.mark_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_MARK_STATS) + self.dump_data(self.all_fwk_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_FRAMEWORK_STATS) + self.dump_data(self.all_cann_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_CANN_STATS) + self.dump_data(self.all_device_stats, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, self.TABLE_DEVICE_STATS) diff --git a/profiler/cluster_analyse/analysis/mstx_sum/stats.ipynb b/profiler/cluster_analyse/analysis/mstx_sum/stats.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..84672bc72b97b02717c3a4110ab1b4dd827adafd --- /dev/null +++ b/profiler/cluster_analyse/analysis/mstx_sum/stats.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MSTX Summary\n", + "\n", + "集群场景MSTX打点数据分析\n", + "\n", + "主要包含以下2个统计内容:\n", + "1. 按Step分组的,整个集群MSTX打点数据的统计情况\n", + "2. 按Name分组的,每个Rank上MSTX打点数据的统计情况" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据准备" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display, HTML\n", + "display(HTML(\"\"))\n", + "\n", + "import plotly.offline as pyo\n", + "\n", + "def is_lab_notebook():\n", + " import re\n", + " import psutil\n", + " return any(re.search('jupyter--lab-script', x) for x in psutil.Process().parent().cmdline())\n", + "\n", + "if is_lab_notebook():\n", + " pyo.init_notebook_mode()\n", + "\n", + "import pandas as pd\n", + "pd.options.plotting.backend = \"plotly\"\n", + "pd.set_option(\"display.max_rows\", 100)\n", + "pd.set_option(\"display.width\", 1000)\n", + "\n", + "import cluster_display\n", + "\n", + "all_fwk_stats_gdf = pd.read_csv(\"all_fwk_stats.csv\", index_col=\"Name\").groupby(\"StepId\")\n", + "all_cann_stats_gdf = pd.read_csv(\"all_cann_stats.csv\", index_col=\"Name\").groupby(\"StepId\")\n", + "all_device_stats_gdf = pd.read_csv(\"all_device_stats.csv\", index_col=\"Name\").groupby(\"StepId\")\n", + "mark_stats_df = pd.read_csv(\"mark_stats.csv\", index_col=\"Name\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 集群MSTX数据分析\n", + "\n", + "将整个集群所有Rank的MSTX数据进行汇总,按Step划分,统计分析耗时情况,时间单位为微秒(us)\n", + "打点数据分为三种:\n", + "1. 框架侧耗时:Framework Time\n", + "2. Cann侧耗时:Cann Time\n", + "3. Device侧耗时:Devcie Time\n", + "\n", + "3种数据都包含以下统计项:\n", + "- Count:数量\n", + "- Mean:平均耗时\n", + "- Std:标准差\n", + "- Min:最小值\n", + "- Q1:四分之一分位数\n", + "- Median:中位数\n", + "- Q3:四分之三分位数\n", + "- Max:最大值\n", + "- Sum:总耗时" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def display_stats_mstx_step_combobox(selected, args):\n", + " step = selected\n", + " fwk_stats_gdf, cann_stats_gdf, device_stats_gdf = args\n", + " fwk_df = fwk_stats_gdf.get_group(step)\n", + " cann_df = cann_stats_gdf.get_group(step)\n", + " device_df = device_stats_gdf.get_group(step)\n", + " figs = []\n", + " display(HTML(\"

Framework Time Stats

\"))\n", + " display(fwk_df)\n", + " cluster_display.display_duration_boxplots(figs, fwk_df, title=\"Framework Time\", x_title=\"Name\", y_title=\"Time\")\n", + " display(HTML(\"

Cann Time Stats

\"))\n", + " display(cann_df)\n", + " cluster_display.display_duration_boxplots(figs, cann_df, title=\"Cann Time\", x_title=\"Name\", y_title=\"Time\")\n", + " display(HTML(\"

Device Time Stats

\"))\n", + " display(device_df)\n", + " cluster_display.display_duration_boxplots(figs, device_df, title=\"Device Time\", x_title=\"Name\", y_title=\"Time\")\n", + "\n", + "steps = list(all_fwk_stats_gdf.groups.keys())\n", + "if steps:\n", + " cluster_display.display_stats_optional_combobox(steps, display_stats_mstx_step_combobox, \n", + " [all_fwk_stats_gdf, all_cann_stats_gdf, all_device_stats_gdf], \"Step:\")\n", + "else:\n", + " print(\"There is no step in stats, so no need to display\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 集群Rank MSTX数据分析\n", + "\n", + "将集群内每个Rank的MSTX数据进行汇总,按打点Name分类,统计分析耗时情况,时间单位为微秒(us)\n", + "\n", + "包含以下统计项:\n", + "- Name:打点名称\n", + "- FrameworkDuration(Us):框架侧耗时\n", + "- CannDuration(Us):Cann侧耗时\n", + "- DeviceDuration(Us):Device侧耗时\n", + "- Rank:Rank序号\n", + "- StepId:Step序号" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def display_mstx_duration_by_rank(selected, args):\n", + " mark_stats_gdf = args\n", + " df = mark_stats_gdf.get_group(selected).sort_values(\"Rank\")\n", + " display(df)\n", + " fwk_duration = []\n", + " cann_duration = []\n", + " device_duration = []\n", + " step_ids = []\n", + " for step_id, step_df in df.groupby(\"StepId\"):\n", + " fwk_duration.append((step_id, step_df[\"FrameworkDuration(Us)\"].values))\n", + " cann_duration.append((step_id, step_df[\"CannDuration(Us)\"].values))\n", + " device_duration.append((step_id, step_df[\"DeviceDuration(Us)\"].values))\n", + " step_ids.append(step_id)\n", + " fwk_df = pd.concat([pd.Series(dur, name=step_id) for step_id, dur in fwk_duration], axis=1)\n", + " cann_df = pd.concat([pd.Series(dur, name=step_id) for step_id, dur in cann_duration], axis=1)\n", + " device_df = pd.concat([pd.Series(dur, name=step_id) for step_id, dur in device_duration], axis=1)\n", + " figs = []\n", + " ranks = df[\"Rank\"].drop_duplicates()\n", + " cluster_display.display_graph(figs, ranks, fwk_df[step_ids],\n", + " title=\"Framework Time\", x_title=\"Rank\", y_title=\"Time\", legend_title=\"Step\")\n", + " cluster_display.display_graph(figs, ranks, cann_df[step_ids],\n", + " title=\"Cann Time\", x_title=\"Rank\", y_title=\"Time\", legend_title=\"Step\")\n", + " cluster_display.display_graph(figs, ranks, device_df[step_ids],\n", + " title=\"Device Time\", x_title=\"Rank\", y_title=\"Time\", legend_title=\"Step\")\n", + "\n", + "mark_stats_gdf = mark_stats_df.groupby(mark_stats_df.index)\n", + "names = list(mark_stats_gdf.groups.keys())\n", + "if steps:\n", + " cluster_display.display_stats_optional_combobox(names, display_mstx_duration_by_rank, mark_stats_gdf, \"Name:\")\n", + "else:\n", + " print(\"There is no mark name in stats, so no need to display\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/profiler/cluster_analyse/analysis/step_trace_time_analysis.py b/profiler/cluster_analyse/analysis/step_trace_time_analysis.py index d24a7f1fe635e62c0857e276578463539a61ee76..6a886fffa97b142e8267066117f561154d85b162 100644 --- a/profiler/cluster_analyse/analysis/step_trace_time_analysis.py +++ b/profiler/cluster_analyse/analysis/step_trace_time_analysis.py @@ -14,8 +14,8 @@ # limitations under the License. import os -from collections import defaultdict +from common_func.db_manager import DBManager from common_func.constant import Constant from common_func.file_manager import FileManager from prof_bean.step_trace_time_bean import StepTraceTimeBean @@ -23,6 +23,7 @@ from prof_bean.step_trace_time_bean import StepTraceTimeBean class StepTraceTimeAnalysis: CLUSTER_TRACE_TIME_CSV = "cluster_step_trace_time.csv" + CLUSTER_TRACE_TIME_TABLE = "ClusterStepTraceTime" def __init__(self, param: dict): self.collection_path = param.get(Constant.COLLECTION_PATH) @@ -30,6 +31,7 @@ class StepTraceTimeAnalysis: self.communication_group = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.COMMUNICATION_GROUP) self.step_time_dict = {} self.step_data_list = [] + self.data_type = param.get(Constant.DATA_TYPE) @staticmethod def get_max_data_row(data_group_list: list): @@ -51,21 +53,51 @@ class StepTraceTimeAnalysis: def dump_data(self): if not self.step_data_list: print("[WARNING] Can't get step time info!") - headers = self.get_headers() - FileManager.create_csv_file(self.collection_path, self.step_data_list, self.CLUSTER_TRACE_TIME_CSV, headers) + return + if self.data_type == Constant.TEXT: + headers = self.get_headers() + FileManager.create_csv_file(self.collection_path, self.step_data_list, self.CLUSTER_TRACE_TIME_CSV, headers) + else: + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + DBManager.create_tables(result_db, self.CLUSTER_TRACE_TIME_TABLE) + column_len = DBManager.get_table_column_count(result_db, self.CLUSTER_TRACE_TIME_TABLE) + data_len = len(self.step_data_list[0]) + if data_len < column_len: + for data in self.step_data_list: + data.extend([0] * (column_len - data_len)) + conn, cursor = DBManager.create_connect_db(result_db) + sql = "insert into {} values ({value})".format(self.CLUSTER_TRACE_TIME_TABLE, + value="?," * (len(self.step_data_list[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, self.step_data_list) + DBManager.destroy_db_connect(conn, cursor) def load_step_trace_time_data(self): for rank_id, profiling_dir_path in self.data_map.items(): - step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.STEP_TIME_CSV) - if step_time_file: - self.step_time_dict[rank_id] = FileManager.read_csv_file(step_time_file, StepTraceTimeBean) + if self.data_type == Constant.TEXT: + step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.STEP_TIME_CSV) + if os.path.exists(step_time_file): + self.step_time_dict[rank_id] = FileManager.read_csv_file(step_time_file, StepTraceTimeBean) + else: + step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, + Constant.DB_COMMUNICATION_ANALYZER) + if (os.path.exists(step_time_file) and + DBManager.check_tables_in_db(step_time_file, Constant.TABLE_STEP_TRACE)): + conn, cursor = DBManager.create_connect_db(step_time_file) + sql = "select * from {0}".format(Constant.TABLE_STEP_TRACE) + data = DBManager.fetch_all_data(cursor, sql, is_dict=False) + self.step_time_dict[rank_id] = data + DBManager.destroy_db_connect(conn, cursor) if not self.step_time_dict.get(rank_id): - print(f"[WARNING] Rank {rank_id} does not have a valid step_trace_time.json.") + print(f"[WARNING] Rank {rank_id} does not have a valid step_trace_time data in {self.data_type} file.") def analyze_step_time(self): for rank_id, data_bean_list in self.step_time_dict.items(): for data_bean in data_bean_list: - self.step_data_list.append([data_bean.step, Constant.RANK, rank_id] + data_bean.row) + if self.data_type == Constant.TEXT: + self.step_data_list.append([data_bean.step, Constant.RANK, rank_id] + data_bean.row) + else: + self.step_data_list.append([data_bean[0], Constant.RANK, rank_id] + list(data_bean[1:])) stage_list = self.communication_group.get(Constant.P2P) if not stage_list: return @@ -80,7 +112,11 @@ class StepTraceTimeAnalysis: step_group_dict.setdefault(key, []).append(data_list[3:]) for key, data_group_list in step_group_dict.items(): - self.step_data_list.append([key[0], Constant.STAGE, key[1]] + self.get_max_data_row(data_group_list)) + if self.data_type == Constant.TEXT: + self.step_data_list.append([key[0], Constant.STAGE, key[1]] + self.get_max_data_row(data_group_list)) + else: + index = "(" + ",".join(str(i) for i in key[1]) + ")" + self.step_data_list.append([key[0], Constant.STAGE, index] + self.get_max_data_row(data_group_list)) def get_headers(self): if self.step_time_dict: diff --git a/profiler/cluster_analyse/cluster_analysis.py b/profiler/cluster_analyse/cluster_analysis.py index e07cac170300650bbf735f7e302b33377dd30a5e..a8d01dcfe348be6b47c0a71099cedab64b6b3e06 100644 --- a/profiler/cluster_analyse/cluster_analysis.py +++ b/profiler/cluster_analyse/cluster_analysis.py @@ -22,13 +22,42 @@ from communication_group.communication_group_generator import CommunicationGroup from common_func.constant import Constant from common_func.file_manager import FileManager from common_func.path_manager import PathManager +from common_func import analysis_loader from analysis.analysis_facade import AnalysisFacade +COMM_FEATURE_LIST = ['all', 'communication_time', 'communication_matrix'] +ALL_FEATURE_LIST = ['all', 'communication_time', 'communication_matrix', 'cann_api_sum', 'hccl_sum', 'compute_op_sum', + 'mstx_sum'] + + +def get_analysis_args(analysis_class, analysis_args): + parser = argparse.ArgumentParser(description="custom analysis args") + parser.add_argument("--parallel_mode", type=str, help="context mode", default="concurrent") + parser.add_argument("--export_type", type=str, help="export type", default="db") + analysis_class[1].add_parser_argument(parser) + return parser.parse_args(analysis_args) + +def parse_specific_params(analysis_name, analysis_args): + analysis_class = analysis_loader.get_class_from_name(analysis_name) + if not analysis_class: + print("[ERROR] undefined analysis.") + return None + + args_parsed = get_analysis_args(analysis_class, analysis_args) + specific_params = { + Constant.RECIPE_NAME: analysis_class[0], + Constant.RECIPE_CLASS: analysis_class[1], + Constant.PARALLEL_MODE: args_parsed.parallel_mode, + Constant.EXPORT_TYPE: args_parsed.export_type + } + specific_params.update(analysis_class[1].parse_argument(args_parsed)) + return specific_params class Interface: ASCEND_PT = "ascend_pt" ASCEND_MS = "ascend_ms" + def __init__(self, params: dict): self.collection_path = PathManager.get_realpath(params.get(Constant.COLLECTION_PATH)) self.analysis_mode = params.get(Constant.ANALYSIS_MODE) @@ -37,6 +66,7 @@ class Interface: self.collective_group_dict = {} self.communication_ops = [] self.matrix_ops = [] + self.origin_params = params def allocate_prof_data(self): ascend_pt_dirs = [] @@ -47,39 +77,72 @@ class Interface: ascend_pt_dirs.append(os.path.join(root, dir_name)) if dir_name.endswith(self.ASCEND_MS): ascend_ms_dirs.append(os.path.join(root, dir_name)) - pt_data_map = PytorchDataPreprocessor(ascend_pt_dirs).get_data_map() + pytorch_processor = PytorchDataPreprocessor(ascend_pt_dirs) + pt_data_map = pytorch_processor.get_data_map() + data_type = pytorch_processor.get_data_type() ms_data_map = MindsporeDataPreprocessor(ascend_ms_dirs).get_data_map() if pt_data_map and ms_data_map: print("[ERROR] Can not analyze pytorch and mindspore meantime.") - return[] - return pt_data_map if pt_data_map else ms_data_map + return [] + return (pt_data_map, data_type) if pt_data_map else (ms_data_map, Constant.TEXT) def run(self): PathManager.check_input_directory_path(self.collection_path) PathManager.check_path_owner_consistent(self.collection_path) - FileManager.create_output_dir(self.collection_path) - data_map = self.allocate_prof_data() + data_map, data_type = self.allocate_prof_data() if not data_map: print("[WARNING] Can not get rank info or profiling data.") return - params = { - Constant.COLLECTION_PATH: self.collection_path, - Constant.DATA_MAP: data_map, - Constant.ANALYSIS_MODE: self.analysis_mode - } - comm_data_dict = CommunicationGroupGenerator(params).generate() - params[Constant.COMM_DATA_DICT] = comm_data_dict - AnalysisFacade(params).cluster_analyze() + if data_type == Constant.INVALID: + print("[ERROR] The current folder contains both DB and other files. Please check.") + return + if self.analysis_mode not in COMM_FEATURE_LIST: + if data_type != Constant.DB: + print("[ERROR] The current analysis node only supports DB as input data. Please check.") + return + FileManager.create_output_dir(self.collection_path, is_overwrite=True) + params = { + Constant.COLLECTION_PATH: self.collection_path, + Constant.DATA_MAP: data_map, + Constant.DATA_TYPE: data_type, + Constant.RECIPE_NAME: self.origin_params.get(Constant.RECIPE_NAME, ""), + Constant.RECIPE_CLASS: self.origin_params.get(Constant.RECIPE_CLASS), + Constant.PARALLEL_MODE: self.origin_params.get(Constant.PARALLEL_MODE, ""), + Constant.EXPORT_TYPE: self.origin_params.get(Constant.EXPORT_TYPE, "") + } + params.update(params[Constant.RECIPE_CLASS].get_extra_argument(self.origin_params)) + AnalysisFacade(params).recipe_analyze() + else: + FileManager.create_output_dir(self.collection_path) + params = { + Constant.COLLECTION_PATH: self.collection_path, + Constant.DATA_MAP: data_map, + Constant.ANALYSIS_MODE: self.analysis_mode, + Constant.DATA_TYPE: data_type + } + comm_data_dict = CommunicationGroupGenerator(params).generate() + params[Constant.COMM_DATA_DICT] = comm_data_dict + AnalysisFacade(params).cluster_analyze() -if __name__ == "__main__": +def cluster_analysis_main(args=None): parser = argparse.ArgumentParser(description="cluster analysis module") parser.add_argument('-d', '--collection_path', type=str, required=True, help="profiling data path") - parser.add_argument('-m', '--mode', choices=['all', 'communication_time', 'communication_matrix'], + parser.add_argument('-m', '--mode', choices=ALL_FEATURE_LIST, default='all', help="different analysis mode") - args_parsed = parser.parse_args() + args_parsed, args_remained = parser.parse_known_args(args=args) parameter = { Constant.COLLECTION_PATH: args_parsed.collection_path, Constant.ANALYSIS_MODE: args_parsed.mode } + if args_parsed.mode in COMM_FEATURE_LIST: + if args_remained: + print(f"[ERROR] The specific argument {args_remained} is not supported for communication analysis.") + return + else: + parameter.update(parse_specific_params(args_parsed.mode, args_remained)) Interface(parameter).run() + + +if __name__ == "__main__": + cluster_analysis_main() diff --git a/profiler/cluster_analyse/cluster_data_preprocess/data_preprocessor.py b/profiler/cluster_analyse/cluster_data_preprocess/data_preprocessor.py index ebc9647c208b05f51698563b8dabb7d13c28c7ec..72d65ae6571e68564e46f43463843d1f46a3a69e 100644 --- a/profiler/cluster_analyse/cluster_data_preprocess/data_preprocessor.py +++ b/profiler/cluster_analyse/cluster_data_preprocess/data_preprocessor.py @@ -12,15 +12,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import os from abc import abstractmethod class DataPreprocessor: - def __init__(self, collection_path: str): - self.collection_path = collection_path + PROFILER_INFO_HEAD = 'profiler_info_' + PROFILER_INFO_EXTENSION = '.json' + + def __init__(self, path_list: list): + self.path_list = path_list self.data_map = {} @abstractmethod - def input_data(self): + def get_data_map(self): pass + + def get_rank_id(self, dir_name: str) -> int: + files = os.listdir(dir_name) + for file_name in files: + if file_name.startswith(self.PROFILER_INFO_HEAD) and file_name.endswith(self.PROFILER_INFO_EXTENSION): + rank_id_str = file_name[len(self.PROFILER_INFO_HEAD): -1 * len(self.PROFILER_INFO_EXTENSION)] + try: + rank_id = int(rank_id_str) + except ValueError: + rank_id = -1 + return rank_id + return -1 diff --git a/profiler/cluster_analyse/cluster_data_preprocess/mindspore_data_preprocessor.py b/profiler/cluster_analyse/cluster_data_preprocess/mindspore_data_preprocessor.py index 85debdd31bb07cf96b91c12eb731cc00b00fcaa3..a3e09983ddb54b972a9e343c1661b5c8b2cbb8c8 100644 --- a/profiler/cluster_analyse/cluster_data_preprocess/mindspore_data_preprocessor.py +++ b/profiler/cluster_analyse/cluster_data_preprocess/mindspore_data_preprocessor.py @@ -14,17 +14,14 @@ # limitations under the License. from collections import defaultdict -import os -from common_func.file_manager import FileManager -from common_func.path_manager import PathManager +from cluster_data_preprocess.data_preprocessor import DataPreprocessor -class MindsporeDataPreprocessor: - PROFILER_INFO_HEAD = 'profiler_info_' - PROFILER_INFO_EXTENSION = '.json' - def __init__(self, path_list: str): - self.path_list = path_list +class MindsporeDataPreprocessor(DataPreprocessor): + + def __init__(self, path_list: list): + super().__init__(path_list) def get_data_map(self) -> dict: rank_id_map = defaultdict(list) @@ -35,23 +32,10 @@ class MindsporeDataPreprocessor: continue rank_id_map[rank_id].append(dir_name) - ret_dict = dict() try: for (rank_id, dir_list) in rank_id_map.items(): dir_list.sort(key=lambda x: x.split('_')[-3]) - ret_dict[rank_id] = dir_list[0] + self.data_map[rank_id] = dir_list[0] except Exception as e: raise RuntimeError("Found invalid directory name!") from e - return ret_dict - - def get_rank_id(self, dir_name: str) -> int: - files = os.listdir(dir_name) - for file_name in files: - if file_name.startswith(self.PROFILER_INFO_HEAD) and file_name.endswith(self.PROFILER_INFO_EXTENSION): - rank_id_str = file_name[len(self.PROFILER_INFO_HEAD): -1 * len(self.PROFILER_INFO_EXTENSION)] - try: - rank_id = int(rank_id_str) - except ValueError: - rank_id = -1 - return rank_id - return -1 + return self.data_map diff --git a/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py b/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py index f1e4c062a7c05656980f0767a3180154e91942ae..55c3d03958b97c427fe8fde0625e72ea4dee8997 100644 --- a/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py +++ b/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py @@ -12,19 +12,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import glob from collections import defaultdict import os + +from cluster_data_preprocess.data_preprocessor import DataPreprocessor +from common_func.constant import Constant from common_func.file_manager import FileManager -from common_func.path_manager import PathManager -class PytorchDataPreprocessor: - PROFILER_INFO_HEAD = 'profiler_info_' - PROFILER_INFO_EXTENSION = '.json' +class PytorchDataPreprocessor(DataPreprocessor): - def __init__(self, path_list: str): - self.path_list = path_list + def __init__(self, path_list: list): + super().__init__(path_list) + self.data_type = set() def get_data_map(self) -> dict: rank_id_map = defaultdict(list) @@ -33,25 +34,23 @@ class PytorchDataPreprocessor: if rank_id < 0: print('[Error]fail to get rankid or rankid invalid.') continue + for file_name in os.listdir(dir_name): + if file_name.startswith(self.PROFILER_INFO_HEAD) and file_name.endswith(self.PROFILER_INFO_EXTENSION): + file_path = os.path.join(dir_name, file_name) + config = FileManager.read_json_file(file_path) + self.data_type.add(config.get(Constant.CONFIG, {}).get(Constant.EXPER_CONFIG, {}). + get(Constant.EXPORT_TYPE, Constant.TEXT)) rank_id_map[rank_id].append(dir_name) - ret_dict = dict() try: for (rank_id, dir_list) in rank_id_map.items(): dir_list.sort(key=lambda x: x.split('_')[-3]) - ret_dict[rank_id] = dir_list[0] + self.data_map[rank_id] = dir_list[0] except Exception as e: raise RuntimeError("Found invalid directory name!") from e - return ret_dict + return self.data_map - def get_rank_id(self, dir_name: str) -> int: - files = os.listdir(dir_name) - for file_name in files: - if file_name.startswith(self.PROFILER_INFO_HEAD) and file_name.endswith(self.PROFILER_INFO_EXTENSION): - rank_id_str = file_name[len(self.PROFILER_INFO_HEAD): -1 * len(self.PROFILER_INFO_EXTENSION)] - try: - rank_id = int(rank_id_str) - except ValueError: - rank_id = -1 - return rank_id - return -1 + def get_data_type(self): + if len(self.data_type) == 1: + return self.data_type.pop() + return Constant.INVALID diff --git a/profiler/cluster_analyse/cluster_kernels_analysis/__init__.py b/profiler/cluster_analyse/cluster_kernels_analysis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/cluster_analyse/cluster_statistics_export/__init__.py b/profiler/cluster_analyse/cluster_statistics_export/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7101187a2c2619f3b1c20dded14b433950b4c662 --- /dev/null +++ b/profiler/cluster_analyse/cluster_statistics_export/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/profiler/cluster_analyse/cluster_statistics_export/cann_api_sum_export.py b/profiler/cluster_analyse/cluster_statistics_export/cann_api_sum_export.py new file mode 100644 index 0000000000000000000000000000000000000000..578ee937be57ff8615085bbe1e4ac6ccae81a4e9 --- /dev/null +++ b/profiler/cluster_analyse/cluster_statistics_export/cann_api_sum_export.py @@ -0,0 +1,65 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + +QUERY = """ +WITH + summary as ( + SELECT + name, + sum(endNs - startNs) AS duration, + count (*) AS num, + avg(endNs - startNs) AS avg_duration, + min(endNs - startNs) AS min_duration, + median(endNs - startNs) AS med_duration, + max(endNs - startNs) AS max_duration, + stdev(endNs - startNs) AS stdev_duration, + lower_quartile(endNs - startNs) AS lower_quartile_duration, + upper_quartile(endNs - startNs) AS upper_quartile_duration + FROM + CANN_API + GROUP BY name + ), + totals AS ( + SELECT sum(duration) AS total + FROM summary + ) +SELECT + ids.value AS "name", + round(summary.duration * 100.0 / (SELECT total FROM totals), 2) AS "durationRatio", + summary.duration AS "totalTimeNs", + summary.num AS "totalCount", + round(summary.avg_duration, 1) AS "averageNs", + round(summary.min_duration, 1) AS "minNs", + round(summary.lower_quartile_duration, 1) AS "Q1Ns", + round(summary.med_duration, 1) AS "medNs", + round(summary.upper_quartile_duration, 1) AS "Q3Ns", + round(summary.max_duration, 1) AS "maxNs", + round(summary.stdev_duration, 1) AS "stdev" +FROM + summary +LEFT JOIN + STRING_IDS AS ids + ON ids.id == summary.name +ORDER BY 2 DESC; + """ + + +class CannApiSumExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse/cluster_statistics_export/compute_op_sum_export.py b/profiler/cluster_analyse/cluster_statistics_export/compute_op_sum_export.py new file mode 100644 index 0000000000000000000000000000000000000000..d70c696100bc305f8b1e182f7b1f915cf58f274a --- /dev/null +++ b/profiler/cluster_analyse/cluster_statistics_export/compute_op_sum_export.py @@ -0,0 +1,49 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + + +QUERY = """ +SELECT + NAME_IDS.value AS "OpName", + OPTYPE_IDS.value AS "OpType", + TASKTYPE_IDS.value AS "TaskType", + INPUTSHAPES_IDS.value AS "InputShapes", + round(TASK.endNs - TASK.startNs) AS "Duration" +FROM + COMPUTE_TASK_INFO +LEFT JOIN TASK + ON TASK.globalTaskId == COMPUTE_TASK_INFO.globalTaskId +LEFT JOIN + STRING_IDS AS NAME_IDS + ON NAME_IDS.id == COMPUTE_TASK_INFO.name +LEFT JOIN + STRING_IDS AS OPTYPE_IDS + ON OPTYPE_IDS.id == COMPUTE_TASK_INFO.opType +LEFT JOIN + STRING_IDS AS TASKTYPE_IDS + ON TASKTYPE_IDS.id == COMPUTE_TASK_INFO.taskType +LEFT JOIN + STRING_IDS AS INPUTSHAPES_IDS + ON INPUTSHAPES_IDS.id == COMPUTE_TASK_INFO.inputShapes + """ + + +class ComputeOpSumExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse/cluster_statistics_export/hccl_sum_export.py b/profiler/cluster_analyse/cluster_statistics_export/hccl_sum_export.py new file mode 100644 index 0000000000000000000000000000000000000000..f695949de1a92e9a1faff593bc45e52f91582242 --- /dev/null +++ b/profiler/cluster_analyse/cluster_statistics_export/hccl_sum_export.py @@ -0,0 +1,39 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + + +QUERY = """ +SELECT + NAME_IDS.value AS "OpName", + TYPE_IDS.value AS "OpType", + round(endNs - startNs) AS "Duration" +FROM + COMMUNICATION_OP +LEFT JOIN + STRING_IDS AS TYPE_IDS + ON TYPE_IDS.id == COMMUNICATION_OP.opType +LEFT JOIN + STRING_IDS AS NAME_IDS + ON NAME_IDS.id == COMMUNICATION_OP.opName + """ + + +class HcclSumExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse/cluster_statistics_export/mstx_mark_export.py b/profiler/cluster_analyse/cluster_statistics_export/mstx_mark_export.py new file mode 100644 index 0000000000000000000000000000000000000000..ac5355c020042d474963296242b79eb3fd6a8c38 --- /dev/null +++ b/profiler/cluster_analyse/cluster_statistics_export/mstx_mark_export.py @@ -0,0 +1,57 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + + +QUERY = """ +WITH + FRAMEWORK_API AS ( + SELECT + PYTORCH_API.startNs, + CONNECTION_IDS.connectionId + FROM + PYTORCH_API + LEFT JOIN + CONNECTION_IDS + ON PYTORCH_API.connectionId == CONNECTION_IDS.id + ) +SELECT + MSG_IDS.value AS "msg", + MSTX_EVENTS.startNs AS "cann_ts", + TASK.startNs AS "device_ts", + FRAMEWORK_API.startNs AS "framework_ts", + MSTX_EVENTS.globalTid AS "tid" +FROM + MSTX_EVENTS +LEFT JOIN + TASK + ON MSTX_EVENTS.connectionId == TASK.connectionId +LEFT JOIN + FRAMEWORK_API + ON MSTX_EVENTS.connectionId == FRAMEWORK_API.connectionId +LEFT JOIN + STRING_IDS AS MSG_IDS + ON MSTX_EVENTS.message == MSG_IDS.id +ORDER BY + MSTX_EVENTS.startNs + """ + + +class MstxMarkExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse/cluster_statistics_export/mstx_step_export.py b/profiler/cluster_analyse/cluster_statistics_export/mstx_step_export.py new file mode 100644 index 0000000000000000000000000000000000000000..c257ce675fe46ea0f7eff2489dd2fe13c846564f --- /dev/null +++ b/profiler/cluster_analyse/cluster_statistics_export/mstx_step_export.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cluster_statistics_export.stats_export import StatsExport + + +QUERY = """ +SELECT + id AS "step_id", + startNs AS "start_ns", + endNs AS "end_ns" +FROM + STEP_TIME +ORDER BY + startNs + """ + + +class MstxStepExport(StatsExport): + + def __init__(self, db_path, recipe_name): + super().__init__(db_path, recipe_name) + self._query = QUERY diff --git a/profiler/cluster_analyse/cluster_statistics_export/stats_export.py b/profiler/cluster_analyse/cluster_statistics_export/stats_export.py new file mode 100644 index 0000000000000000000000000000000000000000..e6d98f48ef8c4e8032f7611dac163ead3cc5fbe0 --- /dev/null +++ b/profiler/cluster_analyse/cluster_statistics_export/stats_export.py @@ -0,0 +1,40 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from common_func.db_manager import DBManager +from common_func.constant import Constant + + +class StatsExport: + + def __init__(self, db_path, analysis_class): + self._db_path = db_path + self._analysis_class = analysis_class + self._query = None + + def get_query(self): + return self._query + + def read_export_db(self): + query = self.get_query() + if query is None: + print(f"[ERROR] query is None.") + return + conn, cursor = DBManager.create_connect_db(self._db_path, Constant.ANALYSIS) + data = pd.read_sql(query, conn) + DBManager.destroy_db_connect(conn, cursor) + return data diff --git a/profiler/cluster_analyse/cluster_utils/__init__.py b/profiler/cluster_analyse/cluster_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/cluster_analyse/cluster_utils/data_transfer_adapter.py b/profiler/cluster_analyse/cluster_utils/data_transfer_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..1f306415fa789ae0dab7d8751b1c240b3433de0d --- /dev/null +++ b/profiler/cluster_analyse/cluster_utils/data_transfer_adapter.py @@ -0,0 +1,142 @@ +import copy + +from common_func.constant import Constant +from common_func.table_constant import TableConstant + + +class DataTransferAdapter(object): + COMM_TIME_TABLE_COLUMN = [TableConstant.START_TIMESTAMP, TableConstant.ELAPSED_TIME, TableConstant.TRANSIT_TIME, + TableConstant.WAIT_TIME, TableConstant.SYNCHRONIZATION_TIME, TableConstant.IDLE_TIME, + TableConstant.SYNCHRONIZATION_TIME_RATIO, TableConstant.WAIT_TIME_RATIO] + COMM_TIME_JSON_COLUMN = [Constant.START_TIMESTAMP, Constant.ELAPSE_TIME_MS, Constant.TRANSIT_TIME_MS, + Constant.WAIT_TIME_MS, Constant.SYNCHRONIZATION_TIME_MS, Constant.IDLE_TIME_MS, + Constant.SYNCHRONIZATION_TIME_RATIO, Constant.WAIT_TIME_RATIO] + MATRIX_TABLE_COLUMN = [TableConstant.TRANSIT_SIZE, TableConstant.TRANSIT_TIME, TableConstant.BANDWIDTH, + TableConstant.TRANSPORT_TYPE, TableConstant.OPNAME] + MATRIX_JSON_COLUMN = [Constant.TRANSIT_SIZE_MB, Constant.TRANSIT_TIME_MS, Constant.BANDWIDTH_GB_S, + Constant.TRANSPORT_TYPE, Constant.OP_NAME] + COMM_BD_TABLE_COLUMN = [TableConstant.TRANSIT_SIZE, TableConstant.TRANSIT_TIME, TableConstant.BANDWIDTH, + TableConstant.LARGE_PACKET_RATIO] + COMM_BD_JSON_COLUMN = [Constant.TRANSIT_SIZE_MB, Constant.TRANSIT_TIME_MS, Constant.BANDWIDTH_GB_S, + Constant.LARGE_PACKET_RATIO] + + def __init__(self): + super().__init__() + + def transfer_comm_from_db_to_json(self, time_info: list, bandwidth_info: list): + result = {} + if not time_info and not bandwidth_info: + return result + for time_data in time_info: + comm_time = dict() + hccl_name = time_data[TableConstant.HCCL_OP_NAME] + "@" + time_data[TableConstant.GROUP_NAME] + for key, value in dict(zip(self.COMM_TIME_JSON_COLUMN, self.COMM_TIME_TABLE_COLUMN)).items(): + if not key.endswith("ratio"): + comm_time[key] = time_data.get(value, 0) + result.setdefault(time_data[TableConstant.STEP], {}).setdefault(time_data[TableConstant.TYPE], {}). \ + setdefault(hccl_name, {})[Constant.COMMUNICATION_TIME_INFO] = comm_time + hccl_set = set() + for bd_data in bandwidth_info: + hccl_name = bd_data[TableConstant.HCCL_OP_NAME] + "@" + bd_data[TableConstant.GROUP_NAME] + hccl_set.add(hccl_name) + for hccl in hccl_set: + comm_bd = dict() + for bd_data in bandwidth_info: + if hccl == (bd_data[TableConstant.HCCL_OP_NAME] + "@" + bd_data[TableConstant.GROUP_NAME]): + temp_dict = dict() + key_dict = dict(zip(self.COMM_BD_JSON_COLUMN, self.COMM_BD_TABLE_COLUMN)) + self.set_value_by_key(temp_dict, bd_data, key_dict) + comm_bd.setdefault(bd_data[TableConstant.TRANSPORT_TYPE], temp_dict).setdefault( + Constant.SIZE_DISTRIBUTION, {})[bd_data[TableConstant.PACKAGE_SIZE]] = \ + [bd_data[TableConstant.COUNT], bd_data[TableConstant.TOTAL_DURATION]] + result.setdefault(bd_data[TableConstant.STEP], {}).setdefault(bd_data[TableConstant.TYPE], {}). \ + setdefault(hccl, {})[Constant.COMMUNICATION_BANDWIDTH_INFO] = comm_bd + return result + + def transfer_comm_from_json_to_db(self, res_data: dict): + res_comm_data, res_bd_data = list(), list() + + def split_comm_time(): + for rank_id, comm_data in op_data.items(): + time_data = comm_data.get(Constant.COMMUNICATION_TIME_INFO) + res_time = set_only_value(rank_id) + for key, value in dict(zip(self.COMM_TIME_TABLE_COLUMN, self.COMM_TIME_JSON_COLUMN)).items(): + res_time[key] = time_data.get(value, 0) + res_comm_data.append(res_time) + bd_data = comm_data.get(Constant.COMMUNICATION_BANDWIDTH_INFO, {}) + for transport_type, data in bd_data.items(): + res_bandwidth = set_only_value(rank_id) + key_dict = dict(zip(self.COMM_BD_TABLE_COLUMN, self.COMM_BD_JSON_COLUMN)) + res_bandwidth[TableConstant.TRANSPORT_TYPE] = transport_type + self.set_value_by_key(res_bandwidth, data, key_dict) + for key, value in data.get(Constant.SIZE_DISTRIBUTION, {}).items(): + res_bandwidth[TableConstant.PACKAGE_SIZE] = key + res_bandwidth[TableConstant.COUNT] = value[0] + res_bandwidth[TableConstant.TOTAL_DURATION] = value[1] + temp_dict = copy.deepcopy(res_bandwidth) + res_bd_data.append(temp_dict) + + def set_only_value(rank_id): + res_dict = dict() + res_dict[TableConstant.RANK_SET] = str(rank_set) + res_dict[TableConstant.STEP] = step + res_dict[TableConstant.RANK_ID] = rank_id + res_dict[TableConstant.HCCL_OP_NAME] = op_name.split("@")[0] if "@" in op_name else op_name + res_dict[TableConstant.GROUP_NAME] = op_name.split("@")[1] if "@" in op_name else "" + return res_dict + + for rank_set, step_dict in res_data.items(): + for step, op_dict in step_dict.items(): + for op_name, op_data in op_dict.items(): + split_comm_time() + return res_comm_data, res_bd_data + + def set_value_by_key(self, src_dict, dst_dict, key_dict): + for key, value in key_dict.items(): + src_dict[key] = dst_dict.get(value, 0) + + def transfer_matrix_from_db_to_json(self, matrix_data: list): + result = {} + if not matrix_data: + return result + hccl_set = set() + for data in matrix_data: + hccl = data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME] + hccl_set.add(hccl) + for hccl in hccl_set: + for data in matrix_data: + if hccl == (data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME]): + key = data[TableConstant.SRC_RANK] + '-' + data[TableConstant.DST_RANK] + temp_dict = dict() + key_dict = dict(zip(self.MATRIX_JSON_COLUMN, self.MATRIX_TABLE_COLUMN)) + self.set_value_by_key(temp_dict, data, key_dict) + result.setdefault(data[TableConstant.STEP], {}).setdefault(data[TableConstant.TYPE], {}). \ + setdefault(hccl, {}).setdefault(key, temp_dict) + return result + + def transfer_matrix_from_json_to_db(self, res_data: dict): + result = list() + + def split_matrix_data(): + for op_name, op_data in op_dict.items(): + for link_key, link_data in op_data.items(): + if "@" in op_name: + hccl_op_name, group_name = op_name.split("@")[0], op_name.split("@")[1] + else: + hccl_op_name, group_name = op_name, "" + matrix_data = { + TableConstant.RANK_SET: str(rank_set), + TableConstant.STEP: step, + TableConstant.HCCL_OP_NAME: hccl_op_name, + TableConstant.GROUP_NAME: group_name, + TableConstant.SRC_RANK: link_key.split("-")[0], + TableConstant.DST_RANK: link_key.split("-")[1] + } + key_dict = dict(zip(self.MATRIX_TABLE_COLUMN, self.MATRIX_JSON_COLUMN)) + self.set_value_by_key(matrix_data, link_data, key_dict) + result.append(matrix_data) + + for rank_set, step_dict in res_data.items(): + for step, op_dict in step_dict.items(): + split_matrix_data() + return result diff --git a/profiler/cluster_analyse/common_func/analysis_loader.py b/profiler/cluster_analyse/common_func/analysis_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..55e7dbc6ea930de7a47799384ffad5daa1328da2 --- /dev/null +++ b/profiler/cluster_analyse/common_func/analysis_loader.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import inspect +import sys + +from common_func.constant import Constant +from analysis.base_analysis import BaseRecipeAnalysis + +def is_analysis_class(obj): + return inspect.isclass(obj) and issubclass(obj, BaseRecipeAnalysis) and obj != BaseRecipeAnalysis + +def get_class_from_name(analysis_name : str): + sys.path.append(Constant.ANALYSIS_PATH) + analysis_path = f"analysis.{analysis_name}.{analysis_name}" + module = None + try: + module = importlib.import_module(analysis_path) + except Exception as e: + print(f"[ERROR] {analysis_path} not find:{e}") + + specific_analysis = inspect.getmembers(module, is_analysis_class) + if not specific_analysis: + print(f"[ERROR] {analysis_name} not found.") + return specific_analysis[0] diff --git a/profiler/cluster_analyse/common_func/constant.py b/profiler/cluster_analyse/common_func/constant.py index e426a9d22567ae9e70411f709c1c09ce02cbdeca..80f0374c1d1d9a37204b9583112ce5baa4cf3e95 100644 --- a/profiler/cluster_analyse/common_func/constant.py +++ b/profiler/cluster_analyse/common_func/constant.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os class Constant(object): # dir name @@ -30,6 +31,7 @@ class Constant(object): MAX_JSON_SIZE = 1024 * 1024 * 1024 * 10 MAX_CSV_SIZE = 1024 * 1024 * 1024 * 5 MAX_PATH_LENGTH = 4096 + MAX_READ_DB_FILE_BYTES = 1024 * 1024 * 1024 * 8 # communication P2P = "p2p" @@ -56,6 +58,9 @@ class Constant(object): OP_NAME = "Op Name" BANDWIDTH_GB_S = "Bandwidth(GB/s)" COMMUNICATION = "communication.json" + ELAPSE_TIME_MS = "Elapse Time(ms)" + IDLE_TIME_MS = "Idle Time(ms)" + LARGE_PACKET_RATIO = "Large Packet Ratio" # params DATA_MAP = "data_map" @@ -66,11 +71,12 @@ class Constant(object): COMMUNICATION_GROUP = "communication_group" TRANSPORT_TYPE = "Transport Type" COMM_DATA_DICT = "comm_data_dict" + DATA_TYPE = "data_type" ANALYSIS_MODE = "analysis_mode" # step time - RANK = 'rank' - STAGE = 'stage' + RANK = "rank" + STAGE = "stage" # epsilon EPS = 1e-15 @@ -78,3 +84,35 @@ class Constant(object): # file suffix JSON_SUFFIX = ".json" CSV_SUFFIX = ".csv" + + # result files type + TEXT = "text" + DB = "db" + INVALID = "invalid" + + # db name + DB_COMMUNICATION_ANALYZER = "analysis.db" + DB_CLUSTER_COMMUNICATION_ANALYZER = "cluster_analysis.db" + + # db tables + TABLE_COMM_ANALYZER_BANDWIDTH = "CommAnalyzerBandwidth" + TABLE_COMM_ANALYZER_TIME = "CommAnalyzerTime" + TABLE_COMM_ANALYZER_MATRIX = "CommAnalyzerMatrix" + TABLE_STEP_TRACE = "StepTraceTime" + TABLE_HOST_INFO = "HostInfo" + TABLE_RANK_DEVICE_MAP = "RankDeviceMap" + + # data config key + CONFIG = "config" + EXPER_CONFIG = "experimental_config" + EXPORT_TYPE = "_export_type" + + # recipe config + ANALYSIS = "analysis" + RECIPE_NAME = "recipe_name" + RECIPE_CLASS = "recipe_class" + PARALLEL_MODE = "parallel_mode" + CLUSTER_CUSTOM_ANALYSE_PATH = os.path.abspath(os.path.dirname(__file__)) + ANALYSIS_PATH = os.path.join(CLUSTER_CUSTOM_ANALYSE_PATH, 'analysis') + + CONCURRENT_MODE = "concurrent" \ No newline at end of file diff --git a/profiler/cluster_analyse/common_func/context.py b/profiler/cluster_analyse/common_func/context.py new file mode 100644 index 0000000000000000000000000000000000000000..4e3d544d3769e0c1360790dc1a4c57ca484687b8 --- /dev/null +++ b/profiler/cluster_analyse/common_func/context.py @@ -0,0 +1,85 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from functools import partial +from concurrent import futures +from common_func.constant import Constant + + +class Context(object): + """abstract base class""" + + ctx_map = None + + @classmethod + def create_context(cls, mode=Constant.CONCURRENT_MODE): + if cls.ctx_map is None: + keys = [Constant.CONCURRENT_MODE] + values = [ConcurrentContext] + cls.ctx_map = dict(zip(keys, values)) + + if mode not in cls.ctx_map: + raise NotImplementedError("mode must be in {}".format(keys)) + + return cls.ctx_map[mode]() + + def __init__(self): + print("[INFO] context {} initialized.".format(self._mode)) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + if exc_type is not None: + print(f"[ERROR] Failed to exit context: {exc_val}") + + def launch(self, func, *args, **kwargs): + raise NotImplementedError + + def map(self, func, *iterables, **kwargs): + raise NotImplementedError + + def wait(self, waitable): + raise NotImplementedError + +class ConcurrentContext(Context): + + def __init__(self, executor=None): + self._mode = Constant.CONCURRENT_MODE + super().__init__() + self._custom = executor is None + self._executor = executor or futures.ProcessPoolExecutor(max_workers=os.cpu_count()) + + def __enter__(self): + if self._executor is None: + raise RuntimeError("executor is None") + return self + + def close(self): + if self._custom: + self._executor.shutdown(wait=True) + self._executor = None + + def launch(self, func, *args, **kwargs): + return self._executor.submit(func, *args, **kwargs).result() + + def map(self, func, *iterables, **kwargs): + partial_func = partial(func, **kwargs) + return list(self._executor.map(partial_func, *iterables)) + + def wait(self, waitable): + return waitable \ No newline at end of file diff --git a/profiler/cluster_analyse/common_func/db_manager.py b/profiler/cluster_analyse/common_func/db_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..c0d6ad89be8edd8bbb2a4ee8e0653141550b0129 --- /dev/null +++ b/profiler/cluster_analyse/common_func/db_manager.py @@ -0,0 +1,233 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sqlite3 + +from common_func.constant import Constant +from common_func.empty_class import EmptyClass +from common_func.file_manager import check_db_path_valid +from common_func.tables_config import TablesConfig +from common_func.sql_extention_func import SqlExtentionAggregateFunc + +class DBManager: + """ + class to manage DB operation + """ + FETCH_SIZE = 10000 + INSERT_SIZE = 10000 + MAX_ROW_COUNT = 100000000 + + @staticmethod + def create_connect_db(db_path: str, mode=None) -> tuple: + """ + create and connect database + """ + if check_db_path_valid(db_path, is_create=True): + try: + conn = sqlite3.connect(db_path) + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return EmptyClass("empty conn"), EmptyClass("empty curs") + try: + if mode == Constant.ANALYSIS: + try: + for func_name, params_count, class_name in SqlExtentionAggregateFunc: + conn.create_aggregate(func_name, params_count, class_name) + except sqlite3.Error as err: + print(f"[ERROR] {err}") + if isinstance(conn, sqlite3.Connection): + curs = conn.cursor() + os.chmod(db_path, Constant.FILE_AUTHORITY) + return conn, curs + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return EmptyClass("empty conn"), EmptyClass("empty curs") + return EmptyClass("empty conn"), EmptyClass("empty curs") + + @staticmethod + def destroy_db_connect(conn: any, curs: any) -> None: + """ + destroy db connection + """ + try: + if isinstance(curs, sqlite3.Cursor): + curs.close() + except sqlite3.Error as err: + print(f"[ERROR] {err}") + try: + if isinstance(conn, sqlite3.Connection): + conn.close() + except sqlite3.Error as err: + print(f"[ERROR] {err}") + + @staticmethod + def judge_table_exists(curs: any, table_name: str) -> any: + """ + judge table exists + """ + if not isinstance(curs, sqlite3.Cursor): + return False + try: + curs.execute("select count(*) from sqlite_master where type='table' and name=?", (table_name,)) + return curs.fetchone()[0] + except sqlite3.Error as err: + print("[ERROR] {}".format(err)) + return False + + @staticmethod + def sql_generate_table(table_map: str): + header_with_type_begin = "(" + header_with_type_end = ")" + header_with_type_list = [] + if table_map in TablesConfig.DATA: + items = TablesConfig.DATA[table_map] + for item in items: + if item[0] == "index": + header_with_type_list.append('"' + item[0] + '" ' + item[1].split(",")[0]) + else: + header_with_type_list.append(item[0] + ' ' + item[1].split(",")[0]) + header_with_type_begin += ",".join(header_with_type_list) + header_with_type_begin += header_with_type_end + return header_with_type_begin + return "" + + @classmethod + def check_tables_in_db(cls, db_path: any, *tables: any) -> bool: + if check_db_path_valid(db_path): + conn, curs = cls.create_connect_db(db_path) + if not (conn and curs): + return False + res = True + for table in tables: + if not cls.judge_table_exists(curs, table): + res = False + break + cls.destroy_db_connect(conn, curs) + return res + return False + + @classmethod + def create_tables(cls, db_path: any, *tables: any): + conn, curs = cls.create_connect_db(db_path) + if not (conn and curs): + return + for table_name in tables: + if cls.judge_table_exists(curs, table_name): + drop_sql = "drop table {0}".format(table_name) + cls.execute_sql(conn, drop_sql) + table_map = "{0}Map".format(table_name) + header_with_type = cls.sql_generate_table(table_map) + sql = "CREATE TABLE IF NOT EXISTS " + table_name + header_with_type + cls.execute_sql(conn, sql) + cls.destroy_db_connect(conn, curs) + + @classmethod + def get_table_column_count(cls, db_path: any, table: any) -> int: + conn, curs = cls.create_connect_db(db_path) + if not (conn and curs): + return 0 + sql = "SELECT COUNT(*) FROM pragma_table_info('{}')".format(table) + res = 0 + try: + curs.execute(sql) + res = curs.fetchone()[0] + except sqlite3.Error as err: + print("[ERROR] {}".format(err)) + finally: + cls.destroy_db_connect(conn, curs) + return res + + @staticmethod + def execute_sql(conn: any, sql: str, params: any = None) -> bool: + """ + execute sql + """ + try: + if isinstance(conn, sqlite3.Connection): + if params: + conn.cursor().execute(sql, params) + else: + conn.cursor().execute(sql) + conn.commit() + return True + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return False + print("[ERROR] conn is invalid param") + return False + + @staticmethod + def executemany_sql(conn: any, sql: str, params: any) -> bool: + """ + execute many sql once + """ + try: + if isinstance(conn, sqlite3.Connection): + conn.cursor().executemany(sql, params) + conn.commit() + return True + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return False + print("[ERROR] conn is invalid param") + return False + + @classmethod + def fetch_all_data(cls: any, curs: any, sql: str, param: tuple = None, is_dict: bool = True) -> list: + """ + fetch 10000 num of data from db each time to get all data + """ + if not isinstance(curs, sqlite3.Cursor): + return [] + data = [] + try: + if param: + res = curs.execute(sql, param) + else: + res = curs.execute(sql) + except sqlite3.Error as err: + print(f"[ERROR] {err}") + curs.row_factory = None + return [] + try: + description = res.description + while True: + res = curs.fetchmany(cls.FETCH_SIZE) + if is_dict: + data += CustomizedDictFactory.generate_dict_from_db(res, description) + else: + data += res + if len(data) > cls.MAX_ROW_COUNT: + print("[WARRING] The records count in the table exceeds the limit!") + if len(res) < cls.FETCH_SIZE: + break + return data + except sqlite3.Error as err: + print(f"[ERROR] {err}") + return [] + finally: + curs.row_factory = None + + +class CustomizedDictFactory: + @staticmethod + def generate_dict_from_db(data_result: any, description: any) -> any: + description_set = [i[0] for i in description] + res = [] + for data in data_result: + data_dict = dict(zip(description_set, data)) + res.append(data_dict) + return res diff --git a/profiler/cluster_analyse/common_func/empty_class.py b/profiler/cluster_analyse/common_func/empty_class.py new file mode 100644 index 0000000000000000000000000000000000000000..df100d156fa064cca4514260db0b2e843e217d09 --- /dev/null +++ b/profiler/cluster_analyse/common_func/empty_class.py @@ -0,0 +1,20 @@ +class EmptyClass: + + def __init__(self: any, info: str = "") -> None: + self._info = info + + @classmethod + def __bool__(cls: any) -> bool: + return False + + @classmethod + def __str__(cls: any) -> str: + return "" + + @property + def info(self: any) -> str: + return self._info + + @staticmethod + def is_empty() -> bool: + return True diff --git a/profiler/cluster_analyse/common_func/file_manager.py b/profiler/cluster_analyse/common_func/file_manager.py index 3853c806f92de1d8da14e32105fcc869789a9a40..e7e2d5adca37faf5b377bcbe720fdfba84311eca 100644 --- a/profiler/cluster_analyse/common_func/file_manager.py +++ b/profiler/cluster_analyse/common_func/file_manager.py @@ -98,9 +98,13 @@ class FileManager: raise RuntimeError(f"Can't create the file: {base_name}") from e @classmethod - def create_output_dir(cls, collection_path: str) -> None: + def create_output_dir(cls, collection_path: str, is_overwrite: bool = False) -> None: output_path = os.path.join( collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + if is_overwrite: + if not os.path.exists(output_path): + PathManager.make_dir_safety(output_path) + return PathManager.remove_path_safety(output_path) PathManager.make_dir_safety(output_path) @@ -115,3 +119,13 @@ class FileManager: file_size = os.path.getsize(file_path) if file_size > limit_size: raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.") + + +def check_db_path_valid(path: str, is_create: bool = False, max_size: int = Constant.MAX_READ_DB_FILE_BYTES) -> bool: + if os.path.islink(path): + print(f'[ERROR] The db file path: {path} is link. Please check the path') + return False + if not is_create and os.path.exists(path) and os.path.getsize(path) > max_size: + print(f'[ERROR] The db file: {path} is too large to read. Please check the file') + return False + return True diff --git a/profiler/cluster_analyse/common_func/sql_extention_func.py b/profiler/cluster_analyse/common_func/sql_extention_func.py new file mode 100644 index 0000000000000000000000000000000000000000..987a0d4365307704d6abf32575a48cc15c0fa33d --- /dev/null +++ b/profiler/cluster_analyse/common_func/sql_extention_func.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +class Median: + + def __init__(self) -> None: + self.data = [] + + def step(self, value) -> None: + self.data.append(value) + + def finalize(self): + return np.median(self.data) + + +class LowerQuartile: + + def __init__(self) -> None: + self.data = [] + + def step(self, value) -> None: + self.data.append(value) + + def finalize(self): + return np.quantile(self.data, 0.25) + + +class UpperQuartile: + + def __init__(self) -> None: + self.data = [] + + def step(self, value) -> None: + self.data.append(value) + + def finalize(self): + return np.quantile(self.data, 0.75) + + +class StandardDeviation: + + def __init__(self) -> None: + self.data = [] + + def step(self, value) -> None: + self.data.append(value) + + def finalize(self): + return np.std(self.data) + + +# func_name, params_count, class +SqlExtentionAggregateFunc = [ + ('median', 1, Median), + ('lower_quartile', 1, LowerQuartile), + ('upper_quartile', 1, UpperQuartile), + ('stdev', 1, StandardDeviation) +] diff --git a/profiler/cluster_analyse/common_func/table_constant.py b/profiler/cluster_analyse/common_func/table_constant.py new file mode 100644 index 0000000000000000000000000000000000000000..de6d47e97e5683493905de5353a9978195e87b70 --- /dev/null +++ b/profiler/cluster_analyse/common_func/table_constant.py @@ -0,0 +1,27 @@ +class TableConstant: + + RANK_SET = "rank_set" + STEP = "step" + RANK_ID = "rank_id" + TYPE = "type" + HCCL_OP_NAME = "hccl_op_name" + GROUP_NAME = "group_name" + START_TIMESTAMP = "start_timestamp" + ELAPSED_TIME = "elapse_time" + TRANSIT_TIME = "transit_time" + WAIT_TIME = "wait_time" + SYNCHRONIZATION_TIME = "synchronization_time" + IDLE_TIME = "idle_time" + SYNCHRONIZATION_TIME_RATIO = "synchronization_time_ratio" + WAIT_TIME_RATIO = "wait_time_ratio" + BAND_TYPE = "band_type" + TRANSIT_SIZE = "transit_size" + BANDWIDTH = "bandwidth" + LARGE_PACKET_RATIO = "large_packet_ratio" + PACKAGE_SIZE = "package_size" + COUNT = "count" + TOTAL_DURATION = "total_duration" + SRC_RANK = "src_rank" + DST_RANK = "dst_rank" + TRANSPORT_TYPE = "transport_type" + OPNAME = "op_name" diff --git a/profiler/cluster_analyse/common_func/tables_config.py b/profiler/cluster_analyse/common_func/tables_config.py new file mode 100644 index 0000000000000000000000000000000000000000..f010014519f864e627f83b99ad0df26af98af3f9 --- /dev/null +++ b/profiler/cluster_analyse/common_func/tables_config.py @@ -0,0 +1,73 @@ +class TablesConfig: + DATA = { + "ClusterCommAnalyzerTimeMap": [ + ("rank_set", "TEXT, null"), + ("step", "TEXT, null"), + ("rank_id", "INTEGER, null"), + ("hccl_op_name", "TEXT, null"), + ("group_name", "TEXT, null"), + ("start_timestamp", "NUMERIC, null"), + ("elapsed_time", "NUMERIC, null"), + ("transit_time", "NUMERIC, null"), + ("wait_time", "NUMERIC, null"), + ("synchronization_time", "NUMERIC, null"), + ("idle_time", "NUMERIC, null"), + ("synchronization_time_ratio", "NUMERIC, null"), + ("wait_time_ratio", "NUMERIC, null") + ], + "CommunicationGroupMap": [ + ("type", "TEXT, null"), + ("rank_set", "TEXT, null") + ], + "ClusterCommAnalyzerBandwidthMap": [ + ("rank_set", "TEXT, null"), + ("step", "TEXT, null"), + ("rank_id", "INTEGER, null"), + ("hccl_op_name", "TEXT, null"), + ("group_name", "TEXT, null"), + ("band_type", "TEXT, null"), + ("transit_size", "NUMERIC, null"), + ("transit_time", "NUMERIC, null"), + ("bandwidth", "NUMERIC, null"), + ("large_packet_ratio", "NUMERIC, null"), + ("package_size", "NUMERIC, null"), + ("count", "NUMERIC, null"), + ("total_duration", "NUMERIC, null") + ], + "ClusterCommAnalyzerMatrixMap": [ + ("rank_set", "TEXT, null"), + ("step", "TEXT, null"), + ("hccl_op_name", "TEXT, null"), + ("group_name", "TEXT, null"), + ("src_rank", "TEXT, null"), + ("dst_rank", "TEXT, null"), + ("transit_size", "NUMERIC, null"), + ("transit_time", "NUMERIC, null"), + ("bandwidth", "NUMERIC, null"), + ("transport_type", "TEXT, null"), + ("op_name", "TEXT, null") + ], + "ClusterStepTraceTimeMap": [ + ("step", "TEXT, null"), + ("type", "TEXT, null"), + ("index", "TEXT, null"), + ("computing", "NUMERIC, null"), + ("communication_not_overlapped", "NUMERIC, null"), + ("overlapped", "NUMERIC, null"), + ("communication", "NUMERIC, null"), + ("free", "NUMERIC, null"), + ("stage", "NUMERIC, null"), + ("bubble", "NUMERIC, null"), + ("communication_not_overlapped_and_exclude_receive", "NUMERIC, null"), + ("preparing", "NUMERIC, null") + ], + "HostInfoMap": [ + ("hostUid", "INTEGER, null"), + ("hostName", "TEXT, null") + ], + "RankDeviceMapMap": [ + ("rankId", "INTEGER, null"), + ("deviceId", "INTEGER, null"), + ("hostUid", "INTEGER, null") + ] + } diff --git a/profiler/cluster_analyse/common_func/utils.py b/profiler/cluster_analyse/common_func/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0a20a5c237f9f46e7b7425ef4b295dad4656174e --- /dev/null +++ b/profiler/cluster_analyse/common_func/utils.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + + +def format_columns(df: pd.DataFrame): + formatted_df = df.rename( + { + "25%": "Q1Ns", + "50%": "MedianNs", + "75%": "Q3Ns", + 0.25: "Q1Ns", + 0.5: "MedianNs", + 0.75: "Q3Ns", + "Q1": "Q1Ns", + "Q3": "Q3Ns", + "min": "MinNs", + "max": "MaxNs", + "median": "MedianNs", + "sum": "SumNs", + "std": "StdNs", + "mean": "MeanNs", + "count": "Count" + }, + axis="columns" + ) + + stats_cols = ["Count", "MeanNs", "StdNs", "MinNs", "Q1Ns", "MedianNs", "Q3Ns", "MaxNs", "SumNs"] + other_cols = [col for col in formatted_df.columns if col not in stats_cols] + return formatted_df[stats_cols + other_cols] + + +def describe_duration(series_groupby): + agg_df = series_groupby.agg(["min", "max", "count", "std", "mean", "sum"]) + quantile_df = series_groupby.quantile([0.25, 0.5, 0.75]) + + quantile_df = quantile_df.unstack() + quantile_df.columns = ["25%", "50%", "75%"] + + stats_df = pd.merge(agg_df, quantile_df, left_index=True, right_index=True) + formated_df = format_columns(stats_df) + formated_df.index.name = stats_df.index.name + return formated_df + + +def stdev(df, aggregated): + if len(df) <= 1: + return df["stdevNs"].iloc[0] + instance = aggregated["totalCount"].loc[df.name] + var_sum = np.dot(df["totalCount"] - 1, df["stdev"] ** 2) + deviation = df["averageNs"] - aggregated["averageNs"].loc[df.name] + dev_sum = np.dot(df["totalCount"], deviation ** 2) + return np.sqrt((var_sum + dev_sum) / (instance - 1)) + + +def convert_unit(df: pd.DataFrame, src_unit, dst_unit): + df.loc[:, df.columns.str.endswith(src_unit)] = df.loc[:, df.columns.str.endswith(src_unit)].apply(lambda x: x / 1000.0) + df = df.rename(columns=lambda x: x.replace(src_unit, "".join(["(", dst_unit, ")"]))) + return df diff --git a/profiler/cluster_analyse/communication_group/base_communication_group.py b/profiler/cluster_analyse/communication_group/base_communication_group.py new file mode 100644 index 0000000000000000000000000000000000000000..55f6801c2875698047849d39fbee3b9827c9ad28 --- /dev/null +++ b/profiler/cluster_analyse/communication_group/base_communication_group.py @@ -0,0 +1,228 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import abstractmethod +from collections import defaultdict +from copy import deepcopy +from multiprocessing import Pool + +from common_func.constant import Constant +from cluster_utils.data_transfer_adapter import DataTransferAdapter + + +class BaseCommunicationGroup: + def __init__(self, params: dict): + self.collection_path = params.get(Constant.COLLECTION_PATH) + self.data_map = params.get(Constant.DATA_MAP) + self.data_type = params.get(Constant.DATA_TYPE) + self.analysis_mode = params.get(Constant.ANALYSIS_MODE) + self.rank_comm_dir_dict = {} + self.p2p_link = [] + self.collective_group_dict = defaultdict(set) + self.p2p_comm_group = [] + self.communication_group = {} + self.communication_ops = [] + self.matrix_ops = [] + self.adapter = DataTransferAdapter() + + def load_communication_data(self): + comm_op_dirs = [] + for rank_id, profiling_dir_path in self.data_map.items(): + if self.data_type == Constant.TEXT: + comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_JSON) + matrix_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_MATRIX_JSON) + else: + comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.DB_COMMUNICATION_ANALYZER) + matrix_dir = comm_dir + if os.path.exists(comm_dir) or os.path.exists(matrix_dir): + comm_op_dirs.append((rank_id, comm_dir, matrix_dir)) + else: + print( + f"[WARNING] Rank {rank_id} does not have valid communication data and communication_matrix data.") + max_processes = int(os.cpu_count() / 2) + with Pool(processes=max_processes) as p: + self.rank_comm_dir_dict = p.map(self.read_communication_func, comm_op_dirs) + + def set_p2p_groups(self): + self.p2p_link = sorted(self.p2p_link, key=lambda x: min(x)) + while self.p2p_link: + union_set = deepcopy(self.p2p_link[0]) + rm_list = [self.p2p_link[0]] + for idx, link_rank_set_x in enumerate(self.p2p_link[1:]): + if UnionFind.is_connected(link_rank_set_x, union_set): + union_set = union_set.union(link_rank_set_x) + rm_list.append(link_rank_set_x) + self.p2p_comm_group.append(union_set) + self.p2p_link = [element for element in self.p2p_link if element not in rm_list] + + def generate_collective_communication_group(self): + self.communication_group[Constant.COLLECTIVE] = \ + [list(group) for group_name, group in self.collective_group_dict.items()] + + def generate_p2p_communication_group(self): + stage_group = {} + for group_name, rank_set in self.collective_group_dict.items(): + if not self.whether_valid_comm_group(rank_set): + continue + unioned_set = set() + remove_key = [] + for first_rank, stage in stage_group.items(): + if UnionFind.is_connected(rank_set, stage): + unioned_set = UnionFind.union(rank_set, stage, unioned_set) + remove_key.append(first_rank) + if unioned_set: + for key in remove_key: + del stage_group[key] + stage_group[min(unioned_set)] = unioned_set + else: + stage_group[min(rank_set)] = rank_set + first_rank_sort_list = sorted([first_rank for first_rank in stage_group]) + self.communication_group[Constant.P2P] = \ + [list(stage_group.get(first_rank, {})) for first_rank in first_rank_sort_list] + + def whether_valid_comm_group(self, rank_set: set): + """ + while distinguish which communication group should be used to infer stage info, these group should be ignored: + 1. group can not include more than 1 rank in every single p2p group + """ + for p2p_rank_set in self.p2p_comm_group: + if len(rank_set.intersection(p2p_rank_set)) > 1: + return False + return True + + @abstractmethod + def read_communication_func(self, params: tuple): + pass + + def analyze_communication_data(self): + for rank_id, rank_id_comm_dict, rank_id_matrix_dict in self.rank_comm_dir_dict: + for step_id, step_id_dict in rank_id_comm_dict.items(): + if not isinstance(step_id_dict, dict): + print(f"[WARNING] rank{rank_id}'s communication.json has a wrong data struct.") + continue + self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE)) + for comm_op_type, comm_op_dict in step_id_dict.items(): + self.add_communication_ops(rank_id, step_id, comm_op_type, comm_op_dict) + + for step_id, step_id_dict in rank_id_matrix_dict.items(): + if not isinstance(step_id_dict, dict): + print(f"[WARNING] rank{rank_id}'s communication_matrix.json has a wrong data struct.") + continue + self.set_p2p_link(rank_id, step_id, rank_id_matrix_dict) + self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE)) + + @abstractmethod + def dump_data(self): + pass + + def collect_comm_data(self): + comm_data_dict = { + Constant.COLLECTIVE_GROUP: self.collective_group_dict, + Constant.COMMUNICATION_OPS: self.communication_ops, + Constant.MATRIX_OPS: self.matrix_ops, + Constant.COMMUNICATION_GROUP: self.communication_group + } + return comm_data_dict + + def generate(self): + self.load_communication_data() + self.analyze_communication_data() + self.set_p2p_groups() + self.generate_collective_communication_group() + self.generate_p2p_communication_group() + self.dump_data() + return self.collect_comm_data() + + def set_p2p_link(self, rank_id: int, step_id: str, rank_id_matrix_dict: dict): + ops = rank_id_matrix_dict.get(step_id, {}) + self.add_matrix_ops(rank_id, step_id, ops) + if not ops: + print(f"[WARNING] rank{rank_id} {step_id} do not have communication matrix ops data.") + return + p2p_ops = ops.get(Constant.P2P, {}) + for op_name, link_dict in p2p_ops.items(): + self.append_p2p_link(op_name, link_dict) + + def append_p2p_link(self, op_name, link_dict): + for link in link_dict: + if '-' not in link: + print(f"[WARNING] {op_name} has an invalid link key {link}!") + break + src_rank = int(link.split('-')[0]) + dst_rank = int(link.split('-')[1]) + if src_rank != dst_rank: + rank_set = {src_rank, dst_rank} + if rank_set in self.p2p_link: + continue + self.p2p_link.append(rank_set) + + def get_collective_ops_name(self, rank_id: int, comm_op_dict: dict): + for comm_op in comm_op_dict: + if comm_op.startswith('Total'): + continue + group_name = comm_op.split('@')[-1] + self.collective_group_dict[group_name].add(rank_id) + + def add_communication_ops(self, rank_id: str, step_id: str, comm_op_type: str, comm_op_dict: dict): + for comm_op in comm_op_dict: + if comm_op.startswith('Total'): + continue + group_name = comm_op.split('@')[-1] + self.communication_ops.append({ + Constant.RANK_ID: rank_id, + Constant.STEP_ID: step_id, + Constant.COMM_OP_TYPE: comm_op_type, + Constant.COMM_OP_NAME: comm_op, + Constant.GROUP_NAME: group_name, + Constant.COMM_OP_INFO: comm_op_dict.get(comm_op) + }) + + def add_matrix_ops(self, rank_id: int, step_id: str, step_id_dict: dict): + for comm_op_type, comm_dict in step_id_dict.items(): + if comm_op_type != Constant.COLLECTIVE and comm_op_type != Constant.P2P: + print(f"[WARNING] Unknown communication operators type!") + continue + for op_name, op_link_info in comm_dict.items(): + if op_name.startswith('Total'): + continue + group_name = op_name.split('@')[-1] + self.matrix_ops.append({ + Constant.RANK_ID: rank_id, + Constant.STEP_ID: step_id, + Constant.COMM_OP_TYPE: comm_op_type, + Constant.COMM_OP_NAME: op_name, + Constant.GROUP_NAME: group_name, + Constant.COMM_OP_INFO: op_link_info + }) + + +class UnionFind(object): + """Disjoint Set Union""" + + @classmethod + def union(cls, first_set: set, second_set: set, third_set: set): + """make p and q the same set""" + return first_set | second_set | third_set + + @classmethod + def is_connected(cls, first_set: set, second_set: set): + """ + check whether set p and set q are connected + """ + if first_set & second_set: + return True + else: + return False diff --git a/profiler/cluster_analyse/communication_group/communication_db_group.py b/profiler/cluster_analyse/communication_group/communication_db_group.py new file mode 100644 index 0000000000000000000000000000000000000000..510dcd971357dfb4798e4d284a72fbb3f3a21859 --- /dev/null +++ b/profiler/cluster_analyse/communication_group/communication_db_group.py @@ -0,0 +1,57 @@ +import os + +from common_func.db_manager import DBManager +from common_func.constant import Constant +from communication_group.base_communication_group import BaseCommunicationGroup + + +class CommunicationDBGroup(BaseCommunicationGroup): + COMMUNICATION_GROUP_TABLE = "CommunicationGroup" + + def __init__(self, params: dict): + super().__init__(params) + + def read_communication_func(self, params: tuple): + if len(params) < 3: + return -1, ({}, {}, {}) + rank_id = params[0] + db_path = params[1] + time_data = [] + bandwidth_data = [] + matrix_data = [] + if os.path.exists(db_path): + conn, cursor = DBManager.create_connect_db(db_path) + time_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_TIME) + bandwidth_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_BANDWIDTH) + matrix_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_MATRIX) + if (DBManager.check_tables_in_db(db_path, Constant.TABLE_COMM_ANALYZER_TIME, + Constant.TABLE_COMM_ANALYZER_BANDWIDTH) + and self.analysis_mode in ["all", "communication_time"]): + time_data = DBManager.fetch_all_data(cursor, time_info_sql) + bandwidth_data = DBManager.fetch_all_data(cursor, bandwidth_info_sql) + if (DBManager.check_tables_in_db(db_path, Constant.TABLE_COMM_ANALYZER_MATRIX) + and self.analysis_mode in ["all", "communication_matrix"]): + matrix_data = DBManager.fetch_all_data(cursor, matrix_info_sql) + DBManager.destroy_db_connect(conn, cursor) + comm_data = self.adapter.transfer_comm_from_db_to_json(time_data, bandwidth_data) + comm_matrix_data = self.adapter.transfer_matrix_from_db_to_json(matrix_data) + return rank_id, comm_data, comm_matrix_data + + def dump_data(self): + output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT) + result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) + res = [] + for data_type, data_list in self.communication_group.items(): + for data in data_list: + rank_set = "(" + ",".join(str(i) for i in data) + ")" + data = [data_type, rank_set] + res.append(data) + if res: + DBManager.create_tables(result_db, self.COMMUNICATION_GROUP_TABLE) + conn, cursor = DBManager.create_connect_db(result_db) + sql = "insert into {} values ({value})".format(self.COMMUNICATION_GROUP_TABLE, + value="?," * (len(res[0]) - 1) + "?") + DBManager.executemany_sql(conn, sql, res) + DBManager.destroy_db_connect(conn, cursor) + else: + print("[WARNING] The CommunicationGroup table won't be created because no data has been calculated.") diff --git a/profiler/cluster_analyse/communication_group/communication_group_generator.py b/profiler/cluster_analyse/communication_group/communication_group_generator.py index 4963bf95399fea29edf31be324a49801e7f485d1..3dca90454b608fe3ffb1c365854c2aa3950b6cee 100644 --- a/profiler/cluster_analyse/communication_group/communication_group_generator.py +++ b/profiler/cluster_analyse/communication_group/communication_group_generator.py @@ -13,211 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from copy import deepcopy -from multiprocessing import Pool -from collections import defaultdict from common_func.constant import Constant -from common_func.file_manager import FileManager +from communication_group.communication_db_group import CommunicationDBGroup +from communication_group.communication_json_group import CommunicationJsonGroup class CommunicationGroupGenerator: - COMMUNICATION_GROUP_JSON = "communication_group.json" + + GROUP_MAP = { + Constant.DB: CommunicationDBGroup, + Constant.TEXT: CommunicationJsonGroup + } def __init__(self, params: dict): - self.collection_path = params.get(Constant.COLLECTION_PATH) - self.data_map = params.get(Constant.DATA_MAP) - self.analysis_mode = params.get(Constant.ANALYSIS_MODE) - self.communication_group = {} - self.collective_group_dict = defaultdict(set) - self.p2p_group_dict = defaultdict(list) - self.rank_comm_dir_dict = {} - self.communication_ops = [] - self.p2p_comm_group = [] - self.p2p_link = [] - self.matrix_ops = [] + self.processor = self.GROUP_MAP.get(params.get(Constant.DATA_TYPE))(params) def generate(self): - self.load_communication_json() - self.analyze_communication_ops() - self.set_p2p_groups() - self.generate_collective_communication_group() - self.generate_p2p_communication_group() - FileManager.create_json_file(self.collection_path, self.communication_group, self.COMMUNICATION_GROUP_JSON) - comm_data_dict = { - Constant.COLLECTIVE_GROUP: self.collective_group_dict, - Constant.COMMUNICATION_OPS: self.communication_ops, - Constant.MATRIX_OPS: self.matrix_ops, - Constant.COMMUNICATION_GROUP: self.communication_group - } - return comm_data_dict - - def analyze_communication_ops(self): - for rank_id, rank_id_comm_dict, rank_id_matrix_dict in self.rank_comm_dir_dict: - for step_id, step_id_dict in rank_id_comm_dict.items(): - if not isinstance(step_id_dict, dict): - print(f"[WARNING] rank{rank_id}'s communication.json has a wrong data struct.") - continue - self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE)) - for comm_op_type, comm_op_dict in step_id_dict.items(): - self.add_communication_ops(rank_id, step_id, comm_op_type, comm_op_dict) - - for step_id, step_id_dict in rank_id_matrix_dict.items(): - if not isinstance(step_id_dict, dict): - print(f"[WARNING] rank{rank_id}'s communication_matrix.json has a wrong data struct.") - continue - self.set_p2p_link(rank_id, step_id, rank_id_matrix_dict) - self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE)) - - def read_comm_json_func(self: any, params: tuple): - if len(params) < 3: - return -1, {}, {} - rank_id = params[0] - comm_json_path = params[1] - matrix_json_path = params[2] - comm_data = {} - matrix_data = {} - if os.path.exists(comm_json_path) and self.analysis_mode in ['all', 'communication_time']: - comm_data = FileManager.read_json_file(comm_json_path) - if os.path.exists(matrix_json_path) and self.analysis_mode in ['all', 'communication_matrix']: - matrix_data = FileManager.read_json_file(matrix_json_path) - return rank_id, comm_data, matrix_data - - def load_communication_json(self): - comm_op_dirs = [] - for rank_id, profiling_dir_path in self.data_map.items(): - comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_JSON) - matrix_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_MATRIX_JSON) - if comm_dir and matrix_dir: - comm_op_dirs.append((rank_id, comm_dir, matrix_dir)) - else: - print(f"[WARNING] Rank {rank_id} does not have a valid communication.json or communication_matrix.json.") - with Pool() as p: - self.rank_comm_dir_dict = p.map(self.read_comm_json_func, comm_op_dirs) - - def generate_collective_communication_group(self): - self.communication_group[Constant.COLLECTIVE] = \ - [list(group) for group_name, group in self.collective_group_dict.items()] - - def whether_valid_comm_group(self, rank_set: set): - """ - while distinguish which communication group should be used to infer stage info, these group should be ignored: - 1. group can not include more than 1 rank in every single p2p group - """ - for p2p_rank_set in self.p2p_comm_group: - if len(rank_set.intersection(p2p_rank_set)) > 1: - return False - return True - - def generate_p2p_communication_group(self): - stage_group = {} - for group_name, rank_set in self.collective_group_dict.items(): - if not self.whether_valid_comm_group(rank_set): - continue - unioned_set = set() - remove_key = [] - for first_rank, stage in stage_group.items(): - if UnionFind.is_connected(rank_set, stage): - unioned_set = UnionFind.union(rank_set, stage, unioned_set) - remove_key.append(first_rank) - if unioned_set: - for key in remove_key: - del stage_group[key] - stage_group[min(unioned_set)] = unioned_set - else: - stage_group[min(rank_set)] = rank_set - first_rank_sort_list = sorted([first_rank for first_rank in stage_group]) - self.communication_group[Constant.P2P] = \ - [list(stage_group.get(first_rank, {})) for first_rank in first_rank_sort_list] - - def set_p2p_groups(self): - self.p2p_link = sorted(self.p2p_link, key=lambda x: min(x)) - while self.p2p_link: - union_set = deepcopy(self.p2p_link[0]) - rm_list = [self.p2p_link[0]] - for idx, link_rank_set_x in enumerate(self.p2p_link[1:]): - if UnionFind.is_connected(link_rank_set_x, union_set): - union_set = union_set.union(link_rank_set_x) - rm_list.append(link_rank_set_x) - self.p2p_comm_group.append(union_set) - self.p2p_link = [element for element in self.p2p_link if element not in rm_list] - - def set_p2p_link(self, rank_id: int, step_id: str, rank_id_matrix_dict: dict): - ops = rank_id_matrix_dict.get(step_id, {}) - self.add_matrix_ops(rank_id, step_id, ops) - if not ops: - print(f"[WARNING] rank{rank_id} {step_id} do not have communication matrix ops data.") - return - p2p_ops = ops.get(Constant.P2P, {}) - for op_name, link_dict in p2p_ops.items(): - self.append_p2p_link(op_name, link_dict) - - def append_p2p_link(self, op_name, link_dict): - for link in link_dict: - if '-' not in link: - print(f"[WARNING] {op_name} has an invalid link key {link}!") - break - src_rank = int(link.split('-')[0]) - dst_rank = int(link.split('-')[1]) - if src_rank != dst_rank: - rank_set = set([src_rank, dst_rank]) - if rank_set in self.p2p_link: - continue - self.p2p_link.append(rank_set) - - def get_collective_ops_name(self, rank_id: int, comm_op_dict: dict): - for comm_op in comm_op_dict: - if comm_op.startswith('Total'): - continue - group_name = comm_op.split('@')[-1] - self.collective_group_dict[group_name].add(rank_id) - - def add_communication_ops(self, rank_id: str, step_id: str, comm_op_type: str, comm_op_dict: dict): - for comm_op in comm_op_dict: - if comm_op.startswith('Total'): - continue - group_name = comm_op.split('@')[-1] - self.communication_ops.append({ - Constant.RANK_ID: rank_id, - Constant.STEP_ID: step_id, - Constant.COMM_OP_TYPE: comm_op_type, - Constant.COMM_OP_NAME: comm_op, - Constant.GROUP_NAME: group_name, - Constant.COMM_OP_INFO: comm_op_dict.get(comm_op) - }) - - def add_matrix_ops(self, rank_id: int, step_id: str, step_id_dict: dict): - for comm_op_type, comm_dict in step_id_dict.items(): - if comm_op_type != Constant.COLLECTIVE and comm_op_type != Constant.P2P: - print(f"[WARNING] Unknown communication operators type!") - continue - for op_name, op_link_info in comm_dict.items(): - if op_name.startswith('Total'): - continue - group_name = op_name.split('@')[-1] - self.matrix_ops.append({ - Constant.RANK_ID: rank_id, - Constant.STEP_ID: step_id, - Constant.COMM_OP_TYPE: comm_op_type, - Constant.COMM_OP_NAME: op_name, - Constant.GROUP_NAME: group_name, - Constant.COMM_OP_INFO: op_link_info - }) - - -class UnionFind(object): - """Disjoint Set Union""" - @classmethod - def union(cls, p: set, q: set, o: set): - """make p and q the same set""" - return p | q | o - - @classmethod - def is_connected(cls, p: set, q: set): - """ - check whether set p and set q are connected - """ - if p & q: - return True - else: - return False + return self.processor.generate() diff --git a/profiler/cluster_analyse/communication_group/communication_json_group.py b/profiler/cluster_analyse/communication_group/communication_json_group.py new file mode 100644 index 0000000000000000000000000000000000000000..f6e01e3abfde4d8f180043a5bf9a50c6b5a4964c --- /dev/null +++ b/profiler/cluster_analyse/communication_group/communication_json_group.py @@ -0,0 +1,44 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from common_func.constant import Constant +from common_func.file_manager import FileManager +from communication_group.base_communication_group import BaseCommunicationGroup + + +class CommunicationJsonGroup(BaseCommunicationGroup): + COMMUNICATION_GROUP_JSON = "communication_group.json" + + def __init__(self, params: dict): + super().__init__(params) + + def dump_data(self): + FileManager.create_json_file(self.collection_path, self.communication_group, self.COMMUNICATION_GROUP_JSON) + + def read_communication_func(self: any, params: tuple): + if len(params) < 3: + return -1, {}, {} + rank_id = params[0] + comm_json_path = params[1] + matrix_json_path = params[2] + comm_data = {} + matrix_data = {} + if os.path.exists(comm_json_path) and self.analysis_mode in ["all", "communication_time"]: + comm_data = FileManager.read_json_file(comm_json_path) + if os.path.exists(matrix_json_path) and self.analysis_mode in ["all", "communication_matrix"]: + matrix_data = FileManager.read_json_file(matrix_json_path) + return rank_id, comm_data, matrix_data diff --git a/profiler/compare_tools/README.md b/profiler/compare_tools/README.md index 17d26d07e2074b4b50ddb2c27371770bc92da144..11ae2cee83429983dd5aaaf02a7baf3bb55de553 100644 --- a/profiler/compare_tools/README.md +++ b/profiler/compare_tools/README.md @@ -6,13 +6,15 @@ compare_tools(性能比对工具)支持比较GPU与NPU之间、NPU与NPU之 场景一:PyTorch训练工程从GPU迁移至NPU后出现性能劣化,通过工具分析出劣化点。 -场景二:PyTorch训练工程在NPU上,不同版本之间存在性能差距,通过工具定位具体差异。 +场景二:PyTorch或MindSpore训练工程在NPU上,不同版本之间存在性能差距,通过工具定位具体差异。 + +场景三:PyTorch训练工程从GPU迁移至MindSpore NPU后出现性能劣化,通过工具分析出劣化点。 ## 使用指导 ### 环境依赖 -使用本工具前需要安装的依赖包:prettytable、xlsxwriter、pandas、numpy +使用本工具前需要安装的依赖包: ```bash pip3 install prettytable @@ -21,9 +23,9 @@ pip3 install pandas pip3 install numpy ``` -### 性能数据采集 +### PyTorch框架性能数据采集 -使用本工具之前需要采集GPU或者NPU的性能数据,然后进行性能比对分析。 +使用本工具之前需要采集GPU或者NPU的性能数据,建议只采集一个step的性能数据,然后进行性能比对分析。 #### GPU性能数据采集 @@ -58,7 +60,7 @@ for step in range(step_number): prof.stop() ``` -pytorch profiler数据目录结构如下: +PyTorch Profiler采集结果数据目录结构如下: ```Python |- pytorch_profiling @@ -66,13 +68,12 @@ pytorch profiler数据目录结构如下: ``` #### NPU性能数据采集 -通过Ascend PyTorch Profiler工具(与PyTorch Profiler工具对标)采集NPU的性能数据,采集参数配置跟GPU一致,具体可以参考链接:[Profiling数据采集](https://gitee.com/ascend/att/tree/master/profiler)。 -将GPU的性能数据采集代码中torch.profiler替换成torch_npu.profiler。 +通过Ascend PyTorch Profiler工具采集NPU的性能数据,采集参数配置与GPU基本一致,只需将GPU的性能数据采集代码中torch.profiler替换成torch_npu.profiler。,参考链接:[Profiling数据采集](https://gitee.com/ascend/att/tree/master/profiler)。 -ascend pytorch profiler数据目录结构如下: +Ascend PyTorch Profiler采集结果数据目录结构如下: -``` +```bash |- ascend_pytorch_profiling |- * _ascend_pt |- ASCEND_PROFILER_OUTPUT @@ -82,33 +83,77 @@ ascend pytorch profiler数据目录结构如下: |- * _ascend_pt ``` +### MindSpore框架性能数据采集 + +#### NPU性能数据采集 + +当前MindSpore场景仅支持NPU环境性能数据与PyTorch GPU性能数据进行比对;以及MindSpore训练工程在NPU上,不同版本之间的性能数据进行比对。 + +通过MindSpore性能调试工具采集NPU的性能数据,建议只采集或只解析一个step的性能数据,参考链接:[性能调试(Ascend)](https://www.mindspore.cn/mindinsight/docs/zh-CN/r2.3/performance_profiling_ascend.html)。 + +MindSpore性能调试工具采集结果数据目录结构如下: + +``` +|- profiler/{rank-*}_{timestamps}_ascend_ms + |- ASCEND_PROFILER_OUTPUT + |- kernel_details.csv + |- trace_view.json +``` + +进行性能比对时,MindSpore采集的性能数据须指定到`profiler/{rank-*}_{timestamps}_ascend_ms`或`ASCEND_PROFILER_OUTPUT`层级。 + ### 性能数据比对 -将att代码仓下载到本地,执行以下命令: +性能比对工具将总体性能拆解为训练耗时和内存占用,其中训练耗时可拆分为算子(包括算子和nn.Module)、通信、调度三个维度,以打屏的形式输出总体指标,帮助用户定界劣化的方向。与此同时,工具还会生成performance_comparison_result_*.xlsx,展示每个算子在执行耗时、通信耗时、内存占用的优劣,可通过DIFF列大于0筛选出劣化算子。详细介绍请参见“**比对结果说明**”。 + +性能比对工具支持使用**命令行**和**脚本**两种方式执行性能数据比对操作,这两种方式均支持**通用参数**和**算子性能比对特有参数**。 + +#### 命令行方式 + +1. 参见《[性能工具](../README.md)》完成工具安装。 + +2. 执行如下命令进行性能数据比对: + + ``` + msprof-analyze compare -d [比对性能数据文件所在路径] -bp [基准性能数据文件所在路径] --output_path=[比对结果文件存放路径] + ``` + + - -d(必选):比对性能数据文件所在路径。可以指定以“ascend_pt”结尾的目录、ASCEND_PROFILER_OUTPUT目录或trace_view.json文件,指定trace_view.json无法显示算子的内存占用。 + - -bp(必选):基准性能数据文件所在路径。基准性能数据文件若以GPU为基准,指定到以".pt.trace"结尾的json文件;若以NPU不同版本为基准,指定文件与-d一致。 + - --output_path(可选):性能比对结果存放的路径,默认保存在当前目录。 + +#### 脚本方式 + +将att代码仓下载到本地,执行如下命令: ```bash # 进入att代码仓目录下的compare_tools目录 cd att/profiler/compare_tools # 执行最简比对命令 -python performance_compare.py [基准性能数据文件] [比对性能数据文件] --output_path=./result_dir +python performance_compare.py [基准性能数据文件所在路径] [比对性能数据文件所在路径] --output_path=[比对结果文件存放路径] ``` -- 基准性能数据文件(必选):若以GPU为基准,指定到以".pt.trace"结尾的json文件;若以NPU不同版本为基准,指定文件参考**比对性能数据文件**。 -- 比对性能数据文件(必选):可以指定以“ascend_pt”结尾的目录、ASCEND_PROFILER_OUTPUT目录或trace_view.json文件,指定trace_view.json无法显示算子的内存占用。 +- 基准性能数据文件所在路径(必选):若以GPU为基准,指定到以".pt.trace"结尾的json文件;若以NPU不同版本为基准,指定文件参考**比对性能数据文件所在路径**。 +- 比对性能数据文件所在路径(必选):可以指定以“ascend_pt”结尾的目录、ASCEND_PROFILER_OUTPUT目录或trace_view.json文件,指定trace_view.json无法显示算子的内存占用。 - --output_path(可选):性能比对结果存放的路径,默认保存在当前目录。 -工具将总体性能拆解为训练耗时和内存占用,其中训练耗时可拆分为算子、通信、调度三个维度,以打屏的形式输出总体指标,帮助用户定界劣化的方向。与此同时,工具还会生成performance_comparison_result_*.xlsl,展示每个算子在执行耗时、通信耗时、内存占用的优劣,可通过DIFF列大于0筛选出劣化算子。详细介绍请参见“**比对结果说明**”。 - #### 通用参数说明 -| 参数名 | 说明 | 是否必选 | -|--------------------------------|-----------|------| -| --enable_profiling_compare | 开启总体性能比对。 | 否 | -| --enable_operator_compare | 开启算子性能比对。 | 否 | -| --enable_communication_compare | 开启通信性能比对。 | 否 | -| --enable_memory_compare | 开启算子内存比对。 | 否 | +| 参数名 | 说明 | 是否必选 | +| ------------------------------ | ------------------------------------------------------------ | -------- | +| --enable_profiling_compare | 开启总体性能比对。 | 否 | +| --enable_operator_compare | 开启算子性能比对。MindSpore场景暂不支持。该开关较耗时,建议只采集一个step的性能数据。 | 否 | +| --enable_communication_compare | 开启通信性能比对。 | 否 | +| --enable_memory_compare | 开启算子内存比对。MindSpore场景暂不支持。该开关较耗时,建议只采集一个step的性能数据。 | 否 | +| --disable_details | 隐藏明细比对,只进行统计级比对。 | 否 | -说明:以上4个开关均不设置的情况下,**工具默认开启所有的性能比对**,当用户设置了以上开关,则按照用户设置的开关进行性能比对,示例如下: +说明:以上开关均不设置的情况下,**工具默认开启所有的性能比对**,当用户设置了以上开关,则按照用户设置的开关进行性能比对,示例如下: + +```bash +msprof-analyze compare -d [比对性能数据文件所在路径] -bp [基准性能数据文件所在路径] --output_path=./result_dir --enable_profiling_compare +``` + +或 ```bash python performance_compare.py [基准性能数据文件] [比对性能数据文件] --output_path=./result_dir --enable_profiling_compare @@ -120,30 +165,36 @@ python performance_compare.py [基准性能数据文件] [比对性能数据文 | 参数名 | 说明 | 是否必选 | | ----------------- | ------------------------------------------------------------ | -------- | -| --gpu_flow_cat | 配置GPU trace中cpu侧算子与device kernel的连线标识,当GPU的kernel均为空时设置。根据timeline的json文件在chrome://tracing上的Flow events的选项配置。使用示例:--gpu_flow_cat=async_gpu | 否 | +| --gpu_flow_cat | 配置GPU trace中CPU侧算子与device kernel的连线标识,当GPU的Device Duration(us)均为0时设置。使用chrome://tracing打开GPU的json,右上角Flow events找到连线标识,将标识配置进该参数。使用示例:--gpu_flow_cat=async_gpu | 否 | | --use_input_shape | 开启算子精准匹配,默认关闭。使用示例:--use_input_shape | 否 | -| --max_kernel_num | 设置CPU侧算子下发的最大kernel数量,当超过设定值时工具会自动往下找子算子,直至满足条件,默认仅比对最上层算子。使用示例:--max_kernel_num=10 | 否 | -| --op_name_map | 设置GPU与NPU等价的算子名称的映射关系,以字典形式存入。使用示例:--op_name_map='{"Optimizer.step#SGD.step":"Optimizer.step#NpuFusedSGD.step"}' | 否 | +| --max_kernel_num | 设置CPU侧算子下发的最大kernel数量,当超过设定值时工具会自动往下找子算子,直至满足条件。默认仅比对最上层算子,粒度较粗;若想要更细粒度的算子比对,可设置该参数,参数值不得小于4,参数值设置越小,比对粒度越细。使用示例:--max_kernel_num=10 | 否 | +| --op_name_map | 设置GPU与NPU等价的算子名称的映射关系,以字典形式存入。使用示例:--op_name_map={'Optimizer.step#SGD.step':'Optimizer.step#NpuFusedSGD.step'} | 否 | ## 比对结果说明 +MindSpore场景仅支持**总体性能**和**通信性能**的对比。 + ### 总体性能 总体性能比对结果以打屏的形式呈现。 -| 字段 | 说明 | -| ------------------------------- | ------------------------------------------------------------ | -| Cube Time(Num) | Cube算子总耗时,Num表示计算的次数。 | -| Vector Time(Num) | Vector算子总耗时,Num表示计算的次数。 | -| Other Time | AI CPU、DSA等其他非cube vector算子耗时。 | -| Flash Attention Time(Forward) | Flash Attention算子前向耗时。 | -| Flash Attention Time(Backward) | Flash Attention算子反向耗时。 | -| Computing Time | 计算流耗时,计算流所有event耗时总和。如果有多条并发计算,计算流耗时对重叠部分只会计算一次。 | -| Mem Usage | 内存使用。gpu上的内存使用可以使用nvidia-smi查看,npu上的内存使用可以使用npu-smi查看,Profiling信息采集时打开profile_memory=True开关,mem usage显示的是memory_record里面的最大resevered值,一般来说是进程级内存。 | -| Uncovered Communication Time | 通信未掩盖耗时。 | -| SDMA Time(Num) | 拷贝类任务耗时,Num表示计算的次数。 | -| Free Time | 调度耗时 = E2E耗时 - 算子耗时 - 通信不可掩盖耗时。Free的定义为Device侧既不在通信又不在计算的时间,因此包含拷贝时间(SDMA Time)。 | -| E2E Time(Not minimal profiling) | E2E总耗时,计算流端到端耗时。当存在Not minimal profiling时,表示该时间存在性能膨胀,会影响通信和调度耗时。 | +| 字段 | 说明 | +| --------------------------------------- | ------------------------------------------------------------ | +| Cube Time(Num) | Cube算子总耗时,Num表示计算的次数。 | +| Vector Time(Num) | Vector算子总耗时,Num表示计算的次数。 | +| Conv Time(Forward)(Num) | conv前向算子耗时,Num表示计算的次数。 | +| Conv Time(Backward)(Num) | conv反向算子耗时,Num表示计算的次数。 | +| Flash Attention Time(Forward)(Num) | Flash Attention算子前向耗时,Num表示计算的次数。 | +| Flash Attention Time(Backward)(Num) | Flash Attention算子反向耗时,Num表示计算的次数。 | +| Paged Attention Time(Num) | Paged Attention算子耗时,Num表示计算的次数。 | +| Lccl Time(Num) | Lccl算子耗时,Num表示计算的次数。 | +| Computing Time | 计算流耗时,计算流所有event耗时总和。如果有多条并发计算,计算流耗时对重叠部分只会计算一次。 | +| Mem Usage | 内存使用。GPU上的内存使用可以使用nvidia-smi查看,NPU上的内存使用可以使用npu-smi查看,Profiling信息采集时打开profile_memory=True开关,mem usage显示的是memory_record里面的最大resevered值,一般来说是进程级内存。 | +| Uncovered Communication Time(Wait Time) | 通信未掩盖耗时,包含Wait Time(只有采集性能数据的Level等级为L1以上并且采集NPU数据时才会存在)为同步时间。 | +| SDMA Time(Num) | 拷贝类任务耗时,Num表示计算的次数。 | +| Free Time | 调度耗时 = E2E耗时 - 算子耗时 - 通信不可掩盖耗时。Free的定义为Device侧既不在通信又不在计算的时间,因此包含拷贝时间(SDMA Time)。 | +| E2E Time(Not minimal profiling) | E2E总耗时,计算流端到端耗时。当存在Not minimal profiling时,表示该时间存在性能膨胀,会影响通信和调度耗时。 | +| Other Time | AI CPU、DSA、TensorMove等其他算子耗时。 | 可以采取最简性能数据采集的方式来减少E2E耗时的性能膨胀,示例代码如下: @@ -160,40 +211,92 @@ with torch_npu.profiler.profile( activities配置仅采集NPU数据,不配置experimental_config参数以及其他可选开关。 +- 当Computing Time耗时增大,分析**算子性能**。 +- 当Uncovered Communication Time耗时增大,分析**通信性能**,若通信性能分析没有劣化的通信算子,代表通信与计算的并行度较差,继续进行NPU的集群性能分析。 +- 当Mem Usage增大,分析**算子内存**,若没有明显占用较大的算子,则代表算子内存申请量没有差异,问题在于内存的释放(持有时间过久),可以使用tensorboard或ascend insight继续进行NPU内存的分析。 + ### 算子性能 -算子性能比对结果在performance_comparison_result_*.xlsl中OperatorCompare和OperatorCompare(TOP)的sheet页呈现。 +MindSpore场景暂不支持。 -- OperatorCompare(TOP):算子为粒度的统计呈现,按照算子在device上的总耗时与基准算子的差距值(Diff Duration(ms)列)进行逆序。 +#### 比对数据无Python Function + +算子性能比对结果在performance_comparison_result_*.xlsx中OperatorCompare和OperatorCompareStatistic的sheet页呈现。 + +- OperatorCompareStatistic:算子为粒度的统计呈现,按照算子在device上的总耗时与基准算子的差距值(Diff Duration(ms)列)进行逆序。 - OperatorCompare:算子比对的明细展示,可以查看每一个算子对应的kernel详情。 - Diff Ratio:比较算子在device上执行总耗时 / 基准算子在device上执行总耗时,红色代表劣化。 +- Device Duration(us):该算子下发到device上执行的所有kernel耗时的总和。 -#### Device Duration(us) +步骤1:查看OperatorCompareStatistic页,找出耗时差距TOP的算子。 +步骤2:查看OperatorCompare页,搜索耗时差距TOP的算子,查看具体执行的kernel耗时,寻找可优化点。 -``` -该算子下发到device上执行的所有kernel耗时的总和 -``` +#### 比对数据有Python Function + +算子性能比对结果在performance_comparison_result_*.xlsx中ModuleCompareStatistic、ModuleCompare的sheet页呈现。 + +当用户采集时开启with_stack开关,会上报python function事件,当比对的双方数据都存在python function的事件时,可进行模块级别的比对。 + +- Module Class:Module名,如nn.Module: Linear。 +- Module Level:Module的层级。 +- Module Name:Module唯一标识名,如/ DynamicNet_0/ Linear_0。 +- Operator Name:框架侧算子名,如aten::add。字段为[ TOTAL ]代表该module的总体情况。 +- Kernel Detail:算子详细信息。 +- Device Self Time(ms):该模块调用的算子(排除子模块)在device侧执行的总耗时,单位ms。 +- Number:该Module或算子被调用的次数。 +- Device Total Time(ms):该模块调用的算子(包含子模块)在device侧执行的总耗时,单位ms。 +- Device Total Time Diff(ms):GPU与NPU的Device Total Time(ms)差值。 +- Device Self Time Diff(ms):GPU与NPU的Device Self Time(ms)差值。 +- Total Time Ratio:GPU与NPU的Device Total Time(ms)比值。 +- Base Call Stack:基准文件模块的调用栈。 +- Comparison Call Stack:比较文件模块的调用栈。 + +ModuleCompare:模块及模块下算子比对的明细展示,可以查看每一个算子对应的kernel详情。 + +- Module Class:Module名,如nn.Module: Linear。 +- Module Level:Module的层级。 +- Module Name:Module唯一标识名,如/ DynamicNet_0/ Linear_0。 +- Operator Name:框架侧算子名,如aten::add。字段为[ TOTAL ]代表该module的总体情况。 +- Kernel Detail:算子详细信息。 +- Device Self Time(us):该模块调用的算子(排除子模块)在device侧执行的总耗时,单位us。 +- Device Total Time(us):该模块调用的算子(包含子模块)在device侧执行的总耗时,单位us。 +- Device Total Time Diff(us):GPU与NPU的Device Total Time(us)差值。 +- Device Self Time Diff(us):GPU与NPU的Device Self Time(us)差值。 +- Total Time Ratio:GPU与NPU的Device Total Time(us)比值。 +- Base Call Stack:有劣化的模块或算子,基准文件模块的调用栈。 +- Comparison Call Stack:有劣化的模块或算子,比较文件模块的调用栈。 + +步骤1:查看ModuleCompareStatistic页,找出耗时差距TOP的模块。 + +​ 筛选Operator Name字段为[ TOTAL ],将模块总体情况按照Device Self Time(ms)字段逆序,可识别出耗时差距TOP的模块。 + +​ 恢复数据,可按照Order Id字段升序。 + +步骤2:查看ModuleCompare页,查找耗时差距TOP模块下的劣化算子。 + +步骤3:通过调用栈找到对应的代码行。 ### 通信性能 -通信性能比对结果在performance_comparison_result_*.xlsl中CommunicationCompare的sheet页呈现。 +通信性能比对结果在performance_comparison_result_*.xlsx中CommunicationCompare的sheet页呈现。 -- 淡蓝色背景的记录行:通信算子的summary信息,包括通信算子名称、调用总次数、通信算子总耗时(单位:us)、通信算子平均耗时(单位:us)、通信算子最大耗时(单位:us)、通信算子最小耗时(单位:us)。 +- 第二行表头:通信算子的summary信息,包括通信算子名称、调用总次数、通信算子总耗时(单位:us)、通信算子平均耗时(单位:us)、通信算子最大耗时(单位:us)、通信算子最小耗时(单位:us)。 - 无背景色的记录行:通信算子的detail信息,仅支持NPU,包含了该通信算子下的所有Task信息,包括Task名称、Task调用次数、Task总耗时(单位:us)、Task平均耗时(单位:us)、Task最大耗时(单位:us)、Task最小耗时(单位:us)。 - Diff Ratio: 比较通信算子的总耗时 / 基准通信算子的总耗时,红色代表劣化。 ### 算子内存 -算子内存比对结果在performance_comparison_result_*.xlsl中MemoryCompare和MemoryCompare(TOP)的sheet页呈现。 +MindSpore场景暂不支持。 -- MemoryCompare(TOP):算子为粒度的统计呈现,按照算子占用的总内存与基准算子的差距值(Diff Memory(MB))进行逆序。 +算子内存比对结果在performance_comparison_result_*.xlsx中MemoryCompare和MemoryCompareStatistic的sheet页呈现。 + +- MemoryCompareStatistic:算子为粒度的统计呈现,按照算子占用的总内存与基准算子的差距值(Diff Memory(MB))进行逆序。 - MemoryCompare:算子内存比对的明细展示,可以查看每一个算子申请内存的详情。 - Diff Ratio: 比较算子占用的总内存 / 基准算子占用的总内存,红色代表劣化。 -#### Size(KB) +- Size(KB):该算子占用的device内存大小,单位KB。 -``` -该算子占用的device内存大小,单位KB -``` \ No newline at end of file +步骤1:查看MemoryCompareStatistic页,找出内存占用差距TOP的算子。 +步骤2:查看MemoryCompare页,搜索内存占用差距TOP的算子,查看具体占用的子算子。 diff --git a/profiler/compare_tools/compare_backend/comparator/module_comparetor.py b/profiler/compare_tools/compare_backend/comparator/module_comparetor.py new file mode 100644 index 0000000000000000000000000000000000000000..49c50b53c5a1b00bd17b7281d80b61d5011cb59a --- /dev/null +++ b/profiler/compare_tools/compare_backend/comparator/module_comparetor.py @@ -0,0 +1,36 @@ +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.common_func import update_order_id +from compare_backend.utils.constant import Constant + + +class ModuleComparator(BaseComparator): + def __init__(self, origin_data: any, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + if not self._origin_data: + return + base_all_data = [data for data in self._origin_data if data[0]] # index 0 for base module + base_all_data.sort(key=lambda x: x[0].start_time) + base_none_data = [data for data in self._origin_data if not data[0]] # index 0 for base module + base_none_data.sort(key=lambda x: x[1].start_time) + index = 0 + for base_module, comparison_module in base_all_data: + if not comparison_module: + self._rows.extend(self._bean(base_module, comparison_module).rows) + continue + while index < len(base_none_data): + module = base_none_data[index][1] # index 1 for comparison module + if module.start_time < comparison_module.start_time: + self._rows.extend(self._bean(None, module).rows) + index += 1 + else: + break + self._rows.extend(self._bean(base_module, comparison_module).rows) + while index < len(base_none_data): + module = base_none_data[index][1] # index 1 for comparison module + self._rows.extend(self._bean(None, module).rows) + index += 1 + update_order_id(self._rows) + if not any(row[-1] != Constant.NA for row in self._rows): + print(f"[WARNING] If you want to see the operator's call stack, you must enable with_stack switch.") diff --git a/profiler/compare_tools/compare_backend/comparator/module_statistic_comparator.py b/profiler/compare_tools/compare_backend/comparator/module_statistic_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..e09108f3cbe3744068daf6c5316dc318aea53177 --- /dev/null +++ b/profiler/compare_tools/compare_backend/comparator/module_statistic_comparator.py @@ -0,0 +1,45 @@ +from collections import OrderedDict + +from compare_backend.comparator.base_comparator import BaseComparator +from compare_backend.utils.common_func import update_order_id + + +class ModuleStatisticComparator(BaseComparator): + def __init__(self, origin_data: list, bean: any): + super().__init__(origin_data, bean) + + def _compare(self): + if not self._origin_data: + return + base_module_dict, comparison_module_dict = self._group_by_module_name() + for module_name, base_data in base_module_dict.items(): + comparison_data = comparison_module_dict.pop(module_name, []) + self._rows.extend(self._bean(module_name, base_data, comparison_data).rows) + for module_name, comparison_data in comparison_module_dict.items(): + self._rows.extend(self._bean(module_name, [], comparison_data).rows) + update_order_id(self._rows) + + def _group_by_module_name(self): + base_module_dict, comparison_module_dict = OrderedDict(), OrderedDict() + base_all_data = [data for data in self._origin_data if data[0]] # index 0 for base module + base_all_data.sort(key=lambda x: x[0].start_time) + base_none_data = [data for data in self._origin_data if not data[0]] # index 0 for base module + base_none_data.sort(key=lambda x: x[1].start_time) + index = 0 + for base_module, comparison_module in base_all_data: + base_module_dict.setdefault(base_module.module_name, []).append(base_module) + if not comparison_module: + continue + while index < len(base_none_data): + module = base_none_data[index][1] # index 1 for comparison module + if module.start_time < comparison_module.start_time: + comparison_module_dict.setdefault(module.module_name, []).append(module) + index += 1 + else: + break + comparison_module_dict.setdefault(comparison_module.module_name, []).append(comparison_module) + while index < len(base_none_data): + module = base_none_data[index][1] # index 1 for comparison module + comparison_module_dict.setdefault(module.module_name, []).append(module) + index += 1 + return base_module_dict, comparison_module_dict diff --git a/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py b/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py index bfc631c66c86f061b10445e117e9f947d7ebdbc5..7283c17b47dea78058d0541c1332df0fa45e90d9 100644 --- a/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py +++ b/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py @@ -18,10 +18,14 @@ class OverallPerformanceComparator(BaseComparator): f'{base_profiling_info.vec_time:.3f}s({base_profiling_info.vec_num})']) comp_col.extend([f'{comp_profiling_info.cube_time:.3f}s({comp_profiling_info.cube_num})', f'{comp_profiling_info.vec_time:.3f}s({comp_profiling_info.vec_num})']) - if base_profiling_info.other_time or comp_profiling_info.other_time: - self._headers.append('Other Time') - base_col.append(f'{base_profiling_info.other_time:.3f}s') - comp_col.append(f'{comp_profiling_info.other_time:.3f}s') + if base_profiling_info.conv_time_fwd or comp_profiling_info.conv_time_fwd: + self._headers.append('Conv Time(Forward)(Num)') + base_col.append(f'{base_profiling_info.conv_time_fwd:.3f}s({base_profiling_info.conv_num_fwd})') + comp_col.append(f'{comp_profiling_info.conv_time_fwd:.3f}s({comp_profiling_info.conv_num_fwd})') + if base_profiling_info.conv_time_bwd or comp_profiling_info.conv_time_bwd: + self._headers.append('Conv Time(Backward)(Num)') + base_col.append(f'{base_profiling_info.conv_time_bwd:.3f}s({base_profiling_info.conv_num_bwd})') + comp_col.append(f'{comp_profiling_info.conv_time_bwd:.3f}s({comp_profiling_info.conv_num_bwd})') if base_profiling_info.fa_time_fwd or comp_profiling_info.fa_time_fwd: self._headers.append('Flash Attention Time(Forward)(Num)') base_col.append(f'{base_profiling_info.fa_time_fwd:.3f}s({base_profiling_info.fa_num_fwd})') @@ -30,6 +34,18 @@ class OverallPerformanceComparator(BaseComparator): self._headers.append('Flash Attention Time(Backward)(Num)') base_col.append(f'{base_profiling_info.fa_time_bwd:.3f}s({base_profiling_info.fa_num_bwd})') comp_col.append(f'{comp_profiling_info.fa_time_bwd:.3f}s({comp_profiling_info.fa_num_bwd})') + if base_profiling_info.pa_time or comp_profiling_info.pa_time: + self._headers.append('Paged Attention Time(Num)') + base_col.append(f'{base_profiling_info.pa_time:.3f}s({base_profiling_info.pa_num})') + comp_col.append(f'{comp_profiling_info.pa_time:.3f}s({comp_profiling_info.pa_num})') + if base_profiling_info.lccl_time or comp_profiling_info.lccl_time: + self._headers.append('Lccl Time(Num)') + base_col.append(f'{base_profiling_info.lccl_time:.3f}s({base_profiling_info.lccl_num})') + comp_col.append(f'{comp_profiling_info.lccl_time:.3f}s({comp_profiling_info.lccl_num})') + if base_profiling_info.other_time or comp_profiling_info.other_time: + self._headers.append('Other Time') + base_col.append(f'{base_profiling_info.other_time:.3f}s') + comp_col.append(f'{comp_profiling_info.other_time:.3f}s') self._headers.extend(['Computing Time']) base_col.extend([f'{base_profiling_info.compute_time:.3f}s']) comp_col.extend([f'{comp_profiling_info.compute_time:.3f}s']) @@ -37,9 +53,17 @@ class OverallPerformanceComparator(BaseComparator): self._headers.append('Mem Usage') base_col.append(f'{base_profiling_info.memory_used:.2f}G') comp_col.append(f'{comp_profiling_info.memory_used:.2f}G') - self._headers.extend(['Uncovered Communication Time']) - base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s']) - comp_col.extend([f'{comp_profiling_info.communication_not_overlapped: .3f}s']) + self._headers.extend(['Uncovered Communication Time(Wait Time)']) + if base_profiling_info.wait_time: + base_col.extend( + [f'{base_profiling_info.communication_not_overlapped: .3f}s({base_profiling_info.wait_time:.3f}s)']) + else: + base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s( / )']) + if comp_profiling_info.is_level0: + comp_col.extend([f'{comp_profiling_info.communication_not_overlapped: .3f}s( / )']) + else: + comp_col.extend( + [f'{comp_profiling_info.communication_not_overlapped: .3f}s({comp_profiling_info.wait_time:.3f}s)']) if base_profiling_info.sdma_time or comp_profiling_info.sdma_time: self._headers.append('SDMA Time(Num)') base_col.append(f'{base_profiling_info.sdma_time:.3f}s({base_profiling_info.sdma_num})') diff --git a/profiler/compare_tools/compare_backend/compare_bean/module_compare_bean.py b/profiler/compare_tools/compare_backend/compare_bean/module_compare_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..abfce00d83d6c1a914aa71481277e2dc1c195f17 --- /dev/null +++ b/profiler/compare_tools/compare_backend/compare_bean/module_compare_bean.py @@ -0,0 +1,83 @@ +from compare_backend.utils.common_func import longest_common_subsequence_matching, calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig +from compare_backend.utils.module_node import ModuleNode +from compare_backend.utils.name_function import NameFunction +from compare_backend.utils.torch_op_node import TorchOpNode + + +class ModuleCompareBean: + TABLE_NAME = Constant.MODULE_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, base_module: ModuleNode, comparison_module: ModuleNode): + self._base_module = ModuleInfo(base_module) + self._comparison_module = ModuleInfo(comparison_module) + self.module_class = self._base_module.module_class if base_module else self._comparison_module.module_class + self.module_level = self._base_module.module_level if base_module else self._comparison_module.module_level + self.module_name = self._base_module.module_name if base_module else self._comparison_module.module_name + + @property + def rows(self): + return [self.get_total_row(), *self.get_detail_rows()] + + def get_total_row(self): + total_diff, total_ratio = calculate_diff_ratio(self._base_module.device_total_time, + self._comparison_module.device_total_time) + self_diff, _ = calculate_diff_ratio(self._base_module.device_self_time, + self._comparison_module.device_self_time) + return [None, self.module_class, self.module_level, self.module_name, "TOTAL", None, + self._base_module.device_self_time, self._base_module.device_total_time, "TOTAL", None, + self._comparison_module.device_self_time, self._comparison_module.device_total_time, total_diff, + self_diff, total_ratio, self._base_module.call_stack, self._comparison_module.call_stack] + + def get_detail_rows(self): + rows = [] + matched_ops = longest_common_subsequence_matching(self._base_module.top_layer_ops, + self._comparison_module.top_layer_ops, NameFunction.get_name) + for base_op, comparison_op in matched_ops: + base_op = OpInfo(base_op) + comparison_op = OpInfo(comparison_op) + self_diff, self_ratio = calculate_diff_ratio(base_op.device_self_time, comparison_op.device_self_time) + base_call_stack = base_op.call_stack if self_diff > 0 else None + comparison_call_stack = comparison_op.call_stack if self_diff > 0 else None + rows.append( + [None, self.module_class, self.module_level, self.module_name, base_op.operator_name, + base_op.kernel_details, base_op.device_self_time, None, comparison_op.operator_name, + comparison_op.kernel_details, comparison_op.device_self_time, None, None, self_diff, self_ratio, + base_call_stack, comparison_call_stack]) + return rows + + +class ModuleInfo: + def __init__(self, module: ModuleNode): + self.module_class = "" + self.module_level = "" + self.module_name = "" + self.device_self_time = 0 + self.device_total_time = 0 + self.top_layer_ops = [] + self.call_stack = "" + if module: + self.module_class = module.module_class + self.module_level = module.module_level + self.module_name = module.module_name.replace("nn.Module:", "") + self.device_self_time = module.device_self_dur + self.device_total_time = module.device_total_dur + self.top_layer_ops = module.toy_layer_api_list + self.call_stack = module.call_stack + + +class OpInfo: + def __init__(self, operator: TorchOpNode): + self.operator_name = "" + self.kernel_details = "" + self.device_self_time = 0 + self.call_stack = "" + if operator: + self.operator_name = operator.name + for kernel in operator.kernel_list: + self.device_self_time += kernel.device_dur + self.kernel_details += kernel.kernel_details + self.call_stack = operator.call_stack diff --git a/profiler/compare_tools/compare_backend/compare_bean/module_statistic_bean.py b/profiler/compare_tools/compare_backend/compare_bean/module_statistic_bean.py new file mode 100644 index 0000000000000000000000000000000000000000..97fc98bdd354e1ebe1fbb3fc44def4eaf3059235 --- /dev/null +++ b/profiler/compare_tools/compare_backend/compare_bean/module_statistic_bean.py @@ -0,0 +1,98 @@ +import re + +from compare_backend.utils.common_func import calculate_diff_ratio +from compare_backend.utils.constant import Constant +from compare_backend.utils.excel_config import ExcelConfig + + +class ModuleStatisticBean: + TABLE_NAME = Constant.MODULE_TOP_TABLE + HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME) + OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME) + + def __init__(self, name: str, base_data: list, comparison_data: list): + self._module_name = name.replace("nn.Module:", "") + pattern = re.compile('_[0-9]+$') + self._module_class = pattern.sub('', name.split("/")[-1]) + self._module_level = name.count("/") + self._base_info = ModuleStatisticInfo(base_data) + self._comparison_info = ModuleStatisticInfo(comparison_data) + + @property + def rows(self): + rows = [self.get_total_row()] + rows.extend(self.get_detail_rows()) + return rows + + @staticmethod + def _get_kernel_detail_rows(base_kernel_dict, com_kernel_dict): + base_kernel_detals = "" + com_kernel_details = "" + for kernel_name, base_dur_list in base_kernel_dict.items(): + base_dur = "%.3f" % sum(base_dur_list) + base_kernel_detals += f"{kernel_name}, [number: {len(base_dur_list)}], [duration_ms: {base_dur}]\n" + for kernel_name, com_dur_list in com_kernel_dict.items(): + com_dur = "%.3f" % sum(com_dur_list) + com_kernel_details += f"{kernel_name}, [number: {len(com_dur_list)}], [duration_ms: {com_dur}]\n" + return [base_kernel_detals, com_kernel_details] + + def get_total_row(self): + total_diff, total_ratio = calculate_diff_ratio(self._base_info.device_total_dur_ms, + self._comparison_info.device_total_dur_ms) + self_diff, _ = calculate_diff_ratio(self._base_info.device_self_dur_ms, + self._comparison_info.device_self_dur_ms) + row = [None, self._module_class, self._module_level, self._module_name, "[ TOTAL ]", None, + self._base_info.device_self_dur_ms, self._base_info.number, self._base_info.device_total_dur_ms, + None, self._comparison_info.device_self_dur_ms, self._comparison_info.number, + self._comparison_info.device_total_dur_ms, total_diff, self_diff, + total_ratio, self._base_info.call_stack, self._comparison_info.call_stack] + return row + + def get_detail_rows(self): + rows = [] + for op_name, base_dur_dict in self._base_info.api_dict.items(): + base_dur_list = base_dur_dict.get("total", []) + com_dur_dict = self._comparison_info.api_dict.pop(op_name, {}) + com_dur_list = com_dur_dict.get("total", []) + base_kernel_detals, com_kernel_details = self._get_kernel_detail_rows(base_dur_dict.get("detail", {}), + com_dur_dict.get("detail", {})) + self_diff, self_ratio = calculate_diff_ratio(sum(base_dur_list), sum(com_dur_list)) + row = [None, self._module_class, self._module_level, self._module_name, op_name, base_kernel_detals, + sum(base_dur_list), len(base_dur_list), None, com_kernel_details, sum(com_dur_list), + len(com_dur_list), None, None, self_diff, self_ratio, None, None] + rows.append(row) + + for op_name, com_dur_dict in self._comparison_info.api_dict.items(): + com_dur_list = com_dur_dict.get("total", []) + base_kernel_detals, com_kernel_details = self._get_kernel_detail_rows({}, com_dur_dict.get("detail", {})) + self_diff, self_ratio = calculate_diff_ratio(0, sum(com_dur_list)) + row = [None, self._module_class, self._module_level, self._module_name, op_name, base_kernel_detals, 0, 0, + None, com_kernel_details, sum(com_dur_list), len(com_dur_list), None, None, self_diff, + self_ratio, None, None] + rows.append(row) + return rows + + +class ModuleStatisticInfo: + def __init__(self, data_list: list): + self._data_list = data_list + self.device_self_dur_ms = 0 + self.device_total_dur_ms = 0 + self.call_stack = "" + self.number = len(data_list) + self.api_dict = {} + self._get_info() + + def _get_info(self): + if self._data_list: + self.call_stack = self._data_list[0].call_stack + for module in self._data_list: + self.device_self_dur_ms += module.device_self_dur / Constant.US_TO_MS + self.device_total_dur_ms += module.device_total_dur / Constant.US_TO_MS + for torch_op in module.toy_layer_api_list: + self.api_dict.setdefault(torch_op.name, {}).setdefault("total", []).append( + torch_op.device_dur / Constant.US_TO_MS) + for kernel in torch_op.kernel_list: + self.api_dict.setdefault(torch_op.name, {}).setdefault("detail", {}).setdefault(kernel.kernel_name, + []).append( + kernel.device_dur / Constant.US_TO_MS) diff --git a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py index ef5e59c555507a9e97d6a2b0c7824110c4b3fce7..122009b9045074c908c33dc50fffd36f03eb4ff9 100644 --- a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py +++ b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/kernel_details_bean.py @@ -3,6 +3,7 @@ import math import pandas as pd from compare_backend.utils.common_func import convert_to_float +from compare_backend.utils.constant import Constant class KernelDetailsBean: @@ -68,6 +69,16 @@ class KernelDetailsBean: def is_cube(self): return "matmul" in self.op_type.lower() + def is_conv(self): + return self.op_type.lower().startswith("conv") + + def is_conv_bwd(self): + lower_op_type = self.op_type.lower() + return any(bwd in lower_op_type for bwd in Constant.BWD_LIST) + + def is_page_attention(self): + return "pagedattention" in self.op_type.lower() + def init(self): self._op_type = self._data.get('Type', "") self._name = self._data.get('Name', "") diff --git a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py index 6ce91ba53c8f2a9286319f35f76b62773743bc49..cef6bb071243264c792e74f562e058ca1d8df7a1 100644 --- a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py +++ b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py @@ -181,11 +181,28 @@ class TraceEventBean: return self.task_type == 'EVENT_WAIT_SQE' def is_backward(self): - bwd_list = ["bwd", "backward"] - for bwd in bwd_list: - if bwd in self.lower_name: - return True - return False + return any(bwd in self.lower_name for bwd in Constant.BWD_LIST) + + def is_python_function(self): + return self.lower_cat == "python_function" + + def is_optimizer(self): + return self.lower_name.startswith("optimizer") + + def is_fwdbwd(self): + return self.lower_cat == "fwdbwd" + + def is_step_profiler(self): + return self.name.find("ProfilerStep#") != -1 + + def reset_name(self, name): + self._name = name + + def is_conv(self): + return self.name.lower().startswith("aten::conv") + + def is_lccl(self): + return self.lower_name == "kernel_aivec" def init(self): if isinstance(self._event, dict): diff --git a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py index 9184c790b7ea59246b602442a13e7e533d921bc8..e5d9bf26e985330d830ba6e01f62525fe88e43ea 100644 --- a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py +++ b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py @@ -16,16 +16,26 @@ class ProfilingInfo: self.sdma_num = 0 self.fa_num_fwd = 0 self.fa_num_bwd = 0 + self.pa_num = 0 + self.lccl_num = 0 + self.conv_time_fwd = 0.0 + self.conv_time_bwd = 0.0 + self.conv_num_fwd = 0 + self.conv_num_bwd = 0 self.compute_time = 0.0 self.communication_not_overlapped = 0.0 + self.wait_time = 0.0 self.memory_used = 0.0 self.e2e_time = 0.0 self.sdma_time = 0.0 self.scheduling_time = 0.0 self.fa_time_bwd = 0.0 + self.pa_time = 0.0 + self.lccl_time = 0.0 self.fa_time_fwd = 0.0 self.minimal_profiling = False self.hide_op_details = False + self.is_level0 = False def trans_time_to_s(self): self.cube_time = self.cube_time / 10 ** 6 @@ -33,21 +43,29 @@ class ProfilingInfo: self.vec_time = self.vec_time / 10 ** 6 self.compute_time = self.compute_time / 10 ** 6 self.communication_not_overlapped = self.communication_not_overlapped / 10 ** 6 + self.wait_time = self.wait_time / 10 ** 6 self.e2e_time = self.e2e_time / 10 ** 6 self.sdma_time = self.sdma_time / 10 ** 6 self.scheduling_time = self.scheduling_time / 10 ** 6 self.fa_time_bwd = self.fa_time_bwd / 10 ** 6 self.fa_time_fwd = self.fa_time_fwd / 10 ** 6 + self.pa_time = self.pa_time / 10 ** 6 + self.lccl_time = self.lccl_time / 10 ** 6 + self.conv_time_fwd = self.conv_time_fwd / 10 ** 6 + self.conv_time_bwd = self.conv_time_bwd / 10 ** 6 def calculate_other_time(self): self.other_time = max( - [0, self.compute_time - self.cube_time - self.fa_time_fwd - self.fa_time_bwd - self.vec_time]) + [0, self.compute_time - self.cube_time - self.fa_time_fwd - self.fa_time_bwd - + self.pa_time - self.vec_time - self.conv_time_fwd - self.conv_time_bwd]) def calculate_vec_time(self): - self.vec_time = self.compute_time - self.cube_time - self.fa_time_fwd - self.fa_time_bwd + self.vec_time = self.compute_time - self.cube_time - self.fa_time_fwd - self.fa_time_bwd \ + - self.conv_time_fwd - self.conv_time_bwd def calculate_schedule_time(self): - self.scheduling_time = self.e2e_time - self.compute_time - self.communication_not_overlapped + self.scheduling_time = (self.e2e_time - self.compute_time - self.lccl_time \ + - self.communication_not_overlapped) def update_fa_fwd_info(self, time: float): self.fa_time_fwd += time @@ -57,6 +75,22 @@ class ProfilingInfo: self.fa_time_bwd += time self.fa_num_bwd += 1 + def update_pa_info(self, time: float): + self.pa_time += time + self.pa_num += 1 + + def update_lccl_info(self, time: float): + self.lccl_time += time + self.lccl_num += 1 + + def update_conv_fwd_info(self, time: float): + self.conv_time_fwd += time + self.conv_num_fwd += 1 + + def update_conv_bwd_info(self, time: float): + self.conv_time_bwd += time + self.conv_num_bwd += 1 + def update_sdma_info(self, time: float, num: int = 1): self.sdma_time += time self.sdma_num += num @@ -84,6 +118,9 @@ class ProfilingInfo: def update_comm_not_overlap(self, time: float): self.communication_not_overlapped += time + def update_comm_not_overlap_wait_time(self, time: float): + self.wait_time = time + def set_memory_used(self, memory: float): self.memory_used = memory diff --git a/profiler/compare_tools/compare_backend/data_prepare/__init__.py b/profiler/compare_tools/compare_backend/data_prepare/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools/compare_backend/data_prepare/module_data_prepare.py b/profiler/compare_tools/compare_backend/data_prepare/module_data_prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..84932366dd9252bf2df068a6d9cc1cf1d0f9c440 --- /dev/null +++ b/profiler/compare_tools/compare_backend/data_prepare/module_data_prepare.py @@ -0,0 +1,99 @@ +import copy +from queue import Queue + +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.profiling_parser.base_profiling_parser import ProfilingResult +from compare_backend.utils.constant import Constant +from compare_backend.utils.module_node import ModuleNode +from compare_backend.utils.tree_builder import TreeBuilder + + +class ModuleDataPrepare: + def __init__(self, profiling_data: ProfilingResult): + self.profiling_data = profiling_data + self._nn_module_list = [] + self._call_function = [] + for event in profiling_data.python_function_data: + if event.lower_name.startswith("nn.module:"): + self._nn_module_list.append(event) + else: + self._call_function.append(event) + self._bwd_dict = {} + self._bwd_pid = self._get_bwd_pid() + + @staticmethod + def update_module_node_info(fwd_root_node, bwd_root_node, func_root_node): + queue = Queue() + queue.put(fwd_root_node) + queue.put(bwd_root_node) + while not queue.empty(): + module_node = queue.get() + module_node.update_torch_op_kernel_list() + call_function = func_root_node.find_module_call(module_node.start_time) + if call_function: + module_node.reset_call_stack(call_function.call_stack) + for sub_module_node in module_node.child_nodes: + queue.put(sub_module_node) + + def build_module_tree(self): + if not self._nn_module_list: + return [None, None] + self._dispatch_torch_op() + event_list = [TraceEventBean({"ts": ts}) for ts in self.profiling_data.kernel_dict.keys()] + self._nn_module_list.extend(event_list) + root_node = TreeBuilder.build_module_tree(self._nn_module_list, self.profiling_data.kernel_dict) + func_root_node = TreeBuilder.build_module_tree(self._call_function, {}) + bwd_module_list = self.get_bwd_module(root_node) + if bwd_module_list: + bwd_module_list.extend(event_list) + bwd_root_node = TreeBuilder.build_module_tree(bwd_module_list, self.profiling_data.kernel_dict) + self.match_torch_op(root_node, bwd_root_node) + self.update_module_node_info(root_node, bwd_root_node, func_root_node) + return [root_node, bwd_root_node] + + def get_bwd_module(self, root_node: ModuleNode): + bwd_module_list = [] + for flow in self.profiling_data.fwdbwd_dict.values(): + start_point = flow.get("start") + end_point = flow.get("end") + if not start_point or not end_point: + continue + end_event = self._bwd_dict.get(end_point.start_time) + if not end_event: + continue + call_module = root_node.find_module_call(start_point.start_time) + if call_module: + bwd_event = copy.deepcopy(end_event) + bwd_event.reset_name(f"[ BACKWARD ]{call_module.module_name}") + bwd_module_list.append(bwd_event) + return bwd_module_list + + def match_torch_op(self, fwd_root_node, bwd_root_node): + torch_op_list = sorted(self.profiling_data.torch_op_data, key=lambda x: x.start_time) + for torch_op in torch_op_list: + if torch_op.is_optimizer(): + continue + if torch_op.is_step_profiler(): + continue + matched_module = fwd_root_node.find_module_call(torch_op.start_time) + if matched_module: + matched_module.find_torch_op_call(torch_op) + continue + matched_module = bwd_root_node.find_module_call(torch_op.start_time) + if matched_module: + matched_module.find_torch_op_call(torch_op) + + def _dispatch_torch_op(self): + for torch_op in self.profiling_data.torch_op_data: + if torch_op.is_optimizer(): + self._nn_module_list.append(torch_op) + continue + if torch_op.pid == self._bwd_pid: + self._bwd_dict[torch_op.start_time] = torch_op + + def _get_bwd_pid(self): + for flow in self.profiling_data.fwdbwd_dict.values(): + end_point = flow.get("end") + if end_point: + return end_point.pid + return Constant.INVALID_VALUE diff --git a/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py b/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..fdce23c6ab4ff7f9f6f7d6bc1442063c57cb6098 --- /dev/null +++ b/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py @@ -0,0 +1,19 @@ +from compare_backend.profiling_parser.base_profiling_parser import ProfilingResult +from compare_backend.utils.tree_builder import TreeBuilder + + +class OperatorDataPrepare: + def __init__(self, profiling_data: ProfilingResult): + self.profiling_data = profiling_data + + def get_top_layer_ops(self) -> any: + root_node = TreeBuilder.build_tree(self.profiling_data.torch_op_data, self.profiling_data.kernel_dict, + self.profiling_data.memory_list) + level1_child_nodes = root_node.child_nodes + result_data = [] + for level1_node in level1_child_nodes: + if level1_node.is_step_profiler(): + result_data.extend(level1_node.child_nodes) + else: + result_data.append(level1_node) + return result_data diff --git a/profiler/compare_tools/compare_backend/disaggregate/__init__.py b/profiler/compare_tools/compare_backend/disaggregate/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/compare_tools/compare_backend/disaggregate/overall_perf_interface.py b/profiler/compare_tools/compare_backend/disaggregate/overall_perf_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..c89e84519302781a590523bc7fdaaf9e1254acf5 --- /dev/null +++ b/profiler/compare_tools/compare_backend/disaggregate/overall_perf_interface.py @@ -0,0 +1,34 @@ +from common_func.path_manager import PathManager +from compare_backend.profiling_parser.gpu_profiling_parser import GPUProfilingParser +from compare_backend.profiling_parser.npu_profiling_parser import NPUProfilingParser +from compare_backend.utils.args_manager import ArgsManager +from compare_backend.utils.compare_args import Args +from compare_backend.utils.constant import Constant + + +class OverallPerfInterface: + PARSER_DICT = {Constant.NPU: NPUProfilingParser, Constant.GPU: GPUProfilingParser} + + def __init__(self, profiling_path: str): + self._profiling_path = profiling_path + self._profiling_path_dict = {} + self._result_data = {} + + def run(self): + self._check_path() + self._load_data() + self._generate_result() + return self._result_data + + def _check_path(self): + profiling_path = PathManager.get_realpath(self._profiling_path) + self._profiling_path_dict = ArgsManager().parse_profiling_path(profiling_path) + + def _load_data(self): + args = Args(enable_profiling_compare=True) + profiling_type = self._profiling_path_dict.get(Constant.PROFILING_TYPE, Constant.NPU) + self._profiling_data = self.PARSER_DICT.get(profiling_type)(args, self._profiling_path_dict).load_data() + + def _generate_result(self): + overall_data = self._profiling_data.overall_metrics + self._result_data = getattr(overall_data, "__dict__", {}) diff --git a/profiler/compare_tools/compare_backend/generator/base_generator.py b/profiler/compare_tools/compare_backend/generator/base_generator.py index c472bc9922e6febf118f62a66424056243156c07..e77071b5998a9915d09c54f8b4c811d434555167 100644 --- a/profiler/compare_tools/compare_backend/generator/base_generator.py +++ b/profiler/compare_tools/compare_backend/generator/base_generator.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from collections import OrderedDict from multiprocessing import Process @@ -7,7 +8,7 @@ class BaseGenerator(Process, ABC): super(BaseGenerator, self).__init__() self._profiling_data_dict = profiling_data_dict self._args = args - self._result_data = {} + self._result_data = OrderedDict() def run(self): self.compare() diff --git a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py index 72ce3ba86893b08ffdd8deff5c586731db4b84f5..5b93d888a4b093a6509438ec6a3c916a50b48e9a 100644 --- a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py +++ b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py @@ -1,23 +1,28 @@ import os from collections import deque from datetime import datetime - -import numpy as np +from queue import Queue from compare_backend.comparator.communication_comparator import CommunicationComparator +from compare_backend.comparator.module_comparetor import ModuleComparator +from compare_backend.comparator.module_statistic_comparator import ModuleStatisticComparator from compare_backend.comparator.operator_comparator import OperatorComparator from compare_backend.comparator.operator_statistic_comparator import OperatorStatisticComparator from compare_backend.compare_bean.communication_bean import CommunicationBean from compare_backend.compare_bean.memory_compare_bean import MemoryCompareBean from compare_backend.compare_bean.memory_statistic_bean import MemoryStatisticBean +from compare_backend.compare_bean.module_compare_bean import ModuleCompareBean +from compare_backend.compare_bean.module_statistic_bean import ModuleStatisticBean from compare_backend.compare_bean.operator_compare_bean import OperatorCompareBean from compare_backend.compare_bean.operator_statistic_bean import OperatorStatisticBean +from compare_backend.data_prepare.module_data_prepare import ModuleDataPrepare +from compare_backend.data_prepare.operator_data_prepare import OperatorDataPrepare from compare_backend.generator.base_generator import BaseGenerator -from compare_backend.profiling_parser.base_profiling_parser import ProfilingResult +from compare_backend.utils.common_func import longest_common_subsequence_matching from compare_backend.utils.constant import Constant +from compare_backend.utils.module_node import ModuleNode from compare_backend.utils.name_function import NameFunction from compare_backend.utils.torch_op_node import TorchOpNode -from compare_backend.utils.tree_builder import TreeBuilder from compare_backend.view.excel_view import ExcelView @@ -25,6 +30,12 @@ class DetailPerformanceGenerator(BaseGenerator): def __init__(self, profiling_data_dict: dict, args: any): super().__init__(profiling_data_dict, args) + @classmethod + def _match_none_subsequence(cls, base_ops: list, comparison_ops: list) -> list: + op_compare_result = [[op, None] for op in iter(base_ops)] + op_compare_result.extend([[None, op] for op in iter(comparison_ops)]) + return op_compare_result + def compare(self): if self._args.enable_operator_compare or self._args.enable_memory_compare or \ self._args.enable_communication_compare: @@ -44,7 +55,16 @@ class DetailPerformanceGenerator(BaseGenerator): def _create_comparator(self): comparator_list = [] - if self._args.enable_operator_compare or self._args.enable_memory_compare: + + op_compare_result = [] + if self._args.enable_operator_compare: + module_compare_result = self.match_nn_module() if self._profiling_data_dict.get( + Constant.BASE_DATA).python_function_data and self._profiling_data_dict.get( + Constant.COMPARISON_DATA).python_function_data else [] + if not module_compare_result: + op_compare_result = self.match_torch_op() + + if self._args.enable_memory_compare and not op_compare_result: op_compare_result = self.match_torch_op() if self._args.enable_communication_compare: @@ -54,89 +74,32 @@ class DetailPerformanceGenerator(BaseGenerator): comparator_list.append(CommunicationComparator(communication_data, CommunicationBean)) if self._args.enable_operator_compare: - comparator_list.append(OperatorComparator(op_compare_result, OperatorCompareBean)) - comparator_list.append(OperatorStatisticComparator(op_compare_result, OperatorStatisticBean)) - + if module_compare_result: + comparator_list.append(ModuleStatisticComparator(module_compare_result, ModuleStatisticBean)) + if not self._args.disable_details: + comparator_list.append(ModuleComparator(module_compare_result, ModuleCompareBean)) + else: + comparator_list.append(OperatorStatisticComparator(op_compare_result, OperatorStatisticBean)) + if not self._args.disable_details: + comparator_list.append(OperatorComparator(op_compare_result, OperatorCompareBean)) if self._args.enable_memory_compare: - comparator_list.append(OperatorComparator(op_compare_result, MemoryCompareBean)) comparator_list.append(OperatorStatisticComparator(op_compare_result, MemoryStatisticBean)) + if not self._args.disable_details: + comparator_list.append(OperatorComparator(op_compare_result, MemoryCompareBean)) return comparator_list def match_torch_op(self) -> list: - base_ops = self._get_top_layer_ops(self._profiling_data_dict.get(Constant.BASE_DATA)) - comparison_ops = self._get_top_layer_ops(self._profiling_data_dict.get(Constant.COMPARISON_DATA)) + base_ops = OperatorDataPrepare(self._profiling_data_dict.get(Constant.BASE_DATA)).get_top_layer_ops() + comparison_ops = OperatorDataPrepare( + self._profiling_data_dict.get(Constant.COMPARISON_DATA)).get_top_layer_ops() if not base_ops and not comparison_ops: return [] name_func = NameFunction(self._args).get_name_func() - compare_result_data = self._matching_op(base_ops, comparison_ops, name_func) + op_compare_result = longest_common_subsequence_matching(base_ops, comparison_ops, name_func) \ + if not self._args.disable_details else self._match_none_subsequence(base_ops, comparison_ops) if self._args.max_kernel_num is not None: - compare_result_data = self._drill_down(compare_result_data, name_func) - return compare_result_data - - @classmethod - def _matching_op(cls, base_ops: list, comparison_ops: list, name_func: any) -> list: - if not comparison_ops: - result_data = [None] * len(base_ops) - for index, value in enumerate(base_ops): - result_data[index] = [value, None] - return result_data - - result_data = [] - comparison_len, base_len = len(comparison_ops), len(base_ops) - dp = [[0] * (base_len + 1) for _ in range(comparison_len + 1)] - for comparison_index in range(1, comparison_len + 1): - for base_index in range(1, base_len + 1): - if name_func(base_ops[base_index - 1]) == name_func( - comparison_ops[comparison_index - 1]): - dp[comparison_index][base_index] = dp[comparison_index - 1][base_index - 1] + 1 - else: - dp[comparison_index][base_index] = max(dp[comparison_index][base_index - 1], - dp[comparison_index - 1][base_index]) - matched_op = [] - comparison_index, base_index = comparison_len, base_len - while comparison_index > 0 and base_index > 0: - if name_func(base_ops[base_index - 1]) == name_func( - comparison_ops[comparison_index - 1]): - matched_op.append([comparison_index - 1, base_index - 1]) - comparison_index -= 1 - base_index -= 1 - continue - if dp[comparison_index][base_index - 1] > dp[comparison_index - 1][base_index]: - base_index -= 1 - else: - comparison_index -= 1 - if not matched_op: - matched_base_index_list = [] - else: - matched_op.reverse() - matched_op = np.array(matched_op) - matched_base_index_list = list(matched_op[:, 1]) - curr_comparison_index = 0 - for base_index, base_api_node in enumerate(base_ops): - if base_index not in matched_base_index_list: - result_data.append([base_api_node, None]) - continue - matched_comparison_index = matched_op[matched_base_index_list.index(base_index), 0] - for comparison_index in range(curr_comparison_index, matched_comparison_index): - result_data.append([None, comparison_ops[comparison_index]]) - result_data.append([base_api_node, comparison_ops[matched_comparison_index]]) - curr_comparison_index = matched_comparison_index + 1 - if curr_comparison_index < len(comparison_ops): - for comparison_index in range(curr_comparison_index, len(comparison_ops)): - result_data.append([None, comparison_ops[comparison_index]]) - return result_data - - def _get_top_layer_ops(self, profiling_data: ProfilingResult) -> any: - root_node = TreeBuilder.build_tree(profiling_data.torch_op_data, profiling_data.kernel_dict, - profiling_data.memory_list) - level1_child_nodes = root_node.child_nodes - result_data = [] - for level1_node in level1_child_nodes: - if level1_node.is_step_profiler(): - result_data.extend(level1_node.child_nodes) - else: - result_data.append(level1_node) - return result_data + op_compare_result = self._drill_down(op_compare_result, name_func) + return op_compare_result def _drill_down(self, compare_result_data: list, name_func: any) -> list: drill_down_result = [] @@ -152,9 +115,47 @@ class DetailPerformanceGenerator(BaseGenerator): if max(base_op.kernel_num, comparison_op.kernel_num) <= self._args.max_kernel_num: drill_down_result.append(match_data) continue - match_list = self._matching_op(base_op.child_nodes, comparison_op.child_nodes, name_func) + match_list = longest_common_subsequence_matching(base_op.child_nodes, + comparison_op.child_nodes, + name_func) \ + if not self._args.disable_details else self._match_none_subsequence(base_op.child_nodes, + comparison_op.child_nodes) match_list.reverse() for data in match_list: op_deque.append(data) return drill_down_result + + def match_nn_module(self) -> list: + module_compare_result = [] + base_root_node = ModuleDataPrepare(self._profiling_data_dict.get(Constant.BASE_DATA)).build_module_tree() + comparison_root_node = ModuleDataPrepare( + self._profiling_data_dict.get(Constant.COMPARISON_DATA)).build_module_tree() + for index, base_node in enumerate(base_root_node): + comparison_node = comparison_root_node[index] if index < len(comparison_root_node) else None + if not base_node or not comparison_node: + continue + module_compare_result.extend(self._matching_all_modules(base_node, comparison_node)) + return module_compare_result + + def _matching_all_modules(self, base_node: ModuleNode, comparison_node: ModuleNode): + all_matched_modules = [] + matched_queue = Queue() + matched_queue.put([base_node, comparison_node]) + while not matched_queue.empty(): + matched_base_node, matched_comparison_node = matched_queue.get() + matched_node_list = self._matching_common_subsequence(matched_base_node, matched_comparison_node) + all_matched_modules.extend(matched_node_list) + for matched_node in matched_node_list: + matched_queue.put(matched_node) + return all_matched_modules + + def _matching_common_subsequence(self, base_node: ModuleNode, comparison_node: ModuleNode): + base_modules = base_node.child_nodes if base_node else [] + comparison_modules = comparison_node.child_nodes if comparison_node else [] + if not base_modules and not comparison_modules: + return [] + name_func = NameFunction(self._args).get_module_name + result = longest_common_subsequence_matching(base_modules, comparison_modules, name_func) \ + if not self._args.disable_details else self._match_none_subsequence(base_modules, comparison_modules) + return result diff --git a/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py index 4c0b51272b0bbf71f6632a7b28005bae2298d056..2127ff5e75e23e98f0debb0dfdafbeb01930c082 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py @@ -4,7 +4,6 @@ from decimal import Decimal from compare_backend.compare_bean.origin_data_bean.compare_event import KernelEvent, MemoryEvent from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean from compare_backend.compare_bean.profiling_info import ProfilingInfo -from compare_backend.utils.args_manager import ArgsManager from compare_backend.utils.constant import Constant from compare_backend.utils.file_reader import FileReader @@ -18,11 +17,19 @@ class ProfilingResult: self.memory_list = [] self.communication_dict = {} self.overall_metrics = ProfilingInfo(profiling_type) + self.python_function_data = [] + self.fwdbwd_dict = {} def update_torch_op_data(self, event: TraceEventBean): event.is_torch_op = True self.torch_op_data.append(event) + def update_python_function_data(self, event: TraceEventBean): + self.python_function_data.append(event) + + def update_fwdbwd_data(self, flow_type: str, event: TraceEventBean): + self.fwdbwd_dict.setdefault(event.id, {})[flow_type] = event + def update_kernel_dict(self, start_time: Decimal, kernel_event: TraceEventBean): self.kernel_dict.setdefault(start_time, []).append(KernelEvent(kernel_event, self._profiling_type)) @@ -45,14 +52,15 @@ class BaseProfilingParser(ABC): self._profiling_path = path_dict.get(Constant.PROFILING_PATH) self._json_path = path_dict.get(Constant.TRACE_PATH) self._trace_events = [] if self._profiling_path == Constant.NPU else {} - self._enable_profiling_compare = ArgsManager().enable_profiling_compare - self._enable_operator_compare = ArgsManager().enable_operator_compare - self._enable_memory_compare = ArgsManager().enable_memory_compare - self._enable_communication_compare = ArgsManager().enable_communication_compare + self._enable_profiling_compare = args.enable_profiling_compare + self._enable_operator_compare = args.enable_operator_compare + self._enable_memory_compare = args.enable_memory_compare + self._enable_communication_compare = args.enable_communication_compare self._dispatch_func = self._get_dispatch_func() self._result_data = ProfilingResult(self._profiling_type) self._memory_events = [] self._flow_dict = {} + self._fwdbwd_dict = {} self._all_kernels = {} self._comm_task_list = [] self._comm_list = [] @@ -134,6 +142,21 @@ class BaseProfilingParser(ABC): return True return False + def _picking_python_function_event(self, event: TraceEventBean): + if event.is_python_function(): + self._result_data.update_python_function_data(event) + return True + return False + + def _picking_fwdbwd_flow_event(self, event: TraceEventBean): + if event.is_fwdbwd(): + if event.is_flow_start(): + self._result_data.update_fwdbwd_data("start", event) + elif event.is_flow_end(): + self._result_data.update_fwdbwd_data("end", event) + return True + return False + def _update_kernel_dict(self): if self._profiling_type == Constant.NPU: for comm in self._comm_list: diff --git a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py index 2ad2e1a557fad7095bea642892c64f32363182e9..c4089aec9bdcb35b80ae9ff9121fcd75bde3a63e 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py @@ -3,23 +3,23 @@ from collections import defaultdict, Counter from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean from compare_backend.profiling_parser.base_profiling_parser import BaseProfilingParser -from compare_backend.utils.args_manager import ArgsManager from compare_backend.utils.constant import Constant class GPUProfilingParser(BaseProfilingParser): - CUBE_MARK = 'gemm' - FA_MARK_LIST = [['fmha', 'kernel'], ['flash', 'kernel']] + CUBE_MARK = ['gemm', 'conv', 'cutlass', 'wgrad'] + FA_MARK_LIST = [['fmha', 'kernel'], ['flash', 'kernel'], ['attention', 'kernel']] SDMA_MARK_LIST = ['htod', 'dtod', 'dtoh', 'memset (device)'] FLOW_CAT = ("async_gpu", "async_cpu_to_gpu", "ac2g", "async") - TORCH_OP_CAT = ("cpu_op", "user_annotation", "cuda_runtime", "operator") + TORCH_OP_CAT = ("cpu_op", "user_annotation", "cuda_runtime", "operator", "runtime") def __init__(self, args: any, path_dict: dict): super().__init__(args, path_dict) self._trace_events = [TraceEventBean(event) for event in self._trace_events.get("traceEvents", [])] - self._flow_cat = (ArgsManager().args.gpu_flow_cat,) if ArgsManager().args.gpu_flow_cat else self.FLOW_CAT + self._flow_cat = (args.gpu_flow_cat,) if args.gpu_flow_cat else self.FLOW_CAT self._compute_stream_id = self._infer_compute_stream_id() self._marks = defaultdict(int) + self._aten_index = 0 @classmethod def __is_flash_attention(cls, name: str): @@ -67,6 +67,14 @@ class GPUProfilingParser(BaseProfilingParser): def _calculate_performance_time(self): min_ts = sys.float_info.max max_ts = sys.float_info.min + self._trace_events.sort(key=lambda x: x.start_time) + aten_events = list(filter(lambda x: x.name.startswith("aten::"), self._trace_events)) + flow_dict_new = {} + for flow_event in self._flow_dict.values(): + start_event = flow_event.get("start") + end_event = flow_event.get("end") + if start_event and end_event: + flow_dict_new[end_event.start_time] = start_event.start_time for event in self._trace_events: if event.stream: min_ts = min(event.start_time, min_ts) @@ -79,7 +87,8 @@ class GPUProfilingParser(BaseProfilingParser): self.__add_marks(event) if event.is_nccl_name(): continue - self.__add_compute_time(event) + self.__add_compute_time(event, aten_events, flow_dict_new) + self._aten_events = None self._result_data.overall_metrics.set_e2e_time(float(max_ts - min_ts)) self.__add_compute_and_overlap_time() @@ -97,17 +106,38 @@ class GPUProfilingParser(BaseProfilingParser): for timestep in range(int(event.start_time + 1), int(event.end_time + 1)): self._marks[str(timestep)] += -100 # mark this timestep in compute stream - def __add_compute_time(self, event: TraceEventBean): + def __add_compute_time(self, event: TraceEventBean, aten_events: list, flow_dict_new: dict): if self.__is_flash_attention(event.name): if event.is_backward(): self._result_data.overall_metrics.update_fa_bwd_info(event.dur) else: self._result_data.overall_metrics.update_fa_fwd_info(event.dur) - elif self.CUBE_MARK in event.lower_name: - self._result_data.overall_metrics.update_cube_info(event.dur) + elif any(cube_mark in event.lower_name for cube_mark in self.CUBE_MARK): + is_conv = self.__check_is_conv(event, aten_events, flow_dict_new) + if is_conv == "conv_fwd": + self._result_data.overall_metrics.update_conv_fwd_info(event.dur) + elif is_conv == "conv_bwd": + self._result_data.overall_metrics.update_conv_bwd_info(event.dur) + else: + self._result_data.overall_metrics.update_cube_info(event.dur) else: self._result_data.overall_metrics.update_vec_info(event.dur) + def __check_is_conv(self, event: TraceEventBean, aten_events: list, flow_dict_new: dict) -> str: + flow_start_time = flow_dict_new.get(event.start_time) + if not flow_start_time: + return "" + aten_len = len(aten_events) + while self._aten_index < aten_len: + cur_aten = aten_events[self._aten_index] + if cur_aten.end_time < flow_start_time: + self._aten_index += 1 + continue + if cur_aten.start_time < flow_start_time: + if cur_aten.is_conv(): + return "conv_bwd" if cur_aten.is_backward() else "conv_fwd" + return "" + def _picking_memory_event(self, event: TraceEventBean): if event.is_memory_event(): self._memory_events.append(event) @@ -136,6 +166,9 @@ class GPUProfilingParser(BaseProfilingParser): func_set.add(self._picking_torch_op_event) if self._enable_communication_compare: func_set.add(self._picking_kernel_event) + if self._enable_operator_compare: + func_set.add(self._picking_python_function_event) + func_set.add(self._picking_fwdbwd_flow_event) if self._enable_operator_compare or self._args.max_kernel_num: func_set.add(self._picking_kernel_event) func_set.add(self._picking_flow_event) diff --git a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py index f872e52a5314a40dbc2e0d4ff7868e875986b809..70ce44b44eb419196dc479dc30ae0b1e4a1136cb 100644 --- a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py +++ b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py @@ -41,6 +41,9 @@ class NPUProfilingParser(BaseProfilingParser): if self._enable_operator_compare or self._args.max_kernel_num: func_list.add(self._picking_kernel_event) func_list.add(self._picking_flow_event) + if self._enable_operator_compare: + func_list.add(self._picking_python_function_event) + func_list.add(self._picking_fwdbwd_flow_event) if self._enable_memory_compare: func_list.add(self._picking_task_queue_data) if self._enable_communication_compare: @@ -48,6 +51,7 @@ class NPUProfilingParser(BaseProfilingParser): if self._enable_profiling_compare: func_list.add(self._picking_overlap_analysis_data) func_list.add(self._picking_kernel_event) + func_list.add(self._picking_hccl_event) return list(func_list) def _update_memory_list(self): @@ -96,12 +100,76 @@ class NPUProfilingParser(BaseProfilingParser): self.__parse_info_json() self.__parse_mem_csv() self.__parse_kernel_csv() + self.__add_lccl_time() self.__add_sdma_time() self.__add_overlap_analysis_time() + self._picking_notify_wait_event_and_not_overlap_event() + self.__add_overlap_wait_time() self._result_data.overall_metrics.calculate_other_time() self._result_data.overall_metrics.calculate_schedule_time() self._result_data.overall_metrics.trans_time_to_s() + def _picking_notify_wait_event_and_not_overlap_event(self): + self.notify_event_cache = [] + self._not_overlaped_commu_event = [] + for event in self._comm_task_list: + if event.name == 'Notify_Wait' and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_CHECK' \ + and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_ACK': + self.notify_event_cache.append(event) + for event in self._overlap_analysis: + if event.is_comm_not_overlap(): + self._not_overlaped_commu_event.append(event) + self._not_overlaped_commu_event.sort(key=lambda x: x.start_time) + + def __add_overlap_wait_time(self): + notify_wait_event_dict = dict() + for notify_event in self.notify_event_cache: + if notify_event.tid in notify_wait_event_dict: + notify_wait_event_dict[notify_event.tid].append(notify_event) + else: + notify_wait_event_dict[notify_event.tid] = [notify_event] + + if self._result_data.overall_metrics.is_level0: + return + + total_time = 0 + for commu_event in self._not_overlaped_commu_event: + wait_time_list = [0] + commu_event_start_time = float(commu_event.start_time) + commu_event_end_time = float(commu_event.start_time) + commu_event.dur + + for plane_id, events in notify_wait_event_dict.items(): + wait_time = 0 + idx = 0 + for notify_event in events: + notify_event_start_time = float(notify_event.start_time) + notify_event_end_time = float(notify_event.start_time) + notify_event.dur + if notify_event_start_time < commu_event_start_time and notify_event_end_time > \ + commu_event_end_time: + wait_time = commu_event_end_time - commu_event_start_time + break + elif notify_event_start_time < commu_event_start_time <= notify_event_end_time <= \ + commu_event_end_time: + wait_time += notify_event_end_time - commu_event_start_time + idx += 1 + elif commu_event_start_time <= notify_event_start_time <= commu_event_end_time < \ + notify_event_end_time: + wait_time += commu_event_end_time - notify_event_start_time + break + elif notify_event_start_time >= commu_event_start_time and notify_event_end_time <= \ + commu_event_end_time: + wait_time += notify_event_end_time - notify_event_start_time + idx += 1 + elif notify_event_end_time < commu_event_start_time: + idx += 1 + else: + break + + wait_time_list.append(wait_time) + notify_wait_event_dict[plane_id] = notify_wait_event_dict[plane_id][idx:] + total_time += max(wait_time_list) + self._result_data.overall_metrics.update_comm_not_overlap_wait_time(total_time) + def _picking_hccl_event(self, event: TraceEventBean): if event.pid != self._hccl_pid or not event.is_x_mode(): return False @@ -162,12 +230,19 @@ class NPUProfilingParser(BaseProfilingParser): if not isinstance(json_data, dict) or not json_data: print('[WARNING] Invalid profiler info.') return - if self.ACTIVE_CPU in json_data.get('config', {}).get('common_config', {}).get('activities', []): + level = json_data.get('config', {}).get('experimental_config', {}).get('_profiler_level', '') + if self.LEVEL_0 != level: return - if self.LEVEL_0 != json_data.get('config', {}).get('experimental_config', {}).get('_profiler_level', ''): + self._result_data.overall_metrics.is_level0 = True + if self.ACTIVE_CPU in json_data.get('config', {}).get('common_config', {}).get('activities', []): return self._result_data.overall_metrics.minimal_profiling = True + def __add_lccl_time(self): + for event in self._all_kernels.values(): + if event.is_lccl(): + self._result_data.overall_metrics.update_lccl_info(event.dur) + def __parse_kernel_csv(self): try: kernel_details = FileReader.read_csv_file(self._kernel_detail_path, KernelDetailsBean) @@ -185,10 +260,17 @@ class NPUProfilingParser(BaseProfilingParser): self._result_data.overall_metrics.update_fa_bwd_info(kernel.duration) else: self._result_data.overall_metrics.update_fa_fwd_info(kernel.duration) + elif kernel.is_conv(): + if kernel.is_conv_bwd(): + self._result_data.overall_metrics.update_conv_bwd_info(kernel.duration) + else: + self._result_data.overall_metrics.update_conv_fwd_info(kernel.duration) elif kernel.is_cube(): self._result_data.overall_metrics.update_cube_info(kernel.duration) elif kernel.is_sdma(): self._result_data.overall_metrics.update_sdma_info(kernel.duration) + elif kernel.is_page_attention(): + self._result_data.overall_metrics.update_pa_info(kernel.duration) elif kernel.is_vector(): self._result_data.overall_metrics.update_vec_info(kernel.duration) else: @@ -235,7 +317,7 @@ class NPUProfilingParser(BaseProfilingParser): sdma_dict.setdefault(stream_id, []).append(event.dur) elif event.is_compute_event(): ai_core_stream.add(stream_id) - compute_stream = event_wait_stream & ai_core_stream + compute_stream = event_wait_stream & ai_core_stream if event_wait_stream else ai_core_stream for stream in compute_stream: dur_list = sdma_dict.get(stream, []) self._result_data.overall_metrics.update_sdma_info(sum(dur_list), len(dur_list)) diff --git a/profiler/compare_tools/compare_backend/utils/args_manager.py b/profiler/compare_tools/compare_backend/utils/args_manager.py index f09a794a25bf4ccb4e64b2e7dd55732771a0e6bd..4b5947fa7bccc32277bb9d18d97ab71249c66941 100644 --- a/profiler/compare_tools/compare_backend/utils/args_manager.py +++ b/profiler/compare_tools/compare_backend/utils/args_manager.py @@ -95,7 +95,8 @@ class ArgsManager: profiler_output = ascend_output if os.path.isdir(ascend_output) else file_path json_path = os.path.join(profiler_output, "trace_view.json") if not os.path.isfile(json_path): - msg = f"Invalid profiling path: {file_path}" + msg = (f"The data is not collected by PyTorch Adaptor mode or the data is not parsed. " + f"Invalid profiling path: {profiler_output}") raise RuntimeError(msg) path_dict = {Constant.PROFILING_TYPE: Constant.NPU, Constant.PROFILING_PATH: file_path, Constant.TRACE_PATH: json_path, Constant.ASCEND_OUTPUT_PATH: profiler_output} diff --git a/profiler/compare_tools/compare_backend/utils/common_func.py b/profiler/compare_tools/compare_backend/utils/common_func.py index 26584626cd1786d32d4e7f5fcaef1a09d8726852..68a1ab584f1514980bc784f4a55152efffe698cf 100644 --- a/profiler/compare_tools/compare_backend/utils/common_func.py +++ b/profiler/compare_tools/compare_backend/utils/common_func.py @@ -1,5 +1,7 @@ from decimal import Decimal +import numpy + def calculate_diff_ratio(base_value: float, comparison_value: float): if not base_value and not comparison_value: @@ -31,3 +33,63 @@ def convert_to_decimal(data: any) -> Decimal: print('[ERROR] Invalid profiling data which failed to convert data to decimal.') return 0.0 return decimal_value + + +def longest_common_subsequence_matching(base_ops: list, comparison_ops: list, name_func: any) -> list: + if not comparison_ops: + result_data = [None] * len(base_ops) + for index, value in enumerate(base_ops): + result_data[index] = [value, None] + return result_data + + comparison_len, base_len = len(comparison_ops), len(base_ops) + if comparison_len * base_len > 50 * 10 ** 8: + print('[WARNING] The comparison time is expected to exceed 30 minutes, if you want to see the results quickly, ' + 'you can restart comparison task and turn on the switch --disable_details.') + dp_flag = set() # flag for only comparison op + pre_list = [0] * (base_len + 1) + cur_list = [0] * (base_len + 1) + + comparison_index = 1 + iter_comparison_data = iter(comparison_ops) + for comparison_data in iter_comparison_data: + base_index = 1 + iter_base_data = iter(base_ops) + for base_data in iter_base_data: + if name_func(comparison_data) == name_func(base_data): + cur_list[base_index] = pre_list[base_index - 1] + 1 + else: + only_base = cur_list[base_index - 1] + only_comparison = pre_list[base_index] + if only_base < only_comparison: + dp_flag.add(comparison_index * base_len + base_index) + cur_list[base_index] = only_comparison + else: + cur_list[base_index] = only_base + base_index += 1 + pre_list = cur_list + comparison_index += 1 + + matched_op = [] + comparison_index, base_index = comparison_len, base_len + while comparison_index > 0 and base_index > 0: + base_data = base_ops[base_index - 1] + comparison_data = comparison_ops[comparison_index - 1] + if name_func(base_data) == name_func(comparison_data): + matched_op.append([base_data, comparison_data]) + comparison_index -= 1 + base_index -= 1 + elif (comparison_index * base_len + base_index) in dp_flag: + matched_op.append([None, comparison_data]) + comparison_index -= 1 + else: + matched_op.append([base_data, None]) + base_index -= 1 + while comparison_index > 0: + matched_op.append([None, comparison_ops[comparison_index - 1]]) + comparison_index -= 1 + while base_index > 0: + matched_op.append([base_ops[base_index - 1], None]) + base_index -= 1 + matched_op.reverse() + return matched_op diff --git a/profiler/compare_tools/compare_backend/utils/compare_args.py b/profiler/compare_tools/compare_backend/utils/compare_args.py new file mode 100644 index 0000000000000000000000000000000000000000..ab9bc364f440ca8412a6e40d67ca74b7c897cbd9 --- /dev/null +++ b/profiler/compare_tools/compare_backend/utils/compare_args.py @@ -0,0 +1,24 @@ +class Args: + def __init__(self, + base_profiling_path: str = "", + comparison_profiling_path: str = "", + enable_profiling_compare: bool = False, + enable_operator_compare: bool = False, + enable_memory_compare: bool = False, + enable_communication_compare: bool = False, + output_path: str = "", + max_kernel_num: int = None, + op_name_map: dict = {}, + use_input_shape: bool = False, + gpu_flow_cat: str = ""): + self.base_profiling_path = base_profiling_path + self.comparison_profiling_path = comparison_profiling_path + self.enable_profiling_compare = enable_profiling_compare + self.enable_operator_compare = enable_operator_compare + self.enable_memory_compare = enable_memory_compare + self.enable_communication_compare = enable_communication_compare + self.output_path = output_path + self.max_kernel_num = max_kernel_num + self.op_name_map = op_name_map + self.use_input_shape = use_input_shape + self.gpu_flow_cat = gpu_flow_cat diff --git a/profiler/compare_tools/compare_backend/utils/constant.py b/profiler/compare_tools/compare_backend/utils/constant.py index d44f9fea93649f5301fa436a1dcac6a39702112a..1b77b214c85f6733e36298e119e43a778fd7969f 100644 --- a/profiler/compare_tools/compare_backend/utils/constant.py +++ b/profiler/compare_tools/compare_backend/utils/constant.py @@ -53,6 +53,8 @@ class Constant(object): MEMORY_TOP_TABLE = "MemoryCompareStatistic" COMMUNICATION_TABLE = "CommunicationCompare" PERFORMANCE_TABLE = "Model Profiling Time Distribution" + MODULE_TABLE = "ModuleCompare" + MODULE_TOP_TABLE = "ModuleCompareStatistic" # memory SIZE = "Size(KB)" @@ -74,3 +76,5 @@ class Constant(object): #compare type OVERALL_COMPARE = "overall" + + BWD_LIST = ["bwd", "backward", "back"] diff --git a/profiler/compare_tools/compare_backend/utils/excel_config.py b/profiler/compare_tools/compare_backend/utils/excel_config.py index 50b2e6329e3b450fc85caca1c0b0d8ab8895a522..306abcdfec6e62f24977b989258ad190a90c9bd7 100644 --- a/profiler/compare_tools/compare_backend/utils/excel_config.py +++ b/profiler/compare_tools/compare_backend/utils/excel_config.py @@ -14,6 +14,10 @@ class CellFormatType: 'bold': True} # 字符串,无背景色,字体加粗 BLUE_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.BLUE_COLOR, 'align': 'left', 'valign': 'vcenter', 'bold': True, 'border': True} # 蓝色背景,加粗 + GREEN_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.GREEN_COLOR, 'align': 'left', + 'valign': 'vcenter', 'bold': True, 'border': True} # 绿色背景,加粗 + YELLOW_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.YELLOW_COLOR, 'align': 'left', + 'valign': 'vcenter', 'bold': True, 'border': True} # 黄色背景,加粗 class ExcelConfig(object): @@ -46,6 +50,21 @@ class ExcelConfig(object): AVG_DURATION = "Avg Duration(us)" MAX_DURATION = "Max Duration(us)" MIN_DURATION = "Min Duration(us)" + MODULE_CLASS = "Module Class" + MODULE_NAME = "Module Name" + DEVICE_SELF_TIME = "Device Self Time(ms)" + DEVICE_TOTAL_TIME = "Device Total Time(ms)" + DIFF_SELF_TIME = "Device Self Time Diff(ms)" + DIFF_TOTAL_RATIO = "Total Diff Ratio" + DIFF_TOTAL_TIME = "Device Total Time Diff(ms)" + DEVICE_SELF_TIME_US = "Device Self Time(us)" + DEVICE_TOTAL_TIME_US = "Device Total Time(us)" + DIFF_SELF_TIME_US = "Device Self Time Diff(us)" + DIFF_TOTAL_TIME_US = "Device Total Time Diff(us)" + NUMBER = "Number" + MODULE_LEVEL = "Module Level" + BASE_CALL_STACK = "Base Call Stack" + COMPARISON_CALL_STACK = "Comparison Call Stack" HEADERS = { Constant.OPERATOR_TABLE: [ @@ -118,9 +137,49 @@ class ExcelConfig(object): {"name": MIN_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17}, {"name": DIFF_DUR, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20} + ], + Constant.MODULE_TOP_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": MODULE_CLASS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": MODULE_LEVEL, "type": CellFormatType.DEFAULT, "width": 15}, + {"name": MODULE_NAME, "type": CellFormatType.DEFAULT, "width": 35}, + {"name": OPERATOR_NAME, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": DEVICE_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": DEVICE_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_TOTAL_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 15}, + {"name": BASE_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30}, + {"name": COMPARISON_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30} + ], + Constant.MODULE_TABLE: [ + {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10}, + {"name": MODULE_CLASS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": MODULE_LEVEL, "type": CellFormatType.DEFAULT, "width": 15}, + {"name": MODULE_NAME, "type": CellFormatType.DEFAULT, "width": 35}, + {"name": OPERATOR_NAME, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_SELF_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DEVICE_TOTAL_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": OPERATOR_NAME, "type": CellFormatType.DEFAULT, "width": 25}, + {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20}, + {"name": DEVICE_SELF_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DEVICE_TOTAL_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_TOTAL_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_SELF_TIME_US, "type": CellFormatType.DEFAULT_FLOAT, "width": 20}, + {"name": DIFF_TOTAL_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 15}, + {"name": BASE_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30}, + {"name": COMPARISON_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30} ] } OVERHEAD = {Constant.OPERATOR_TABLE: ["B1:F1", "G1:K1"], Constant.MEMORY_TABLE: ["B1:F1", "G1:K1"], Constant.COMMUNICATION_TABLE: ["B1:H1", "I1:O1"], Constant.OPERATOR_TOP_TABLE: ["C1:D1", "E1:F1"], - Constant.MEMORY_TOP_TABLE: ["C1:E1", "F1:H1"]} + Constant.MEMORY_TOP_TABLE: ["C1:E1", "F1:H1"], Constant.MODULE_TOP_TABLE: ["F1:I1", "J1:M1"], + Constant.MODULE_TABLE: ["E1:H1", "I1:L1"]} diff --git a/profiler/compare_tools/compare_backend/utils/module_node.py b/profiler/compare_tools/compare_backend/utils/module_node.py new file mode 100644 index 0000000000000000000000000000000000000000..f85606094ede7abc378c1b3d017b4a98c8800107 --- /dev/null +++ b/profiler/compare_tools/compare_backend/utils/module_node.py @@ -0,0 +1,171 @@ +import re +from math import ceil + +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.utils.torch_op_node import TorchOpNode + + +class ModuleNode: + ts = "ts" + kernels = "kernels" + + def __init__(self, event: TraceEventBean, parent_node=None): + self._event = event + self._parent_node = parent_node + self._child_nodes = [] + self._module_name = f"{parent_node.module_name}/{event.name}" if parent_node else event.name + self._module_level = parent_node.module_level + 1 if parent_node else 1 + self._kernel_self_list = [] + self._kernel_total_list = [] + self._call_stack = f"{parent_node.call_stack};\n{event.name}" if parent_node and parent_node.call_stack \ + else event.name + self._root_torch_op_node = TorchOpNode() + self._cur_torch_op_node = self._root_torch_op_node + + @property + def module_name(self): + return self._module_name + + @property + def module_class(self): + pattern = re.compile('_[0-9]+$') + return pattern.sub('', self.name.split("/")[-1]) + + @property + def module_level(self): + return self._module_level + + @property + def name(self): + return self._event.name + + @property + def parent_node(self): + return self._parent_node + + @property + def child_nodes(self): + return self._child_nodes + + @property + def dur(self): + return self._event.dur + + @property + def start_time(self): + return self._event.start_time + + @property + def end_time(self): + return self._event.end_time + + @property + def host_self_dur(self): + return self.dur - sum([node.dur for node in self.child_nodes]) + + @property + def device_self_dur(self): + dur = 0 + for kernel_dict in self._kernel_self_list: + kernel_list = kernel_dict.get(self.kernels, []) + dur += sum([kernel.device_dur for kernel in kernel_list]) + return dur + + @property + def device_total_dur(self): + dur = 0 + for kernel_dict in self._kernel_total_list: + kernel_list = kernel_dict.get(self.kernels, []) + dur += sum([kernel.device_dur for kernel in kernel_list]) + return dur + + @property + def kernel_details(self): + kernel_details = "" + for kernel_dict in self._kernel_self_list: + kernel_list = kernel_dict.get(self.kernels, []) + for kernel in kernel_list: + kernel_details += kernel.kernel_details + return kernel_details + + @property + def toy_layer_api_list(self): + return self._root_torch_op_node.child_nodes + + @property + def call_stack(self): + return self._call_stack + + @staticmethod + def _binary_search(ts_time, parent_node): + if not parent_node.child_nodes: + return None + right = len(parent_node.child_nodes) - 1 + left = 0 + while right > left: + mid = left + ceil((right - left) / 2) + if ts_time >= parent_node.child_nodes[mid].start_time: + left = mid + else: + right = mid - 1 + if parent_node.child_nodes[left].start_time < ts_time < parent_node.child_nodes[left].end_time: + return parent_node.child_nodes[left] + return None + + def reset_call_stack(self, call_stack): + self._call_stack = call_stack + + def update_child_nodes(self, node): + self._child_nodes.append(node) + + def update_kernel_list(self, ts, kernel_list: list): + self._update_kernel_self_list(ts, kernel_list) + node = self + while node.parent_node: + node._update_kernel_total_list(ts, kernel_list) + node = node.parent_node + + def _update_kernel_self_list(self, ts, kernel_list: list): + self._kernel_self_list.append({self.ts: ts, self.kernels: kernel_list}) + + def _update_kernel_total_list(self, ts, kernel_list: list): + self._kernel_total_list.append({self.ts: ts, self.kernels: kernel_list}) + + def find_module_call(self, ts_time): + call_module = self._binary_search(ts_time, self) + while call_module: + module = self._binary_search(ts_time, call_module) + if not module: + return call_module + call_module = module + return call_module + + def find_torch_op_call(self, event): + while self._cur_torch_op_node: + if self._cur_torch_op_node != self._root_torch_op_node and \ + event.start_time > self._cur_torch_op_node.end_time: + self._cur_torch_op_node = self._cur_torch_op_node.parent + continue + tree_node = TorchOpNode(event, self._cur_torch_op_node) + self._cur_torch_op_node.add_child_node(tree_node) + self._cur_torch_op_node = tree_node + break + + def update_torch_op_kernel_list(self): + top_node_list = self._root_torch_op_node.child_nodes + if not top_node_list: + return + top_node_list.sort(key=lambda x: x.start_time) + cur_index = 0 + self._kernel_self_list.sort(key=lambda x: x.get(self.ts, 0)) + for kernel_dict in self._kernel_self_list: + ts = kernel_dict.get(self.ts, 0) + kernel_list = kernel_dict.get(self.kernels, []) + while cur_index < len(top_node_list): + if ts > top_node_list[cur_index].end_time: + cur_index += 1 + continue + if ts < top_node_list[cur_index].start_time: + break + top_node_list[cur_index].update_kernel_list(kernel_list) + break diff --git a/profiler/compare_tools/compare_backend/utils/name_function.py b/profiler/compare_tools/compare_backend/utils/name_function.py index d83f9e4291c9c1afbcbc1e398741d2bdbedd8df8..cd79e8a03fa7a970ce97ad59f14fae12766f096b 100644 --- a/profiler/compare_tools/compare_backend/utils/name_function.py +++ b/profiler/compare_tools/compare_backend/utils/name_function.py @@ -1,3 +1,4 @@ +from compare_backend.utils.module_node import ModuleNode from compare_backend.utils.torch_op_node import TorchOpNode @@ -41,3 +42,11 @@ class NameFunction: input_shape = ';\r\n'.join(data) return f'{self.args.op_name_map.get(op_node.name, op_node.name)}{input_shape}' return f'{self.args.op_name_map.get(op_node.name, op_node.name)}{op_node.input_shape}' + + def get_module_name(self, module: ModuleNode) -> str: + if not self.args.op_name_map: + return module.module_name + module = module.module_name + for old_name, new_name in self.args.op_name_map.items(): + module.replace(old_name, new_name) + return module diff --git a/profiler/compare_tools/compare_backend/utils/torch_op_node.py b/profiler/compare_tools/compare_backend/utils/torch_op_node.py index 45b9299ba0a23fcc0072546f73cec125890d2e21..690c46cd51c1e2991b0bfaf44e9af431cdad5151 100644 --- a/profiler/compare_tools/compare_backend/utils/torch_op_node.py +++ b/profiler/compare_tools/compare_backend/utils/torch_op_node.py @@ -60,6 +60,10 @@ class TorchOpNode: def memory_allocated(self): return self._memory_allocated_list + @property + def device_dur(self): + return sum([kernel.device_dur for kernel in self._kernel_list]) + def add_child_node(self, child_node): self._child_nodes.append(child_node) @@ -73,11 +77,16 @@ class TorchOpNode: cur_node._kernel_num += kernel_num cur_node = cur_node._parent_node + def update_kernel_list(self, kernel_list: list): + if not kernel_list: + return + self._kernel_list.extend(kernel_list) + def set_memory_allocated(self, memory_allocated: MemoryEvent): self._memory_allocated_list.append(memory_allocated) def is_step_profiler(self) -> bool: - return self.name.find("ProfilerStep#") != -1 + return self._event.is_step_profiler() def get_op_info(self) -> list: return [self.name, self.input_shape, self.input_type, self.call_stack] diff --git a/profiler/compare_tools/compare_backend/utils/tree_builder.py b/profiler/compare_tools/compare_backend/utils/tree_builder.py index f621453d1a5a2281425a01e93b3f89b012f35b88..34c1fe1a1f4046d1e60af107f5ee74484424174a 100644 --- a/profiler/compare_tools/compare_backend/utils/tree_builder.py +++ b/profiler/compare_tools/compare_backend/utils/tree_builder.py @@ -1,5 +1,7 @@ from queue import Queue +from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean +from compare_backend.utils.module_node import ModuleNode from compare_backend.utils.torch_op_node import TorchOpNode @@ -7,10 +9,12 @@ class TreeBuilder: @classmethod def build_tree(cls, event_list: list, kernel_dict: dict, memory_list: list) -> TorchOpNode: root_node = TorchOpNode() - event_list.extend(memory_list) - event_list.sort(key=lambda x: x.start_time) + all_event_list = [] + all_event_list.extend(event_list) + all_event_list.extend(memory_list) + all_event_list.sort(key=lambda x: x.start_time) last_node = root_node - for event in event_list: + for event in all_event_list: while last_node: if last_node != root_node and event.start_time > last_node.end_time: last_node = last_node.parent @@ -53,3 +57,26 @@ class TreeBuilder: for child_node in tree_node.child_nodes: node_queue.put(child_node) return result_list + + @classmethod + def build_module_tree(cls, event_list: list, kernel_dict: dict): + root_node = ModuleNode(TraceEventBean({})) + event_list.sort(key=lambda x: x.start_time) + last_node = root_node + for event in event_list: + while last_node: + if last_node != root_node and event.start_time > last_node.end_time: + last_node = last_node.parent_node + continue + if event.is_x_mode(): + tree_node = ModuleNode(event, last_node) + last_node.update_child_nodes(tree_node) + last_node = tree_node + break + if last_node == root_node: + break + kernel_list = kernel_dict.get(event.start_time, []) + if kernel_list: + last_node.update_kernel_list(event.start_time, kernel_list) + break + return root_node diff --git a/profiler/compare_tools/compare_backend/view/work_sheet_creator.py b/profiler/compare_tools/compare_backend/view/work_sheet_creator.py index c5e56c2f8b9a7ae0c1d1a596dbe81e3541f6ce73..7a33168da377ae77ab64fff0886e09eef065b4e2 100644 --- a/profiler/compare_tools/compare_backend/view/work_sheet_creator.py +++ b/profiler/compare_tools/compare_backend/view/work_sheet_creator.py @@ -23,20 +23,28 @@ class WorkSheetCreator: self._write_data() def _write_headers(self): - header_format = self._work_book.add_format(CellFormatType.BLUE_BOLD) + base_header_format = self._work_book.add_format(CellFormatType.GREEN_BOLD) + com_header_format = self._work_book.add_format(CellFormatType.YELLOW_BOLD) + com_index_range = [-1, -1] overhead = self._data.get("overhead", []) if overhead: base_path = f"Base Profiling: {self._args.base_profiling_path}" - self._work_sheet.merge_range(overhead[0], base_path, header_format) + self._work_sheet.merge_range(overhead[0], base_path, base_header_format) + com_index_range = [self._col_ids.index(overhead[1].split(":")[0][0]), + self._col_ids.index(overhead[1].split(":")[1][0])] comparison_path = f"Comparison Profiling: {self._args.comparison_profiling_path}" - self._work_sheet.merge_range(overhead[1], comparison_path, header_format) + self._work_sheet.merge_range(overhead[1], comparison_path, com_header_format) self._row_id += 2 for index, header in enumerate(self._data.get("headers")): + if index in range(com_index_range[0], com_index_range[1] + 1): + header_format = com_header_format + else: + header_format = base_header_format col_id = self._col_ids[index] self._work_sheet.set_column(f"{col_id}:{col_id}", header.get("width")) self._work_sheet.write(f"{col_id}{self._row_id}", header.get("name"), header_format) self._field_format[index] = self._work_book.add_format(header.get("type")) - if header.get("name") == ExcelConfig.DIFF_RATIO: + if header.get("name") in (ExcelConfig.DIFF_RATIO, ExcelConfig.DIFF_TOTAL_RATIO): self._diff_ratio_index = index self._row_id += 1 diff --git a/profiler/compare_tools/compare_interface/comparison_interface.py b/profiler/compare_tools/compare_interface/comparison_interface.py index b3ba5f63c8260aabe132955746184c7c28edbc2a..919095b310126f2ce0c9c3e6912fb10f24d149e9 100644 --- a/profiler/compare_tools/compare_interface/comparison_interface.py +++ b/profiler/compare_tools/compare_interface/comparison_interface.py @@ -1,39 +1,31 @@ -from compare_backend.comparison_generator import ComparisonGenerator -from compare_backend.utils.constant import Constant +import sys +import os +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "cluster_analyse")) +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -class Args: - def __init__(self, - base_profiling_path: str, - comparison_profiling_path: str, - enable_profiling_compare: bool = False, - enable_operator_compare: bool = False, - enable_memory_compare: bool = False, - enable_communication_compare: bool = False, - output_path: str = "", - max_kernel_num: int = None, - op_name_map: dict = None, - use_input_shape: bool = False, - gpu_flow_cat: str = ""): - self.base_profiling_path = base_profiling_path - self.comparison_profiling_path = comparison_profiling_path - self.enable_profiling_compare = enable_profiling_compare - self.enable_operator_compare = enable_operator_compare - self.enable_memory_compare = enable_memory_compare - self.enable_communication_compare = enable_communication_compare - self.output_path = output_path - self.max_kernel_num = max_kernel_num - self.op_name_map = op_name_map or {} - self.use_input_shape = use_input_shape - self.gpu_flow_cat = gpu_flow_cat +from compare_backend.comparison_generator import ComparisonGenerator +from compare_backend.disaggregate.overall_perf_interface import OverallPerfInterface +from compare_backend.utils.compare_args import Args +from compare_backend.utils.constant import Constant class ComparisonInterface: - def __init__(self, base_profiling_path: str, comparison_profiling_path: str): - self._args = Args(base_profiling_path, comparison_profiling_path) + def __init__(self, base_profiling_path: str, comparison_profiling_path: str = ""): + self.base_profiling_path = base_profiling_path + if comparison_profiling_path: + self._args = Args(base_profiling_path=base_profiling_path, + comparison_profiling_path=comparison_profiling_path) def compare(self, compare_type: str) -> dict: if compare_type == Constant.OVERALL_COMPARE: self._args.enable_profiling_compare = True return ComparisonGenerator(self._args).run_interface(compare_type) + + def disaggregate_perf(self, compare_type: str) -> dict: + if compare_type != Constant.OVERALL_COMPARE: + print('[ERROR] Invalid compare_type value: {compare_type} which not supported.') + return {} + return OverallPerfInterface(self.base_profiling_path).run() diff --git a/profiler/compare_tools/performance_compare.py b/profiler/compare_tools/performance_compare.py index 4676355a28de80d73f01f75b23b102ebf4ff1a79..8de0a72cbdcfe958f465d338ded44bd0077c4bc0 100644 --- a/profiler/compare_tools/performance_compare.py +++ b/profiler/compare_tools/performance_compare.py @@ -18,6 +18,7 @@ def main(): parser.add_argument("--enable_operator_compare", default=False, action='store_true', help="开启算子性能比较") parser.add_argument("--enable_memory_compare", default=False, action='store_true', help="开启算子内存比较") parser.add_argument("--enable_communication_compare", default=False, action='store_true', help="开启通信性能比较") + parser.add_argument("--disable_details", default=False, action='store_true', help="不展示比对明细") parser.add_argument("--output_path", type=str, default='', help="性能数据比对结果的存放路径") parser.add_argument("--max_kernel_num", type=int, help="每个torch op的kernel数量限制") parser.add_argument("--op_name_map", type=ast.literal_eval, default={}, diff --git a/profiler/prof_common/__init__.py b/profiler/prof_common/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/prof_common/analyze_dict.py b/profiler/prof_common/analyze_dict.py new file mode 100644 index 0000000000000000000000000000000000000000..a06577e8fb49436f7b867e8e74495cc76a6a58b2 --- /dev/null +++ b/profiler/prof_common/analyze_dict.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +class AnalyzeDict(dict): + def __getstate__(self): + return self.__dict__ + + def __setstate__(self, d): + self.__dict__.update(d) + + def __getattr__(self, key: str): + if key not in self: + return {} + + value = self[key] + if isinstance(value, dict): + value = AnalyzeDict(value) + return value diff --git a/profiler/prof_common/constant.py b/profiler/prof_common/constant.py new file mode 100644 index 0000000000000000000000000000000000000000..5789b89cb1a248977b64839339395acc5288b2ab --- /dev/null +++ b/profiler/prof_common/constant.py @@ -0,0 +1,18 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +class Constant(object): + COLLECTION_PATH = "collection_path" + ANALYSIS_MODE = "analysis_mode" + CONTEXT_SETTINGS = dict(help_option_names=['-H', '-h', '--help']) \ No newline at end of file diff --git a/profiler/requirements.txt b/profiler/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6bf2069a60788587f8e4e0129bbc031b1d1daaaf --- /dev/null +++ b/profiler/requirements.txt @@ -0,0 +1,2 @@ +-r requirements/build.txt +-r requirements/tests.txt \ No newline at end of file diff --git a/profiler/requirements/build.txt b/profiler/requirements/build.txt new file mode 100644 index 0000000000000000000000000000000000000000..c750ff83dedf2f6b6823f45a747c95d395e1ccb5 --- /dev/null +++ b/profiler/requirements/build.txt @@ -0,0 +1,12 @@ +click +tabulate +networkx +jinja2 +PyYaml +tqdm +prettytable +ijson +requests +xlsxwriter +sqlalchemy +urllib3<2.0 \ No newline at end of file diff --git a/profiler/requirements/tests.txt b/profiler/requirements/tests.txt new file mode 100644 index 0000000000000000000000000000000000000000..8313304e687428a406a5962ff5aef4d16620c167 --- /dev/null +++ b/profiler/requirements/tests.txt @@ -0,0 +1,17 @@ +pytest==6.2.4 +pytest-cov==2.12.0 +pytest-mock==3.6.1 +pytest-cookies==0.6.1 +mock==4.0.3 +click +tabulate +networkx +jinja2 +PyYaml +tqdm +prettytable +ijson +requests +xlsxwriter +sqlalchemy +urllib3<2.0 \ No newline at end of file diff --git a/profiler/setup.cfg b/profiler/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..26f243faa5199ade53fc6f42959d475ac5539f29 --- /dev/null +++ b/profiler/setup.cfg @@ -0,0 +1,32 @@ +[isort] +line_length = 120 +multi_line_output = 0 +known_standard_library = setuptools +no_lines_before = STDLIB,LOCALFOLDER +default_section = THIRDPARTY +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true + +[flake8] +exclude = tests/* +max-line-length = 120 + +[pycodestyle] +max-line-length = 120 +exclude = tests/* + +[yapf] +BASED_ON_STYLE = pep8 +BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true +SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true +COLUMN_LIMIT = 120 + +[aliases] +test=pytest + +[mypy] +ignore_missing_imports = True + +[mypy-tests.*] +ignore_errors = True diff --git a/profiler/setup.py b/profiler/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..19c8729d91a0e8bad55f35c9eda34ae6f1ebd918 --- /dev/null +++ b/profiler/setup.py @@ -0,0 +1,44 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +import os.path + +from setuptools import find_packages, setup # type: ignore + +extras = { + "test": [ + "pytest==6.2.4", + "pytest-cookies==0.6.1", + "pytest-cov==2.12.0", + "mock==4.0.3", + ] +} + +with open('requirements/build.txt', 'r') as f: + requires = f.read().splitlines() + +with open('requirements/tests.txt', 'r') as f: + tests_requires = f.read().splitlines() +tests_requires.extend(set(requires)) + +with open('version.txt', 'r') as f: + version = f.read().strip() + +root_path = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) +setup( + name="msprof-analyze", + version=version, + description="MindStudio Profiler Analyze Tools", + package_dir={"": root_path}, + packages=find_packages(root_path), + include_package_data=False, + python_requires='>=3.7', + install_requires=requires, + package_data={'': ['*.json', '*.ini', '*.txt', '*.yaml', '*.html', '*.ipynb']}, + tests_require=tests_requires, + entry_points=""" + [console_scripts] + msprof-analyze=profiler.cli.entrance:msprof_analyze_cli + """ +) + +# build cmd: pip install --editable . diff --git a/profiler/test/ut/advisor/advisor_backend/compute_advice/test_npu_slow_advice.py b/profiler/test/ut/advisor/advisor_backend/compute_advice/test_npu_slow_advice.py new file mode 100644 index 0000000000000000000000000000000000000000..8830d495992cfcd2c26024863f8b644d5b4c6902 --- /dev/null +++ b/profiler/test/ut/advisor/advisor_backend/compute_advice/test_npu_slow_advice.py @@ -0,0 +1,223 @@ +import json +import os +import shutil +import stat +import csv +import unittest + +from advisor_backend.interface import Interface +from advisor_backend.compute_advice.npu_slow_advice import NpuSlowAdvice + + +class TestNpuSlowAdvice(unittest.TestCase): + ASCEND_PT_DIR = "./ascend_pt" + OUTPUT_DIR = "./ascend_pt/ASCEND_PROFILER_OUTPUT" + interface = None + err_interface = None + + def tearDown(self): + if os.path.exists(TestNpuSlowAdvice.ASCEND_PT_DIR): + shutil.rmtree(TestNpuSlowAdvice.ASCEND_PT_DIR) + + def setUp(self): + if os.path.exists(TestNpuSlowAdvice.ASCEND_PT_DIR): + shutil.rmtree(TestNpuSlowAdvice.ASCEND_PT_DIR) + if not os.path.exists(TestNpuSlowAdvice.ASCEND_PT_DIR): + os.makedirs(TestNpuSlowAdvice.ASCEND_PT_DIR) + if not os.path.exists(TestNpuSlowAdvice.OUTPUT_DIR): + os.makedirs(TestNpuSlowAdvice.OUTPUT_DIR) + + @classmethod + def get_basic_trace_view(cls): + # Python pid + py_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 1, "args": {"name": "Python"}} + # ascend pid + ascend_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 4, "args": {"name": "Ascend Hardware"}} + # ascend pid + cann_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 5, "args": {"name": "CANN"}} + # ascend hardware ops + ah_event1 = {"ph": "X", "name": "Slice1", "ts": "1699529623106750", "dur": 100, "tid": 3, "pid": 4, "args": {}} + ah_event2 = {"ph": "X", "name": "Slice2", "ts": "1699529623106751", "dur": 80, "tid": 3, "pid": 4, "args": {}} + # flow event + flow_event_s = {"ph": "s", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "200", "args": {}} + flow_event_e = {"ph": "f", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "1699529623106750", "args": {}} + return [py_pid_data, ascend_pid_data, cann_pid_data, ah_event1, ah_event2, flow_event_s, flow_event_e] + + @classmethod + def create_profiler_info_json(cls): + info = { + "config": { + "common_config": { + "with_stack": True, + "activities": ["ProfilerActivity.CPU", "ProfilerActivity.NPU"] + } + } + } + with os.fdopen(os.open(f"{TestNpuSlowAdvice.ASCEND_PT_DIR}/profiler_info_0.json", + os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp: + fp.write(json.dumps(info)) + + @classmethod + def create_old_version_trace_view(cls): + basic_info = cls.get_basic_trace_view() + + # python ops + py_event1 = {"ph": "X", "cat": "python_function", "name": "aten::slice", "ts": "200", "dur": 100, "tid": 2, + "pid": 1, + "args": {"Call stack": "/root/test/slice.py(116);\r\n/root/torch/module.py"}} + py_event2 = {"ph": "X", "cat": "python_function", "name": "slice", "ts": "199", "dur": 200, "tid": 2, "pid": 1, + "args": {"Call stack": "/root/test/slice.py(116);\r\n/root/torch/module.py"}} + raw_data = [ + *basic_info, py_event1, py_event2 + ] + + with os.fdopen(os.open(f"{TestNpuSlowAdvice.OUTPUT_DIR}/trace_view.json", + os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp: + fp.write(json.dumps(raw_data)) + + @classmethod + def create_new_version_trace_view(cls): + basic_info = cls.get_basic_trace_view() + # python ops + py_event1 = {"ph": "X", "name": "aten::slice", "ts": "200", "dur": 100, "tid": 2, "pid": 1, "args": {}} + py_event2 = {"ph": "X", "name": "slice", "ts": "199", "dur": 105, "tid": 2, "pid": 1, "args": {}} + py_event3 = {"ph": "X", "cat": "python_function", "name": "/root/test/slice.py(116)", "ts": "198", "dur": 120, + "tid": 2, "pid": 1, + "args": {}} + py_event4 = {"ph": "X", "cat": "python_function", "name": "/root/torch/module.py", "ts": "197", "dur": 150, + "tid": 2, "pid": 1, "args": {}} + + raw_data = [ + *basic_info, py_event1, py_event2, py_event3, py_event4 + ] + + with os.fdopen(os.open(f"{TestNpuSlowAdvice.OUTPUT_DIR}/trace_view.json", + os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp: + fp.write(json.dumps(raw_data)) + + @classmethod + def create_kernel_details(cls): + # create csv files + csv_header = ['Step Id', 'Model ID', 'Task ID', 'Stream ID', 'Name', 'Type', 'Accelerator Core', + 'Start Time(us)', + 'Duration(us)', 'Wait Time(us)', 'Block Dim', 'Mix Block Dim', 'Input Shapes', 'Input Data Types', + 'Input Formats', 'Output Shapes', 'Output Data Types', 'Output Formats', 'Context ID', + 'aicore_time(us)', + 'aic_total_cycles', 'aic_mac_ratio', 'aic_mac_int8_ratio', 'aic_cube_fops', + 'aic_vector_fops', + 'aiv_time(us)', 'aiv_total_cycles', 'aiv_vec_fp32_ratio', 'aiv_vec_fp16_ratio', + 'aiv_vec_int32_ratio', + 'aiv_vec_misc_ratio', 'aiv_cube_fops', 'aiv_vector_fops'] + # RED: size=0.0492 MB, throughput=2.32 GB/s, task_duration=21.2us + csv_row1 = [1, 4294967295, 1265, 16, 'Slice1', 'Slice', 'AI_VECTOR_CORE', "1699529623106750\t", 21.2, 261.56, 9, + 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 0, 0, 0, 0, 0, 0, + 1.77, 29508, 0, 0, 0.0062, 0, 0, 5856] + # YELLOW: size=0.0492 MB, throughput=984 GB/s, task_duration=0.05us + csv_row2 = [1, 4294967295, 1265, 16, 'Slice2', 'Slice', 'AI_VECTOR_CORE', "1699529623106751\t", 0.05, 261.56, 9, + 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 0, 0, 0, 0, 0, 0, + 1.77, 29508, 0, 0, 0.0062, 0, 0, 5856] + # WHITE: AI_CPU + csv_row3 = [1, 4294967295, 1265, 16, 'Swish1', 'Swish', 'AI_CPU', "1699529623106752\t", 3.14, 261.56, 9, + 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', + 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'] + # GREEN: size=0.0492 MB, throughput=15.67 GB/s, task_duration = 3.14us + csv_row4 = [1, 4294967295, 1265, 16, 'Mul1', 'Mul', 'AI_VECTOR_CORE', "1699529623106753\t", 3.14, 261.56, 9, 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 0, 0, 0, 0, 0, 0, + 1.77, 29508, 0, 0, 0.0062, 0, 0, 5856] + # RED: aic_mac_ratio=0.2 + csv_row5 = [1, 4294967295, 1265, 16, 'Add1', 'Add', 'AI_CORE', "1699529623106754\t", 3.14, 261.56, 9, 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 2.3, 28888, 0.2, 0.1, 0.1, 0.7, + 0, 0, 0, 0, 0, 0, 0, 0] + # GREEN: aic_mac_ratio=0.85 + csv_row6 = [1, 4294967295, 1265, 16, 'Add1', 'Add', 'AI_CORE', "1699529623106754\t", 3.14, 261.56, 9, 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 2.3, 38888, 0.85, 0.1, 0.1, 0.7, + 0, 0, 0, 0, 0, 0, 0, 0] + # YELLOW: aic_mac_ratio=0.64 + csv_row7 = [1, 4294967295, 1265, 16, 'Add1', 'Add', 'AI_CORE', "1699529623106754\t", 3.14, 261.56, 9, 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 2.3, 48888, 0.64, 0.1, 0.1, 0.7, + 0, 0, 0, 0, 0, 0, 0, 0] + # WHITE: MIX_AIC + csv_row8 = [1, 4294967295, 1265, 16, 'Slice2', 'Slice', 'MIX_AIC', "1699529623106751\t", 0.05, 261.56, 9, + 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 2.3, 28888, 0.4, 0.1, 0.1, 0.7, + 1.77, 29508, 0, 0, 0.0062, 0, 0, 5856] + # WHITE: MIX_AIV + csv_row9 = [1, 4294967295, 1265, 16, 'Slice2', 'Slice', 'MIX_AIV', "1699529623106751\t", 0.05, 261.56, 9, + 0, + '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A', + 2.3, 28888, 0.4, 0.1, 0.1, 0.7, + 1.77, 29508, 0, 0, 0.0062, 0, 0, 5856] + with os.fdopen(os.open(f"{TestNpuSlowAdvice.OUTPUT_DIR}/kernel_details.csv", + os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp: + csv_writer = csv.writer(fp) + csv_writer.writerow(csv_header) + csv_writer.writerow(csv_row1) + csv_writer.writerow(csv_row2) + csv_writer.writerow(csv_row3) + csv_writer.writerow(csv_row4) + csv_writer.writerow(csv_row5) + csv_writer.writerow(csv_row6) + csv_writer.writerow(csv_row7) + csv_writer.writerow(csv_row8) + csv_writer.writerow(csv_row9) + + def test_run_should_return_empty_when_ascend_pt_path_not_exist(self): + interface = Interface("") + data = interface.get_data('compute', 'npu_slow') + self.assertEqual(0, len(data)) + + def test_run_should_return_empty_when_there_is_no_kernel_details(self): + interface = Interface(self.ASCEND_PT_DIR) + data = interface.get_data('compute', 'npu_slow') + self.assertEqual(0, len(data)) + + def test_run_should_return_7_data_without_call_stack_when_json_not_exist(self): + self.create_kernel_details() + interface = Interface(self.ASCEND_PT_DIR) + data = interface.get_data('compute', 'npu_slow') + call_stack = NpuSlowAdvice(self.ASCEND_PT_DIR).get_call_stack(data, index_id=0, ts_col="Start Time(us)") + self.assertEqual(9, len(data)) + self.assertEqual("", call_stack) + + def test_run_should_return_7_data_with_call_stack_when_new_trace_view_exists(self): + self.create_profiler_info_json() + self.create_kernel_details() + self.create_new_version_trace_view() + interface = Interface(self.ASCEND_PT_DIR) + data = interface.get_data('compute', 'npu_slow') + slow_op_data = data[data["color"] == "RED"] + NpuSlowAdvice.save_to_excel(data, file_path=os.path.join(self.ASCEND_PT_DIR, "slow_op.xlsx")) + call_stack = NpuSlowAdvice(self.ASCEND_PT_DIR).get_call_stack(data, index_id=0, ts_col="Start Time(us)") + self.assertEqual(9, len(data)) + self.assertEqual(2, len(slow_op_data)) + print(call_stack) + call_stack_res = "/root/torch/module.py\n" \ + "/root/test/slice.py(116)" + self.assertEqual(call_stack_res, call_stack) + + def test_run_should_return_7_data_with_call_stack_when_old_trace_view_exists(self): + self.create_profiler_info_json() + self.create_kernel_details() + self.create_old_version_trace_view() + interface = Interface(self.ASCEND_PT_DIR) + data = interface.get_data('compute', 'npu_slow') + slow_op_data = data[data["color"] == "RED"] + NpuSlowAdvice.save_to_excel(data, file_path=os.path.join(self.ASCEND_PT_DIR, "slow_op.xlsx")) + call_stack = NpuSlowAdvice(self.ASCEND_PT_DIR).get_call_stack(data, index_id=0, ts_col="Start Time(us)") + self.assertEqual(9, len(data)) + self.assertEqual(2, len(slow_op_data)) + print(call_stack) + call_stack_res = "/root/test/slice.py(116)\n\r\n" \ + "/root/torch/module.py" + self.assertEqual(call_stack_res, call_stack) diff --git a/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_timeline_op_compile_checker.py b/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_timeline_op_compile_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..9060bfb8d30d62c6ce357a1ff8b9d4cf281a4442 --- /dev/null +++ b/profiler/test/ut/advisor/advisor_backend/timeline_advice/test_timeline_op_compile_checker.py @@ -0,0 +1,46 @@ +import unittest +import os +import sys + +work_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))))) +sys.path.insert(0, work_path) +from unittest.mock import patch +from profiler.advisor.analyzer.schedule import dispatch +from profiler.advisor.analyzer.schedule.dispatch.timeline_op_dispatch_analyzer import OpDispatchAnalyzer +from profiler.advisor.dataset.timeline_event_dataset import TimelineEventDataset +from profiler.advisor.display.html.render import HTMLRender +from profiler.test.ut.advisor.advisor_backend.tools.tool import recover_env + + +class TestOperatorDispatchAnalyzer(unittest.TestCase): + @classmethod + def tearDownClass(cls) -> None: + recover_env() + + @patch("profiler.advisor.common.constant.MAX_OP_COMPILE_NUM", 5) + def test_ops_dispatch_analyzer(self): + kwargs = {"analysis_mode": "all"} + data_root_dir = os.path.dirname(os.path.realpath(__file__)) + op_dispatch_analyzer = OpDispatchAnalyzer(data_root_dir, **kwargs) + + results = op_dispatch_analyzer.optimize(**kwargs) + self.assertTrue(results.page_dict) + self.assertIsNotNone(results.sheet_recorder.sheet_data.get("operator dispatch")) + + @patch("profiler.advisor.common.constant.MAX_OP_COMPILE_NUM", 5) + def test_ops_dispatch_make_render(self): + kwargs = {"analysis_mode": "timeline"} + data_root_dir = os.path.dirname(os.path.realpath(__file__)) + op_dispatch = OpDispatchAnalyzer(data_root_dir, **kwargs) + event_dataset = op_dispatch.get_first_data_by_key(op_dispatch.dataset_list, TimelineEventDataset.get_key()) + + op_dispatch.get_op_compile_info(event_dataset) + html_render = HTMLRender() + op_dispatch.make_render(html_render) + self.assertTrue(len(html_render.render_list) >= 1) + + +if __name__ == '__main__': + tester = TestOperatorDispatchAnalyzer() + tester.test_ops_dispatch_analyzer() + tester.test_ops_dispatch_make_render() diff --git a/profiler/test/ut/advisor/advisor_backend/tools/__init__.py b/profiler/test/ut/advisor/advisor_backend/tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/test/ut/advisor/advisor_backend/tools/tool.py b/profiler/test/ut/advisor/advisor_backend/tools/tool.py new file mode 100644 index 0000000000000000000000000000000000000000..6c6f690d3a1b091a67babbd1708dc8076fbb26a3 --- /dev/null +++ b/profiler/test/ut/advisor/advisor_backend/tools/tool.py @@ -0,0 +1,29 @@ +import os +import re +import shutil +import shlex +from subprocess import Popen, PIPE + + +def delete_file(pattern, work_path): + file_list = os.listdir(work_path) + for file_name in file_list: + if re.match(pattern, file_name): + + os.remove(os.path.join(work_path, file_name)) + + +def recover_env(work_path="./"): + if os.path.exists("./log"): + shutil.rmtree("./log") + + if os.path.exists("./tune_ops_file.cfg"): + os.remove("./tune_ops_file.cfg") + + delete_file(r"ma_advisor_+", work_path) + + +def run_command(cmd): + # Make sure the process output can be displayed on the console + p = Popen(shlex.split(cmd, posix=False), stdout=PIPE, bufsize=0, universal_newlines=False) + p.wait() diff --git a/profiler/test/ut/compare_tools/profiling_parser/test_gpu_profiling_parser.py b/profiler/test/ut/compare_tools/profiling_parser/test_gpu_profiling_parser.py index 388b92ec4167821aeae03799b173ac226d4dd1d9..04468721504b1e1133b659a4d497c4ef86ed0414 100644 --- a/profiler/test/ut/compare_tools/profiling_parser/test_gpu_profiling_parser.py +++ b/profiler/test/ut/compare_tools/profiling_parser/test_gpu_profiling_parser.py @@ -71,6 +71,7 @@ class TestGpuProfilingParser(unittest.TestCase): res._trace_events = [TraceEventBean(event) for event in self.trace_events] res._result_data = ProfilingResult("GPU") res._compute_stream_id = 3 + res._flow_dict = {} res._marks = defaultdict(int) res._calculate_performance_time() self.assertEqual(res._result_data.overall_metrics.e2e_time, 98) diff --git a/profiler/version.txt b/profiler/version.txt new file mode 100644 index 0000000000000000000000000000000000000000..8cfbc905b39f65131ba18e561d236557fbdc52cc --- /dev/null +++ b/profiler/version.txt @@ -0,0 +1 @@ +1.1.1 \ No newline at end of file diff --git a/sample/README.md b/sample/README.md index 167b1a01cbd87c75eb6a6479a39fc198360a402f..1bc2a76161ed8b83312bb3965d5f8d672b4e9d75 100644 --- a/sample/README.md +++ b/sample/README.md @@ -5,12 +5,61 @@ 如果考虑商用集成,推荐使用CANN软件包中的AscendC样例工程,比如:ascendc_kernel_cmake目录。本项目中的工程就是基于其进行简化仅用于快速验证。 +说明:该sample目录中,每个最小目录就是一个完整的样例工程。这些样例工程本身可能以为依赖的不同存在差异。 + ## 依赖说明 安装CANN包,并使能环境变量,并确保```ASCEND_HOME_PATH```生效,可以在CANN包安装目录下使能: ``` source set_env.sh ``` +## 目录介绍 +整体目录结构如下: +``` +- sample + |- build # 编译并运行所有样例内容(建议按需使用,此处命令可以参考 + |- normal_sample # 纯C/C++的AscendC单算子极简工程,可配合msdebug和msprof工具 + |- cube_only # 仅含aic的AscendC单算子极简工程 + |- mix # mix算子的AscendC单算子极简工程 + |- vec_only # 仅含aiv的AscendC单算子极简工程 + |- pytorch_adapter # 适配pytorch的AscendC单算子极简工程,可配合msdebug和msprof工具 + |- jit_compile # jit模式,运行时编译使用 + |- with_setuptools # 编译成wheel包安装使用 + |- sanitizer_sample # 异常样例,用于配合mssanitizer工具 + |- racecheck # 含竞争问题的样例 + |- xx # 其他异常样例 +``` + +如果你关注自定义算子的pytorch框架适配,详见[此处](./pytorch_adapter/README.md) + + +## 算子调试 msdebug +若使用msdebug进行上板调试,还需要额外调整,具体如下: +1. 编译阶段:在```sample\normal_sample\vec_only```相对路径下的```Makefile```文件中修改如下内容: + + 调试信息增强,并扩大栈空间: + ``` + COMPILER_FLAG := -xcce -O2 -std=c++17 + 修改为: + COMPILER_FLAG := -xcce -O0 -std=c++17 -g --cce-ignore-always-inline=true + ``` + +2. 运行阶段: +``` +msdebug ./*.fatbin +``` + +## 内存检测 sanitizer +1. 编译阶段:在编译过程中添加```--cce-enable-sanitizer -g```参数, 在链接过程中添加```--cce-enable-sanitizer```参数。(现样例中已在Makefile中添加),执行如下命令: +``` +make +``` + +2. 运行阶段: +``` +mssanitizer ./*.fatbin # 默认进行memcheck检查 +``` + + ## 算子调优 算子调优工具可以支持上板和仿真算子的调优,下面将以vec_only中的算子为例,进行工具使用的实战命令讲解 @@ -84,30 +133,3 @@ source set_env.sh └── trace.json # 算子所有核的流水图 ``` 4. 更多指标信息请参考算子开发工具使用手册。 - -## 算子调试msdebug -若使用msdebug进行上板调试,还需要额外调整,具体如下: -1. 编译阶段:在```sample\normal_sample\vec_only```相对路径下的```Makefile```文件中修改如下内容: - + 调试信息增强,并扩大栈空间: - ``` - COMPILER_FLAG := -xcce -O2 -std=c++17 - 修改为: - COMPILER_FLAG := -xcce -O0 -std=c++17 -g -mllvm -cce-aicore-function-stack-size=0x8000 -mllvm -cce-aicore-stack-size=0x8000 -mllvm -cce-aicore-jump-expand=true - -## 内存检测 sanitizer -### sanitizer_sample目录介绍 - -此目录下为sanitizer对应的样例库,包含竞争检测和内存检测相关的样例。 - -#### Racecheck目录介绍 - -Racecheck为竞争检测相关的样例。 - -raw_error_kernel.cpp文件为UB上先读后写竞争和GM上先写后读竞争问题的样例。 - - -运行阶段: - -``` -/usr/local/Ascend/ascend-toolkit/latest/tools/mssanitizer/bin/mssanitizer --tool=racecheck ./raw_error.fatbin -``` \ No newline at end of file diff --git a/sample/normal_sample/mix/Makefile b/sample/normal_sample/mix/Makefile index e19120ff6697d9ecf6a15ec26d1aaf51b41ed5ef..e7676a20ed5037030cc15ee0f67484ff0c08f369 100644 --- a/sample/normal_sample/mix/Makefile +++ b/sample/normal_sample/mix/Makefile @@ -8,7 +8,7 @@ DAV_FLAG := --cce-aicore-arch=dav-c220 ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 TILING_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw HOST_INC_FLAG := -I${ASCEND_HOME_PATH}/include -LINK_LIBS := -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++ -lprofapi -lmmpa +LINK_LIBS := -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++ -lprofapi -lmmpa -lascendalog -lregister -lerror_manager LINK_STATIC_LIBS := ${ASCEND_HOME_PATH}/lib64/libascendc_runtime.a all: build diff --git a/sample/normal_sample/mix/main.cpp b/sample/normal_sample/mix/main.cpp index c26a055fb4e2232fecd9ee9f747d1d241bb914e9..0b68f4c546552b6af416b4898cd4a366959a1cf6 100644 --- a/sample/normal_sample/mix/main.cpp +++ b/sample/normal_sample/mix/main.cpp @@ -111,7 +111,7 @@ int32_t main(int32_t argc, char *argv[]) size_t biasSize = 640 * sizeof(float); size_t ySize = 1024 * 640 * sizeof(float); size_t workspaceSize = 16 * 1024 * 1024 * sizeof(float); // AscendC::GetUserWorkspace中预留空间 - size_t tilingSize = 48 * sizeof(uint32_t); + size_t tilingSize = 96 * sizeof(uint32_t); uint32_t blockDim = 1; CHECK_ACL(aclInit(nullptr)); diff --git a/sample/pytorch_adapter/README.md b/sample/pytorch_adapter/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a2b1ba63570058ac954a121f4b14b396f5dace81 --- /dev/null +++ b/sample/pytorch_adapter/README.md @@ -0,0 +1,53 @@ +# 自定义算子的pytorch框架适配说明 + +## 简介 +昇腾提供丰富的算子接入框架的方式,此处将介绍最简单的一种,每个目录中都是一个独立的可使用的工程 + +## 依赖 +与业内pytorch的算子介入方式相同,算子接入框架需要保障设备上有正确的pytorch版本(我们还依赖torch_npu版本) + +pytorch版本可由pip安装,torch_npu版本详见[此处](https://gitee.com/ascend/pytorch/releases),请选择与pytorch适配的torch_npu版本。 + +## 工程介绍 +整体工程目录如下: +``` +- pytorch_adapter + |- jit_compile # 实时编译的接入方式 + |- add_adapter.cpp # 使用算子动态库接口完成算子在pytorch框架的适配 + |- add_kernel.cpp # 昇腾算子实现,并提供host侧的动态库接口 + |- main.py # python的入口,实现整体集成 + |- Makefile # 用以生成昇腾算子的host侧动态库的编译脚本 + |- with_setuptools # wheel包的接入方式 + |- add_adapter.cpp + |- add_kernel.cpp + |- Makefile + |- setup.py # setuptools的入口,支持编译并打包生成wheel包 + |- test.py # 测试wheel包功能的入口 +``` + +## 工程使用 + +### jit_compile工程 +执行如下命令,就会在运行过程中,现场生成python模块并使用: +``` +python main.py +``` + +### setuptools工程 +针对with_setuptools工程,可以编译出可安装的wheel包,便于多机部署使用。 + + +1. 执行如下命令可以编译出软件包(setuptools可以支持多种方式,比如:build,install等,此处不一一展示): +``` +pytorch setup.py bdist_wheel # 编译出wheel包,在dist目录下 +``` + +2. 到```dist```目录下用pip命令安装对应软件包。 + +3. 执行测试脚本 +``` +python test.py +``` + +## 其他 +1. 此处样例使用的是静态tiling,如果使用动态tiling,则可以在adapter.cpp中对Tensor的shape进行分析,选择合适tiling。(这部分是流程中必须的,只是可能在不同位置,比如aclnn中,这部分在接口实现;此处,我们本身也可以对add_custom_do进行封装,将tiling内置。) \ No newline at end of file diff --git a/sample/pytorch_adapter/jit_compile/Makefile b/sample/pytorch_adapter/jit_compile/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ec9115f377a578677470b89f365583dfcf246515 --- /dev/null +++ b/sample/pytorch_adapter/jit_compile/Makefile @@ -0,0 +1,20 @@ +# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 +ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest + +COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 +COMPILER_FLAG := -xcce -O2 -std=c++17 +DYNAMIC_LIB_FLAG := -fPIC -shared +DAV_FLAG := --cce-aicore-arch=dav-c220-vec +ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 + +all: build + +build: libcustom_kernels.so + +# 后续如果要扩展,把多个kernel的cpp都加到后面 +libcustom_kernels.so: add_kernel.cpp + $(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^ + +.PHONY: clean +clean: + rm *.so \ No newline at end of file diff --git a/sample/pytorch_adapter/jit_compile/add_adapter.cpp b/sample/pytorch_adapter/jit_compile/add_adapter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6c65e60ec596fe8b5627e06f678549b5f2f05660 --- /dev/null +++ b/sample/pytorch_adapter/jit_compile/add_adapter.cpp @@ -0,0 +1,128 @@ +#include +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "torch_npu/csrc/framework/OpCommand.h" + +using torch::autograd::AutogradContext; +using torch::autograd::Function; +using tensor_list = std::vector; +using namespace at; + +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); + +// 为NPU设备注册前向实现 +at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other) +{ + // 创建输出内存 + at::Tensor result = at::Tensor(self); + // 将pytorch中的结构翻译成为CANN认识的数据类型和结构 + // 1. (重要)通过对tensor的shape分析,选择合适的tiling(该算子为了简化,固定了tiling,只有特定shape下计算才正确) + // 2. 对数据类型和格式转换 -- 此处无需数据格式处理,直接使用 + auto stream = c10_npu::getCurrentNPUStream().stream(false); + auto x = self.storage().data(); + auto y = other.storage().data(); + auto z = result.storage().data(); + + uint32_t blockDim = 8; + auto callback = [stream, blockDim, x, y, z]() -> int { + add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z); + return 0; // 此处可以通过某种方式获取算子执行结果,还未实现 + }; + // 下发算子 + at_npu::native::OpCommand cmd; + cmd.Name("my_add").SetCustomHandler(callback).Run(); + return result; +} + +// 为NPU设备注册反向实现 +std::tuple my_add_backward_impl_npu(const at::Tensor &self) +{ + at::Tensor result = at::Tensor(self); // 创建输出内存 + + return {result, result}; +} + +// 为Meta设备注册前向实现 +at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other) +{ + return empty_like(self); +} + +// 为Meta设备注册反向实现 +std::tuple my_add_backward_impl_meta(const at::Tensor &self) +{ + auto result = empty_like(self); + return std::make_tuple(result, result); +} + +// 寻找注册在该op上的不同设备的实现 +at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other) +{ + static auto op = + torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed(); + return op.call(self, other); +} +// 寻找注册在该op上的不同设备的实现 +std::tuple my_add_backward_impl(const at::Tensor &self) +{ + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("myaten::my_add_backward", "") + .typed(); + return op.call(self); +} + +// 在myaten命名空间里注册my_add和my_add_backward两个schema +TORCH_LIBRARY(myaten, m) +{ + m.def("my_add(Tensor self, Tensor other) -> Tensor"); + m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)"); +} + +// 通过继承torch::autograd::Function类实现前反向绑定 +class MyAddFunction : public torch::autograd::Function { +public: + static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other) + { + at::AutoDispatchBelowADInplaceOrView guard; + return my_add_impl(self, other); + } + + static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) + { + auto grad_output = grad_outputs[0]; + auto result = my_add_backward_impl(grad_output); + return {std::get<0>(result), std::get<1>(result)}; + } +}; + +at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other) +{ + return MyAddFunction::apply(self, other); +} + +// 给op绑定NPU的自动求导实现 +// 如果是pytorch 2.1以下的版本,AutogradPrivateUse1需要改成AutogradXLA +TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_autograd); +} + +// 为NPU设备注册前反向实现 +// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1,在2.1以下版本用的是XLA,如果是2.1以下版本PrivateUse1需要改成XLA +TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_npu); + m.impl("my_add_backward", &my_add_backward_impl_npu); +} + +// 为Meta设备注册前反向实现 +TORCH_LIBRARY_IMPL(myaten, Meta, m) +{ + m.impl("my_add", &my_add_impl_meta); + m.impl("my_add_backward", &my_add_backward_impl_meta); +} + +// 通过pybind将c++接口和python接口绑定 +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("my_add", &my_add_impl_autograd, "x + y"); +} diff --git a/sample/pytorch_adapter/jit_compile/add_kernel.cpp b/sample/pytorch_adapter/jit_compile/add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9aa62e093633de1f5bddc8d9b7f80fb58831bdb9 --- /dev/null +++ b/sample/pytorch_adapter/jit_compile/add_kernel.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + * + * Function : z = x + y + * This sample is a very basic sample that implements vector add on Ascend plaform. + * In this sample: + * Length of x / y / z is 8*2048. + * Num of vector core used in sample is 8. + * Length for each core to compute is 2048. + * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop. + * + */ +#include "kernel_operator.h" +using namespace AscendC; +constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data +constexpr int32_t USE_CORE_NUM = 8; // num of core used +constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core +constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue +constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer + +class KernelAdd { +public: + __aicore__ inline KernelAdd() + {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) + { + // get start index for current core, core parallel + xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + // pipe alloc memory to queue, the unit is Bytes + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + } + __aicore__ inline void Process() + { + // loop count need to be doubled, due to double buffer + constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; + // tiling strategy, pipeline parallel + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + // alloc tensor from queue memory + LocalTensor xLocal = inQueueX.AllocTensor(); + LocalTensor yLocal = inQueueY.AllocTensor(); + // copy progress_th tile from global tensor to local tensor + DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); + DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); + // enque input tensors to VECIN queue + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + // deque input tensors from VECIN queue + LocalTensor xLocal = inQueueX.DeQue(); + LocalTensor yLocal = inQueueY.DeQue(); + LocalTensor zLocal = outQueueZ.AllocTensor(); + // call Add instr for computation + Add(zLocal, xLocal, yLocal, TILE_LENGTH); + // enque the output tensor to VECOUT queue + outQueueZ.EnQue(zLocal); + // free input tensors for reuse + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + // deque output tensor from VECOUT queue + LocalTensor zLocal = outQueueZ.DeQue(); + // copy progress_th tile from local tensor to global tensor + DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); + // free output tensor for reuse + outQueueZ.FreeTensor(zLocal); + } + +private: + TPipe pipe; + // create queues for input, in this case depth is equal to buffer num + TQue inQueueX, inQueueY; + // create queue for output, in this case depth is equal to buffer num + TQue outQueueZ; + GlobalTensor xGm, yGm, zGm; +}; +// implementation of kernel function +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) +{ + KernelAdd op; + op.Init(x, y, z); + op.Process(); +} + +// 包裹核函数,使得普通编译器能认识这个符号 +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) +{ + add_custom<<>>(x, y, z); +} \ No newline at end of file diff --git a/sample/pytorch_adapter/jit_compile/main.py b/sample/pytorch_adapter/jit_compile/main.py new file mode 100644 index 0000000000000000000000000000000000000000..847a51f1c4787dcf353759d1115f352c1c760353 --- /dev/null +++ b/sample/pytorch_adapter/jit_compile/main.py @@ -0,0 +1,70 @@ +import os +import subprocess +import torch +import torch_npu +import torch.utils.cpp_extension +from torch_npu.testing.testcase import TestCase, run_tests + +PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) +CUR_PATH = os.path.abspath(os.path.dirname(__file__)) + + +def compile_kernels(): + # 由于pytorch中没有昇腾device编译的扩展,所以此处人工加make + subprocess.run("make") + + +def compile_host(): + extra_ldflags = [] + extra_ldflags.append(f"-L{PYTORCH_NPU_INSTALL_PATH}/lib") + extra_ldflags.append("-ltorch_npu") + extra_ldflags.append(f"-L{CUR_PATH}/") + extra_ldflags.append("-lcustom_kernels") + extra_include_paths = [] + extra_include_paths.append("./") + extra_include_paths.append(os.path.join( + PYTORCH_NPU_INSTALL_PATH, "include")) + extra_include_paths.append(os.path.join(os.path.join(os.path.join(os.path.join( + PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc")) + + module = torch.utils.cpp_extension.load( + name="jit_extension", + sources=[ + "add_adapter.cpp" + ], + extra_include_paths=extra_include_paths, + extra_ldflags=extra_ldflags, + verbose=True) + return module + + +class TestCustomAdd(TestCase): + def test_add(self): + module = compile_host() + # 由于kernel现在是静态tiling,所以此处尺寸需要匹配 + # 因为add是elementwise的,现有算子支持8*2048(详见kernel实现),所以,小于这个应该都可以 + length = [8, 2048] + x = torch.rand(length, device='cpu', dtype=torch.float16) + y = torch.rand(length, device='cpu', dtype=torch.float16) + + x_npu = x.npu() + y_npu = y.npu() + x_npu.requires_grad = True + y_npu.requires_grad = True + output = module.my_add(x_npu, y_npu) + # 反向能力验证 + output.backward(output) + + x.requires_grad = True + y.requires_grad = True + cpuout = torch.add(x, y) + cpuout.backward(cpuout) + + self.assertRtolEqual(output, cpuout) + self.assertRtolEqual(x_npu.grad, x.grad) + self.assertRtolEqual(y_npu.grad, y.grad) + + +if __name__ == '__main__': + compile_kernels() + run_tests() diff --git a/sample/pytorch_adapter/with_setuptools/Makefile b/sample/pytorch_adapter/with_setuptools/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ec9115f377a578677470b89f365583dfcf246515 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/Makefile @@ -0,0 +1,20 @@ +# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 +ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest + +COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 +COMPILER_FLAG := -xcce -O2 -std=c++17 +DYNAMIC_LIB_FLAG := -fPIC -shared +DAV_FLAG := --cce-aicore-arch=dav-c220-vec +ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 + +all: build + +build: libcustom_kernels.so + +# 后续如果要扩展,把多个kernel的cpp都加到后面 +libcustom_kernels.so: add_kernel.cpp + $(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^ + +.PHONY: clean +clean: + rm *.so \ No newline at end of file diff --git a/sample/pytorch_adapter/with_setuptools/add_adapter.cpp b/sample/pytorch_adapter/with_setuptools/add_adapter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6c65e60ec596fe8b5627e06f678549b5f2f05660 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/add_adapter.cpp @@ -0,0 +1,128 @@ +#include +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "torch_npu/csrc/framework/OpCommand.h" + +using torch::autograd::AutogradContext; +using torch::autograd::Function; +using tensor_list = std::vector; +using namespace at; + +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); + +// 为NPU设备注册前向实现 +at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other) +{ + // 创建输出内存 + at::Tensor result = at::Tensor(self); + // 将pytorch中的结构翻译成为CANN认识的数据类型和结构 + // 1. (重要)通过对tensor的shape分析,选择合适的tiling(该算子为了简化,固定了tiling,只有特定shape下计算才正确) + // 2. 对数据类型和格式转换 -- 此处无需数据格式处理,直接使用 + auto stream = c10_npu::getCurrentNPUStream().stream(false); + auto x = self.storage().data(); + auto y = other.storage().data(); + auto z = result.storage().data(); + + uint32_t blockDim = 8; + auto callback = [stream, blockDim, x, y, z]() -> int { + add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z); + return 0; // 此处可以通过某种方式获取算子执行结果,还未实现 + }; + // 下发算子 + at_npu::native::OpCommand cmd; + cmd.Name("my_add").SetCustomHandler(callback).Run(); + return result; +} + +// 为NPU设备注册反向实现 +std::tuple my_add_backward_impl_npu(const at::Tensor &self) +{ + at::Tensor result = at::Tensor(self); // 创建输出内存 + + return {result, result}; +} + +// 为Meta设备注册前向实现 +at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other) +{ + return empty_like(self); +} + +// 为Meta设备注册反向实现 +std::tuple my_add_backward_impl_meta(const at::Tensor &self) +{ + auto result = empty_like(self); + return std::make_tuple(result, result); +} + +// 寻找注册在该op上的不同设备的实现 +at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other) +{ + static auto op = + torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed(); + return op.call(self, other); +} +// 寻找注册在该op上的不同设备的实现 +std::tuple my_add_backward_impl(const at::Tensor &self) +{ + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("myaten::my_add_backward", "") + .typed(); + return op.call(self); +} + +// 在myaten命名空间里注册my_add和my_add_backward两个schema +TORCH_LIBRARY(myaten, m) +{ + m.def("my_add(Tensor self, Tensor other) -> Tensor"); + m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)"); +} + +// 通过继承torch::autograd::Function类实现前反向绑定 +class MyAddFunction : public torch::autograd::Function { +public: + static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other) + { + at::AutoDispatchBelowADInplaceOrView guard; + return my_add_impl(self, other); + } + + static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) + { + auto grad_output = grad_outputs[0]; + auto result = my_add_backward_impl(grad_output); + return {std::get<0>(result), std::get<1>(result)}; + } +}; + +at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other) +{ + return MyAddFunction::apply(self, other); +} + +// 给op绑定NPU的自动求导实现 +// 如果是pytorch 2.1以下的版本,AutogradPrivateUse1需要改成AutogradXLA +TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_autograd); +} + +// 为NPU设备注册前反向实现 +// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1,在2.1以下版本用的是XLA,如果是2.1以下版本PrivateUse1需要改成XLA +TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m) +{ + m.impl("my_add", &my_add_impl_npu); + m.impl("my_add_backward", &my_add_backward_impl_npu); +} + +// 为Meta设备注册前反向实现 +TORCH_LIBRARY_IMPL(myaten, Meta, m) +{ + m.impl("my_add", &my_add_impl_meta); + m.impl("my_add_backward", &my_add_backward_impl_meta); +} + +// 通过pybind将c++接口和python接口绑定 +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("my_add", &my_add_impl_autograd, "x + y"); +} diff --git a/sample/pytorch_adapter/with_setuptools/add_kernel.cpp b/sample/pytorch_adapter/with_setuptools/add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9aa62e093633de1f5bddc8d9b7f80fb58831bdb9 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/add_kernel.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + * + * Function : z = x + y + * This sample is a very basic sample that implements vector add on Ascend plaform. + * In this sample: + * Length of x / y / z is 8*2048. + * Num of vector core used in sample is 8. + * Length for each core to compute is 2048. + * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop. + * + */ +#include "kernel_operator.h" +using namespace AscendC; +constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data +constexpr int32_t USE_CORE_NUM = 8; // num of core used +constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core +constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue +constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer + +class KernelAdd { +public: + __aicore__ inline KernelAdd() + {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) + { + // get start index for current core, core parallel + xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); + // pipe alloc memory to queue, the unit is Bytes + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + } + __aicore__ inline void Process() + { + // loop count need to be doubled, due to double buffer + constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; + // tiling strategy, pipeline parallel + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + // alloc tensor from queue memory + LocalTensor xLocal = inQueueX.AllocTensor(); + LocalTensor yLocal = inQueueY.AllocTensor(); + // copy progress_th tile from global tensor to local tensor + DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); + DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); + // enque input tensors to VECIN queue + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + // deque input tensors from VECIN queue + LocalTensor xLocal = inQueueX.DeQue(); + LocalTensor yLocal = inQueueY.DeQue(); + LocalTensor zLocal = outQueueZ.AllocTensor(); + // call Add instr for computation + Add(zLocal, xLocal, yLocal, TILE_LENGTH); + // enque the output tensor to VECOUT queue + outQueueZ.EnQue(zLocal); + // free input tensors for reuse + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + // deque output tensor from VECOUT queue + LocalTensor zLocal = outQueueZ.DeQue(); + // copy progress_th tile from local tensor to global tensor + DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); + // free output tensor for reuse + outQueueZ.FreeTensor(zLocal); + } + +private: + TPipe pipe; + // create queues for input, in this case depth is equal to buffer num + TQue inQueueX, inQueueY; + // create queue for output, in this case depth is equal to buffer num + TQue outQueueZ; + GlobalTensor xGm, yGm, zGm; +}; +// implementation of kernel function +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) +{ + KernelAdd op; + op.Init(x, y, z); + op.Process(); +} + +// 包裹核函数,使得普通编译器能认识这个符号 +extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) +{ + add_custom<<>>(x, y, z); +} \ No newline at end of file diff --git a/sample/pytorch_adapter/with_setuptools/setup.py b/sample/pytorch_adapter/with_setuptools/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..92ab1d3c78c7866b4bd53d9531bf0674c8b2987e --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/setup.py @@ -0,0 +1,51 @@ +import os +import subprocess +import torch +import torch_npu +from setuptools import setup, find_packages +from torch.utils.cpp_extension import BuildExtension +from torch_npu.utils.cpp_extension import NpuExtension + +PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) +CUR_PATH = os.path.abspath(os.path.dirname(__file__)) + + +def compile_kernels(): + # 由于pytorch中没有昇腾device编译的扩展,所以此处人工加make + subprocess.run("make") + return "libcustom_kernels.so" # 这个make出来的库名字 + + +def compile_adapter(): + ext = NpuExtension( + name="ascend_custom_kernels_lib", # import的库的名字 + # 如果还有其他cpp文件参与编译,需要在这里添加 + sources=[f"{CUR_PATH}/add_adapter.cpp"], + extra_compile_args=[ + '-I' + os.path.join(os.path.join(os.path.join(os.path.join( + PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"), + ], + library_dirs=[f"{CUR_PATH}"], # 编译时需要依赖的库文件的路径,相当于g++编译时的-L选项 + libraries=["custom_kernels"], # 编译时依赖的库文件,相当于-l选项 + ) + return [ext] + + +if __name__ == "__main__": + # 编译出含有算子的库,并以so的方式提供 + kernel_so = compile_kernels() + + # 编译出pytorch适配层的库,支持被框架集成 + exts = compile_adapter() + + # 将整体打包成wheel包 + setup( + name="ascend_custom_kernels", # package的名字 + version='1.0', + keywords='ascend_custom_kernels', + ext_modules=exts, + packages=find_packages(), + cmdclass={"build_ext": BuildExtension}, + data_files=[(".", [kernel_so])], + include_package_data=True, + ) diff --git a/sample/pytorch_adapter/with_setuptools/test.py b/sample/pytorch_adapter/with_setuptools/test.py new file mode 100644 index 0000000000000000000000000000000000000000..896eef2c0fbb1a113377fb7dc770f45fd99832f4 --- /dev/null +++ b/sample/pytorch_adapter/with_setuptools/test.py @@ -0,0 +1,34 @@ +import torch +import torch_npu +import ascend_custom_kernels_lib +from torch_npu.testing.testcase import TestCase, run_tests + + +class TestCustomAdd(TestCase): + def test_add(self): + # 由于kernel现在是静态tiling,所以此处尺寸需要匹配 + # 因为add是elementwise的,现有算子支持8*2048(详见kernel实现),所以,小于这个应该都可以 + length = [8, 2048] + x = torch.rand(length, device='cpu', dtype=torch.float16) + y = torch.rand(length, device='cpu', dtype=torch.float16) + + x_npu = x.npu() + y_npu = y.npu() + x_npu.requires_grad = True + y_npu.requires_grad = True + output = ascend_custom_kernels_lib.my_add(x_npu, y_npu) + # 反向能力验证 + output.backward(output) + + x.requires_grad = True + y.requires_grad = True + cpuout = torch.add(x, y) + cpuout.backward(cpuout) + + self.assertRtolEqual(output, cpuout) + self.assertRtolEqual(x_npu.grad, x.grad) + self.assertRtolEqual(y_npu.grad, y.grad) + + +if __name__ == "__main__": + run_tests() diff --git a/sample/third_party/lib/libruntime.so.aarch64 b/sample/third_party/lib/libruntime.so.aarch64 deleted file mode 100644 index 2c686dc3e0ab56768ec8c45cfac9f1fbb107888f..0000000000000000000000000000000000000000 Binary files a/sample/third_party/lib/libruntime.so.aarch64 and /dev/null differ diff --git a/sample/third_party/lib/libruntime.so.x86 b/sample/third_party/lib/libruntime.so.x86 deleted file mode 100644 index 6da21687dc7655cc6745003cfcbb6c3c0a8ceb34..0000000000000000000000000000000000000000 Binary files a/sample/third_party/lib/libruntime.so.x86 and /dev/null differ diff --git a/sample/third_party/lib/libruntime_camodel.so.aarch64 b/sample/third_party/lib/libruntime_camodel.so.aarch64 deleted file mode 100644 index 2c686dc3e0ab56768ec8c45cfac9f1fbb107888f..0000000000000000000000000000000000000000 Binary files a/sample/third_party/lib/libruntime_camodel.so.aarch64 and /dev/null differ diff --git a/sample/third_party/lib/libruntime_camodel.so.x86 b/sample/third_party/lib/libruntime_camodel.so.x86 deleted file mode 100644 index 6da21687dc7655cc6745003cfcbb6c3c0a8ceb34..0000000000000000000000000000000000000000 Binary files a/sample/third_party/lib/libruntime_camodel.so.x86 and /dev/null differ