diff --git a/debug/accuracy_tools/atat/core/common_config.py b/debug/accuracy_tools/atat/core/common_config.py index 740119b63cea42356187e2fa9d1ba9a58b705ee7..a6e67fd01c28416bed61b63222cd3dfbba94c4f1 100644 --- a/debug/accuracy_tools/atat/core/common_config.py +++ b/debug/accuracy_tools/atat/core/common_config.py @@ -11,6 +11,9 @@ class CommonConfig: self.level = json_config.get('level') self.seed = json_config.get('seed') self.is_deterministic = json_config.get('is_deterministic') + self.on_step_end = json_config.get('on_step_end') + self.bench_dump_path = json_config.get('bench_dump_path') + self.on_step_end_path = json_config.get('on_step_end_path') self._check_config() def _check_config(self): diff --git a/debug/accuracy_tools/atat/pytorch/__init__.py b/debug/accuracy_tools/atat/pytorch/__init__.py index 482e850f7baa845bd831e0d4728e841661b9345b..198cea96de8ed6b9e44667ddf9e8c4f23c5a2410 100644 --- a/debug/accuracy_tools/atat/pytorch/__init__.py +++ b/debug/accuracy_tools/atat/pytorch/__init__.py @@ -2,3 +2,4 @@ from .debugger.precision_debugger import PrecisionDebugger from .common.utils import seed_all from .compare.acc_compare import compare from .compare.distributed_compare import compare_distributed +from .visualization.json_parse_graph import compare_graph, build_graph diff --git a/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py b/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py index f5764ac92b2d3c053adaa1a50169dfac4f3147bc..cff65534d951a29fc9fcea201403259c1fe164e8 100644 --- a/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py +++ b/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py @@ -847,7 +847,10 @@ def compare_process(file_handles, stack_mode, fuzzy_match, summary_compare=False for npu_data in npu_ops_queue: get_un_match_accuracy(result, npu_data, md5_compare, summary_compare) - header = [] + result_to_csv(md5_compare, summary_compare, stack_mode, result, output_csv_handle) + + +def result_to_csv(md5_compare, summary_compare, stack_mode, result, output_csv_handle): if md5_compare: header = CompareConst.MD5_COMPARE_RESULT_HEADER[:] elif summary_compare: diff --git a/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py b/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py index b3f0b51a66797705aae7e0033a340e186378c927..7695a52e87104819441b02ac2b3e6e906b29e6e3 100644 --- a/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py +++ b/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py @@ -13,13 +13,15 @@ class DebuggerConfig: self.is_deterministic = common_config.is_deterministic if common_config.is_deterministic else False self.scope = task_config.scope if task_config.scope else [] self.list = task_config.list if task_config.list else [] - self.data_mode = task_config.data_mode if task_config.data_mode else ["all"] + self.data_mode = task_config.data_mode if task_config.data_mode else ["all"] self.backward_input = task_config.backward_input self.summary_mode = task_config.summary_mode if task_config.summary_mode else Const.STATISTICS self.overflow_num = task_config.overflow_num if task_config.overflow_num else 1 self.repair_scope = None self.repair_api_str = None - self.on_step_end = None + self.on_step_end = common_config.on_step_end + self.bench_dump_path = common_config.bench_dump_path + self.on_step_end_path = common_config.on_step_end_path self.repair_type = None self.check() diff --git a/debug/accuracy_tools/atat/pytorch/functional/step_post_process.py b/debug/accuracy_tools/atat/pytorch/functional/step_post_process.py index 7f0d3459326f04691a0041c120bf4efc676f8bc1..86c1dcc04bc1949fd134cdea19dc86e10822330a 100644 --- a/debug/accuracy_tools/atat/pytorch/functional/step_post_process.py +++ b/debug/accuracy_tools/atat/pytorch/functional/step_post_process.py @@ -1,5 +1,7 @@ +import os from abc import ABC, abstractmethod from ..common.exceptions import StepException +from ..visualization.json_parse_graph import compare_graph def run_parallel_ut(config): @@ -17,6 +19,8 @@ def build_step_post_process(config): return SingleAPICheck(config) elif config.on_step_end == StepPostProcess.Compare: return AutoCompare(config) + elif config.on_step_end == StepPostProcess.GraphCompare: + return GraphCompare(config) else: raise StepException(StepException.InvalidPostProcess, f"step后处理须配置为" f"'{StepPostProcess.SingleAPICheck}'或'{StepPostProcess.Compare}'," @@ -26,6 +30,7 @@ def build_step_post_process(config): class StepPostProcess(ABC): SingleAPICheck = 'single_api_check' Compare = 'compare' + GraphCompare = 'compare_graph' class SingleAPICheck: @@ -41,3 +46,33 @@ class AutoCompare: def run(self): compare_distrbuted(self.config.bench_dump_path, self.config.dump_path) + + +class GraphCompare: + def __init__(self, config): + self.config = config + + def run(self): + self.compare_graph() + + def compare_graph(self): + if self.config.step and self.config.current_iter not in self.config.step: + return + if self.config.step is None: + step = 'step0' + else: + step = 'step' + str(self.config.current_iter) + n_path = os.path.join(self.config.dump_path, step) + b_path = os.path.join(self.config.bench_dump_path, step) + if self.config.rank: + rank = 'rank' + str(self.config.rank) + else: + rank = 'rank0' + n_path_rank = os.path.join(n_path, rank) + if not os.path.exists(n_path_rank): + rank = 'rank' + n_path_rank = os.path.join(n_path, rank) + b_path_rank = os.path.join(b_path, rank) + if not os.path.exists(b_path_rank): + raise StepException(StepException.InvalidPostProcess, f'路径{b_path_rank}不存在,比对失败') + compare_graph(n_path_rank, b_path_rank, self.config.on_step_end_path) diff --git a/debug/accuracy_tools/atat/pytorch/service.py b/debug/accuracy_tools/atat/pytorch/service.py index dd2d2be41c724ddf51616c180e347f774db31818..508260ff8fe209cee67c5c82f8d5021e2059bcfc 100644 --- a/debug/accuracy_tools/atat/pytorch/service.py +++ b/debug/accuracy_tools/atat/pytorch/service.py @@ -78,9 +78,10 @@ class Service: return pre_forward_hook, forward_hook, backward_hook def step(self): + self.config.current_iter = self.current_iter self.current_iter += 1 if self.step_post_process: - self.step_post_process() + self.step_post_process.run() @staticmethod def check_model_valid(model): diff --git a/debug/accuracy_tools/atat/pytorch/visualization/__init__.py b/debug/accuracy_tools/atat/pytorch/visualization/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py b/debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..10455b767b2bef0534962b3facedc5267f7eb69f --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py @@ -0,0 +1,278 @@ +import os +import json +import stat +from .graph_utils import ToolTip, Suggestions +from .graph.node_op import NodeOp +from ..compare.acc_compare import read_op, merge_tensor, get_accuracy, result_to_csv +from ..common.utils_compare import CompareConst, Const + + +class CompareTree: + def init(self, tree_n, tree_b, data_n_dict, data_b_dict, stack_json_data, csv_path, + summary_compare, md5_compare, stack_mode): + self.tree_n = tree_n + self.tree_b = tree_b + self.data_n_dict = data_n_dict + self.data_b_dict = data_b_dict + self.csv_path = csv_path + # 使用字典来存储树B中所有节点的type��节点的映射,以便快速查找 + self.b_nodes_by_value = {} + self.to_csv_result = [] + self.md5_compare = md5_compare + self.summary_compare = summary_compare + self.real_data_compare = self.summary_compare is False and self.md5_compare is False + self.stack_mode = stack_mode + self.stack_json_data = stack_json_data + self.real_data_compare_nodes = [] + self.fill_b_nodes_dict(self.tree_b) + self.compare_nodes(self.tree_n) + + # 获取节点所有祖先的列表 + @staticmethod + def get_ancestors(node): + ancestors = [] + current_node = node.upnode + while current_node: + ancestors.append(current_node.type) + current_node = current_node.upnode + return list(reversed(ancestors)) + + def fill_b_nodes_dict(self, node): + if node.type not in self.b_nodes_by_value: + self.b_nodes_by_value[node.type] = [] + self.b_nodes_by_value[node.type].append(node) + for subnode in node.subnodes: + self.fill_b_nodes_dict(subnode) + + def result_to_csv(self): + with os.fdopen(os.open(self.csv_path, os.O_RDWR | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP), + 'w+') as file_out: + result_to_csv(self.md5_compare, self.summary_compare, self.stack_mode, self.to_csv_result, file_out) + + # 递归比较NPU树中的节点,如果在Bench中找到具有相同type的节点,检查它们的祖先 + def compare_nodes(self, node_n): + if node_n.type in self.b_nodes_by_value: + for node_b in self.b_nodes_by_value[node_n.type]: + # 检查两个节点是否有完全相同的祖先链 + flag, ancestors = self.have_same_ancestors(node_n, node_b) + flag = flag and node_n.data_info == node_b.data_info + if flag: + # 如果祖先链相同,data_info相同,将node_b及其祖先添加到node_n的bench_ancestors属性中 + ancestors.append(node_b.type) + node_n.matched_node_link = ancestors + # 不copy一下转成yaml会有乱码 + node_b.matched_node_link = ancestors.copy() + # compare + # 真实数据比对只会得到基本信息,并没有精度指标,需要调用多进程比对接口 + compare_result_list = self.compare_node(node_n, node_b) + if compare_result_list: + self.to_csv_result.extend(compare_result_list) + self.add_compare_result_to_node(node_n, compare_result_list) + + for subnode in node_n.subnodes: + self.compare_nodes(subnode) + + # 将比对结果添加到节点的输入输出数据中,正常来说输入输出数据和比对结果数据数量是一致的 + def add_compare_result_to_node(self, node, compare_result_list): + # 真实数据比对,先暂存节点,在多进程比对得到精度指标后,再将指标添加到节点 + if self.real_data_compare: + self.real_data_compare_nodes.append(node) + return + compare_in_dict = {} + compare_out_dict = {} + # input和output比对数据分开 + for item in compare_result_list: + if 'output' in item[0]: + compare_out_dict[item[0]] = item + else: + compare_in_dict[item[0]] = item + if self.md5_compare: + precision_status_in = self.add_md5_compare_data(node.input_data, compare_in_dict) + precision_status_out = self.add_md5_compare_data(node.output_data, compare_out_dict) + # 所有输入输出md5比对通过,这个节点才算通过 + precision_status = precision_status_in and precision_status_out + node.data['precision_status'] = precision_status + # md5比对通过为1,否则0 + node.data['precision_index'] = 1 if precision_status else 0 + node.data['md5 Compare Result'] = CompareConst.PASS if precision_status else CompareConst.DIFF + elif self.summary_compare: + precision_status_in, precision_index_in = self.add_summary_compare_data(node.input_data, compare_in_dict) + precision_status_out, precision_index_out = self.add_summary_compare_data(node.output_data, + compare_out_dict) + precision_status = precision_status_in and precision_status_out + precision_index = min(precision_index_in, precision_index_out) + node.data['precision_status'] = precision_status + node.data['precision_index'] = precision_index + if not precision_status: + self.add_summary_compare_node_error_key(node.output_data) + self.add_real_compare_suggestions(node) + + def add_summary_compare_data(self, node_data, compare_data_dict): + precision_status = True + precision_index = 1 + for key, value in node_data.items(): + if not isinstance(value, dict): + continue + compare_data = compare_data_dict.get(key) + if compare_data: + key_list = [CompareConst.MAX_DIFF, CompareConst.MIN_DIFF, CompareConst.MEAN_DIFF, + CompareConst.NORM_DIFF] + # 取npu和bench数据进行比较,用完删除 + del_list = [CompareConst.NPU_MAX, CompareConst.NPU_MIN, CompareConst.NPU_MEAN, + CompareConst.NPU_NORM, CompareConst.BENCH_MAX, CompareConst.BENCH_MIN, + CompareConst.BENCH_MEAN, CompareConst.BENCH_NORM] + key_list.extend(del_list) + id_list = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17] + self._match_data(value, compare_data, key_list, id_list) + # summary比对是否通过 + precision_status, precision_index = self._summary_compare_judgment(value, precision_status, + precision_index) + self._del_item_by_list(value, del_list) + node_data[key] = value + return precision_status, precision_index + + @staticmethod + def _summary_compare_judgment(data_dict, precision_status, precision_index): + item_dict = {(CompareConst.NPU_MAX, CompareConst.BENCH_MAX): (CompareConst.MAX_DIFF, 'Max Magnitude Diff'), + (CompareConst.NPU_MIN, CompareConst.BENCH_MIN): (CompareConst.MIN_DIFF, 'Min Magnitude Diff'), + (CompareConst.NPU_MEAN, CompareConst.BENCH_MEAN): (CompareConst.MEAN_DIFF, 'Mean Magnitude Diff'), + (CompareConst.NPU_NORM, CompareConst.BENCH_NORM): ( + CompareConst.NORM_DIFF, 'L2norm Magnitude Diff')} + for key, value in item_dict.items(): + if isinstance(data_dict.get(key[0]), (float, int)) and isinstance(data_dict.get(key[1]), (float, int)) \ + and isinstance(data_dict.get(value[0]), (float, int)): + magnitude_diff = abs(data_dict.get(value[0])) / ( + max(abs(data_dict.get(key[0])), abs(data_dict.get(key[1]))) + 1e-10) + magnitude_diff = 1 if magnitude_diff > 1 else magnitude_diff + data_dict[value[1]] = magnitude_diff + if magnitude_diff > 0.3: + precision_status = False + precision_index = 1 - max(precision_index, magnitude_diff) + return precision_status, precision_index + + def add_md5_compare_data(self, node_data, compare_data_dict): + precision_status = True + for key, value in node_data.items(): + if not isinstance(value, dict): + continue + compare_data = compare_data_dict.get(key) + if compare_data: + key_list = ['md5 Compare Result'] + id_list = [8] + self._match_data(value, compare_data, key_list, id_list) + # md5比对是否通过 + if value.get('md5 Compare Result') != CompareConst.PASS: + precision_status = False + node_data[key] = value + return precision_status + + def add_real_compare_data(self, node_data, compare_data_dict): + min_thousandth = float(1) + numbers = [] + for key, value in node_data.items(): + if not isinstance(value, dict): + continue + # self._del_item(value) + compare_data = compare_data_dict.get(key) + if compare_data: + key_list = [CompareConst.COSINE, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, + CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] + id_list = [6, 7, 8, 9, 10] + self._match_data(value, compare_data, key_list, id_list) + # 获取一个节点所有的输入或输出最小的双千指标 + thousandth = value.get(CompareConst.ONE_THOUSANDTH_ERR_RATIO) + # 可能是None,可能是非数字内容str + try: + thousandth = float(thousandth) + except (ValueError, TypeError): + thousandth = None + if thousandth is not None: + numbers.append(thousandth) + node_data[key] = value + # 双千指标都是None的异常情况 + if not numbers: + min_thousandth = None + else: + min_thousandth = min(numbers + [min_thousandth]) + return min_thousandth + + @staticmethod + def add_real_compare_node_error_key(node_data): + for key, value in node_data.items(): + if not isinstance(value, dict): + continue + value['error_key'] = [CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] + node_data[key] = value + + @staticmethod + def add_summary_compare_node_error_key(node_data): + for key, value in node_data.items(): + if not isinstance(value, dict): + continue + value['error_key'] = ['Max Magnitude Diff', 'Min Magnitude Diff', 'Mean Magnitude Diff', 'L2norm Magnitude Diff'] + node_data[key] = value + + @staticmethod + def add_real_compare_suggestions(node): + if node.op == NodeOp.module: + node.suggestions['text'] = Suggestions.Module + node.suggestions[Suggestions.PTDBG] = Suggestions.PTDBG_URL + if node.op == NodeOp.function_api: + node.suggestions['text'] = Suggestions.API + node.suggestions[Suggestions.API_ACCURACY_CHECKER] = Suggestions.API_ACCURACY_CHECKER_URL + + def get_tool_tip(self): + if self.summary_compare: + tips = { + CompareConst.MAX_DIFF: ToolTip.MAX_DIFF, + CompareConst.MIN_DIFF: ToolTip.MIN_DIFF, + CompareConst.MEAN_DIFF: ToolTip.MEAN_DIFF, + CompareConst.NORM_DIFF: ToolTip.NORM_DIFF} + elif self.md5_compare: + tips = { + Const.MD5: ToolTip.MD5} + else: + tips = { + CompareConst.ONE_THOUSANDTH_ERR_RATIO: ToolTip.ONE_THOUSANDTH_ERR_RATIO, + CompareConst.COSINE: ToolTip.COSINE, + CompareConst.MAX_ABS_ERR: ToolTip.MAX_ABS_ERR, + CompareConst.MAX_RELATIVE_ERR: ToolTip.MAX_RELATIVE_ERR} + return json.dumps(tips) + + @staticmethod + def _match_data(data_dict, compare_data, key_list, id_list): + if len(key_list) != len(id_list): + return + for i, key in enumerate(key_list): + data = compare_data[id_list[i]] + if data is not None and 'nan' not in str(data): + data_dict[key] = compare_data[id_list[i]] + + @staticmethod + def _del_item_by_list(data_dict, del_list): + if isinstance(data_dict, dict): + for item in del_list: + if item in data_dict: + del data_dict[item] + + # 比较两个节点的所有祖先是否相同 + def have_same_ancestors(self, node_a, node_b): + ancestors_a = self.get_ancestors(node_a) + ancestors_b = self.get_ancestors(node_b) + return ancestors_a == ancestors_b, ancestors_a + + def parse_node(self, node, data_dict): + op_parsed_list = read_op(data_dict.get(node.type, {}), node.type) + if node.type in self.stack_json_data: + op_parsed_list.append( + {'full_op_name': node.type, 'full_info': self.stack_json_data[node.type]}) + else: + op_parsed_list.append({'full_op_name': node.type, 'full_info': None}) + return merge_tensor(op_parsed_list, self.summary_compare, self.md5_compare) + + def compare_node(self, node_n, node_b): + result = [] + merge_n = self.parse_node(node_n, self.data_n_dict) + merge_b = self.parse_node(node_b, self.data_b_dict) + get_accuracy(result, merge_n, merge_b, self.summary_compare, self.md5_compare) + return result diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph/base_node.py b/debug/accuracy_tools/atat/pytorch/visualization/graph/base_node.py new file mode 100644 index 0000000000000000000000000000000000000000..490e964d1da859d8c4e7abad36e6cb6f4ac09e44 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/visualization/graph/base_node.py @@ -0,0 +1,90 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class BaseNode: + def __init__(self, node_op, node_type, up_node=None, is_forward=True): + self.op = node_op + self.type = node_type + self.id = node_type + self.data = {} + self.outputs = [] + self.inputs = [] + self.output_data = {} + self.input_data = {} + self.upnode = up_node + self.subnodes = [] + if up_node: + up_node.add_subnode(self) + self.is_forward = is_forward + self.pair = None + self.matched_node_link = [] + self.data_info = '' + self.suggestions = {} + + def __str__(self): + info = f'id:\t{self.id}' + return info + + def get_info(self): + info = f'{self.id}\t{self.op}' + if not self.is_forward: + info += '(b)' + for key in self.data: + info += f'\n{key}:\t{self.data.get(key)}' + return info + + def add_subnode(self, node): + if node.id == self.id: + return + self.subnodes.append(node) + + def get_yaml_dict(self): + result = {} + result['id'] = self.id + result['node_type'] = self.op.value + result['type'] = self.type + result['data'] = self.data + result['output_data'] = self._del_item(self.output_data) + result['input_data'] = self._del_item(self.input_data) + result['outputs'] = [(edge_id, node.id) for edge_id, node in self.outputs] + result['inputs'] = [(edge_id, node.id) for edge_id, node in self.inputs] + result['upnode'] = self.upnode.id if self.upnode else 'None' + result['subnodes'] = [node.id for node in self.subnodes] + result['is_forward'] = self.is_forward + result['pair'] = self.pair.id if self.pair else 'None' + result['matched_node_link'] = self.matched_node_link + result['suggestions'] = self.suggestions + return result + + @staticmethod + def _del_item(data_dict): + del_list = ['requires_grad', 'data_name', 'full_op_name'] + for key, value in data_dict.items(): + if not isinstance(value, dict): + continue + for item in del_list: + if item in value: + del value[item] + BaseNode._formate_floats(value) + + return data_dict + + @staticmethod + def _formate_floats(data_dict): + for key, value in data_dict.items(): + if isinstance(value, float): + data_dict[key] = round(value, 6) + data_dict[key] = str(value).replace("'", "") diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph/graph.py b/debug/accuracy_tools/atat/pytorch/visualization/graph/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..54cdd08db66a4a5e350587f977776c68a759f205 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/visualization/graph/graph.py @@ -0,0 +1,28 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Graph: + def __init__(self): + self.root = None + self.recent_node = None + self.depth = 0 + self.node_map = {} + self.rawid_map = {} + + def __str__(self): + infos = [f'{str(self.node_map.get(node_id))}' for node_id in self.node_map] + info = "\n".join(infos) + return info diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph/graph_builder.py b/debug/accuracy_tools/atat/pytorch/visualization/graph/graph_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..22bb2739f95e798420fc338e2169a1d123803ebf --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/visualization/graph/graph_builder.py @@ -0,0 +1,53 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json + +from ....core.file_check_util import FileOpen + + +class GraphBuilder: + + @staticmethod + def export_to_json(filename, graph): + result = {} + result['root'] = graph.root.id if graph.root else 'None' + result['node'] = {} + GraphBuilder._export_dfs(graph.root, result['node']) + with FileOpen(filename, 'w') as f: + f.write(json.dumps(result, indent=4)) + + @staticmethod + def get_graph_result(graph): + result = {} + result['root'] = graph.root.id if graph.root else 'None' + result['node'] = {} + GraphBuilder._export_dfs(graph.root, result['node']) + return result + + @staticmethod + def export_graphs_to_json(filename, graph_n, graph_b, tool_tip): + result = {} + result['NPU'] = GraphBuilder.get_graph_result(graph_n) + result['Bench'] = GraphBuilder.get_graph_result(graph_b) + result['Tooltip'] = tool_tip + with FileOpen(filename, 'w') as f: + f.write(json.dumps(result, indent=4)) + + @staticmethod + def _export_dfs(node, result): + info = node.get_yaml_dict() + result[node.id] = info + for subnode in node.subnodes: + GraphBuilder._export_dfs(subnode, result) diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph/node_op.py b/debug/accuracy_tools/atat/pytorch/visualization/graph/node_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f81a52b9951ed4223a24856b5e25fa8b77f4aca7 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/visualization/graph/node_op.py @@ -0,0 +1,24 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from enum import Enum + + +class NodeOp(Enum): + module = 1 + function_api = 2 + module_api = 3 + tensor = 4 + output = 5 diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph_utils.py b/debug/accuracy_tools/atat/pytorch/visualization/graph_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7b022a34da9a20392bed071af13b59f50e722fdb --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/visualization/graph_utils.py @@ -0,0 +1,23 @@ +class ToolTip: + MAX_DIFF = 'NPU与标杆API统计信息比对,最大值的差值' + MIN_DIFF = 'NPU与标杆API统计信息比对,最小值的差值' + MEAN_DIFF = 'NPU与标杆API统计信息比对,平均值的差值' + NORM_DIFF = 'NPU与标杆API统计信息比对,2范数(平方根)的差值' + MAX_MAGNITUDE_DIFF = 'NPU与标杆API统计信息比对,最大值的差值相对误差' + MIN_MAGNITUDE_DIFF = 'NPU与标杆API统计信息比对,最小值的差值相对误差' + MEAN_MAGNITUDE_DIFF = 'NPU与标杆API统计信息比对,平均值的差值相对误差' + NORM_MAGNITUDE_DIFF = 'NPU与标杆API统计信息比对,2范数(平方根)的差值相对误差' + MD5 = '数据MD5信息,用于比较两个数据信息是否完全一致' + ONE_THOUSANDTH_ERR_RATIO = 'Tensor中的元素逐个与对应的标杆数据对比,相对误差大于千分之一的比例占总元素个数的比例小于千分之一' + COSINE = '通过计算两个向量的余弦值来判断其相似度,数值越接近于1说明计算出的两个张量越相似,实际可接受阈值为大于0.99。在计算中可能会存在nan,主要由于可能会出现其中一个向量为0' + MAX_ABS_ERR = '当最大绝对误差越接近0表示其计算的误差越小,实际可接受阈值为小于0.001' + MAX_RELATIVE_ERR = '当最大相对误差越接近0表示其计算的误差越小。当dump数据中存在0或Nan时,比对结果中最大相对误差则出现inf或Nan的情况,属于正常现象' + + +class Suggestions: + Module = '此模块精度比对结果疑似异常,请使用ptdbg工具对模块中的api进行dump比对' + API = '此api精度比对结果疑似异常,请使用api accuracy checker工具对api进行精度检测' + PTDBG = 'ptdbg工具' + PTDBG_URL = 'https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/ptdbg_ascend' + API_ACCURACY_CHECKER = 'api accuracy checker工具' + API_ACCURACY_CHECKER_URL = 'https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/api_accuracy_checker' diff --git a/debug/accuracy_tools/atat/pytorch/visualization/json_parse_graph.py b/debug/accuracy_tools/atat/pytorch/visualization/json_parse_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..86b3c23e08333f05616eae7bb50e7cb0934dadc2 --- /dev/null +++ b/debug/accuracy_tools/atat/pytorch/visualization/json_parse_graph.py @@ -0,0 +1,200 @@ +import os +import json +import re +import time +import pandas as pd +from .graph.graph import Graph +from .graph.base_node import BaseNode +from .graph.node_op import NodeOp +from .graph.graph_builder import GraphBuilder +from ..compare.acc_compare import read_op, task_dumppath_get, _do_multi_process +from ..common.utils_compare import add_time_as_suffix +from .compare_tree import CompareTree + + +def _load_json_file(file_path): + try: + with open(file_path, 'r') as file: + file_dict = json.load(file) + if not isinstance(file_dict, dict): + return {} + return file_dict + except json.JSONDecodeError: + return {} + + +def _get_data_inputs_outputs(data_dict: dict): + input_args = data_dict.get('input_args', []) + input_kwargs = data_dict.get('input_kwargs', {}) + output = data_dict.get('output', []) + + input_args = input_args if isinstance(input_args, list) else [] + input_kwargs = input_kwargs if isinstance(input_kwargs, dict) else {} + output = output if isinstance(output, list) else [] + return input_args, input_kwargs, output + + +# ptdbg compare read_op 解析数据 +def _add_node_data(node_data, node): + input_data = {} + output_data = {} + op_parsed_list = read_op(node_data, node.type) + for item in op_parsed_list: + full_op_name = item.get('full_op_name', '') + if 'output' in full_op_name: + output_data[full_op_name] = item + else: + input_data[full_op_name] = item + node.input_data = input_data + node.output_data = output_data + + +def _get_data_info(item): + if isinstance(item, dict): + return str(item.get('type', 'na')) + '_' + str(item.get('dtype', 'na')) + '_' + str(item.get('shape', 'na')) + elif isinstance(item, (list, tuple)): + return str([_get_data_info(sub_item) for sub_item in item]) + return '' + + +def _process_node_data_info(items): + info_str = '' + for item in items: + info_str += _get_data_info(item) + return info_str + + +# 节点所有输入、输出的type、dtype和shape要一样 +def _get_node_data_info(input_args, input_kwargs, output): + return _process_node_data_info(input_args) + _process_node_data_info(input_kwargs) + _process_node_data_info(output) + + +def _get_node_op(node_name: str): + pattern = r'^(Tensor|Torch|Functional|NPU|VF|Distributed|Aten)' + match = re.match(pattern, node_name) + if match: + return NodeOp.function_api + else: + return NodeOp.module + + +def build_tree(construct_dict, data_dict, root_name): + # 创建一个字典来存储已经创建的节点,以便重用 + created_nodes = {} + root_node = BaseNode(NodeOp.module, root_name) + + # 创建一个函数来递归地创建或获取节点 + def get_or_create_node(op, name, up_node=None): + if name not in created_nodes: + # add data + base_node = BaseNode(op, name, up_node) + node_data = data_dict.get(name, {}) + input_args, input_kwargs, output = _get_data_inputs_outputs(node_data) + # 添加输入输出数据 + _add_node_data(node_data, base_node) + + # 添加输入输出数据信息组成的标识,用来匹配npu和标杆的节点 + data_info = _get_node_data_info(input_args, input_kwargs, output) + base_node.data_info = data_info + created_nodes[name] = base_node + elif up_node: + # 如果节点已经存在,但我们现在才知道它的上级节点 + created_nodes[name].upnode = up_node + up_node.add_subnode(created_nodes[name]) + return created_nodes[name] + + # 遍历字典,为每个键值对创建或获取节点 + for subnode, upnode in construct_dict.items(): + if upnode: + up_node = get_or_create_node(_get_node_op(upnode), upnode) + else: + up_node = root_node + get_or_create_node(_get_node_op(subnode), subnode, up_node) + + return root_node, created_nodes + + +def build_graph(construct_path, data_path, output_path): + construct_dict = _load_json_file(construct_path) + data_dict = _load_json_file(data_path).get('data', {}) + root_node, created_nodes = build_tree(construct_dict, data_dict, 'root_node') + graph = Graph() + graph.root = root_node + graph.node_map = created_nodes + GraphBuilder.export_to_json(output_path, graph) + + +def do_compare_graph(construct_n_path, data_n_path, construct_b_path, data_b_path, stack_path, output_path, csv_path): + dump_path_param = { + "npu_json_path": data_n_path, + "bench_json_path": data_b_path, + "stack_json_path": stack_path, + "is_print_compare_log": True + } + # 判断比对模式 + summary_compare, md5_compare = task_dumppath_get(dump_path_param) + + construct_n_dict = _load_json_file(construct_n_path) + data_n_dict = _load_json_file(data_n_path).get('data', {}) + root_n_node, created_n_nodes = build_tree(construct_n_dict, data_n_dict, 'NPU') + construct_b_dict = _load_json_file(construct_b_path) + data_b_dict = _load_json_file(data_b_path).get('data', {}) + root_b_node, created_b_nodes = build_tree(construct_b_dict, data_b_dict, 'NPU') + stack_json_data = _load_json_file(stack_path) + + start_time = time.time() + compare_tree = CompareTree(root_n_node, root_b_node, data_n_dict, data_b_dict, stack_json_data, csv_path, + summary_compare, md5_compare, True) + end_time = time.time() + print('compare_tree', end_time - start_time) + compare_tree.result_to_csv() + + if summary_compare is False and md5_compare is False: + # 真实数据比对,开启多进程比对得到精度指标,再写进已创建的csv中 + _do_multi_process(dump_path_param, csv_path) + # 从csv文件读取精度指标,添加到node节点中 + df = pd.read_csv(csv_path) + compare_data_dict = {row[0]: row.tolist() for index, row in df.iterrows()} + for node in compare_tree.real_data_compare_nodes: + min_thousandth_in = compare_tree.add_real_compare_data(node.input_data, compare_data_dict) + min_thousandth_out = compare_tree.add_real_compare_data(node.output_data, compare_data_dict) + if min_thousandth_in and min_thousandth_out: + change_percentage = abs(min_thousandth_in - min_thousandth_out) + else: + change_percentage = 0 + precision_status = True + if change_percentage > 0.1: + precision_status = False + # 精度不达标,双千指标标红 + CompareTree.add_real_compare_node_error_key(node.output_data) + # 添加建议 + CompareTree.add_real_compare_suggestions(node) + node.data['precision_status'] = precision_status + node.data['precision_index'] = 0 if change_percentage > 1 else 1 - change_percentage + + graph_n = Graph() + graph_n.root = root_n_node + graph_n.node_map = created_n_nodes + graph_b = Graph() + graph_b.root = root_b_node + graph_n.node_map = created_b_nodes + start_time = time.time() + GraphBuilder.export_graphs_to_json(output_path, graph_n, graph_b, compare_tree.get_tool_tip()) + end_time = time.time() + print('export_graphs_to_yaml', end_time - start_time) + + +def compare_graph(dump_path_n, dump_path_b, out_path): + g_dir = dump_path_n + g_dir_b = dump_path_b + g_construct_path = f'{g_dir}/construct.json' + g_construct_path1 = f'{g_dir_b}/construct.json' + g_data_path = f'{g_dir}/dump.json' + g_data_path1 = f'{g_dir_b}/dump.json' + g_stack_path = f'{g_dir}/stack.json' + g_output_path = f'{out_path}/export.vis' + g_csv_dir = f'{out_path}' + g_file_name = add_time_as_suffix("compare_result") + g_csv_path = os.path.join(os.path.realpath(g_csv_dir), g_file_name) + do_compare_graph(g_construct_path, g_data_path, g_construct_path1, g_data_path1, g_stack_path, g_output_path, + g_csv_path)