From f649b878ef926ea47aacd32afbff0a2d8c4fc6d5 Mon Sep 17 00:00:00 2001 From: l30044004 Date: Mon, 3 Jun 2024 20:47:58 +0800 Subject: [PATCH 1/3] =?UTF-8?q?graph=E7=94=9F=E6=88=90=E5=92=8C=E6=AF=94?= =?UTF-8?q?=E5=AF=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- debug/accuracy_tools/atat/pytorch/visualization/__init__.py | 0 debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py | 0 .../accuracy_tools/atat/pytorch/visualization/graph/base_node.py | 0 debug/accuracy_tools/atat/pytorch/visualization/graph/graph.py | 0 .../atat/pytorch/visualization/graph/graph_builder.py | 0 debug/accuracy_tools/atat/pytorch/visualization/graph/node_op.py | 0 debug/accuracy_tools/atat/pytorch/visualization/graph_utils.py | 0 .../accuracy_tools/atat/pytorch/visualization/json_parse_graph.py | 0 8 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 debug/accuracy_tools/atat/pytorch/visualization/__init__.py create mode 100644 debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py create mode 100644 debug/accuracy_tools/atat/pytorch/visualization/graph/base_node.py create mode 100644 debug/accuracy_tools/atat/pytorch/visualization/graph/graph.py create mode 100644 debug/accuracy_tools/atat/pytorch/visualization/graph/graph_builder.py create mode 100644 debug/accuracy_tools/atat/pytorch/visualization/graph/node_op.py create mode 100644 debug/accuracy_tools/atat/pytorch/visualization/graph_utils.py create mode 100644 debug/accuracy_tools/atat/pytorch/visualization/json_parse_graph.py diff --git a/debug/accuracy_tools/atat/pytorch/visualization/__init__.py b/debug/accuracy_tools/atat/pytorch/visualization/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py b/debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph/base_node.py b/debug/accuracy_tools/atat/pytorch/visualization/graph/base_node.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph/graph.py b/debug/accuracy_tools/atat/pytorch/visualization/graph/graph.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph/graph_builder.py b/debug/accuracy_tools/atat/pytorch/visualization/graph/graph_builder.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph/node_op.py b/debug/accuracy_tools/atat/pytorch/visualization/graph/node_op.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph_utils.py b/debug/accuracy_tools/atat/pytorch/visualization/graph_utils.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debug/accuracy_tools/atat/pytorch/visualization/json_parse_graph.py b/debug/accuracy_tools/atat/pytorch/visualization/json_parse_graph.py new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From 8c23c1008e562f0937ecaa1790254e905b9ae437 Mon Sep 17 00:00:00 2001 From: l30044004 Date: Mon, 3 Jun 2024 20:54:59 +0800 Subject: [PATCH 2/3] =?UTF-8?q?graph=E7=94=9F=E6=88=90=E5=92=8C=E6=AF=94?= =?UTF-8?q?=E5=AF=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../accuracy_tools/atat/core/common_config.py | 3 + debug/accuracy_tools/atat/pytorch/__init__.py | 1 + .../atat/pytorch/compare/acc_compare.py | 5 +- .../atat/pytorch/debugger/debugger_config.py | 6 +- .../pytorch/functional/step_post_process.py | 35 +++ debug/accuracy_tools/atat/pytorch/service.py | 3 +- .../pytorch/visualization/compare_tree.py | 267 ++++++++++++++++++ .../pytorch/visualization/graph/base_node.py | 94 ++++++ .../atat/pytorch/visualization/graph/graph.py | 28 ++ .../visualization/graph/graph_builder.py | 54 ++++ .../pytorch/visualization/graph/node_op.py | 24 ++ .../atat/pytorch/visualization/graph_utils.py | 23 ++ .../pytorch/visualization/json_parse_graph.py | 200 +++++++++++++ 13 files changed, 739 insertions(+), 4 deletions(-) diff --git a/debug/accuracy_tools/atat/core/common_config.py b/debug/accuracy_tools/atat/core/common_config.py index 740119b63c..a6e67fd01c 100644 --- a/debug/accuracy_tools/atat/core/common_config.py +++ b/debug/accuracy_tools/atat/core/common_config.py @@ -11,6 +11,9 @@ class CommonConfig: self.level = json_config.get('level') self.seed = json_config.get('seed') self.is_deterministic = json_config.get('is_deterministic') + self.on_step_end = json_config.get('on_step_end') + self.bench_dump_path = json_config.get('bench_dump_path') + self.on_step_end_path = json_config.get('on_step_end_path') self._check_config() def _check_config(self): diff --git a/debug/accuracy_tools/atat/pytorch/__init__.py b/debug/accuracy_tools/atat/pytorch/__init__.py index 482e850f7b..198cea96de 100644 --- a/debug/accuracy_tools/atat/pytorch/__init__.py +++ b/debug/accuracy_tools/atat/pytorch/__init__.py @@ -2,3 +2,4 @@ from .debugger.precision_debugger import PrecisionDebugger from .common.utils import seed_all from .compare.acc_compare import compare from .compare.distributed_compare import compare_distributed +from .visualization.json_parse_graph import compare_graph, build_graph diff --git a/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py b/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py index f5764ac92b..cff65534d9 100644 --- a/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py +++ b/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py @@ -847,7 +847,10 @@ def compare_process(file_handles, stack_mode, fuzzy_match, summary_compare=False for npu_data in npu_ops_queue: get_un_match_accuracy(result, npu_data, md5_compare, summary_compare) - header = [] + result_to_csv(md5_compare, summary_compare, stack_mode, result, output_csv_handle) + + +def result_to_csv(md5_compare, summary_compare, stack_mode, result, output_csv_handle): if md5_compare: header = CompareConst.MD5_COMPARE_RESULT_HEADER[:] elif summary_compare: diff --git a/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py b/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py index b3f0b51a66..7695a52e87 100644 --- a/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py +++ b/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py @@ -13,13 +13,15 @@ class DebuggerConfig: self.is_deterministic = common_config.is_deterministic if common_config.is_deterministic else False self.scope = task_config.scope if task_config.scope else [] self.list = task_config.list if task_config.list else [] - self.data_mode = task_config.data_mode if task_config.data_mode else ["all"] + self.data_mode = task_config.data_mode if task_config.data_mode else ["all"] self.backward_input = task_config.backward_input self.summary_mode = task_config.summary_mode if task_config.summary_mode else Const.STATISTICS self.overflow_num = task_config.overflow_num if task_config.overflow_num else 1 self.repair_scope = None self.repair_api_str = None - self.on_step_end = None + self.on_step_end = common_config.on_step_end + self.bench_dump_path = common_config.bench_dump_path + self.on_step_end_path = common_config.on_step_end_path self.repair_type = None self.check() diff --git a/debug/accuracy_tools/atat/pytorch/functional/step_post_process.py b/debug/accuracy_tools/atat/pytorch/functional/step_post_process.py index 7f0d345932..86c1dcc04b 100644 --- a/debug/accuracy_tools/atat/pytorch/functional/step_post_process.py +++ b/debug/accuracy_tools/atat/pytorch/functional/step_post_process.py @@ -1,5 +1,7 @@ +import os from abc import ABC, abstractmethod from ..common.exceptions import StepException +from ..visualization.json_parse_graph import compare_graph def run_parallel_ut(config): @@ -17,6 +19,8 @@ def build_step_post_process(config): return SingleAPICheck(config) elif config.on_step_end == StepPostProcess.Compare: return AutoCompare(config) + elif config.on_step_end == StepPostProcess.GraphCompare: + return GraphCompare(config) else: raise StepException(StepException.InvalidPostProcess, f"step后处理须配置为" f"'{StepPostProcess.SingleAPICheck}'或'{StepPostProcess.Compare}'," @@ -26,6 +30,7 @@ def build_step_post_process(config): class StepPostProcess(ABC): SingleAPICheck = 'single_api_check' Compare = 'compare' + GraphCompare = 'compare_graph' class SingleAPICheck: @@ -41,3 +46,33 @@ class AutoCompare: def run(self): compare_distrbuted(self.config.bench_dump_path, self.config.dump_path) + + +class GraphCompare: + def __init__(self, config): + self.config = config + + def run(self): + self.compare_graph() + + def compare_graph(self): + if self.config.step and self.config.current_iter not in self.config.step: + return + if self.config.step is None: + step = 'step0' + else: + step = 'step' + str(self.config.current_iter) + n_path = os.path.join(self.config.dump_path, step) + b_path = os.path.join(self.config.bench_dump_path, step) + if self.config.rank: + rank = 'rank' + str(self.config.rank) + else: + rank = 'rank0' + n_path_rank = os.path.join(n_path, rank) + if not os.path.exists(n_path_rank): + rank = 'rank' + n_path_rank = os.path.join(n_path, rank) + b_path_rank = os.path.join(b_path, rank) + if not os.path.exists(b_path_rank): + raise StepException(StepException.InvalidPostProcess, f'路径{b_path_rank}不存在,比对失败') + compare_graph(n_path_rank, b_path_rank, self.config.on_step_end_path) diff --git a/debug/accuracy_tools/atat/pytorch/service.py b/debug/accuracy_tools/atat/pytorch/service.py index dd2d2be41c..508260ff8f 100644 --- a/debug/accuracy_tools/atat/pytorch/service.py +++ b/debug/accuracy_tools/atat/pytorch/service.py @@ -78,9 +78,10 @@ class Service: return pre_forward_hook, forward_hook, backward_hook def step(self): + self.config.current_iter = self.current_iter self.current_iter += 1 if self.step_post_process: - self.step_post_process() + self.step_post_process.run() @staticmethod def check_model_valid(model): diff --git a/debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py b/debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py index e69de29bb2..25682f48ce 100644 --- a/debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py +++ b/debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py @@ -0,0 +1,267 @@ +import os +import json +import stat +from .graph_utils import ToolTip, Suggestions +from .graph.node_op import NodeOp +from ..compare.acc_compare import read_op, merge_tensor, get_accuracy, result_to_csv +from ..common.utils_compare import CompareConst, Const + + +class CompareTree: + def init(self, tree_n, tree_b, data_n_dict, data_b_dict, stack_json_data, csv_path, + summary_compare, md5_compare, stack_mode): + self.tree_n = tree_n + self.tree_b = tree_b + self.data_n_dict = data_n_dict + self.data_b_dict = data_b_dict + self.csv_path = csv_path + # 使用字典来存储树B中所有节点的type��节点的映射,以便快速查找 + self.b_nodes_by_value = {} + self.to_csv_result = [] + self.md5_compare = md5_compare + self.summary_compare = summary_compare + self.real_data_compare = self.summary_compare is False and self.md5_compare is False + self.stack_mode = stack_mode + self.stack_json_data = stack_json_data + self.real_data_compare_nodes = [] + self.fill_b_nodes_dict(self.tree_b) + self.compare_nodes(self.tree_n) + + # 获取节点所有祖先的列表 + @staticmethod + def get_ancestors(node): + ancestors = [] + current_node = node.upnode + while current_node: + ancestors.append(current_node.type) + current_node = current_node.upnode + return list(reversed(ancestors)) + + def fill_b_nodes_dict(self, node): + if node.type not in self.b_nodes_by_value: + self.b_nodes_by_value[node.type] = [] + self.b_nodes_by_value[node.type].append(node) + for subnode in node.subnodes: + self.fill_b_nodes_dict(subnode) + + def result_to_csv(self): + with os.fdopen(os.open(self.csv_path, os.O_RDWR | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP), + 'w+') as file_out: + result_to_csv(self.md5_compare, self.summary_compare, self.stack_mode, self.to_csv_result, file_out) + + # 递归比较NPU树中的节点,如果在Bench中找到具有相同type的节点,检查它们的祖先 + def compare_nodes(self, node_n): + if node_n.type in self.b_nodes_by_value: + for node_b in self.b_nodes_by_value[node_n.type]: + # 检查两个节点是否有完全相同的祖先链 + flag, ancestors = self.have_same_ancestors(node_n, node_b) + flag = flag and node_n.data_info == node_b.data_info + if flag: + # 如果祖先链相同,data_info相同,将node_b及其祖先添加到node_n的bench_ancestors属性中 + ancestors.append(node_b.type) + node_n.matched_node_link = ancestors + # 不copy一下转成yaml会有乱码 + node_b.matched_node_link = ancestors.copy() + # compare + # 真实数据比对只会得到基本信息,并没有精度指标,需要调用多进程比对接口 + compare_result_list = self.compare_node(node_n, node_b) + if compare_result_list: + self.to_csv_result.extend(compare_result_list) + self.add_compare_result_to_node(node_n, compare_result_list) + + for subnode in node_n.subnodes: + self.compare_nodes(subnode) + + # 将比对结果添加到节点的输入输出数据中,正常来说输入输出数据和比对结果数据数量是一致的 + def add_compare_result_to_node(self, node, compare_result_list): + # 真实数据比对,先暂存节点,在多进程比对得到精度指标后,再将指标添加到节点 + if self.real_data_compare: + self.real_data_compare_nodes.append(node) + return + compare_in_dict = {} + compare_out_dict = {} + # input和output比对数据分开 + for item in compare_result_list: + if 'output' in item[0]: + compare_out_dict[item[0]] = item + else: + compare_in_dict[item[0]] = item + if self.md5_compare: + precision_status_in = self.add_md5_compare_data(node.input_data, compare_in_dict) + precision_status_out = self.add_md5_compare_data(node.output_data, compare_out_dict) + # 所有输入输出md5比对通过,这个节点才算通过 + precision_status = precision_status_in and precision_status_out + node.data['precision_status'] = precision_status + # md5比对通过为1,否则0 + node.data['precision_index'] = 1 if precision_status else 0 + node.data['md5 Compare Result'] = CompareConst.PASS if precision_status else CompareConst.DIFF + elif self.summary_compare: + precision_status_in, precision_index_in = self.add_summary_compare_data(node.input_data, compare_in_dict) + precision_status_out, precision_index_out = self.add_summary_compare_data(node.output_data, + compare_out_dict) + precision_status = precision_status_in and precision_status_out + precision_index = min(precision_index_in, precision_index_out) + node.data['precision_status'] = precision_status + node.data['precision_index'] = precision_index + + def add_summary_compare_data(self, node_data, compare_data_dict): + precision_status = True + precision_index = 1 + for key, value in node_data.items(): + if not isinstance(value, dict): + continue + compare_data = compare_data_dict.get(key) + if compare_data: + key_list = [CompareConst.MAX_DIFF, CompareConst.MIN_DIFF, CompareConst.MEAN_DIFF, + CompareConst.NORM_DIFF] + # 取npu和bench数据进行比较,用完删除 + del_list = [CompareConst.NPU_MAX, CompareConst.NPU_MIN, CompareConst.NPU_MEAN, + CompareConst.NPU_NORM, CompareConst.BENCH_MAX, CompareConst.BENCH_MIN, + CompareConst.BENCH_MEAN, CompareConst.BENCH_NORM] + key_list.extend(del_list) + id_list = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17] + self._match_data(value, compare_data, key_list, id_list) + # summary比对是否通过 + precision_status, precision_index = self._summary_compare_judgment(value, precision_status, + precision_index) + self._del_item_by_list(value, del_list) + node_data[key] = value + return precision_status, precision_index + + @staticmethod + def _summary_compare_judgment(data_dict, precision_status, precision_index): + item_dict = {(CompareConst.NPU_MAX, CompareConst.BENCH_MAX): (CompareConst.MAX_DIFF, 'Max Magnitude Diff'), + (CompareConst.NPU_MIN, CompareConst.BENCH_MIN): (CompareConst.MIN_DIFF, 'Min Magnitude Diff'), + (CompareConst.NPU_MEAN, CompareConst.BENCH_MEAN): (CompareConst.MEAN_DIFF, 'Mean Magnitude Diff'), + (CompareConst.NPU_NORM, CompareConst.BENCH_NORM): ( + CompareConst.NORM_DIFF, 'L2norm Magnitude Diff')} + for key, value in item_dict.items(): + if isinstance(data_dict.get(key[0]), (float, int)) and isinstance(data_dict.get(key[1]), (float, int)) \ + and isinstance(data_dict.get(value[0]), (float, int)): + magnitude_diff = abs(data_dict.get(value[0])) / ( + max(abs(data_dict.get(key[0])), abs(data_dict.get(key[1]))) + 1e-10) + magnitude_diff = 1 if magnitude_diff > 1 else magnitude_diff + data_dict[value[1]] = magnitude_diff + if magnitude_diff > 0.3: + precision_status = False + precision_index = 1 - max(precision_index, magnitude_diff) + return precision_status, precision_index + + def add_md5_compare_data(self, node_data, compare_data_dict): + precision_status = True + for key, value in node_data.items(): + if not isinstance(value, dict): + continue + compare_data = compare_data_dict.get(key) + if compare_data: + key_list = ['md5 Compare Result'] + id_list = [8] + self._match_data(value, compare_data, key_list, id_list) + # md5比对是否通过 + if value.get('md5 Compare Result') != CompareConst.PASS: + precision_status = False + node_data[key] = value + return precision_status + + def add_real_compare_data(self, node_data, compare_data_dict): + min_thousandth = float(1) + numbers = [] + for key, value in node_data.items(): + if not isinstance(value, dict): + continue + # self._del_item(value) + compare_data = compare_data_dict.get(key) + if compare_data: + key_list = [CompareConst.COSINE, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, + CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] + id_list = [6, 7, 8, 9, 10] + self._match_data(value, compare_data, key_list, id_list) + # 获取一个节点所有的输入或输出最小的双千指标 + thousandth = value.get(CompareConst.ONE_THOUSANDTH_ERR_RATIO) + # 可能是None,可能是非数字内容str + try: + thousandth = float(thousandth) + except (ValueError, TypeError): + thousandth = None + if thousandth is not None: + numbers.append(thousandth) + node_data[key] = value + # 双千指标都是None的异常情况 + if not numbers: + min_thousandth = None + else: + min_thousandth = min(numbers + [min_thousandth]) + return min_thousandth + + @staticmethod + def add_real_compare_node_error_key(node_data): + for key, value in node_data.items(): + if not isinstance(value, dict): + continue + value['error_key'] = [CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] + node_data[key] = value + + @staticmethod + def add_real_compare_suggestions(node): + if node.op == NodeOp.module: + node.suggestions['text'] = Suggestions.Module + node.suggestions[Suggestions.PTDBG] = Suggestions.PTDBG_URL + if node.op == NodeOp.function_api: + node.suggestions['text'] = Suggestions.API + node.suggestions[Suggestions.API_ACCURACY_CHECKER] = Suggestions.API_ACCURACY_CHECKER_URL + + def get_tool_tip(self): + if self.summary_compare: + tips = { + CompareConst.MAX_DIFF: ToolTip.MAX_DIFF, + CompareConst.MIN_DIFF: ToolTip.MIN_DIFF, + CompareConst.MEAN_DIFF: ToolTip.MEAN_DIFF, + CompareConst.NORM_DIFF: ToolTip.NORM_DIFF} + elif self.md5_compare: + tips = { + Const.MD5: ToolTip.MD5} + else: + tips = { + CompareConst.ONE_THOUSANDTH_ERR_RATIO: ToolTip.ONE_THOUSANDTH_ERR_RATIO, + CompareConst.COSINE: ToolTip.COSINE, + CompareConst.MAX_ABS_ERR: ToolTip.MAX_ABS_ERR, + CompareConst.MAX_RELATIVE_ERR: ToolTip.MAX_RELATIVE_ERR} + return json.dumps(tips) + + @staticmethod + def _match_data(data_dict, compare_data, key_list, id_list): + if len(key_list) != len(id_list): + return + for i, key in enumerate(key_list): + data = compare_data[id_list[i]] + if data is not None and 'nan' not in str(data): + data_dict[key] = compare_data[id_list[i]] + + @staticmethod + def _del_item_by_list(data_dict, del_list): + if isinstance(data_dict, dict): + for item in del_list: + if item in data_dict: + del data_dict[item] + + # 比较两个节点的所有祖先是否相同 + def have_same_ancestors(self, node_a, node_b): + ancestors_a = self.get_ancestors(node_a) + ancestors_b = self.get_ancestors(node_b) + return ancestors_a == ancestors_b, ancestors_a + + def parse_node(self, node, data_dict): + op_parsed_list = read_op(data_dict.get(node.type, {}), node.type) + if node.type in self.stack_json_data: + op_parsed_list.append( + {'full_op_name': node.type, 'full_info': self.stack_json_data[node.type]}) + else: + op_parsed_list.append({'full_op_name': node.type, 'full_info': None}) + return merge_tensor(op_parsed_list, self.summary_compare, self.md5_compare) + + def compare_node(self, node_n, node_b): + result = [] + merge_n = self.parse_node(node_n, self.data_n_dict) + merge_b = self.parse_node(node_b, self.data_b_dict) + get_accuracy(result, merge_n, merge_b, self.summary_compare, self.md5_compare) + return result diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph/base_node.py b/debug/accuracy_tools/atat/pytorch/visualization/graph/base_node.py index e69de29bb2..4ef02b9aef 100644 --- a/debug/accuracy_tools/atat/pytorch/visualization/graph/base_node.py +++ b/debug/accuracy_tools/atat/pytorch/visualization/graph/base_node.py @@ -0,0 +1,94 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import json + + +class BaseNode: + def __init__(self, node_op, node_type, up_node=None, is_forward=True): + self.op = node_op + self.type = node_type + self.id = node_type + self.data = {} + self.outputs = [] + self.inputs = [] + self.output_data = {} + self.input_data = {} + self.upnode = up_node + self.subnodes = [] + if up_node: + up_node.add_subnode(self) + self.is_forward = is_forward + self.pair = None + self.matched_node_link = [] + self.data_info = '' + self.suggestions = {} + + def __str__(self): + info = f'id:\t{self.id}' + return info + + def get_info(self): + info = f'{self.id}\t{self.op}' + if not self.is_forward: + info += '(b)' + for key in self.data: + info += f'\n{key}:\t{self.data.get(key)}' + return info + + def add_subnode(self, node): + if node.id == self.id: + return + self.subnodes.append(node) + + def get_yaml_dict(self): + result = {} + result['id'] = self.id + result['node_type'] = self.op.value + result['type'] = self.type + result['data'] = json.dumps(self.data) + result['output_data'] = self._del_item(self.output_data) + result['input_data'] = self._del_item(self.input_data) + result['outputs'] = [(edge_id, node.id) for edge_id, node in self.outputs] + result['inputs'] = [(edge_id, node.id) for edge_id, node in self.inputs] + result['upnode'] = self.upnode.id if self.upnode else 'None' + result['subnodes'] = [node.id for node in self.subnodes] + result['is_forward'] = self.is_forward + result['pair'] = self.pair.id if self.pair else 'None' + result['matched_node_link'] = self.matched_node_link + result['suggestions'] = json.dumps(self.suggestions) + return result + + @staticmethod + def _del_item(data_dict): + del_list = ['requires_grad', 'data_name', 'full_op_name'] + for key, value in data_dict.items(): + if not isinstance(value, dict): + continue + for item in del_list: + if item in value: + del value[item] + BaseNode._formate_floats(value) + # 匹配冒号后面跟着的,由负数符号、字母和空格组成的字符串,直到遇到逗号或闭合大括号为止,例如将"A": a替换成"A": "a" + data_dict[key] = re.sub(r'(?<=: )([-a-zA-Z\s]+)(?=[,}])', r'"\1"', json.dumps(value)) + + return data_dict + + @staticmethod + def _formate_floats(data_dict): + for key, value in data_dict.items(): + if isinstance(value, float): + data_dict[key] = round(value, 6) diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph/graph.py b/debug/accuracy_tools/atat/pytorch/visualization/graph/graph.py index e69de29bb2..54cdd08db6 100644 --- a/debug/accuracy_tools/atat/pytorch/visualization/graph/graph.py +++ b/debug/accuracy_tools/atat/pytorch/visualization/graph/graph.py @@ -0,0 +1,28 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Graph: + def __init__(self): + self.root = None + self.recent_node = None + self.depth = 0 + self.node_map = {} + self.rawid_map = {} + + def __str__(self): + infos = [f'{str(self.node_map.get(node_id))}' for node_id in self.node_map] + info = "\n".join(infos) + return info diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph/graph_builder.py b/debug/accuracy_tools/atat/pytorch/visualization/graph/graph_builder.py index e69de29bb2..c1f259b639 100644 --- a/debug/accuracy_tools/atat/pytorch/visualization/graph/graph_builder.py +++ b/debug/accuracy_tools/atat/pytorch/visualization/graph/graph_builder.py @@ -0,0 +1,54 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import yaml + +from ....core.file_check_util import FileOpen + + +class GraphBuilder: + + @staticmethod + def export_to_yaml(filename, graph): + result = {} + result['root'] = graph.root.id if graph.root else 'None' + result['node'] = {} + GraphBuilder._export_dfs(graph.root, result['node']) + with FileOpen(filename, 'w') as f: + yaml.dump(data=result, stream=f, allow_unicode=True, encoding='utf-8') + + @staticmethod + def get_graph_result(graph): + result = {} + result['root'] = graph.root.id if graph.root else 'None' + result['node'] = {} + GraphBuilder._export_dfs(graph.root, result['node']) + return result + + @staticmethod + def export_graphs_to_yaml(filename, graph_n, graph_b, tool_tip): + result = {} + result['NPU'] = GraphBuilder.get_graph_result(graph_n) + result['Bench'] = GraphBuilder.get_graph_result(graph_b) + result['Tooltip'] = tool_tip + with FileOpen(filename, 'w') as f: + yaml.dump(data=result, stream=f, allow_unicode=True, encoding='utf-8') + + @staticmethod + def _export_dfs(node, result): + info = node.get_yaml_dict() + result[node.id] = info + for subnode in node.subnodes: + GraphBuilder._export_dfs(subnode, result) diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph/node_op.py b/debug/accuracy_tools/atat/pytorch/visualization/graph/node_op.py index e69de29bb2..f81a52b995 100644 --- a/debug/accuracy_tools/atat/pytorch/visualization/graph/node_op.py +++ b/debug/accuracy_tools/atat/pytorch/visualization/graph/node_op.py @@ -0,0 +1,24 @@ +# Copyright (c) 2023, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from enum import Enum + + +class NodeOp(Enum): + module = 1 + function_api = 2 + module_api = 3 + tensor = 4 + output = 5 diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph_utils.py b/debug/accuracy_tools/atat/pytorch/visualization/graph_utils.py index e69de29bb2..7b022a34da 100644 --- a/debug/accuracy_tools/atat/pytorch/visualization/graph_utils.py +++ b/debug/accuracy_tools/atat/pytorch/visualization/graph_utils.py @@ -0,0 +1,23 @@ +class ToolTip: + MAX_DIFF = 'NPU与标杆API统计信息比对,最大值的差值' + MIN_DIFF = 'NPU与标杆API统计信息比对,最小值的差值' + MEAN_DIFF = 'NPU与标杆API统计信息比对,平均值的差值' + NORM_DIFF = 'NPU与标杆API统计信息比对,2范数(平方根)的差值' + MAX_MAGNITUDE_DIFF = 'NPU与标杆API统计信息比对,最大值的差值相对误差' + MIN_MAGNITUDE_DIFF = 'NPU与标杆API统计信息比对,最小值的差值相对误差' + MEAN_MAGNITUDE_DIFF = 'NPU与标杆API统计信息比对,平均值的差值相对误差' + NORM_MAGNITUDE_DIFF = 'NPU与标杆API统计信息比对,2范数(平方根)的差值相对误差' + MD5 = '数据MD5信息,用于比较两个数据信息是否完全一致' + ONE_THOUSANDTH_ERR_RATIO = 'Tensor中的元素逐个与对应的标杆数据对比,相对误差大于千分之一的比例占总元素个数的比例小于千分之一' + COSINE = '通过计算两个向量的余弦值来判断其相似度,数值越接近于1说明计算出的两个张量越相似,实际可接受阈值为大于0.99。在计算中可能会存在nan,主要由于可能会出现其中一个向量为0' + MAX_ABS_ERR = '当最大绝对误差越接近0表示其计算的误差越小,实际可接受阈值为小于0.001' + MAX_RELATIVE_ERR = '当最大相对误差越接近0表示其计算的误差越小。当dump数据中存在0或Nan时,比对结果中最大相对误差则出现inf或Nan的情况,属于正常现象' + + +class Suggestions: + Module = '此模块精度比对结果疑似异常,请使用ptdbg工具对模块中的api进行dump比对' + API = '此api精度比对结果疑似异常,请使用api accuracy checker工具对api进行精度检测' + PTDBG = 'ptdbg工具' + PTDBG_URL = 'https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/ptdbg_ascend' + API_ACCURACY_CHECKER = 'api accuracy checker工具' + API_ACCURACY_CHECKER_URL = 'https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/api_accuracy_checker' diff --git a/debug/accuracy_tools/atat/pytorch/visualization/json_parse_graph.py b/debug/accuracy_tools/atat/pytorch/visualization/json_parse_graph.py index e69de29bb2..34d1cb8fe6 100644 --- a/debug/accuracy_tools/atat/pytorch/visualization/json_parse_graph.py +++ b/debug/accuracy_tools/atat/pytorch/visualization/json_parse_graph.py @@ -0,0 +1,200 @@ +import os +import json +import re +import time +import pandas as pd +from .graph.graph import Graph +from .graph.base_node import BaseNode +from .graph.node_op import NodeOp +from .graph.graph_builder import GraphBuilder +from ..compare.acc_compare import read_op, task_dumppath_get, _do_multi_process +from ..common.utils_compare import add_time_as_suffix +from .compare_tree import CompareTree + + +def _load_json_file(file_path): + try: + with open(file_path, 'r') as file: + file_dict = json.load(file) + if not isinstance(file_dict, dict): + return {} + return file_dict + except json.JSONDecodeError: + return {} + + +def _get_data_inputs_outputs(data_dict: dict): + input_args = data_dict.get('input_args', []) + input_kwargs = data_dict.get('input_kwargs', {}) + output = data_dict.get('output', []) + + input_args = input_args if isinstance(input_args, list) else [] + input_kwargs = input_kwargs if isinstance(input_kwargs, dict) else {} + output = output if isinstance(output, list) else [] + return input_args, input_kwargs, output + + +# ptdbg compare read_op 解析数据 +def _add_node_data(node_data, node): + input_data = {} + output_data = {} + op_parsed_list = read_op(node_data, node.type) + for item in op_parsed_list: + full_op_name = item.get('full_op_name', '') + if 'output' in full_op_name: + output_data[full_op_name] = item + else: + input_data[full_op_name] = item + node.input_data = input_data + node.output_data = output_data + + +def _get_data_info(item): + if isinstance(item, dict): + return str(item.get('type', 'na')) + '_' + str(item.get('dtype', 'na')) + '_' + str(item.get('shape', 'na')) + elif isinstance(item, (list, tuple)): + return str([_get_data_info(sub_item) for sub_item in item]) + return '' + + +def _process_node_data_info(items): + info_str = '' + for item in items: + info_str += _get_data_info(item) + return info_str + + +# 节点所有输入、输出的type、dtype和shape要一样 +def _get_node_data_info(input_args, input_kwargs, output): + return _process_node_data_info(input_args) + _process_node_data_info(input_kwargs) + _process_node_data_info(output) + + +def _get_node_op(node_name: str): + pattern = r'^(Tensor|Torch|Functional|NPU|VF|Distributed|Aten)' + match = re.match(pattern, node_name) + if match: + return NodeOp.function_api + else: + return NodeOp.module + + +def build_tree(construct_dict, data_dict, root_name): + # 创建一个字典来存储已经创建的节点,以便重用 + created_nodes = {} + root_node = BaseNode(NodeOp.module, root_name) + + # 创建一个函数来递归地创建或获取节点 + def get_or_create_node(op, name, up_node=None): + if name not in created_nodes: + # add data + base_node = BaseNode(op, name, up_node) + node_data = data_dict.get(name, {}) + input_args, input_kwargs, output = _get_data_inputs_outputs(node_data) + # 添加输入输出数据 + _add_node_data(node_data, base_node) + + # 添加输入输出数据信息组成的标识,用来匹配npu和标杆的节点 + data_info = _get_node_data_info(input_args, input_kwargs, output) + base_node.data_info = data_info + created_nodes[name] = base_node + elif up_node: + # 如果节点已经存在,但我们现在才知道它的上级节点 + created_nodes[name].upnode = up_node + up_node.add_subnode(created_nodes[name]) + return created_nodes[name] + + # 遍历字典,为每个键值对创建或获取节点 + for subnode, upnode in construct_dict.items(): + if upnode: + up_node = get_or_create_node(_get_node_op(upnode), upnode) + else: + up_node = root_node + get_or_create_node(_get_node_op(subnode), subnode, up_node) + + return root_node, created_nodes + + +def build_graph(construct_path, data_path, output_path): + construct_dict = _load_json_file(construct_path) + data_dict = _load_json_file(data_path).get('data', {}) + root_node, created_nodes = build_tree(construct_dict, data_dict, 'root_node') + graph = Graph() + graph.root = root_node + graph.node_map = created_nodes + GraphBuilder.export_to_yaml(output_path, graph) + + +def do_compare_graph(construct_n_path, data_n_path, construct_b_path, data_b_path, stack_path, output_path, csv_path): + dump_path_param = { + "npu_json_path": data_n_path, + "bench_json_path": data_b_path, + "stack_json_path": stack_path, + "is_print_compare_log": True + } + # 判断比对模式 + summary_compare, md5_compare = task_dumppath_get(dump_path_param) + + construct_n_dict = _load_json_file(construct_n_path) + data_n_dict = _load_json_file(data_n_path).get('data', {}) + root_n_node, created_n_nodes = build_tree(construct_n_dict, data_n_dict, 'NPU') + construct_b_dict = _load_json_file(construct_b_path) + data_b_dict = _load_json_file(data_b_path).get('data', {}) + root_b_node, created_b_nodes = build_tree(construct_b_dict, data_b_dict, 'NPU') + stack_json_data = _load_json_file(stack_path) + + start_time = time.time() + compare_tree = CompareTree(root_n_node, root_b_node, data_n_dict, data_b_dict, stack_json_data, csv_path, + summary_compare, md5_compare, True) + end_time = time.time() + print('compare_tree', end_time - start_time) + compare_tree.result_to_csv() + + if summary_compare is False and md5_compare is False: + # 真实数据比对,开启多进程比对得到精度指标,再写进已创建的csv中 + _do_multi_process(dump_path_param, csv_path) + # 从csv文件读取精度指标,添加到node节点中 + df = pd.read_csv(csv_path) + compare_data_dict = {row[0]: row.tolist() for index, row in df.iterrows()} + for node in compare_tree.real_data_compare_nodes: + min_thousandth_in = compare_tree.add_real_compare_data(node.input_data, compare_data_dict) + min_thousandth_out = compare_tree.add_real_compare_data(node.output_data, compare_data_dict) + if min_thousandth_in and min_thousandth_out: + change_percentage = abs(min_thousandth_in - min_thousandth_out) + else: + change_percentage = 0 + precision_status = True + if change_percentage > 0.1: + precision_status = False + # 精度不达标,双千指标标红 + CompareTree.add_real_compare_node_error_key(node.output_data) + # 添加建议 + CompareTree.add_real_compare_suggestions(node) + node.data['precision_status'] = precision_status + node.data['precision_index'] = 0 if change_percentage > 1 else 1 - change_percentage + + graph_n = Graph() + graph_n.root = root_n_node + graph_n.node_map = created_n_nodes + graph_b = Graph() + graph_b.root = root_b_node + graph_n.node_map = created_b_nodes + start_time = time.time() + GraphBuilder.export_graphs_to_yaml(output_path, graph_n, graph_b, compare_tree.get_tool_tip()) + end_time = time.time() + print('export_graphs_to_yaml', end_time - start_time) + + +def compare_graph(dump_path_n, dump_path_b, out_path): + g_dir = dump_path_n + g_dir_b = dump_path_b + g_construct_path = f'{g_dir}/construct.json' + g_construct_path1 = f'{g_dir_b}/construct.json' + g_data_path = f'{g_dir}/dump.json' + g_data_path1 = f'{g_dir_b}/dump.json' + g_stack_path = f'{g_dir}/stack.json' + g_output_path = f'{out_path}/export.vis' + g_csv_dir = f'{out_path}' + g_file_name = add_time_as_suffix("compare_result") + g_csv_path = os.path.join(os.path.realpath(g_csv_dir), g_file_name) + do_compare_graph(g_construct_path, g_data_path, g_construct_path1, g_data_path1, g_stack_path, g_output_path, + g_csv_path) -- Gitee From 9bbc3b95bb8a29699fb2749f59794f1b82cb0f61 Mon Sep 17 00:00:00 2001 From: l30044004 Date: Wed, 5 Jun 2024 17:52:25 +0800 Subject: [PATCH 3/3] =?UTF-8?q?graph=E7=94=9F=E6=88=90=E5=92=8C=E6=AF=94?= =?UTF-8?q?=E5=AF=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../atat/pytorch/visualization/compare_tree.py | 11 +++++++++++ .../atat/pytorch/visualization/graph/base_node.py | 10 +++------- .../atat/pytorch/visualization/graph/graph_builder.py | 11 +++++------ .../atat/pytorch/visualization/json_parse_graph.py | 4 ++-- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py b/debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py index 25682f48ce..10455b767b 100644 --- a/debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py +++ b/debug/accuracy_tools/atat/pytorch/visualization/compare_tree.py @@ -103,6 +103,9 @@ class CompareTree: precision_index = min(precision_index_in, precision_index_out) node.data['precision_status'] = precision_status node.data['precision_index'] = precision_index + if not precision_status: + self.add_summary_compare_node_error_key(node.output_data) + self.add_real_compare_suggestions(node) def add_summary_compare_data(self, node_data, compare_data_dict): precision_status = True @@ -201,6 +204,14 @@ class CompareTree: value['error_key'] = [CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] node_data[key] = value + @staticmethod + def add_summary_compare_node_error_key(node_data): + for key, value in node_data.items(): + if not isinstance(value, dict): + continue + value['error_key'] = ['Max Magnitude Diff', 'Min Magnitude Diff', 'Mean Magnitude Diff', 'L2norm Magnitude Diff'] + node_data[key] = value + @staticmethod def add_real_compare_suggestions(node): if node.op == NodeOp.module: diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph/base_node.py b/debug/accuracy_tools/atat/pytorch/visualization/graph/base_node.py index 4ef02b9aef..490e964d1d 100644 --- a/debug/accuracy_tools/atat/pytorch/visualization/graph/base_node.py +++ b/debug/accuracy_tools/atat/pytorch/visualization/graph/base_node.py @@ -13,9 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import re -import json - class BaseNode: def __init__(self, node_op, node_type, up_node=None, is_forward=True): @@ -59,7 +56,7 @@ class BaseNode: result['id'] = self.id result['node_type'] = self.op.value result['type'] = self.type - result['data'] = json.dumps(self.data) + result['data'] = self.data result['output_data'] = self._del_item(self.output_data) result['input_data'] = self._del_item(self.input_data) result['outputs'] = [(edge_id, node.id) for edge_id, node in self.outputs] @@ -69,7 +66,7 @@ class BaseNode: result['is_forward'] = self.is_forward result['pair'] = self.pair.id if self.pair else 'None' result['matched_node_link'] = self.matched_node_link - result['suggestions'] = json.dumps(self.suggestions) + result['suggestions'] = self.suggestions return result @staticmethod @@ -82,8 +79,6 @@ class BaseNode: if item in value: del value[item] BaseNode._formate_floats(value) - # 匹配冒号后面跟着的,由负数符号、字母和空格组成的字符串,直到遇到逗号或闭合大括号为止,例如将"A": a替换成"A": "a" - data_dict[key] = re.sub(r'(?<=: )([-a-zA-Z\s]+)(?=[,}])', r'"\1"', json.dumps(value)) return data_dict @@ -92,3 +87,4 @@ class BaseNode: for key, value in data_dict.items(): if isinstance(value, float): data_dict[key] = round(value, 6) + data_dict[key] = str(value).replace("'", "") diff --git a/debug/accuracy_tools/atat/pytorch/visualization/graph/graph_builder.py b/debug/accuracy_tools/atat/pytorch/visualization/graph/graph_builder.py index c1f259b639..22bb2739f9 100644 --- a/debug/accuracy_tools/atat/pytorch/visualization/graph/graph_builder.py +++ b/debug/accuracy_tools/atat/pytorch/visualization/graph/graph_builder.py @@ -12,8 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import yaml +import json from ....core.file_check_util import FileOpen @@ -21,13 +20,13 @@ from ....core.file_check_util import FileOpen class GraphBuilder: @staticmethod - def export_to_yaml(filename, graph): + def export_to_json(filename, graph): result = {} result['root'] = graph.root.id if graph.root else 'None' result['node'] = {} GraphBuilder._export_dfs(graph.root, result['node']) with FileOpen(filename, 'w') as f: - yaml.dump(data=result, stream=f, allow_unicode=True, encoding='utf-8') + f.write(json.dumps(result, indent=4)) @staticmethod def get_graph_result(graph): @@ -38,13 +37,13 @@ class GraphBuilder: return result @staticmethod - def export_graphs_to_yaml(filename, graph_n, graph_b, tool_tip): + def export_graphs_to_json(filename, graph_n, graph_b, tool_tip): result = {} result['NPU'] = GraphBuilder.get_graph_result(graph_n) result['Bench'] = GraphBuilder.get_graph_result(graph_b) result['Tooltip'] = tool_tip with FileOpen(filename, 'w') as f: - yaml.dump(data=result, stream=f, allow_unicode=True, encoding='utf-8') + f.write(json.dumps(result, indent=4)) @staticmethod def _export_dfs(node, result): diff --git a/debug/accuracy_tools/atat/pytorch/visualization/json_parse_graph.py b/debug/accuracy_tools/atat/pytorch/visualization/json_parse_graph.py index 34d1cb8fe6..86b3c23e08 100644 --- a/debug/accuracy_tools/atat/pytorch/visualization/json_parse_graph.py +++ b/debug/accuracy_tools/atat/pytorch/visualization/json_parse_graph.py @@ -121,7 +121,7 @@ def build_graph(construct_path, data_path, output_path): graph = Graph() graph.root = root_node graph.node_map = created_nodes - GraphBuilder.export_to_yaml(output_path, graph) + GraphBuilder.export_to_json(output_path, graph) def do_compare_graph(construct_n_path, data_n_path, construct_b_path, data_b_path, stack_path, output_path, csv_path): @@ -179,7 +179,7 @@ def do_compare_graph(construct_n_path, data_n_path, construct_b_path, data_b_pat graph_b.root = root_b_node graph_n.node_map = created_b_nodes start_time = time.time() - GraphBuilder.export_graphs_to_yaml(output_path, graph_n, graph_b, compare_tree.get_tool_tip()) + GraphBuilder.export_graphs_to_json(output_path, graph_n, graph_b, compare_tree.get_tool_tip()) end_time = time.time() print('export_graphs_to_yaml', end_time - start_time) -- Gitee