From 28b514acdc697375149cbcda1c07cc5f657f86ed Mon Sep 17 00:00:00 2001 From: s30048155 Date: Mon, 16 Oct 2023 16:44:01 +0800 Subject: [PATCH 01/16] update FAQ --- debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md | 38 ++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md index e571e64d15d..260ca77aedc 100644 --- a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md +++ b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md @@ -6,6 +6,36 @@ ``` __version__ = '3.4' ``` +### 2.dump指定操作 +如果需要dump融合算子或者切片操作的输入输出,需要在att/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml中进行手动添加,切片操作在tensor:下添加 +``` +- __getitem__ +``` +融合算子在torch_npu:下自行添加。目前默认支持的融合算子包括: +``` +- npu_scaled_masked_softmax +- torch_npu.npu_rotary_mul +- torch_npu.npu_roi_align +- torch_npu.npu_roi_alignbk +- npu_ptiou +``` +## 常见问题 + +### 1. 在同一个目录多次执行dump会冲突吗? + +会,同一个目录多次dump,会覆盖上一次结果,可以使用dump_tag参数修改dump目录名称。 + +### 2. 一个网络中包含多个model,register hook中传入哪一个model? + +传入任意一个model即可,工具会自动dump所有model。 + +### 3. 如何dump算子级的数据? + +需要使用acl dump模式,即在dump操作中配置mode="acl"或dump_mode='acl'。 + +### 4. 工具比对发现NPU和标杆数据的API无法完全对齐? + +torch版本和硬件差异属于正常情况 ## 异常情况 ### 1. 单机多卡场景dump目录下只生成一个rank目录或pkl文件格式损坏 @@ -103,3 +133,11 @@ compare(dump_result_param, "./output", stack_mode=True) - matmul期望的输入是二维,当输入不是二维时,会将输入通过view操作展成二维,再进行matmul运算,因此在反向求导时,backward_hook能拿到的是UnsafeViewBackward这步操作里面数据的梯度信息,取不到MmBackward这步操作里面数据的梯度信息,即权重的反向梯度数据。 - 典型的例子有,当linear的输入不是二维,且无bias时,会调用output = input.matmul(weight.t()),因此拿不到linear层的weight的反向梯度数据。 + +### 13. 使用dataloader后raise异常Exception: ptdbg: exit after iteration [x, x, x] + +- 正常现象,dataloader通过raise结束程序,堆栈信息可忽略。 + +### 14. 工具报错:AssertionError: Please register hooks to nn.Module + +- 请在model示例化之后配置register hook。 \ No newline at end of file -- Gitee From 6e26f981c4cdcfd17124feae77f507513f4ae34e Mon Sep 17 00:00:00 2001 From: s30048155 Date: Mon, 16 Oct 2023 16:44:10 +0800 Subject: [PATCH 02/16] update yaml --- .../python/ptdbg_ascend/hook_module/support_wrap_ops.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml index 180d0cdac86..0f59e4b87a5 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml @@ -146,7 +146,6 @@ tensor: - __eq__ - __ge__ - __gt__ - - __getitem__ - __iadd__ - __iand__ - __idiv__ @@ -1053,6 +1052,11 @@ torch_npu: - npu_sign_bits_pack - npu_sign_bits_unpack - npu_flash_attention + - npu_scaled_masked_softmax + - torch_npu.npu_rotary_mul + - torch_npu.npu_roi_align + - torch_npu.npu_roi_alignbk + - npu_ptiou distributed: - send -- Gitee From 98f2321d292500e1fdc7ce08726b39c9d148acff Mon Sep 17 00:00:00 2001 From: s30048155 Date: Mon, 16 Oct 2023 17:01:11 +0800 Subject: [PATCH 03/16] update --- debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md index 260ca77aedc..bc2b4dadeed 100644 --- a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md +++ b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md @@ -11,7 +11,17 @@ __version__ = '3.4' ``` - __getitem__ ``` -融合算子在torch_npu:下自行添加。目前默认支持的融合算子包括: +如果融合算子无法dump,需要手动添加到support_wrap_ops.yaml中,比如以下算子: +``` +def npu_forward_fused_softmax(self, input_, mask): + resl = torch_npu.npu_scaled_masked_softmax(input_, mask, self.scale, False) + return resl +``` +需要在support_wrap_ops.yaml中的torch_npu: 中自行添加该融合算子即可: +``` +- npu_scaled_masked_softmax +``` +目前已默认支持的融合算子包括: ``` - npu_scaled_masked_softmax - torch_npu.npu_rotary_mul -- Gitee From 49ccbb0fcf8c68738a1c4ae99f7c29cf1f0f5b6d Mon Sep 17 00:00:00 2001 From: sunyiming Date: Mon, 16 Oct 2023 09:06:41 +0000 Subject: [PATCH 04/16] update debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md. Signed-off-by: sunyiming --- debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md | 25 +++++++++++--------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md index bc2b4dadeed..ec355a99f14 100644 --- a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md +++ b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md @@ -7,28 +7,31 @@ __version__ = '3.4' ``` ### 2.dump指定操作 -如果需要dump融合算子或者切片操作的输入输出,需要在att/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml中进行手动添加,切片操作在tensor:下添加 +dump指定操作当前支持dump融合算子和dump切片操作的输入输出,需要在att/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml中添加如下代码: +--切片操作 + 在tensor:下添加: ``` - __getitem__ ``` -如果融合算子无法dump,需要手动添加到support_wrap_ops.yaml中,比如以下算子: +--融合算子 + 在torch_npu:下添加融合算子名称,当前默认支持的融合算子包括: +``` +- npu_scaled_masked_softmax +- torch_npu.npu_rotary_mul +- torch_npu.npu_roi_align +- torch_npu.npu_roi_alignbk +- npu_ptiou +``` +例如: ``` def npu_forward_fused_softmax(self, input_, mask): resl = torch_npu.npu_scaled_masked_softmax(input_, mask, self.scale, False) return resl ``` -需要在support_wrap_ops.yaml中的torch_npu: 中自行添加该融合算子即可: +需要在support_wrap_ops.yaml中的torch_npu: 中添加: ``` - npu_scaled_masked_softmax ``` -目前已默认支持的融合算子包括: -``` -- npu_scaled_masked_softmax -- torch_npu.npu_rotary_mul -- torch_npu.npu_roi_align -- torch_npu.npu_roi_alignbk -- npu_ptiou -``` ## 常见问题 ### 1. 在同一个目录多次执行dump会冲突吗? -- Gitee From c96251e4304cd7206f0c5ca14c89f0246a55a304 Mon Sep 17 00:00:00 2001 From: sunyiming Date: Mon, 16 Oct 2023 09:08:06 +0000 Subject: [PATCH 05/16] update debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md. Signed-off-by: sunyiming --- debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md | 1 + 1 file changed, 1 insertion(+) diff --git a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md index ec355a99f14..08a4c8dd822 100644 --- a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md +++ b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md @@ -8,6 +8,7 @@ __version__ = '3.4' ``` ### 2.dump指定操作 dump指定操作当前支持dump融合算子和dump切片操作的输入输出,需要在att/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml中添加如下代码: + --切片操作 在tensor:下添加: ``` -- Gitee From 5e9f5e2b64970f9b48cb24d3f975ce713730db98 Mon Sep 17 00:00:00 2001 From: s30048155 Date: Tue, 17 Oct 2023 10:03:20 +0800 Subject: [PATCH 06/16] update --- debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md index bc2b4dadeed..0b7e3d8ee28 100644 --- a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md +++ b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md @@ -17,18 +17,11 @@ def npu_forward_fused_softmax(self, input_, mask): resl = torch_npu.npu_scaled_masked_softmax(input_, mask, self.scale, False) return resl ``` -需要在support_wrap_ops.yaml中的torch_npu: 中自行添加该融合算子即可: +调用了需要在如果需要dump其中调用的npu_scaled_masked_softmax算子的输入输出信息,需要在support_wrap_ops.yaml中的torch_npu: 中自行添加该融合算子即可: ``` - npu_scaled_masked_softmax ``` -目前已默认支持的融合算子包括: -``` -- npu_scaled_masked_softmax -- torch_npu.npu_rotary_mul -- torch_npu.npu_roi_align -- torch_npu.npu_roi_alignbk -- npu_ptiou -``` +(仅作举例用,已默认支持) ## 常见问题 ### 1. 在同一个目录多次执行dump会冲突吗? -- Gitee From dd1654524d74c6fe1a69034aec4edae0a2d267b5 Mon Sep 17 00:00:00 2001 From: s30048155 Date: Tue, 17 Oct 2023 10:29:30 +0800 Subject: [PATCH 07/16] update --- debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md | 18 ++++++++---------- .../hook_module/support_wrap_ops.yaml | 1 + 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md index e6eae4d91d9..8e3588c3a9c 100644 --- a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md +++ b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md @@ -6,20 +6,14 @@ ``` __version__ = '3.4' ``` -### 2.dump指定操作 -dump指定操作当前支持dump融合算子和dump切片操作的输入输出,需要在att/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml中添加如下代码: - -切片操作,在tensor:下添加: -``` -- __getitem__ -``` -如果融合算子无法dump,需要手动添加到support_wrap_ops.yaml中,比如以下算子: +### 2.dump指定融合算子 +dump指定操作当前支持dump指定融合算子的输入输出,需要在att/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml中添加,比如以下算子: ``` def npu_forward_fused_softmax(self, input_, mask): resl = torch_npu.npu_scaled_masked_softmax(input_, mask, self.scale, False) return resl ``` -调用了需要在如果需要dump其中调用的npu_scaled_masked_softmax算子的输入输出信息,需要在support_wrap_ops.yaml中的torch_npu: 中自行添加该融合算子即可: +如果需要dump其中调用的npu_scaled_masked_softmax算子的输入输出信息,需要在support_wrap_ops.yaml中的torch_npu: 中自行添加该融合算子即可: ``` - npu_scaled_masked_softmax ``` @@ -146,4 +140,8 @@ compare(dump_result_param, "./output", stack_mode=True) ### 14. 工具报错:AssertionError: Please register hooks to nn.Module -- 请在model示例化之后配置register hook。 \ No newline at end of file +- 请在model示例化之后配置register hook。 + +### 15. 添加ptdbg_ascend工具后截取操作报错:IndexError: too many indices for tensor of dimension x 类似的报错。 + +删除ptdbg_ascend工具的hook_module目录下yaml文件中Tensor:下的`- __getitem__`即可。 \ No newline at end of file diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml index 0f59e4b87a5..db2f2475e22 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml @@ -145,6 +145,7 @@ tensor: - __div__ - __eq__ - __ge__ + - __getitem__ - __gt__ - __iadd__ - __iand__ -- Gitee From c9717d0f7c2184c489dd178b02dc41344e4aac7c Mon Sep 17 00:00:00 2001 From: sunyiming Date: Tue, 17 Oct 2023 06:47:54 +0000 Subject: [PATCH 08/16] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20lib?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/.keep diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/.keep b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/.keep new file mode 100644 index 00000000000..e69de29bb2d -- Gitee From c1e1ce0d0cd31789885aa583bcbd0dc546b217fe Mon Sep 17 00:00:00 2001 From: sunyiming Date: Tue, 17 Oct 2023 06:49:24 +0000 Subject: [PATCH 09/16] =?UTF-8?q?=E8=BF=81=E7=A7=BBlib=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E5=A4=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: sunyiming --- .../ptdbg_ascend/parse_tool/lib/__init__.py | 0 .../ptdbg_ascend/parse_tool/lib/compare.py | 152 ++++++++++++ .../ptdbg_ascend/parse_tool/lib/config.py | 46 ++++ .../ptdbg_ascend/parse_tool/lib/file_desc.py | 31 +++ .../parse_tool/lib/interactive_cli.py | 67 ++++++ .../parse_tool/lib/parse_exception.py | 50 ++++ .../ptdbg_ascend/parse_tool/lib/parse_tool.py | 140 +++++++++++ .../ptdbg_ascend/parse_tool/lib/utils.py | 223 ++++++++++++++++++ .../parse_tool/lib/visualization.py | 87 +++++++ 9 files changed, 796 insertions(+) create mode 100644 debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/__init__.py create mode 100644 debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/compare.py create mode 100644 debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/config.py create mode 100644 debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/file_desc.py create mode 100644 debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/interactive_cli.py create mode 100644 debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py create mode 100644 debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_tool.py create mode 100644 debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/utils.py create mode 100644 debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/visualization.py diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/__init__.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/compare.py new file mode 100644 index 00000000000..b7268a347c0 --- /dev/null +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/compare.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os +import numpy as np +from .utils import Util +from .config import Const +from .parse_exception import ParseException + + +class Compare: + def __init__(self): + self.util = Util() + self.log = self.util.log + self.vector_compare_result = {} + self.msaccucmp = None + + @property + def call_msaccucmp(self): + if not self.msaccucmp: + self.msaccucmp = self.util.check_msaccucmp(Const.MS_ACCU_CMP_PATH) + return self.msaccucmp + + def npu_vs_npu_compare(self, my_dump_path, golden_dump_path, result_dir): + self.log.info("Start Compare ...............") + self.compare_vector(my_dump_path, golden_dump_path, result_dir) + self.log.info("Compare finished!!") + + def compare_vector(self, my_dump_path, golden_dump_path, result_dir): + self.util.create_dir(result_dir) + self.util.check_path_valid(result_dir) + cmd = '%s %s compare -m %s -g %s -out %s' % ( + self.util.python, self.call_msaccucmp, my_dump_path, golden_dump_path, result_dir + ) + return self.util.execute_command(cmd) + + def convert_dump_to_npy(self, dump_file, data_format, output): + file_name = "" + if os.path.isfile(dump_file): + self.log.info("Covert file is: %s", dump_file) + file_name = os.path.basename(dump_file) + elif os.path.isdir(dump_file): + self.log.info("Convert all files in path: %s", dump_file) + file_name = "" + output = output if output else Const.DUMP_CONVERT_DIR + self.util.check_path_valid(output) + convert = self.convert(dump_file, data_format, output) + if convert == 0: + convert_files = self.util.list_convert_files(output, file_name) + + summary_txt = ["SrcFile: %s" % dump_file] + for convert_file in convert_files.values(): + summary_txt.append(" - %s" % convert_file.file_name) + self.util.print_panel("\n".join(summary_txt)) + + def convert(self, dump_file, data_format, output): + self.util.create_dir(output) + self.util.check_path_valid(output) + if data_format: + cmd = '%s %s convert -d %s -out %s -f %s' % ( + self.util.python, self.call_msaccucmp, dump_file, output, data_format + ) + else: + cmd = '%s %s convert -d %s -out %s' % ( + self.util.python, self.call_msaccucmp, dump_file, output + ) + return self.util.execute_command(cmd) + + def compare_data(self, left, right, save_txt=False, rl=0.001, al=0.001, diff_count=20): + """Compare data""" + if left is None or right is None: + raise ParseException("invalid input or output") + try: + left_data = np.load(left) + right_data = np.load(right) + except UnicodeError as e: + self.log.error("%s %s" % ("UnicodeError", str(e))) + self.log.warning("Please check the npy file") + raise ParseException(ParseException.PARSE_UNICODE_ERROR) + except IOError: + self.log.error("Failed to load npy %s or %s." % (left, right)) + raise ParseException(ParseException.PARSE_LOAD_NPY_ERROR) + + # save to txt + if save_txt: + self.util.save_npy_to_txt(left_data, left + ".txt") + self.util.save_npy_to_txt(right_data, right + ".txt") + # compare data + total_cnt, all_close, cos_sim, err_percent = self._do_compare_data(left_data, right_data, rl, al, diff_count) + content = ['Left:', ' ├─ NpyFile: %s' % left] + if save_txt: + content.append(' ├─ TxtFile: [green]%s.txt[/green]' % left) + content.append(' └─ NpySpec: [yellow]%s[/yellow]' % self.util.gen_npy_info_txt(left_data)) + content.append('Right:') + content.append(' ├─ NpyFile: %s' % right) + if save_txt: + content.append(' ├─ TxtFile: [green]%s.txt[/green]' % right) + content.append(' └─ NpySpec: [yellow]%s[/yellow]' % self.util.gen_npy_info_txt(right_data)) + content.append('NumCnt: %s' % total_cnt) + content.append('AllClose: %s' % all_close) + content.append('CosSim: %s' % cos_sim) + content.append('ErrorPer: %s (rl= %s, al= %s)' % (err_percent, rl, al)) + self.util.print_panel("\n".join(content)) + + def _do_compare_data(self, left, right, rl=0.001, al=0.001, diff_count=20): + data_left = left.astype(np.float32) + data_right = right.astype(np.float32) + shape_left = data_left.shape + shape_right = data_right.shape + if shape_left != shape_right: + self.log.warning("Data shape not equal: %s vs %s", data_left.shape, data_right.shape) + data_left = data_left.reshape(-1) + data_right = data_right.reshape(-1) + if data_left.shape[0] != data_right.shape[0]: + self.log.warning("Data size not equal: %s vs %s", data_left.shape, data_right.shape) + if data_left.shape[0] < data_right.shape[0]: + data_left = np.pad(data_left, (0, data_right.shape[0] - data_left.shape[0]), 'constant') + else: + data_right = np.pad(data_right, (0, data_left.shape[0] - data_right.shape[0]), 'constant') + all_close = np.allclose(data_left, data_right, atol=al, rtol=rl) + cos_sim = np.dot(data_left, data_right) / ( + np.sqrt(np.dot(data_left, data_left)) * np.sqrt(np.dot(data_right, data_right))) + err_cnt = 0 + total_cnt = data_left.shape[0] + diff_table_columns = ['Index', 'Left', 'Right', 'Diff'] + err_table = self.util.create_table("Error Item Table", diff_table_columns) + top_table = self.util.create_table("Top Item Table", diff_table_columns) + for i in range(total_cnt): + abs_diff = abs(data_left[i] - data_right[i]) + if i < diff_count: + top_table.add_row(str(i), str(data_left[i]), str(data_right[i]), str(abs_diff)) + if abs_diff > (al + rl * abs(data_right[i])): + if err_cnt < diff_count: + err_table.add_row(str(i), str(data_left[i]), str(data_right[i]), str(abs_diff)) + err_cnt += 1 + err_percent = float(err_cnt / total_cnt) + self.util.print(self.util.create_columns([err_table, top_table])) + return total_cnt, all_close, cos_sim, err_percent diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/config.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/config.py new file mode 100644 index 00000000000..7c4eb141916 --- /dev/null +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/config.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os + + +class Const: + + MS_ACCU_CMP_PATH = '/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py' + ROOT_DIR = "" + LOG_LEVEL = "NOTSET" + DATA_ROOT_DIR = os.path.join(ROOT_DIR, 'parse_data') + DUMP_CONVERT_DIR = os.path.join(DATA_ROOT_DIR, 'dump_convert') + COMPARE_DIR = os.path.join(DATA_ROOT_DIR, 'compare_result') + OFFLINE_DUMP_CONVERT_PATTERN = \ + r"^([A-Za-z0-9_-]+)\.([A-Za-z0-9_-]+)\.([0-9]+)(\.[0-9]+)?\.([0-9]{1,255})" \ + r"\.([a-z]+)\.([0-9]{1,255})(\.[x0-9]+)?\.npy$" + NUMPY_PATTERN = r".*\.npy$" + NPY_SUFFIX = ".npy" + PKL_SUFFIX = ".pkl" + DIRECTORY_LENGTH = 4096 + FILE_NAME_LENGTH = 255 + FILE_PATTERN = r'^[a-zA-Z0-9_./-]+$' + ONE_GB = 1 * 1024 * 1024 * 1024 + TEN_GB = 10 * 1024 * 1024 * 1024 + HEADER = r""" ____ + / __ \____ ______________ + / /_/ / __ `/ ___/ ___/ _ \ + / ____/ /_/ / / (__ ) __/ + /_/ \__,_/_/ /____/\___/ + + """ diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/file_desc.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/file_desc.py new file mode 100644 index 00000000000..30695e3b0a0 --- /dev/null +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/file_desc.py @@ -0,0 +1,31 @@ +# coding=utf-8 +import os + + +class FileDesc(object): + def __init__(self, file_name, dir_path, timestamp=-1): + self.file_name = file_name + self.dir_path = dir_path + self.path = os.path.join(dir_path, file_name) + self.timestamp = timestamp + self.idx = 0 + if self.timestamp == -1: + self.timestamp = os.path.getmtime(self.path) + + +class NpuDumpFileDesc(FileDesc): + def __init__(self, file_name, dir_path, timestamp, op_name, op_type, task_id, stream_id=0): + super(NpuDumpFileDesc, self).__init__(file_name, dir_path, timestamp) + self.op_name = op_name + self.op_type = op_type + self.task_id = task_id + stream_id = 0 if stream_id is None else int(stream_id) + self.stream_id = stream_id + self.idx = dir_path.split(os.sep)[-1] + + +class DumpDecodeFileDesc(NpuDumpFileDesc): + def __init__(self, file_name, dir_path, timestamp, op_name, op_type, task_id, anchor_type, anchor_idx): + super(DumpDecodeFileDesc, self).__init__(file_name, dir_path, timestamp, op_name, op_type, task_id) + self.type = anchor_type + self.idx = anchor_idx diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/interactive_cli.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/interactive_cli.py new file mode 100644 index 00000000000..df1de148e21 --- /dev/null +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/interactive_cli.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import cmd +from .parse_tool import ParseTool +from .utils import Util +from .config import Const +from .parse_exception import catch_exception + + +class InteractiveCli(cmd.Cmd): + def __init__(self): + cmd.Cmd.__init__(self) + self.prompt = "Parse >>> " + self.parse_tool = ParseTool() + self.util = Util() + self.util.print_panel(Const.HEADER) + self._prepare() + + @staticmethod + def _parse_argv(line, insert=None): + argv = line.split() if line != "" else [] + if "-h" in argv: + return argv + if insert is not None and len(argv) and argv[0] != insert: + argv.insert(0, insert) + return argv + + def _prepare(self): + self.parse_tool.prepare() + + @catch_exception + def default(self, line=""): + self.util.execute_command(line) + return False + + @catch_exception + def do_run(self, line=""): + self.util.execute_command(line) + + def do_vc(self, line=""): + self.parse_tool.do_vector_compare(self._parse_argv(line)) + + def do_dc(self, line=""): + self.parse_tool.do_convert_dump(self._parse_argv(line)) + + def do_pt(self, line=""): + self.parse_tool.do_print_data(self._parse_argv(line)) + + def do_pk(self, line=""): + self.parse_tool.do_parse_pkl(self._parse_argv(line)) + + def do_cn(self, line=''): + self.parse_tool.do_compare_data(self._parse_argv(line)) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py new file mode 100644 index 00000000000..a82a5106ffd --- /dev/null +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_exception.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import logging + + +class ParseException(Exception): + + PARSE_INVALID_PATH_ERROR = 0 + PARSE_NO_FILE_ERROR = 1 + PARSE_NO_MODULE_ERROR = 2 + PARSE_INVALID_DATA_ERROR = 3 + PARSE_INVALID_FILE_FORMAT_ERROR = 4 + PARSE_UNICODE_ERROR = 5 + PARSE_JSONDECODE_ERROR = 6 + PARSE_MSACCUCMP_ERROR = 7 + PARSE_LOAD_NPY_ERROR = 8 + + def __init__(self, code, error_info=""): + super(ParseException, self).__init__() + self.error_info = error_info + self.code = code + + +def catch_exception(func): + def inner(*args, **kwargs): + log = logging.getLogger() + line = args[-1] if len(args) == 2 else "" + try: + return func(*args, **kwargs) + except OSError: + log.error("%s: command not found" % line) + except ParseException: + log.error("Command execution failed") + except SystemExit: + log.warning("Please enter the correct command") + return inner diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_tool.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_tool.py new file mode 100644 index 00000000000..b0b56100700 --- /dev/null +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/parse_tool.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import argparse +import os + +from .config import Const +from .utils import Util +from .compare import Compare +from .visualization import Visualization +from .parse_exception import catch_exception, ParseException + + +class ParseTool: + def __init__(self): + self.util = Util() + self.compare = Compare() + self.visual = Visualization() + + @catch_exception + def prepare(self): + self.util.create_dir(Const.DATA_ROOT_DIR) + + @catch_exception + def do_vector_compare(self, argv=None): + parser = argparse.ArgumentParser() + parser.add_argument( + "-m", "--my_dump_path", dest="my_dump_path", default=None, + help=" my dump path, the data compared with golden data", + required=True + ) + parser.add_argument( + "-g", "--golden_dump_path", dest="golden_dump_path", default=None, + help=" the golden dump data path", + required=True + ) + parser.add_argument( + "-out", "--output_path", dest="output_path", default=None, + help=" the output path", + required=False + ) + parser.add_argument( + "-asc", "--ascend_path", dest="ascend_path", default=None, + help=" the Ascend home path", + required=False + ) + args = parser.parse_args(argv) + if not args.output_path: + result_dir = os.path.join(Const.COMPARE_DIR) + else: + result_dir = args.output_path + my_dump_path = args.my_dump_path + golden_dump_path = args.golden_dump_path + self.util.check_path_valid(my_dump_path) + self.util.check_path_valid(golden_dump_path) + self.util.check_files_in_path(my_dump_path) + self.util.check_files_in_path(golden_dump_path) + if not os.path.isdir(my_dump_path) or not os.path.isdir(golden_dump_path): + self.util.log.error("Please enter a directory not a file") + raise ParseException(ParseException.PARSE_INVALID_PATH_ERROR) + if args.ascend_path: + Const.MS_ACCU_CMP_PATH = self.util.path_strip(args.ascend_path) + self.util.check_path_valid(Const.MS_ACCU_CMP_PATH) + self.compare.npu_vs_npu_compare(my_dump_path, golden_dump_path, result_dir) + + @catch_exception + def do_convert_dump(self, argv=None): + parser = argparse.ArgumentParser() + parser.add_argument( + '-n', '--name', dest='path', default=None, required=True, help='dump file or dump file directory') + parser.add_argument( + '-f', '--format', dest='format', default=None, required=False, help='target format') + parser.add_argument( + '-out', '--output_path', dest='output_path', required=False, default=None, help='output path') + parser.add_argument( + "-asc", "--ascend_path", dest="ascend_path", default=None, help=" the Ascend home path", + required=False) + args = parser.parse_args(argv) + self.util.check_path_valid(args.path) + self.util.check_files_in_path(args.path) + if args.ascend_path: + Const.MS_ACCU_CMP_PATH = self.util.path_strip(args.ascend_path) + self.util.check_path_valid(Const.MS_ACCU_CMP_PATH) + self.compare.convert_dump_to_npy(args.path, args.format, args.output_path) + + @catch_exception + def do_print_data(self, argv=None): + """print tensor data""" + parser = argparse.ArgumentParser() + parser.add_argument('-n', '--name', dest='path', default=None, required=True, help='File name') + args = parser.parse_args(argv) + self.visual.print_npy_data(args.path) + + @catch_exception + def do_parse_pkl(self, argv=None): + parser = argparse.ArgumentParser() + parser.add_argument( + '-f', '--file', dest='file_name', default=None, required=True, help='PKL file path') + parser.add_argument( + '-n', '--name', dest='api_name', default=None, required=True, help='API name') + args = parser.parse_args(argv) + self.visual.parse_pkl(args.file_name, args.api_name) + + @catch_exception + def do_compare_data(self, argv): + """compare two tensor""" + parser = argparse.ArgumentParser() + parser.add_argument( + "-m", "--my_dump_path", dest="my_dump_path", default=None, + help=" my dump path, the data compared with golden data", + required=True + ) + parser.add_argument( + "-g", "--golden_dump_path", dest="golden_dump_path", default=None, + help=" the golden dump data path", + required=True + ) + parser.add_argument('-p', '--print', dest='count', default=20, type=int, help='print err data num') + parser.add_argument('-s', '--save', dest='save', action='store_true', help='save data in txt format') + parser.add_argument('-al', '--atol', dest='atol', default=0.001, type=float, help='set rtol') + parser.add_argument('-rl', '--rtol', dest='rtol', default=0.001, type=float, help='set atol') + args = parser.parse_args(argv) + self.util.check_path_valid(args.my_dump_path) + self.util.check_path_valid(args.golden_dump_path) + self.util.check_path_format(args.my_dump_path, Const.NPY_SUFFIX) + self.util.check_path_format(args.golden_dump_path, Const.NPY_SUFFIX) + self.compare.compare_data(args.my_dump_path, args.golden_dump_path, args.save, args.rtol, args.atol, args.count) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/utils.py new file mode 100644 index 00000000000..02cf0a215a0 --- /dev/null +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/utils.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import logging +import os +import re +import sys +import subprocess +import numpy as np +from .config import Const +from .file_desc import DumpDecodeFileDesc, FileDesc +from .parse_exception import ParseException + +try: + from rich.traceback import install + from rich.panel import Panel + from rich.table import Table + from rich import print as rich_print + from rich.columns import Columns + install() +except ImportError as err: + install = None + Panel = None + Table = None + Columns = None + rich_print = None + print("[Warning] Failed to import rich.", err) + print("[Warning] Some features may not be available. Please run 'pip install rich' to fix it.") + + +class Util: + def __init__(self): + self.ms_accu_cmp = None + logging.basicConfig( + level=Const.LOG_LEVEL, + format="%(asctime)s (%(process)d) -[%(levelname)s]%(message)s", + datefmt="%Y-%m-%d %H:%M:%S" + ) + self.log = logging.getLogger() + self.python = sys.executable + + @staticmethod + def print(content): + rich_print(content) + + @staticmethod + def path_strip(path): + return path.strip("'").strip('"') + + @staticmethod + def _gen_npu_dump_convert_file_info(name, match, dir_path): + return DumpDecodeFileDesc(name, dir_path, int(match.groups()[-4]), op_name=match.group(2), + op_type=match.group(1), task_id=int(match.group(3)), anchor_type=match.groups()[-3], + anchor_idx=int(match.groups()[-2])) + + @staticmethod + def _gen_numpy_file_info(name, math, dir_path): + return FileDesc(name, dir_path) + + def execute_command(self, cmd): + if not cmd: + self.log.error("Commond is None") + return -1 + self.log.debug("[RUN CMD]: %s", cmd) + cmd = cmd.split(" ") + complete_process = subprocess.run(cmd, shell=False) + return complete_process.returncode + + def print_panel(self, content, title='', fit=True): + if not Panel: + print(content) + return + if fit: + self.print(Panel.fit(content, title=title)) + else: + self.print(Panel(content, title=title)) + + def check_msaccucmp(self, target_file): + self.log.info("Try to auto detect file with name: %s.", target_file) + result = subprocess.run( + [self.python, target_file, "--help"], stdout=subprocess.PIPE) + if result.returncode == 0: + self.log.info("Check [%s] success.", target_file) + else: + self.log.error("Check msaccucmp failed in dir %s" % target_file) + self.log.error("Please specify a valid msaccucmp.py path or install the cann package") + raise ParseException(ParseException.PARSE_MSACCUCMP_ERROR) + return target_file + + def create_dir(self, path): + path = self.path_strip(path) + if os.path.exists(path): + return + try: + os.makedirs(path, mode=0o750) + except OSError as e: + self.log.error("Failed to create %s. %s", path, str(e)) + raise ParseException(ParseException.PARSE_INVALID_PATH_ERROR) + + def gen_npy_info_txt(self, source_data): + shape, dtype, max_data, min_data, mean = \ + self.npy_info(source_data) + return \ + '[Shape: %s] [Dtype: %s] [Max: %s] [Min: %s] [Mean: %s]' % (shape, dtype, max_data, min_data, mean) + + def save_npy_to_txt(self, data, dst_file='', align=0): + if os.path.exists(dst_file): + self.log.info("Dst file %s exists, will not save new one.", dst_file) + return + shape = data.shape + data = data.flatten() + if align == 0: + align = 1 if len(shape) == 0 else shape[-1] + elif data.size % align != 0: + pad_array = np.zeros((align - data.size % align,)) + data = np.append(data, pad_array) + np.savetxt(dst_file, data.reshape((-1, align)), delimiter=' ', fmt='%g') + + def list_convert_files(self, path, external_pattern=""): + return self._list_file_with_pattern( + path, Const.OFFLINE_DUMP_CONVERT_PATTERN, external_pattern, self._gen_npu_dump_convert_file_info + ) + + def list_numpy_files(self, path, extern_pattern=''): + return self._list_file_with_pattern(path, Const.NUMPY_PATTERN, extern_pattern, + self._gen_numpy_file_info) + + def create_columns(self, content): + if not Columns: + self.log.error("No Module named rich, please install it") + raise ParseException(ParseException.PARSE_NO_MODULE_ERROR) + return Columns(content) + + def create_table(self, title, columns): + if not Table: + self.log.error("No Module named rich, please install it and restart parse tool") + raise ParseException(ParseException.PARSE_NO_MODULE_ERROR) + table = Table(title=title) + for column_name in columns: + table.add_column(column_name, overflow='fold') + return table + + def check_path_valid(self, path): + path = self.path_strip(path) + if not path or not os.path.exists(path): + self.log.error("The path %s does not exist." % path) + raise ParseException(ParseException.PARSE_INVALID_PATH_ERROR) + if os.path.islink(path): + self.log.error('The file path {} is a soft link.'.format(path)) + raise ParseException(ParseException.PARSE_INVALID_PATH_ERROR) + if len(os.path.realpath(path)) > Const.DIRECTORY_LENGTH or len(os.path.basename(path)) > \ + Const.FILE_NAME_LENGTH: + self.log.error('The file path length exceeds limit.') + raise ParseException(ParseException.PARSE_INVALID_PATH_ERROR) + if not re.match(Const.FILE_PATTERN, os.path.realpath(path)): + self.log.error('The file path {} contains special characters.'.format(path)) + raise ParseException(ParseException.PARSE_INVALID_PATH_ERROR) + if os.path.isfile(path): + file_size = os.path.getsize(path) + if path.endswith(Const.PKL_SUFFIX) and file_size > Const.ONE_GB: + self.log.error('The file {} size is greater than 1GB.'.format(path)) + raise ParseException(ParseException.PARSE_INVALID_PATH_ERROR) + if path.endswith(Const.NPY_SUFFIX) and file_size > Const.TEN_GB: + self.log.error('The file {} size is greater than 10GB.'.format(path)) + raise ParseException(ParseException.PARSE_INVALID_PATH_ERROR) + + def check_files_in_path(self, path): + if os.path.isdir(path) and len(os.listdir(path)) == 0: + self.log.error("No files in %s." % path) + raise ParseException(ParseException.PARSE_INVALID_PATH_ERROR) + + def npy_info(self, source_data): + if isinstance(source_data, np.ndarray): + data = source_data + else: + self.log.error("Invalid data, data is not ndarray") + raise ParseException(ParseException.PARSE_INVALID_DATA_ERROR) + if data.dtype == 'object': + self.log.error("Invalid data, data is object.") + raise ParseException(ParseException.PARSE_INVALID_DATA_ERROR) + if np.size(data) == 0: + self.log.error("Invalid data, data is empty") + raise ParseException(ParseException.PARSE_INVALID_DATA_ERROR) + return data.shape, data.dtype, data.max(), data.min(), data.mean() + + def _list_file_with_pattern(self, path, pattern, extern_pattern, gen_info_func): + self.check_path_valid(path) + file_list = {} + re_pattern = re.compile(pattern) + for dir_path, dir_names, file_names in os.walk(path, followlinks=True): + for name in file_names: + match = re_pattern.match(name) + if not match: + continue + if extern_pattern != '' and not re.match(extern_pattern, name): + continue + file_list[name] = gen_info_func(name, match, dir_path) + return file_list + + def check_path_format(self, path, suffix): + if os.path.isfile(path): + if not path.endswith(suffix): + self.log.error("%s is not a %s file." % (path, suffix)) + raise ParseException(ParseException.PARSE_INVALID_FILE_FORMAT_ERROR) + elif os.path.isdir(path): + self.log.error("Please specify a single file path") + raise ParseException(ParseException.PARSE_INVALID_PATH_ERROR) + else: + self.log.error("The file path %s is invalid" % path) + raise ParseException(ParseException.PARSE_INVALID_PATH_ERROR) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/visualization.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/visualization.py new file mode 100644 index 00000000000..a46f7196668 --- /dev/null +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/lib/visualization.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +import json +import numpy as np + +from .config import Const +from .utils import Util +from .parse_exception import ParseException + + +class Visualization: + def __init__(self): + self.util = Util() + + def print_npy_summary(self, target_file): + try: + np_data = np.load(target_file, allow_pickle=True) + except UnicodeError as e: + self.util.log.error("%s %s" % ("UnicodeError", str(e))) + self.util.log.warning("Please check the npy file") + raise ParseException(ParseException.PARSE_UNICODE_ERROR) + table = self.util.create_table('', ['Index', 'Data']) + flatten_data = np_data.flatten() + for i in range(min(16, int(np.ceil(flatten_data.size / 8)))): + last_idx = min(flatten_data.size, i * 8 + 8) + table.add_row(str(i * 8), ' '.join(flatten_data[i * 8: last_idx].astype('str').tolist())) + summary = ['[yellow]%s[/yellow]' % self.util.gen_npy_info_txt(np_data), 'Path: %s' % target_file, + "TextFile: %s.txt" % target_file] + self.util.print_panel(self.util.create_columns([table, "\n".join(summary)]), target_file) + self.util.save_npy_to_txt(np_data, target_file + "txt") + + def print_npy_data(self, file_name): + file_name = self.util.path_strip(file_name) + self.util.check_path_valid(file_name) + self.util.check_path_format(file_name, Const.NPY_SUFFIX) + return self.print_npy_summary(file_name) + + def parse_pkl(self, path, api_name): + path = self.util.path_strip(path) + self.util.check_path_valid(path) + self.util.check_path_format(path, Const.PKL_SUFFIX) + with open(path, "r") as pkl_handle: + title_printed = False + while True: + pkl_line = pkl_handle.readline() + if pkl_line == '\n': + continue + if len(pkl_line) == 0: + break + try: + msg = json.loads(pkl_line) + except json.JSONDecodeError as e: + self.util.log.error("%s %s in line %s" % ("JSONDecodeError", str(e), pkl_line)) + self.util.log.warning("Please check the pkl file") + raise ParseException(ParseException.PARSE_JSONDECODE_ERROR) + info_prefix = msg[0] + if not info_prefix.startswith(api_name): + continue + if info_prefix.find("stack_info") != -1 and len(msg) == 2: + print("\nTrace back({}):".format(msg[0])) + if msg[1] and len(msg[1]) > 4: + for item in reversed(msg[1]): + print(" File \"{}\", line {}, in {}".format(item[0], item[1], item[2])) + print(" {}".format(item[3])) + continue + if len(msg) > 5: + summery_info = " [{}][dtype: {}][shape: {}][max: {}][min: {}][mean: {}]" \ + .format(msg[0], msg[3], msg[4], msg[5][0], msg[5][1], msg[5][2]) + if not title_printed: + print("\nStatistic Info:") + title_printed = True + print(summery_info) + pkl_handle.close() -- Gitee From e8d4850289feebf0c3ad909c705696ff88cc1196 Mon Sep 17 00:00:00 2001 From: sunyiming Date: Tue, 17 Oct 2023 06:55:25 +0000 Subject: [PATCH 10/16] update debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_compare.py. change to real exist path Signed-off-by: sunyiming --- .../ptdbg_ascend/test/ut/parse_tool/test_compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_compare.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_compare.py index 207a8cbf09e..43986d5233e 100644 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_compare.py +++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_compare.py @@ -26,7 +26,7 @@ class TestCompare(unittest.TestCase): def test_convert_dump_to_npy(self): dump_file = 'path_to_dump_file' data_format = 'data_format' - output = 'path_to_output' + output = './' self.compare.convert_dump_to_npy(dump_file, data_format, output) def test_convert(self): -- Gitee From 1932144eb66d48de21c83f5fa9078282bb41db27 Mon Sep 17 00:00:00 2001 From: s30048155 Date: Tue, 17 Oct 2023 16:03:26 +0800 Subject: [PATCH 11/16] bugfix --- .../ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py index 0657c7b7cc3..8bfad1195e6 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py @@ -25,7 +25,7 @@ class DumpUtil(object): dump_init_enable = False dump_api_list = [] dump_filter_switch = None - dump_mode = ['all'] + dump_mode = ['forward', 'backward', 'input', 'output'] backward_input = {} dump_dir_tag = 'ptdbg_dump' dump_config = None -- Gitee From dfc3a4ff722d695010d287a087a1ca9c958960e6 Mon Sep 17 00:00:00 2001 From: s30048155 Date: Tue, 17 Oct 2023 16:07:43 +0800 Subject: [PATCH 12/16] update --- .../src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml index db2f2475e22..725433d4899 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml @@ -145,8 +145,8 @@ tensor: - __div__ - __eq__ - __ge__ - - __getitem__ - __gt__ + - __getitem__ - __iadd__ - __iand__ - __idiv__ -- Gitee From 5abbe1b5500370d61676165c14361d1dccf3dc2c Mon Sep 17 00:00:00 2001 From: s30048155 Date: Tue, 17 Oct 2023 17:30:32 +0800 Subject: [PATCH 13/16] update yaml --- .../python/ptdbg_ascend/hook_module/support_wrap_ops.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml index 725433d4899..b0a4bd3cec3 100644 --- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml +++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml @@ -1054,9 +1054,9 @@ torch_npu: - npu_sign_bits_unpack - npu_flash_attention - npu_scaled_masked_softmax - - torch_npu.npu_rotary_mul - - torch_npu.npu_roi_align - - torch_npu.npu_roi_alignbk + - npu_rotary_mul + - npu_roi_align + - npu_roi_alignbk - npu_ptiou distributed: -- Gitee From 5f88c3e8aa54062e475f60224db300b6623eb5e3 Mon Sep 17 00:00:00 2001 From: s30048155 Date: Tue, 17 Oct 2023 17:44:44 +0800 Subject: [PATCH 14/16] update FAQ --- debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md index 87db2b79c54..28ef50963e7 100644 --- a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md +++ b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md @@ -134,18 +134,18 @@ compare(dump_result_param, "./output", stack_mode=True) - matmul期望的输入是二维,当输入不是二维时,会将输入通过view操作展成二维,再进行matmul运算,因此在反向求导时,backward_hook能拿到的是UnsafeViewBackward这步操作里面数据的梯度信息,取不到MmBackward这步操作里面数据的梯度信息,即权重的反向梯度数据。 - 典型的例子有,当linear的输入不是二维,且无bias时,会调用output = input.matmul(weight.t()),因此拿不到linear层的weight的反向梯度数据。 -### 13. 使用dataloader后raise异常Exception: ptdbg: exit after iteration [x, x, x] +### 13. pkl文件中的某些api的dtype类型为float16,但是读取此api的npy文件显示的dtype类型为float32 + +- ptdbg工具在dump数据时需要将原始数据从npu to cpu上再转换为numpy类型,npu to cpu的逻辑和gpu to cpu是保持一致的,都存在dtype可能从float16变为float32类型的情况,如果出现dtype不一致的问题,最终dump数据的dtype以pkl文件为准。 + +### 14. 使用dataloader后raise异常Exception: ptdbg: exit after iteration [x, x, x] - 正常现象,dataloader通过raise结束程序,堆栈信息可忽略。 -### 14. 工具报错:AssertionError: Please register hooks to nn.Module +### 15. 工具报错:AssertionError: Please register hooks to nn.Module - 请在model示例化之后配置register hook。 -### 15. 添加ptdbg_ascend工具后截取操作报错:IndexError: too many indices for tensor of dimension x 类似的报错。 +### 16. 添加ptdbg_ascend工具后截取操作报错:`IndexError: too many indices for tensor of dimension x` 或 `TypeError: len() of a 0-d tensor`。 删除ptdbg_ascend工具的hook_module目录下yaml文件中Tensor:下的`- __getitem__`即可。 - -### 16. pkl文件中的某些api的dtype类型为float16,但是读取此api的npy文件显示的dtype类型为float32 - -- ptdbg工具在dump数据时需要将原始数据从npu to cpu上再转换为numpy类型,npu to cpu的逻辑和gpu to cpu是保持一致的,都存在dtype可能从float16变为float32类型的情况,如果出现dtype不一致的问题,最终dump数据的dtype以pkl文件为准。 -- Gitee From 8dac98259233b9e8b60cc4292552205f1e20317d Mon Sep 17 00:00:00 2001 From: s30048155 Date: Tue, 17 Oct 2023 17:46:44 +0800 Subject: [PATCH 15/16] update FAQ --- debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md index 28ef50963e7..7115a576a69 100644 --- a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md +++ b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md @@ -7,7 +7,7 @@ __version__ = '3.4' ``` ### 2.dump指定融合算子 -dump指定操作当前支持dump指定融合算子的输入输出,需要在att/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml中添加,比如以下算子: +dump指定操作当前支持dump指定融合算子的输入输出,需要在att/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml中添加,比如以下代码段调用的softmax融合算子 ``` def npu_forward_fused_softmax(self, input_, mask): resl = torch_npu.npu_scaled_masked_softmax(input_, mask, self.scale, False) @@ -17,7 +17,7 @@ def npu_forward_fused_softmax(self, input_, mask): ``` - npu_scaled_masked_softmax ``` -(仅作举例用,已默认支持) +(npu_scaled_masked_softmax融合算子工具已支持dump,本例仅供参考) ## 常见问题 -- Gitee From aa23776d90e6610a56baa1caf41a15fe4fe9f238 Mon Sep 17 00:00:00 2001 From: s30048155 Date: Wed, 18 Oct 2023 11:55:21 +0800 Subject: [PATCH 16/16] remove parse_tool --- .../test/ut/parse_tool/test_compare.py | 37 --------------- .../test/ut/parse_tool/test_parse_tool.py | 12 ----- .../test/ut/parse_tool/test_util.py | 47 ------------------- .../test/ut/parse_tool/test_visualization.py | 36 -------------- 4 files changed, 132 deletions(-) delete mode 100644 debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_compare.py delete mode 100644 debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_parse_tool.py delete mode 100644 debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_util.py delete mode 100644 debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_visualization.py diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_compare.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_compare.py deleted file mode 100644 index 43986d5233e..00000000000 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_compare.py +++ /dev/null @@ -1,37 +0,0 @@ -import unittest -import numpy as np -from ptdbg_ascend.parse_tool.lib.compare import Compare - -class TestCompare(unittest.TestCase): - def setUp(self): - self.compare = Compare() - - def test_call_msaccucmp(self): - result = self.compare.call_msaccucmp - self.assertIsNotNone(result) - - def test_npu_vs_npu_compare(self): - my_dump_path = 'path_to_my_dump' - golden_dump_path = 'path_to_golden_dump' - result_dir = 'path_to_result_dir' - self.compare.npu_vs_npu_compare(my_dump_path, golden_dump_path, result_dir) - - def test_compare_vector(self): - my_dump_path = 'path_to_my_dump' - golden_dump_path = 'path_to_golden_dump' - result_dir = 'path_to_result_dir' - result = self.compare.compare_vector(my_dump_path, golden_dump_path, result_dir) - self.assertIsNotNone(result) - - def test_convert_dump_to_npy(self): - dump_file = 'path_to_dump_file' - data_format = 'data_format' - output = './' - self.compare.convert_dump_to_npy(dump_file, data_format, output) - - def test_convert(self): - dump_file = 'path_to_dump_file' - data_format = 'data_format' - output = 'path_to_output' - result = self.compare.convert(dump_file, data_format, output) - self.assertIsNotNone(result) diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_parse_tool.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_parse_tool.py deleted file mode 100644 index cb57705c5bb..00000000000 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_parse_tool.py +++ /dev/null @@ -1,12 +0,0 @@ -import unittest -from unittest.mock import patch, MagicMock -from ptdbg_ascend.parse_tool.lib.parse_tool import ParseTool - -class TestParseTool(unittest.TestCase): - def setUp(self): - self.parse_tool = ParseTool() - - @patch('ptdbg_ascend.parse_tool.lib.parse_tool.Util.create_dir') - def test_prepare(self, mock_create_dir): - self.parse_tool.prepare() - mock_create_dir.assert_called_once() diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_util.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_util.py deleted file mode 100644 index c497ec60669..00000000000 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_util.py +++ /dev/null @@ -1,47 +0,0 @@ -import unittest -from unittest.mock import patch, MagicMock -import numpy as np -from ptdbg_ascend.parse_tool.lib import utils - -class TestUtils(unittest.TestCase): - def setUp(self): - self.util = utils.Util() - - def test_execute_command(self): - with patch('subprocess.run') as mocked_run: - mocked_run.return_value.returncode = 0 - result = self.util.execute_command('echo hello') - self.assertEqual(result, 0) - - def test_check_msaccucmp(self): - with patch('subprocess.run') as mocked_run: - mocked_run.return_value.returncode = 0 - result = self.util.check_msaccucmp('msaccucmp.py') - self.assertEqual(result, 'msaccucmp.py') - - def test_gen_npy_info_txt(self): - data = np.array([1, 2, 3]) - result = self.util.gen_npy_info_txt(data) - self.assertEqual(result, '[Shape: (3,)] [Dtype: int64] [Max: 3] [Min: 1] [Mean: 2.0]') - - def test_save_npy_to_txt(self): - data = np.array([1, 2, 3]) - with patch('numpy.savetxt') as mocked_savetxt: - self.util.save_npy_to_txt(data, 'test.txt') - mocked_savetxt.assert_called_once() - - def test_check_path_valid(self): - with patch('os.path.exists') as mocked_exists: - mocked_exists.return_value = True - self.util.check_path_valid('valid_path') - - def test_npy_info(self): - data = np.array([1, 2, 3]) - result = self.util.npy_info(data) - self.assertEqual(result, ((3,), np.dtype('int64'), 3, 1, 2.0)) - - def test_check_path_format(self): - with patch('os.path.isfile') as mocked_isfile: - mocked_isfile.return_value = True - self.util.check_path_format('file.txt', '.txt') - diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_visualization.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_visualization.py deleted file mode 100644 index 308f0e02164..00000000000 --- a/debug/accuracy_tools/ptdbg_ascend/test/ut/parse_tool/test_visualization.py +++ /dev/null @@ -1,36 +0,0 @@ -import unittest -import os -import numpy as np -from ptdbg_ascend.parse_tool.lib.visualization import Visualization - -class TestVisualization(unittest.TestCase): - def setUp(self): - self.visualization = Visualization() - - def test_print_npy_summary(self): - np.save('test.npy', np.array([1, 2, 3, 4, 5])) - try: - self.visualization.print_npy_summary('test.npy') - except Exception as e: - self.fail(f"print_npy_summary raised exception {e}") - - def test_print_npy_data(self): - np.save('test.npy', np.array([1, 2, 3, 4, 5])) - try: - self.visualization.print_npy_data('test.npy') - except Exception as e: - self.fail(f"print_npy_data raised exception {e}") - - def test_parse_pkl(self): - with open('test.pkl', 'w') as f: - f.write('["api_name", [], "", "", "", ["", "", ""]]') - try: - self.visualization.parse_pkl('test.pkl', 'api_name') - except Exception as e: - self.fail(f"parse_pkl raised exception {e}") - - def tearDown(self): - if os.path.exists('test.npy'): - os.remove('test.npy') - if os.path.exists('test.pkl'): - os.remove('test.pkl') \ No newline at end of file -- Gitee