diff --git a/debug/accuracy_tools/ptdbg_ascend/CMakeLists.txt b/debug/accuracy_tools/ptdbg_ascend/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f4b3ccd3a53136e229050b09ee57c1fe1c218f6c
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.5)
+project(PtdbgAscend)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_SKIP_RPATH TRUE)
+
+if (NOT EXISTS  ${CMAKE_CURRENT_LIST_DIR}/tools/PYTHON_BIN_PATH)
+    message(FATAL_ERROR "No validate configuration found. Did you forget to configure first?")
+endif ()
+
+file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/tools/PYTHON_BIN_PATH" PYTHON_BIN_PATH)
+
+add_custom_target(ptdbg_ascend ALL
+        COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_LIST_DIR}/src/python ${CMAKE_BINARY_DIR}/ptdbg_ascend
+        COMMAND cd ${CMAKE_BINARY_DIR}/ptdbg_ascend && ${PYTHON_BIN_PATH} setup.py bdist_wheel
+        VERBATIM
+	)
+
+install(CODE "execute_process(COMMAND ${PYTHON_BIN_PATH} -m pip install ${CMAKE_BINARY_DIR}/ptdbg_ascend/dist/ptdbg_ascend-3.2-py3-none-any.whl --upgrade)")
diff --git a/debug/accuracy_tools/ptdbg_ascend/README.md b/debug/accuracy_tools/ptdbg_ascend/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..729c54f8ef5a48686860ccf2d375103ace1ba4d7
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/README.md
@@ -0,0 +1,244 @@
+# **PyTorch精度工具**
+
+## 快速安装
+
+进行PyTorch精度比对需要将ptdbg_ascend精度工具分别安装在CPU或GPU环境以及NPU环境下。
+
+1. whl包获取。
+
+   请通过下表链接下载ptdbg_ascend精度工具whl包，推荐下载最新版本。
+
+   | ptdbg_ascend版本 | 发布日期  | 支持PyTorch版本      | 下载链接                                                     | 参考指南                                                     | 校验码                                                       |
+   | ---------------- | --------- | -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+   | 3.2              | 2023-8-17 | 1.8.1/1.11.0/2.0/2.1 | [ptdbg_ascend-3.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/3.0/ptdbg_ascend-3.2-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v3.2](doc/ptdbg_ascend精度工具功能说明_v3.2.md) | 0116f66c7c893fc171bfa86e12ecfbf9cd062aedd176a0e67befb880b995f472 |
+   | 3.1              | 2023-8-02 | 1.8.1/1.11.0/2.0     | [ptdbg_ascend-3.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/3.0/ptdbg_ascend-3.1-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v3.1](doc/ptdbg_ascend精度工具功能说明_v3.1.md) | ef0dd5f96faf3576466545f082383eece409f25642a9bc4d0efc944969c1445a |
+   | 2.0              | 2023-7-07 | 1.8.1/1.11.0/2.0     | [ptdbg_ascend-2.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/2.0/ptdbg_ascend-2.0-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v2.0](doc/ptdbg_ascend精度工具功能说明_v2.0.md) | 85e046f133f0f40ed660337ce8207249b1dac47ac668910625bea49809f31d66 |
+   | 1.0              | 2023-3-30 | 1.8.1/1.11.0         | [ptdbg_ascend-1.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/1.0/ptdbg_ascend-1.0-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v1.0](https://gitee.com/ascend/tools/blob/master/ptdbg_ascend/doc/ptdbg_ascend%E7%B2%BE%E5%BA%A6%E5%B7%A5%E5%85%B7%E5%8A%9F%E8%83%BD%E8%AF%B4%E6%98%8E_v1.0.md) | 0559e12ba7accf80d182f227698163ee0de88bf86b1e9cd9f33b16fdead14759 |
+
+2. whl包校验。
+
+   1. 根据以上下载链接下载whl包到Linux安装环境。
+
+   2. 进入whl包所在目录，执行如下命令。
+
+      ```
+      sha256sum {name}.whl
+      ```
+
+      {name}为whl包名称。
+
+      若回显呈现对应版本whl包一致的**校验码**，则表示下载了正确的ptdbg_ascend精度工具whl安装包。示例如下：
+
+      ```
+      sha256sum ptdbg_ascend-3.1-py3-none-any.whl
+      ef0dd5f96faf3576466545f082383eece409f25642a9bc4d0efc944969c1445a  ptdbg_ascend-3.1-py3-none-any.whl
+      ```
+
+3. whl包安装。
+
+   执行如下命令进行安装。
+
+   ```bash
+   pip3 install ./ptdbg_ascend-{version}-py3-none-any.whl
+   ```
+
+   若为覆盖安装，请在命令行末尾增加“--force-reinstall”参数强制安装，例如：
+
+   ```bash
+   pip3 install ./ptdbg_ascend-{version}-py3-none-any.whl --force-reinstall
+   ```
+
+   提示如下信息则表示安装成功。
+
+   ```bash
+   Successfully installed ptdbg_ascend-{version}
+   ```
+
+## **PyTorch精度工具简介**
+
+### 概述
+
+在PyTorch训练网络，对同一模型或API调试过程中，遇到API相关的计算精度问题，定位时费时费力。
+
+ptdbg_ascend为PyTorch精度工具，用来进行PyTorch整网API粒度的数据dump、精度比对和溢出检测，从而定位PyTorch训练场景下的精度问题。
+
+**使用场景**
+
+主要的使用场景包括：
+
+- 同一模型，从CPU或GPU移植到NPU中存在精度下降问题，对比NPU芯片中的API计算数值与CPU或GPU芯片中的API计算数值，进行问题定位。
+- 同一模型，进行迭代（模型、框架版本升级或设备硬件升级）时存在的精度下降问题，对比相同模型在迭代前后版本的API计算数值，进行问题定位。
+
+### 原理介绍
+
+精度对比工具，通过在PyTorch模型中注册hook，跟踪计算图中API的前向传播与反向传播时的输入与输出，排查存在计算精度误差，进行问题的精准定位。
+
+**精度比对流程**
+
+1. 当模型在CPU或GPU上进行正向和反向传播时，分别dump每一层的数值输入与输出。
+
+2. 当模型在NPU中进行计算时，采用相同的方式dump下相应的数据。
+
+3. 通过对比dump出的数值，计算余弦相似度和最大绝对误差的方式，定位和排查NPU API存在的计算精度问题。如图1所示。
+
+   图1：精度比对逻辑图
+
+   ![op_compare](figures/module_compare.png)
+
+**API匹配条件**
+
+进行精度比对时，需要判断CPU或GPU的API与NPU的API是否相同可比对，须满足以下匹配条件：
+
+- 两个API的名称相同，API命名规则：`{api_type}_{api_name}_{api调用次数}_{正反向}_{输入输出}.index`，如：Functional_conv2d_1_backward_input.0。
+- 两个API的输入输出Tensor数量和各个Tensor的Shape相同。
+
+通常满足以上两个条件，ptdbg_ascend就认为是同一个API，成功进行API的匹配，后续进行相应的计算精度比对。
+
+## **PyTorch精度工具安装**
+
+### 环境准备
+
+- 通过pip安装环境依赖wheel、numpy、pandas（1.3.5及以上版本）和pyyaml。
+- ptdbg_ascend与PyTorch有严格的版本配套关系，使用工具前，您需要确保已经正确安装了PyTorch v1.8.1、PyTorch v1.11.0或PyTorch v2.0.0版本：
+  - CPU或GPU环境：请至[PyTorch官网](https://www.pytorch.org)下载并安装。
+  - NPU环境：请参见《[CANN软件安装指南](https://www.hiascend.com/document/detail/zh/canncommercial/63RC1/envdeployment/instg/instg_000002.html)》“安装开发环境 > 在昇腾设备上安装 > 安装深度学习框架 > 安装PyTorch”章节进行安装。
+
+### 安装
+
+进行PyTorch精度比对需要将ptdbg_ascend精度工具分别安装在CPU或GPU环境以及NPU环境下。
+
+ptdbg_ascend精度工具的安装方式包括：**下载whl包安装**和**源代码编译安装**。
+
+#### 下载whl包安装
+
+1. whl包获取。
+
+   请通过下表链接下载ptdbg_ascend精度工具whl包，推荐下载最新版本。
+
+   | ptdbg_ascend版本 | 发布日期  | 支持PyTorch版本      | 下载链接                                                     | 校验码                                                       | 参考指南                                                     |
+   | ---------------- | --------- | -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+   | 3.2              | 2023-8-17 | 1.8.1/1.11.0/2.0/2.1 | [ptdbg_ascend-3.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/3.0/ptdbg_ascend-3.2-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v3.2](doc/ptdbg_ascend精度工具功能说明_v3.2.md) | 0116f66c7c893fc171bfa86e12ecfbf9cd062aedd176a0e67befb880b995f472 |
+   | 3.1              | 2023-8-02 | 1.8.1/1.11.0/2.0     | [ptdbg_ascend-3.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/3.0/ptdbg_ascend-3.1-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v3.1](doc/ptdbg_ascend精度工具功能说明_v3.1.md) | ef0dd5f96faf3576466545f082383eece409f25642a9bc4d0efc944969c1445a |
+   | 2.0              | 2023-7-07 | 1.8.1/1.11.0/2.0     | [ptdbg_ascend-2.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/2.0/ptdbg_ascend-2.0-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v2.0](doc/ptdbg_ascend精度工具功能说明_v2.0.md) | 85e046f133f0f40ed660337ce8207249b1dac47ac668910625bea49809f31d66 |
+   | 1.0              | 2023-3-30 | 1.8.1/1.11.0         | [ptdbg_ascend-1.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/1.0/ptdbg_ascend-1.0-py3-none-any.whl) | [ptdbg_ascend精度工具功能说明_v1.0](https://gitee.com/ascend/tools/blob/master/ptdbg_ascend/doc/ptdbg_ascend精度工具功能说明_v1.0.md) | 0559e12ba7accf80d182f227698163ee0de88bf86b1e9cd9f33b16fdead14759 |
+
+2. whl包校验。
+
+   1. 根据以上下载链接下载whl包到Linux安装环境。
+
+   2. 进入whl包所在目录，执行如下命令。
+
+      ```
+      sha256sum {name}.whl
+      ```
+
+      {name}为whl包名称。
+
+      若回显呈现对应版本whl包一致的**校验码**，则表示下载了正确的ptdbg_ascend精度工具whl安装包。示例如下：
+
+      ```
+      sha256sum ptdbg_ascend-3.1-py3-none-any.whl
+      ef0dd5f96faf3576466545f082383eece409f25642a9bc4d0efc944969c1445a  ptdbg_ascend-3.1-py3-none-any.whl
+      ```
+
+3. whl包安装。
+
+   执行如下命令进行安装。
+
+   ```bash
+   pip3 install ./ptdbg_ascend-{version}-py3-none-any.whl
+   ```
+
+   若为覆盖安装，请在命令行末尾增加“--force-reinstall”参数强制安装，例如：
+
+   ```bash
+   pip3 install ./ptdbg_ascend-{version}-py3-none-any.whl --force-reinstall
+   ```
+
+   提示如下信息则表示安装成功。
+
+   ```bash
+   Successfully installed ptdbg_ascend-{version}
+   ```
+
+#### 源代码编译安装
+
+1. 安装依赖。
+
+   编译前需要安装wheel。
+
+   ```bash
+   pip3 install wheel
+   ```
+
+2. 下载源码。
+
+   ```bash
+   git clone https://gitee.com/ascend/tools.git
+   ```
+
+3. 配置安装环境。
+
+   ```bash
+   cd tools/ptdbg_ascend
+   bash ./configure
+   ```
+
+   默认情况下，执行上述命会弹出如下交互式会话窗口。
+
+   您的会话可能有所不同，请以实际情况为准。
+
+   ```bash
+   Please specify the location of python with available pytorch v1.8.1/v1.11.0 site-packages installed. [Default is /usr/bin/python3]
+   (You can make this quiet by set env [ADAPTER_TARGET_PYTHON_PATH]):
+   ```
+
+   此时要求输入安装了PyTorch v1.8.1或者v1.11.0 版本的Python解释器路径，若默认路径正确，回车，否则请输入正确的Python解释器路径。
+
+   > 也可以通过设置ADAPTER_TARGET_PYTHON_PATH的环境变量，来抑制交互式窗口弹出，但是要确保路径是有效的，否则仍然弹出。
+
+   配置完成后提示如下信息则表示Python解释器验证成功。
+
+   ```bash
+   Configuration finished
+   ```
+
+4. 配置cmake。
+
+   ```bash
+   mkdir build
+   cd build
+   cmake ..
+   ```
+
+   可能需要几分钟时间下载ptdbg_ascend的依赖项目以完成配置。
+
+5. 执行编译。
+
+   ```bash
+   make
+   ```
+
+   编译结束后生成如下whl包。
+
+   ```bash
+   ./ptdbg_ascend/dist/ptdbg_ascend-{version}-py3-none-any.whl
+   ```
+
+6. 安装。
+
+   执行如下命令进行ptdbg_ascend安装。
+   
+   ```bash
+   pip3 install ./ptdbg_ascend/dist/ptdbg_ascend-{version}-py3-none-any.whl --upgrade --force-reinstall
+   ```
+
+完成ptdbg_ascend安装后，可以进行PyTorch精度数据的dump和、比对和溢出检测等操作，详细介绍请参见《[PyTorch精度工具使用指南](https://gitee.com/ascend/tools/tree/master/ptdbg_ascend/doc)》。
+
+## 贡献
+
+push代码前，请务必保证已经完成了基础功能测试和网络测试。
+
+## Release Notes
+
+Release Notes请参见[RELEASE](RELEASE.md).
diff --git a/debug/accuracy_tools/ptdbg_ascend/RELEASE.md b/debug/accuracy_tools/ptdbg_ascend/RELEASE.md
new file mode 100644
index 0000000000000000000000000000000000000000..f37c0731e82732251f248d8a8e2e113cb60b2018
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/RELEASE.md
@@ -0,0 +1,4 @@
+# Release 3.2
+
+This is the initial release of Pytorch precision compare tools which was designed by the researchers
+ and engineers in Huawei Technologies Co.,Ltd.
\ No newline at end of file
diff --git a/debug/accuracy_tools/ptdbg_ascend/configure b/debug/accuracy_tools/ptdbg_ascend/configure
new file mode 100644
index 0000000000000000000000000000000000000000..a953879ec96b3860b016b111c38c2b1ad419ef84
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/configure
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+if [ -z "$PYTHON_BIN_PATH" ]; then
+  PYTHON_BIN_PATH=$(which python3 || which python || true)
+fi
+
+# Set all env variables
+CONFIGURE_DIR=$(dirname "$0")
+"$PYTHON_BIN_PATH" "${CONFIGURE_DIR}/configure.py" "$@"
+
+echo "Configuration finished"
diff --git a/debug/accuracy_tools/ptdbg_ascend/configure.py b/debug/accuracy_tools/ptdbg_ascend/configure.py
new file mode 100644
index 0000000000000000000000000000000000000000..b914b32e9f9ffbe0028dbd05184970c38d388b9d
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/configure.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Function:
+This class mainly involves tf common function.
+Copyright Information:
+HuaWei Technologies Co.,Ltd. All Rights Reserved © 2022
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import subprocess
+import sys
+
+_PYTORCH_VERSION_1_8 = "1.8"
+_PYTORCH_VERSION_1_11 = "1.11"
+_PYTORCH_VERSION_2_0 = "2.0"
+_PYTORCH_VERSION_2_1 = "2.1"
+_PYTHON_BIN_PATH_ENV = "ADAPTER_TARGET_PYTHON_PATH"
+_ASCEND_INSTALLED_PATH_ENV = "ASCEND_INSTALLED_PATH"
+
+
+def run_command(cmd):
+    """run_command."""
+    output = subprocess.check_output(cmd)
+    return output.decode('UTF-8').strip()
+
+
+def get_input(question):
+    """get_input."""
+    try:
+        try:
+            answer = raw_input(question)
+        except NameError:
+            answer = input(question)
+    except EOFError:
+        answer = ''
+    return answer
+
+
+def config_path(file_name):
+    """config_path."""
+    return os.path.join("tools", file_name)
+
+
+def setup_python(env_path):
+    """Get python install path."""
+    default_python_bin_path = sys.executable
+    ask_python_bin_path = ('Please specify the location of python with valid '
+                           'pytorch 1.8/1.11/2.0/2.1 site-packages installed. [Default '
+                           'is %s]\n(You can make this quiet by set env '
+                           '[ADAPTER_TARGET_PYTHON_PATH]): ') % default_python_bin_path
+    custom_python_bin_path = env_path
+    while True:
+        if not custom_python_bin_path:
+            python_bin_path = get_input(ask_python_bin_path)
+        else:
+            python_bin_path = custom_python_bin_path
+            custom_python_bin_path = None
+        if not python_bin_path:
+            python_bin_path = default_python_bin_path
+        # Check if the path is valid
+        if os.path.isfile(python_bin_path) and os.access(python_bin_path, os.X_OK):
+            pass
+        elif not os.path.exists(python_bin_path):
+            print('Invalid python path: %s cannot be found.' % python_bin_path)
+            continue
+        else:
+            print('%s is not executable.  Is it the python binary?' % python_bin_path)
+            continue
+
+        try:
+            compile_args = run_command([
+                python_bin_path, '-c',
+                'import distutils.sysconfig; import torch; print(torch.__version__ + "|" +'
+                ' "|".join(torch.__path__) + "|" + distutils.sysconfig.get_python_inc())']).split("|")
+            if (not compile_args[0].startswith(_PYTORCH_VERSION_1_8)) and \
+                    (not compile_args[0].startswith(_PYTORCH_VERSION_1_11)) and \
+                    (not compile_args[0].startswith(_PYTORCH_VERSION_2_0)) and \
+                    (not compile_args[0].startswith(_PYTORCH_VERSION_2_1)):
+                print('Currently supported Pytorch version is %s/%s, we got %s.'
+                      % (_PYTORCH_VERSION_1_8, _PYTORCH_VERSION_1_11, _PYTORCH_VERSION_2_0, _PYTORCH_VERSION_2_1, compile_args[0]))
+                continue
+        except subprocess.CalledProcessError:
+            print('Pytorch is not installed or does not work properly.')
+            continue
+        # Write tools/python_bin_path.sh
+        with open(config_path('PYTHON_BIN_PATH'), 'w') as f:
+            f.write(python_bin_path)
+        with open(config_path('PYTORCH_INSTALLED_PATH'), 'w') as f:
+            f.write(compile_args[1])
+        break
+
+
+def main():
+    """main."""
+    env_snapshot = dict(os.environ)
+    setup_python(env_snapshot.get(_PYTHON_BIN_PATH_ENV))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md
new file mode 100644
index 0000000000000000000000000000000000000000..4ec3b2d30b7cb3727ac935064bd4223058e08469
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md
@@ -0,0 +1,33 @@
+## FAQ
+
+### 1. 单机多卡场景dump目录下只生成一个rank目录或pkl文件格式损坏
+
+**故障现象**
+
+dump目录下只生成一个rank目录或dump目录下的pkl文件格式损坏、内容不完整。 
+
+**故障原因**
+
+通常是因为register_hook没有正确配置，带着工具没有获取正确的`rank_id`（从rank参数读取或从模型参数的device_id读取）。
+
+**故障处理**
+
+register_hook需要在set_dump_path之后调用，也需要在每个进程上被调用，建议在搬运模型数据到卡之后调用。识别方法如下：
+
+- 找到训练代码中遍历epoch的for循环或遍历数据集的for循环，把register_hook放到循环开始前即可。
+- 找到训练代码中调用DDP或者DistributedDataParallel的代码行，把register_hook放到该代码行所在的代码块之后。
+- 若代码中均无以上两种情况，那么尽可能把这行代码往后放，并配置register_hook的rank参数。
+
+### 2. HCCL 报错： error code: EI0006
+
+**故障现象**
+
+使用ptdbg_ascend工具时，报错： error code: EI0006。
+
+**故障原因**
+
+CANN软件版本较低导致不兼容。
+
+**故障处理**
+
+升级新版CANN软件版本。
diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/Pytorch \350\277\220\347\256\227\351\207\215\350\275\275API\345\222\214Acl\347\256\227\345\255\220\345\257\271\345\272\224\345\205\263\347\263\273.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/Pytorch \350\277\220\347\256\227\351\207\215\350\275\275API\345\222\214Acl\347\256\227\345\255\220\345\257\271\345\272\224\345\205\263\347\263\273.md"
new file mode 100644
index 0000000000000000000000000000000000000000..860b4555887cdf45eb04dc336a76f05eb8260e49
--- /dev/null
+++ "b/debug/accuracy_tools/ptdbg_ascend/doc/Pytorch \350\277\220\347\256\227\351\207\215\350\275\275API\345\222\214Acl\347\256\227\345\255\220\345\257\271\345\272\224\345\205\263\347\263\273.md"	
@@ -0,0 +1,39 @@
+工具Dump文件命名规则，`{api_type}_{api_name}_{api调用次数}_{正反向}_{输入输出}.index`, 如 Functional_conv2d_1_backward_input.0
+Tensor___bool___0_forward_input.0
+Torch_conv2d_1_backward_input.0
+
+Pytorch运算符重载后API（主要涉及api_type为Tensor），工具当前支持Dump的运算符和ACL对应关系整理如下
+
+| API  | ACL   | 运算符  |
+| ------------ | ------------ | ------------ |
+|  __	add	__ | Add  |  + |
+| __	and	__  | BitwiseAnd  | &  |
+| __	bool	__  | NonZero  | if x  |
+| __	div	__  | RealDiv  |  / |
+| __	ge	__  | GreaterEqual  | >=  |
+| __	gt__  |  Greater |  > |
+| __	iadd__  | Add  | +=  |
+| __	iand__  | BitwiseAnd  | &=   |
+| __	idiv__  | RealDiv  |  /= |
+| __	ifloordiv__  | FloorDiv  |  //= |
+| __	ilshift__  |  LeftShift | <<=  |
+| __	imod__  |  FloorMod | %=  |
+| __	imul__  |  Mul |  *= |
+| __	ior__  |  BitwiseOr |   \|= |
+| __	irshift__  |  RightShift |  >= |
+| __	isub__  |  Sub | -=  |
+| __	ixor__  | BitwiseXor  | ^=  |
+| __	lshift__  |LeftShift   | <<  |
+| __	matmul__  |  Dot |  @ |
+| __	mod__  |  FloorMod |  % |
+| __	mul__  | Mul  | *  |
+| __	nonzero__  | NonZero  | 暂无  |
+| __	or__  |  BitwiseOr | \|  |
+| __	radd__  |  Add |  + |
+| __	rmul__  |  Mul | *  |
+| __	rshift__  | RightShift  | >>  |
+| __	sub__  | Sub  |  - |
+| __	truediv__  | RealDiv  |  / |
+| __	xor__  |  BitwiseXor | ^  |
+| floor_divide  |  FloorDiv |  // |
+| __  getitem __ |  Strideslice |  [] |
diff --git a/debug/accuracy_tools/ptdbg_ascend/doc/img/auto_analyze_log.png b/debug/accuracy_tools/ptdbg_ascend/doc/img/auto_analyze_log.png
new file mode 100644
index 0000000000000000000000000000000000000000..999b47f97ef5661316c7e61dbdc93c87996259f3
Binary files /dev/null and b/debug/accuracy_tools/ptdbg_ascend/doc/img/auto_analyze_log.png differ
diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v1.0.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v1.0.md"
new file mode 100644
index 0000000000000000000000000000000000000000..bce52373d82df533ea1e5e5a13eace614c485877
--- /dev/null
+++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v1.0.md"
@@ -0,0 +1,539 @@
+# **PyTorch精度工具使用指南**
+
+## 简介
+本文介绍ptdbg_ascend精度工具，用来进行整网API粒度的数据dump，精度比对和溢出检测，从而定位pytoch训练场景下的精度问题。
+
+## 工具安装
+
+### 环境和依赖
+
+#### 环境
+- 可执行pytorch训练任务的训练环境(安装了Pytorch 1.8 或者 Pytorch 1.11版本)
+
+#### 工具依赖
+- 通过pip安装环境依赖numpy、pandas、pyyaml
+
+### 工具安装方式
+
+ptdbg_ascend精度工具的安装方式包括：下载whl包安装和源代码编译安装。本文主要介绍whl包安装，源码编译安装详见：[ptdbg_ascend](https://gitee.com/ascend/tools/tree/master/ptdbg_ascend)。
+
+
+#### 下载whl包安装
+
+1. 下载ptdbg_ascend精度工具的whl包。
+
+   - [ptdbg_ascend-1.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/package/ptdbg_ascend/1.0/ptdbg_ascend-1.0-py3-none-any.whl)
+
+2. 执行如下命令，进行安装。
+
+   ```bash
+   pip3 install ./ptdbg_ascend-{version}-py3-none-any.whl
+   ```
+
+   {version}表示软件版本号。
+
+   说明：若为覆盖安装，请增加“--force-reinstall”参数强制安装，例如：
+
+   ```bash
+   pip3 install ./ptdbg_ascend-{version}-py3-none-any.whl --force-reinstall
+   ```
+
+   分别提示如下信息则表示安装成功：
+
+   ```bash
+   # ptdbg_ascend精度工具
+   Successfully installed ptdbg_ascend-{version}
+   ```
+
+## 功能介绍   
+
+### 接口说明
+
+工具提供如下接口函数用于dump过程的配置，描述如下：
+
+| 函数                | 描述                                                                                                |
+| ------------------- |---------------------------------------------------------------------------------------------------|
+| set_dump_path       | 用于设置dump文件的路径(包含文件名)，参数示例：“/var/log/dump/npu_dump.pkl”                                            |
+| set_dump_switch     | 设置dump范围，不设置则默认处于关闭状态。第一个参数为：“ON” 或者 "OFF",若需要控制dump的算子范围，则需要第二、三个参数，默认不配置                        |
+| seed_all            | 固定随机数，参数为随机数种子，默认种子为：1234.                                                                        |
+ | set_backward_input | 设置反向ACL级别dump时需要的反向输入的路径,参数示例："acl_dump_xxx/Functional_conv2d_1_backward_input.0.npy"             
+| register_hook       | 用于注册dump回调函数，例如：注册精度比对hook：register_hook(model, acc_cmp_dump).                                    |
+| compare             | 比对接口，将GPU/CPU/NPU的dump文件进行比对，第三个参数为存放比对结果的目录；<br/>文件名称基于时间戳自动生成，格式为：compare_result_timestamp.csv. |
+| parse               | (若pkl文件中有)打印特定api接口的堆栈信息、统计数据信息，第一个参数为pkl文件名，第二个参数为要抽取的api接口前缀，例如"Torch_norm_1_forward".          |
+| compare_distributed | 单机多卡场景下的比对，自动检索和匹配对应卡和进程所dump的数据文件，再调用compare做比对。也支持单机单卡使用。                                       |
+
+### 数据dump
+#### 使用说明
+1) seed_all和set_dump_path在训练主函数main一开始就调用，避免随机数固定不全；
+2) register_hook须在set_dump_path之后调用，避免dump数据路径设置错误
+3) set_dump_switch提供多种dump模式，可以根据不同场景选择dump方式
+4) 进行CPU数据dump时，请安装torch包而非torch_npu包，避免工具无法识别使用场景，导致失败
+5) TASK_QUEUE_ENABLE环境变量会导致算子下发和执行异步进行，因此在ACL dump前需要将TASK_QUEUE_ENABLE关闭，需要在执行运行命令前先export TASK_QUEUE_ENABLE=0
+```
+# 多种dump模式介绍
+
+# 示例1： dump指定api/api列表.
+set_dump_switch("ON", mode="list", scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward", "Torch_relu_3_backward"])
+
+# 示例2： dump指定范围. 会dump Tensor_abs_1_forward 到 Tensor_transpose_3_forward之间的所有api
+set_dump_switch("ON", mode="range", scope=["Tensor_abs_1_forward", "Tensor_transpose_3_forward"])
+
+# 示例3： STACK模式，只dump堆栈信息， 示例中dump "Tensor_abs_1_forward" 到 "Tensor_transpose_3_forward" 之间所有api的STACK信息
+set_dump_switch("ON", mode="stack", scope=["Tensor_abs_1_forward", "Tensor_transpose_3_forward"])
+
+# 示例4： dump指定api/api列表的ACL级别的输入输出数据
+set_dump_switch("ON", mode="acl", scope=["Tensor_abs_1_forward"])
+
+# 示例5： dump指定某一类api的api级别输入输出数据
+set_dump_switch("ON", mode="api_list", api_list=["relu"])
+
+# 示例6： dump全部api级别输入输出数据以及相应堆栈信息
+set_dump_switch("ON", mode="api_stack")
+
+```
+4) dump数据存盘说明：<br/>
+
+- 精度比对dump场景 <br/>
+  假设配置的dump文件名为npu_dump.pkl，此时dump的结果为两部分：
+
+* 文件npu_dump.pkl 中包含dump数据的api名称、dtype、 shape、统计信息：max, min, mean.<br/>
+* 文件夹npu_dump_timestamp，文件夹下为numpy格式的dump数据.<br/>
+ numpy文件保存的前缀和Pytorch对应关系如下
+
+| 前缀                | Torch模块                                                                                         |
+| ------------------- |---------------------------------------------------------------------------------------------------|
+| Tensor              |  torch.Tensor                                                                                     |
+| Torch               |  torch                                                                                            |
+| Functional          |  torch.nn.functional                                                                              |
+| NPU                 |  NPU亲和算子                                                                                       |
+| VF                  |  torch._VF                                                                                        |
+
+当dump模式配置为 "api_stack"时 假设配置的dump文件名为npu_dump.pkl，文件名会被添加api_stack前缀，此时dump的结果为两部分：
+* 文件api_stack_npu_dump.pkl 中包含dump数据的api名称、dtype、 shape、统计信息：max, min, mean，以及堆栈信息。<br/>
+* 文件夹api_stack_npu_dump_timestamp，文件夹下为numpy格式的dump数据.<br/>
+
+**【新改动】** 单机多卡比对功能已上线，dump数据文件夹组织统一改为如下格式
+  ```
+  ├── dump_path
+  │   └── ptdbg_dump_v1.0
+  │       ├── rank0
+  │       │   ├── myDump
+  |       |   |    ├── Tensor_permute_1_forward.npy
+  |       |   |    ...
+  |       |   |    └── Fcuntion_linear_5_backward_output.npy
+  │       │   └── myDump.pkl
+  │       ├── rank1
+  |       |   ├── myDump
+  |       |   |   └── ...
+  |       |   └── myDump.pkl 
+  │       ├── rank2
+  |       |   ├── myDump
+  |       |   |   └── ...
+  |       |   └── myDump.pkl 
+  │       ├── ...
+  │       |
+  |       └── rank7
+  ```
+引入这个格式是为了区分各卡所dump数据，有多少张卡就有多少个rank文件夹。同时为了避免单卡和多卡使用方式割裂，单机单卡使用工具也会形成上述文件夹格式，仅在卡数量上有区别。
+具体生成方式和单机多卡的精度工具使用教程见下文场景4。
+
+
+5) 整网dump和指定范围dump结果的区别：
+* 指定范围dump时，npu_dump.pkl 中还包含stack信息<br/>
+
+6) 溢出检测dump场景<br/>
+测试不需要配置dump文件名，会在当前目录自动生成`ptdbg_dump_v1.0`文件夹，并且按卡数量创建rank文件夹，每张卡dump数据会在对应rank文件夹中：
+
+* 溢出检测的pkl文件名格式为`Overflow_info_{timestamp}.pkl`，每次溢出时时间戳不同<br/>
+  pkl文件中包含dump数据的api名称、dtype、 shape(不包含统计信息max, min, mean)。
+* 对应的dump数据存放目录为`Overflow_info_{timestamp}`，dump数据为完整Tensor数据，存放格式为numpy。
+
+
+## 场景化示例
+### 场景1：训练场景的精度问题分析
+第一步，整网Dump比对，初步定位异常范围<br/>
+数据dump。NPU和GPU/CPU数据，下面以NPU为例（GPU/CPU dump基本相同）：<br/>
+```
+from ptdbg_ascend import *
+
+# 在main函数开始前固定随机数
+seed_all()
+
+# 设置dump路径（含文件名）和dump_tag。dump_tag会体现在数据文件夹的文件名上
+# 多卡使用时最好也在main函数开始前设置
+set_dump_path("./npu_dump.pkl", dump_tag="dump_conv2d")
+
+...
+
+# 注册精度比对dump的hook函数
+# 第一个参数是model对象， 第二个参数为精度比对dump的钩子函数，配置为：acc_cmp_dump，该函数从ptdbg_ascend中import
+
+# 示例
+register_hook(model, acc_cmp_dump)
+
+...
+
+# dump默认处于关闭状态，设置dump开关为打开
+# 如果只在特定的step dump，则在期望dump的迭代开始前打开dump开关，step结束后关掉。
+set_dump_switch("ON")
+
+...
+
+# 在期望dump的step结束后关闭dump开关
+set_dump_switch("OFF")
+
+...
+
+```
+
+比对dump数据<br/>
+```
+from ptdbg_ascend import *
+
+...
+
+# 数据dump完成后,比对dump的NPU vs GPU/CPU数据, compare第二个参数中的目录必须是已存在的目录
+比对示例：
+dump_result_param={
+"npu_pkl_path": "./npu_dump.pkl",
+"bench_pkl_path": "./gpu_dump.pkl",
+"npu_dump_data_dir": "./npu_dump_20230104_13434",
+"bench_dump_data_dir": "./gpu_dump_20230104_132544",
+"is_print_compare_log": True
+}
+compare(dump_result_param, "./output", True)
+```
+Dump数据时使用"api_stack" 模式时进行比对dump数据<br/>
+```
+from ptdbg_ascend import *
+
+...
+
+# 数据dump完成后,比对dump的NPU vs GPU/CPU数据, compare第二个参数中的目录必须是已存在的目录, stack_mode参数需要配置为True, 默认为False
+# 请注意：stack_mode为True时，需配置使用"api_stack"模式下的dump数据，其他模式均不需要设置stack_mode
+# api_stack为"api_stack"模式下自动生成的前缀（参考4.dump数据存盘数据说明）
+比对示例：
+dump_result_param={
+"npu_pkl_path": "./api_stack_npu_dump.pkl",
+"bench_pkl_path": "./api_stack_gpu_dump.pkl",
+"npu_dump_data_dir": "./api_stack_npu_dump_20230104_13434",
+"bench_dump_data_dir": "./api_stack_gpu_dump_20230104_132544",
+"is_print_compare_log": True
+}
+compare(dump_result_param, "./output", True, stack_mode=True)
+# 比对结果中将展示堆栈信息
+```
+
+
+第二步：缩小范围分析<br/>
+      指定api范围做完整数据的dump，此时也可以做精度比对。<br/>
+      指定范围dump时，还会dump出stack信息，便于找到api调用点。<br/>
+      示例代码中只包含第一步基础之上，需要调整的设置。
+```
+# 设置dump路径（含文件名），dump路径若不重新设置，会导致整网dump的数据被覆盖
+set_dump_path("./npu_dump_scope.pkl")
+
+...
+
+# 注册精度比对dump的hook函数
+register_hook(model, acc_cmp_dump)
+
+...
+
+# 通过set_dump_switch控制dump的范围
+# 示例1： dump指定api/api列表.
+set_dump_switch("ON", mode="list", scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward", "Torch_relu_3_forward"])
+# 示例2： dump指定范围. 会dump Tensor_abs_1_forward 到 Tensor_transpose_2_forward之间的所有api
+set_dump_switch("ON", mode="range", scope=["Tensor_abs_1_forward", "Tensor_transpose_2_forward"])
+# 示例3： dump指定前向api的ACL级别数据.
+register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='dump.json')
+set_dump_switch("ON", mode="acl", scope=["Tensor_permute_1_forward"])
+# 示例4： dump指定反向api的ACL级别数据.
+register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='dump.json')
+set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"])
+set_backward_input(["xxx/Functional_conv2d_1_backward_input.0.npy"])
+...
+```
+按范围dump后的分析<br/>
+可以基于dump的完整数据做比对，可以结合堆栈信息分析代码，也可以做单API模型的问题复现；
+
+### 场景2：提取指定API的堆栈信息/dump数据的统计信息
+指定范围dump的信息可能包含多个api，且pkl文件显示不直观，这里通过parse接口可以清晰的显示特定api的堆栈信息和dump数据统计信息
+```
+from ptdbg_ascend import *
+
+# 提取dump信息中第21次调用的API：Torch_batch_normal的堆栈信息及数据统计信息
+parse("./npu_dump.pkl", "Torch_batch_normal_1_forward")
+```
+
+### 场景3：溢出检测分析（NPU场景识别aicore浮点溢出,GPU和CPU不支持）
+#### 1. api溢出检测，溢出api，api级数据dump
+```
+from ptdbg_ascend import *
+
+# 在main函数起始位置固定随机数
+seed_all()
+
+...
+
+#注册溢出检测的hook：
+# 第一个参数是model对象， 第二个参数为精度比对dump的钩子函数名，必须配置为：overflow_check，该函数从ptdbg_ascend中import
+# 第三个参数为溢出检测的次数，例如配置为3，表示检测到第三次溢出时停止训练;
+
+# 示例，检测到2次溢出后退出
+register_hook(model, overflow_check, overflow_nums=2)
+
+...
+```
+注：单机多卡使用时各卡单独计算溢出次数。
+
+#### 2. api溢出检测，溢出api，acl级数据dump
+
+```
+from ptdbg_ascend import *
+
+# 在main函数起始位置固定随机数
+seed_all()
+
+...
+
+#注册溢出检测的hook：
+# 第一个参数是model对象， 第二个参数为精度比对dump的钩子函数名，必须配置为：overflow_check，该函数从ptdbg_ascend中import
+# 第三个参数为overflow_nums表示第几次溢出时，停止训练，例如配置为3，表示检测到第三次溢出时停止训练，过程中检测到溢出API对应ACL数据均dump;默认不配置即检测到一次溢出，训练停止
+# 第四个参数为dump_mode,控制针对溢出api的dump模式，默认api，如需进一步定位acl数据，可配置为dump_mode="acl"
+# 第五个参数为dump_config，acl dump的配置文件，dump_mode="acl"时，此配置项为必须的。例如：dump_config='/home/xxx/dump.json'
+
+# 针对正向溢出场景，可以直接通过上述配置，将溢出api进行acl粒度的数据dump
+# 示例，检测到1次溢出后退出，并针对溢出api，进行对应acl粒度的数据dump
+register_hook(model, overflow_check, dump_mode='acl', dump_config='/home/xxx/dump.json')
+
+...
+
+# 默认全量进行溢出检测
+# 如果只在特定的step 溢出检测，则在期望溢出检测的迭代开始前打开溢出检测开关，step结束后关掉。
+set_overflow_check_switch("ON")
+
+...
+
+# 在期望溢出检测的step结束后关闭溢出检测开关
+set_overflow_check_switch("OFF")
+
+...
+
+# 对于反向溢出场景获取反向acl级别数据
+# 使用acl模式，配置上梯度输入文件，再进行一次dump
+register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='dump.json')
+set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"])
+set_backward_input(["xxx/Functional_conv2d_1_backward_input.0.npy"])    # 该输入文件为首次运行得到的反向输入
+```
+#### dump.json配置示例
+```
+{
+ "dump":
+ {
+         "dump_list":[],
+         "dump_path":"/home/HwHiAiUser/dump/output",
+         "dump_mode":"all",
+         "dump_op_switch":"on"
+ }
+}
+```
+#### dump.json参数说明
+| 字段名              | 说明                                                                                                |
+|-----------------|---------------------------------------------------------------------------------------------------|
+| dump_list   | 待dump数据的算子模型。为空，无需配置。                                         |
+| dump_path   | dump数据文件存储到运行环境的目录，支持配置绝对路径或相对路径：<br>* 绝对路径配置以“/”开头，例如：/home/HwHiAiUser/output。<br>* 相对路径配置直接以目录名开始，例如：output。<br>例如：dump_path配置为/home/HwHiAiUser/output，则dump数据文件存储到运行环境的/home/HwHiAiUser/output目录下。 |
+| dump_mode   | dump数据模式，配置如下：<br>* output：dump算子的输出数据，默认取值output。<br>* input：dump算子的输入数据。<br>*  all：dump算子的输入、输出数据。|
+| dump_op_switch   | 单算子模型dump数据开关，配置如下：<br>* off：关闭单算子模型dump，默认取值off。<br>* on：开启单算子模型dump。|
+
+##### dump路径说明
+采集的dump数据会在{dump_path}/{time}/{deviceid}/{model_id}目录下生成，例如“/home/HwHiAiUser/output/20200808163566/0/0”
+```
+├── 20230131172437
+│   └── 1
+│       ├── 0
+│       │   ├── Add.Add.45.0.1675157077183551
+│       │   ├── Cast.trans_Cast_0.31.0.1675157077159449
+│       │   ├── Cast.trans_Cast_5.43.0.1675157077180129
+│       │   ├── MatMul.MatMul.39.0.1675157077172961
+│       │   ├── Mul.Mul.29.0.1675157077155731
+│       │   ├── NPUAllocFloatStatus.NPUAllocFloatStatus.24.0.1675157077145262
+│       │   ├── TransData.trans_TransData_1.33.0.1675157077162791
+│       │   └── TransData.trans_TransData_4.41.0.1675157077176648
+│       ├── 1701737061
+│       │   └── Cast.trans_Cast_2.35.0.1675157077166214
+│       ├── 25
+│       │   └── NPUClearFloatStatus.NPUClearFloatStatus.26.0.1675157077150342
+│       └── 68
+│           └── TransData.trans_TransData_3.37.0.1675157077169473
+```
+#### 注意事项
+此功能原理是，针对溢出阶段，开启acl dump模式，重新对溢出阶段执行，落盘数据。
+* dump_mode="acl"场景下，会增加npu的内存消耗，请用户谨慎开启。
+
+* 针对前向溢出api，可以通过以上原理，重新精准执行到溢出前向api，因此可以得到前向溢出api的全部acl数据。
+
+* 部分api存在调用嵌套关系，比如functional.batch_norm实际调用torch.batch_norm, 该场景会影响acl init初始化多次，导致功能异常。针对此场景，后续会针对性做适配，当前版本可能存在此问题
+
+* 针对前向溢出api，可以通过overflow_nums，配置允许的溢出次数，并将每次溢出api的全部acl数据dump下来，到达指定溢出次数后停止，停止后会看到堆栈打印包含如下字段。
+  ValueError: [overflow xxx times]: dump file is saved in 'xxxxx.pkl'.
+  其中xxx times为用户设置的次数，xxxxx.pkl为文件生成路径
+  
+* 对于反向溢出场景获取acl级别数据，第一轮获取反向算子的输入数据，准备好后配置dump.json，并配置好输入数据路径，相关配置如下：
+
+  register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='dump.json')
+  set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"])
+  set_backward_input(["xxx/Functional_conv2d_1_backward_input.0.npy"])
+
+### 场景四 单机多卡场景使用精度比对工具
+精度工具单机多卡功能继承了单机单卡时工具的所有功能，如果你想了解工具的基本功能，请参阅上面的场景一到场景三。
+如果你已经熟悉单机单卡使用本工具，想了解如何单机多卡使用，那么请参考[迅速上手：单机多卡使用注意事项](./NotesForMultiCardTraining.md)
+
+**文件夹格式改动**
+
+为了支持单机多卡场景，我们模仿ACL溢出检测dump的文件夹，区分了不同rank所dump的数据文件。
+假设dump路径设置为`set_dump_path('./dump_path/myDump.pkl', dump_tag='dump_conv2d')`，
+则数据（pkl和包含npy文件的文件夹）会dump在：`./dump_path/{dump_tag}_{version}/rank{rankid}/`路径下。比如：
+
+  ```
+  ├── dump_path
+  │   └── dump_conv2d_v1.0
+  │       ├── rank0
+  │       │   ├── myDump
+  |       |   |    ├── Tensor_permute_1_forward.npy
+  |       |   |    ...
+  |       |   |    └── Fcuntion_linear_5_backward_output.npy
+  │       │   └── myDump.pkl
+  │       ├── rank1
+  |       |   ├── myDump
+  |       |   |   └── ...
+  |       |   └── myDump.pkl 
+  │       ├── rank2
+  |       |   ├── myDump
+  |       |   |   └── ...
+  |       |   └── myDump.pkl 
+  │       ├── ...
+  │       |
+  |       └── rank7
+  ```
+
+具体地说，dump_path下首先产生一个`{dump_tag}_{version}`文件夹，`dump_tag`是set_dump_path传入参数设置的，可以用来提高文件夹辨识度。
+`version`是工具版本，用于区分不同版本工具所dump的数据这个文件夹中会根据实际使用卡的数量产生若干`rank`文件夹。
+每张卡上dump结果产生pkl和npy数据文件夹会存在对应的rank文件夹下。
+需要注意的是，如果以相同的dump_path和dump_tag运行两次，则**第二次的数据文件会覆盖第一次的**。
+
+**单机多卡使用说明**
+1. set_dump_path 设置dump目标路径
+
+由于上述文件夹结构改动，你可能已经注意到了最终dump的pkl路径和原本set_dump_path传入的路径不同。
+另外，我们给set_dump_path新增了一个参数`dump_tag`，用来标识本次dump的用途，优化文件夹结构。
+比如，你正在用工具调试ResNet50，首先做了一次全量dump，可以
+
+```
+set_dump_path('./dump_resnet50/dump.pkl', dump_tag='all')
+```
+经过全量dump你发现其中某个conv2d算子计算误差较大，想要定位到代码行，那么可以
+```
+set_dump_path('./dump_resnet50/dump.pkl', dump_tag='conv2d_stack')
+```
+并在`set_dump_switch`时启用stack模式。这样在`dump_resnet50`文件夹下就会分别有`all_{version}`和`conv2d_stack_{version}`两个文件夹，方便查看。
+
+2. register_hook 注册工具的dump或溢出检测钩子
+
+为了方便区分不同卡上的dump数据，调用register_hook时可以通过`rank`参数传入各自进程所对应的`rank_id`，比如
+
+```
+register_hook(model, acc_cmp_dump, rank=rank_id)
+```
+
+`rank`将决定该进程所dump数据被存入哪个`rank`文件夹（如上面文件夹格式所描述）。如果不清楚当前rank id或者不显式传入，
+工具将隐式从传入模型的参数读取`device.index`信息作为`rank`。因此隐式读取时用户须保证在模型已经上卡之后再调用`register_hook`
+需要注意的是，由于该函数会创建各卡dump数据时的目标`rank`文件夹，因此在调用register_hook前必须先set_dump_path，否则set_dump_path会失效。
+
+3. compare_distributed 分布式比对
+
+dump数据之后的比对建议使用`compare_distributed`接口。调用该接口需要传入`npu_dump_dir`, `bench_dump_dir`, `output_path`三个参数，
+前两者代表需要比对的两次运行数据所在的总文件夹路径，即上文所说的`{dump_path}/{dump_tag}_{version}` 。函数会自动检测文件夹下的`rank`文件夹并按顺序一一对应，
+并调用compare逐个做比对，最终对每对`rank`文件夹生成一个csv比对结果。
+
+在上面的例子中，我们可以传入 `dump_path/dump_conv2d_v1.0` 作为`npu_dump_dir`参数。
+
+   假设我们要比对的标杆数据在`dump_gpu/dump_conv2d_v1.0`文件夹（文件夹下应有对应数量的rank文件夹），要比对以上两次运行所产生的数据差异，
+   就可以把这个路径作为`bench_dump_dir`传入。如：
+```python
+compare_distributed('dump_path/dump_conv2d_v1.0', 'dump_gpu/dump_conv2d_v1.0', './output')
+```
+另外，原本`compare`比对函数支持的参数如`shape_flag`、`stack_mode`等，`compare_distributed`函数也支持。
+
+**注意：两次运行须用相同数量的卡，传入`compare_distributed`的两个文件夹下须有相同个数的rank文件夹，且不包含其他无关文件，否则将无法比对。**
+
+### **NPU自定义算子dump**
+对于NPU vs NPU场景，本工具还支持对NPU自定义算子的数据dump，目前支持列表如下
+
+| NPU自定义算子 |
+| ------ | 
+| torch_npu.one_ | 
+| torch_npu.npu_sort_v2 | 
+| torch_npu.npu_transpose |
+| torch_npu.npu_broadcast |
+| torch_npu.npu_dtype_cast |
+| torch_npu.empty_with_format |
+| torch_npu.npu_one_hot |
+| torch_npu.npu_stride_add |
+| torch_npu.npu_ps_roi_pooling |
+| torch_npu.npu_roi_align |
+| torch_npu.npu_nms_v4 |
+| torch_npu.npu_iou |
+| torch_npu.npu_nms_with_mask |
+| torch_npu.npu_pad |
+| torch_npu.npu_bounding_box_encode |
+| torch_npu.npu_bounding_box_decode |
+| torch_npu.npu_batch_nms |
+| torch_npu.npu_slice |
+| torch_npu._npu_dropout |
+| torch_npu.npu_indexing
+| torch_npu.npu_ifmr |
+| torch_npu.npu_max |
+| torch_npu.npu_scatter |
+| torch_npu.npu_layer_norm_eval |
+| torch_npu.npu_alloc_float_status |
+| torch_npu.npu_get_float_status |
+| torch_npu.npu_clear_float_status |
+| torch_npu.npu_confusion_transpose |
+| torch_npu.npu_bmmV2 |
+| torch_npu.fast_gelu |
+| torch_npu.npu_sub_sample |
+| torch_npu.npu_deformable_conv2d |
+| torch_npu.npu_mish |
+| torch_npu.npu_anchor_response_flags |
+| torch_npu.npu_yolo_boxes_encode |
+| torch_npu.npu_grid_assign_positive |
+| torch_npu.npu_normalize_batch |
+| torch_npu.npu_masked_fill_range |
+| torch_npu.npu_linear |
+| torch_npu.npu_bert_apply_adam |
+| torch_npu.npu_giou |
+| torch_npu.npu_ciou |
+| torch_npu.npu_ciou_backward |
+| torch_npu.npu_diou |
+| torch_npu.npu_diou_backward |
+| torch_npu.npu_sign_bits_pack |
+| torch_npu.npu_sign_bits_unpack |
+
+### **计算精度评价指标**
+
+在进行计算精度匹配时，基本共识为默认CPU或GPU的算子计算结果是准确的，最终比对生成的csv文件中主要包括以下的几个属性：
+
+| NPU Name | Bench Name | Npu Tensor Dtype | Bench Tensor Dtype  | Npu Tensor Shape  | Bench Tensor Shape  | Cosine  |  MaxAbsError  |  ...  |
+|:--------:|:----------:|:----------------:|:-------------------:|:-----------------:|:-------------------:|:-------:|:-------------:|:-----:| 
+
+其中主要使用算子Name、Dtype、Shape用于描述算子的基本特征，Cosine(余弦相似)、MaxAbsError(最大绝对误差)作为评价计算精度的主要评估指标：
+
+1. 余弦相似度(通过计算两个向量的余弦值来判断其相似度)：
+
+
+当余弦夹角数值越接近于1说明计算出的两个张量越相似，在计算中可能会存在nan，主要由于可能会出现其中一个向量为0
+
+2. MaxAbsError(最大绝对误差)：
+
+当最大绝对误差越接近0表示其计算的误差越小
diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v2.0.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v2.0.md"
new file mode 100644
index 0000000000000000000000000000000000000000..eba61ce1a6b2c0e8eb3b3092563e5340319386dc
--- /dev/null
+++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v2.0.md"
@@ -0,0 +1,1001 @@
+# **PyTorch精度工具使用指南**
+
+本文主要介绍PyTorch精度工具精度工具ptdbg_ascend的使用以及精度比对场景示例。
+
+ptdbg_ascend工具的原理及安装请参见《[PyTorch精度工具](https://gitee.com/ascend/tools/blob/master/ptdbg_ascend/README.md)》。
+
+## PyTorch精度比对总体流程
+
+1. 准备CPU或GPU训练工程。
+
+2. 在环境下安装ptdbg_ascend工具。
+
+3. 在训练脚本内插入ptdbg_ascend工具dump接口。
+
+4. 执行训练dump数据。
+
+5. 将CPU或GPU训练工程迁移为NPU训练工程。
+
+   请参见《[PyTorch模型迁移和训练指南](https://www.hiascend.com/document/detail/zh/canncommercial/63RC1/modeldevpt/ptmigr/ptmigr_0001.html)》。
+
+6. 在NPU环境下安装ptdbg_ascend工具。
+
+7. 在NPU训练脚本内插入ptdbg_ascend工具dump接口。
+
+8. NPU环境下执行训练dump数据。
+
+9. 创建并配置精度比对脚本，例如compare.py。
+
+10. 执行CPU或GPU dump与NPU dump数据的精度比对。
+
+11. 比对结果分析。
+
+## 场景化示例
+
+本章节主要介绍通过ptdbg_ascend工具进行精度比对和分析，主要使用“**CPU或GPU及NPU精度数据dump**”和“**CPU或GPU与NPU精度数据比对**”章节中介绍的ptdbg_ascend工具接口。
+
+### 单卡场景精度比对
+
+**精度分析建议**
+
+PyTorch训练场景的精度问题分析建议参考以下思路进行精度比对和比对结果分析：
+
+1. 整网比对：dump整网数据并进行精度比对，初步定位异常范围。
+2. 缩小范围：根据Accuracy Reached or Not找出不符合精度标准的API。
+3. 范围比对：对不符合精度标准的API重新dump。
+4. 分析原因并优化：分析API精度不符合标准的原因并进行优化调整。
+5. 整网比对：重新进行整网比对，判断优化后的API是否已符合精度标准以及是否出现新的精度问题。
+6. 重复1~5步，直到不存在精度问题为止。
+
+**精度分析示例**
+
+1. dump整网数据。
+
+   分别dump CPU或GPU以及NPU数据，在PyTorch训练脚本插入dump接口，示例代码如下（下面以NPU为例，CPU或GPU dump基本相同）：
+
+   ```python
+   from ptdbg_ascend import *
+   
+   # 在main函数开始前固定随机数
+   seed_all()
+   
+   # 配置dump数据目录路径和名称
+   set_dump_path("./npu_dump", dump_tag='all')
+   
+   # 注册dump回调函数
+   register_hook(model, acc_cmp_dump)
+   
+   ...
+   
+   # 在第一个迭代开始的位置开启dump和堆栈模式，同时为保证数据完整性开启dump bool和整型的tensor以及浮点、bool和整型的标量
+   set_dump_switch("ON", mode="api_stack", filter_switch="OFF")
+   
+   ...
+   
+   # 在第一个迭代结束的位置关闭dump
+   set_dump_switch("OFF")
+   ```
+
+2. 比对整网数据。
+
+   第1步中的NPU dump数据文件为npu_dump.pkl，假设NPU dump npy数据目录为npu_dump，GPU dump数据文件为gpu_dump.pkl，GPU dump npy数据目录为gpu_dump。
+
+   创建并配置精度比对脚本，以创建compare.py为例，示例代码如下：
+
+   ```python
+   from ptdbg_ascend import *
+   dump_result_param={
+   "npu_pkl_path": "./npu_dump/all_v2.0/rank0/api_stack_dump.pkl",
+   "bench_pkl_path": "./gpu_dump/all_v2.0/rank0/api_stack_dump.pkl",
+   "npu_dump_data_dir": "./npu_dump/all_v2.0/rank0/api_stack_dump",
+   "bench_dump_data_dir": "./gpu_dump/all_v2.0/rank0/api_stack_dump",
+   "is_print_compare_log": True
+   }
+   compare(dump_result_param, "./output")
+   ```
+
+   执行比对：
+
+   ```bash
+   python3 compare.py
+   ```
+
+   在output目录下生成结果文件，包括：`compare_result_{timestamp}.csv`和`advisor_{timestamp}.txt`
+
+3. 找出存在问题的API。
+
+   1. 根据`advisor_{timestamp}.txt`或打屏信息的提示，可找到存在精度问题的算子（Suspect Nodes）和专家建议（Expert Advice)
+
+      ![auto_analyze_log](img/auto_analyze_log.png)
+
+   2. 根据第2步结果文件`compare_result_{timestamp}.csv`中的Accuracy Reached or No字段显示为NO的API，针对该API执行后续比对操作，分析该API存在的精度问题。
+
+4. （可选）提取指定API的堆栈信息和dump数据统计信息。
+
+   通过parse接口可以清晰的显示特定API的堆栈信息和dump数据统计信息，结合堆栈信息分析代码中可能存在的精度问题。
+
+   创建并配置提取脚本，以创建parse.py为例，示例代码如下：
+
+   ```python
+   from ptdbg_ascend import *
+   
+   # 提取dump信息中第1次调用的API：Torch_batch_normal的堆栈信息及数据统计信息
+   parse("./npu_dump/all_v2.0/rank0/api_stack_dump.pkl", "Torch_batch_normal_1_forward")
+   ```
+
+   执行提取：
+
+   ```bash
+   python3 parse.py
+   ```
+
+   
+
+5. （可选）指定API dump数据。
+
+   - dump指定前向API的ACL级别数据
+
+     ```python
+     from ptdbg_ascend import *
+     
+     # 固定随机数，开启确定性计算
+     seed_all(mode=True)
+     set_dump_path("./dump_path", dump_tag='forward')
+     register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+     
+     # dump指定前向API的ACL级别数据、bool和整型的tensor以及浮点、bool和整型的标量
+     set_dump_switch("ON", mode="acl", scope=["Tensor_permute_1_forward"], filter_switch="OFF")
+     
+     ...
+     
+     set_dump_switch("OFF")
+     ```
+
+   - dump指定反向API的ACL级别数据
+
+     ```python
+     from ptdbg_ascend import *
+     
+     # 固定随机数，开启确定性计算
+     seed_all(mode=True)
+     set_dump_path("./dump_path", dump_tag='backward')
+     register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+     
+     # dump指定反向API的ACL级别数据、bool和整型的tensor以及浮点、bool和整型的标量
+     set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"], filter_switch="OFF")
+     set_backward_input(["./npu_dump/all_v2.0/rank0/api_stack_dump/Functional_conv2d_1_backward_input.0.npy"])
+     
+     ...
+     
+     set_dump_switch("OFF")
+     ```
+
+6. （可选）重新比对。
+
+   根据第4或5步的dump数据重新配置compare.py并执行比对，可以对单API模型进行问题复现。
+
+**注意事项**
+
+* dump_mode="acl"场景下，会增加npu的内存消耗，请谨慎开启。
+* 部分API存在调用嵌套关系，比如functional.batch_norm实际调用torch.batch_norm，该场景会影响acl init初始化多次，导致功能异常。
+
+### 多卡场景精度比对
+
+精度工具支持多卡场景的精度比对，多卡场景的dump步骤与单卡场景完全一致，请参见“**单卡场景精度比对**”章节，不同的是多卡数据精度比对时需要使用“compare_distributed”函数进行比对。如下示例：
+
+说明：多机多卡场景需要每个设备单独执行比对操作。
+
+假设NPU dump npy数据目录为npu_dump/dump_conv2d_v1.0，GPU dump npy数据目录为gpu_dump/dump_conv2d_v1.0。
+
+1. 创建比对脚本，例如compare_distributed.py，拷贝如下代码。
+
+   ```python
+   from ptdbg_ascend import *
+   compare_distributed('./npu_dump/ptdbg_dump_v2.0', './gpu_dump/ptdbg_dump_v2.0', './output')
+   ```
+
+2. 执行比对：
+
+   ```bash
+   python3 compare_distributed.py
+   ```
+
+两次运行须用相同数量的卡，传入`compare_distributed`的两个文件夹下须有相同个数的rank文件夹，且不包含其他无关文件，否则将无法比对。
+
+**多卡set_dump_path注意事项**
+
+多卡一般为多进程，须保证每个进程都正确调用set_dump_path，或把set_dump_path插入到import语句后，如：
+
+```python
+from ptdbg_ascend import *
+seed_all()
+set_dump_path('./dump_resnet')
+```
+
+如此可保证set_dump_path在每个进程都被调用。
+
+**多卡register_hook注意事项**
+
+register_hook需要在set_dump_path之后调用，也需要在每个进程上被调用，建议在搬运模型数据到卡之后调用。识别方法如下：
+
+- 找到训练代码中遍历epoch的for循环或遍历数据集的for循环，把register_hook放到循环开始前即可。
+- 找到训练代码中调用DDP或者DistributedDataParallel的代码行，把register_hook放到该代码行所在的代码块之后。
+- 若代码中均无以上两种情况，需要保证register_hook在模型定义之后插入，并配置rank参数。rank参数获取rank_id请参见“**[rank_id获取方法](https://gitee.com/ascend/tools/tree/master/ptdbg_ascend/doc/rank_id获取方法.md)**”。
+
+### NPU vs NPU精度比对
+
+对于NPU vs NPU场景，是针对同一模型，进行迭代（模型、API版本升级或设备硬件升级）时存在的精度下降问题，对比相同模型在迭代前后版本的API计算数值，进行问题定位。
+
+一般情况下迭代涉及NPU自定义算子，因此，可以仅dump NPU自定义算子进行比对。比对精度问题分析请参见“**单卡场景精度比对**”章节。
+
+工具当前支持dump NPU自定义算子如下：
+
+| 序号 | NPU自定义算子                       |
+| :--- | ----------------------------------- |
+| 1    | torch_npu.one_                      |
+| 2    | torch_npu.npu_sort_v2               |
+| 3    | torch_npu.npu_transpose             |
+| 4    | torch_npu.npu_broadcast             |
+| 5    | torch_npu.npu_dtype_cast            |
+| 6    | torch_npu.empty_with_format         |
+| 7    | torch_npu.npu_one_hot               |
+| 8    | torch_npu.npu_stride_add            |
+| 9    | torch_npu.npu_ps_roi_pooling        |
+| 10   | torch_npu.npu_roi_align             |
+| 11   | torch_npu.npu_nms_v4                |
+| 12   | torch_npu.npu_iou                   |
+| 13   | torch_npu.npu_nms_with_mask         |
+| 14   | torch_npu.npu_pad                   |
+| 15   | torch_npu.npu_bounding_box_encode   |
+| 16   | torch_npu.npu_bounding_box_decode   |
+| 17   | torch_npu.npu_batch_nms             |
+| 18   | torch_npu.npu_slice                 |
+| 19   | torch_npu._npu_dropout              |
+| 20   | torch_npu.npu_indexing              |
+| 21   | torch_npu.npu_ifmr                  |
+| 22   | torch_npu.npu_max                   |
+| 23   | torch_npu.npu_scatter               |
+| 24   | torch_npu.npu_layer_norm_eval       |
+| 25   | torch_npu.npu_alloc_float_status    |
+| 26   | torch_npu.npu_get_float_status      |
+| 27   | torch_npu.npu_clear_float_status    |
+| 28   | torch_npu.npu_confusion_transpose   |
+| 29   | torch_npu.npu_bmmV2                 |
+| 30   | torch_npu.fast_gelu                 |
+| 31   | torch_npu.npu_sub_sample            |
+| 32   | torch_npu.npu_deformable_conv2d     |
+| 33   | torch_npu.npu_mish                  |
+| 34   | torch_npu.npu_anchor_response_flags |
+| 35   | torch_npu.npu_yolo_boxes_encode     |
+| 36   | torch_npu.npu_grid_assign_positive  |
+| 37   | torch_npu.npu_normalize_batch       |
+| 38   | torch_npu.npu_masked_fill_range     |
+| 39   | torch_npu.npu_linear                |
+| 40   | torch_npu.npu_bert_apply_adam       |
+| 41   | torch_npu.npu_giou                  |
+| 42   | torch_npu.npu_ciou                  |
+| 43   | torch_npu.npu_ciou_backward         |
+| 44   | torch_npu.npu_diou                  |
+| 45   | torch_npu.npu_diou_backward         |
+| 46   | torch_npu.npu_sign_bits_pack        |
+| 47   | torch_npu.npu_sign_bits_unpack      |
+
+### 溢出检测场景
+
+溢出检测是针对NPU的PyTorch API，检测是否存在溢出的情况。当前仅支持识别aicore浮点溢出。
+
+溢出检测原理：针对溢出阶段，开启acl dump模式，重新对溢出阶段执行，落盘数据。
+
+建议按照如下步骤操作：
+
+1. 在NPU环境下安装ptdbg_ascend工具。
+
+2. 在NPU训练脚本内插入ptdbg_ascend工具溢出检测接口。
+
+   - 示例1：全量溢出检测
+
+     ```python
+     from ptdbg_ascend import *
+     seed_all()
+     ...
+     # 设置检测到3次溢出后退出训练
+     register_hook(model, overflow_check, overflow_nums=3)
+     
+     ...
+     ```
+
+     多卡使用时各卡单独计算溢出次数。
+
+   - 示例2：dump指定API的ACL级别溢出数据
+
+     ```python
+     from ptdbg_ascend import *
+     seed_all()
+     ...
+     # dump指定API的ACL级别溢出数据
+     register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json')
+     
+     # 在期望溢出检测的step位置开始前打开溢出检测开关
+     set_overflow_check_switch("ON")
+     
+     ...
+     
+     # 在step结束的位置关闭溢出检测开关
+     set_overflow_check_switch("OFF")
+     
+     ...
+     ```
+
+   - 示例3：dump指定反向API的ACL级别的溢出数据
+
+     1. 进行全量溢出检测
+
+        ```python
+        from ptdbg_ascend import *
+        seed_all()
+        ...
+        # 设置检测到3次溢出后退出训练
+        register_hook(model, overflow_check)
+        
+        ...
+        ```
+
+     2. dump指定反向API的ACL级别的溢出数据
+
+        ```python
+        from ptdbg_ascend import *
+        seed_all()
+        ...
+        # dump指定反向API的ACL级别溢出数据
+        register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+        set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"])
+        set_backward_input(["./npu_dump/ptdbg_dump_v2.0/rank0/dump/Functional_conv2d_1_backward_input.0.npy"])
+        ```
+
+   针对前向溢出API，可以通过overflow_nums，配置允许的溢出次数，并将每次溢出API的全部ACL数据dump下来，到达指定溢出次数后停止，停止后会看到堆栈打印包含如下字段。
+
+   ```bash
+   ValueError: [overflow xxx times]: dump file is saved in 'xxxxx.pkl'.
+   ```
+
+   其中xxx times为用户设置的次数，xxxxx.pkl为文件生成路径。
+
+3. NPU环境下执行训练dump溢出数据。
+
+**注意事项**
+
+* dump_mode="acl"场景下，会增加npu的内存消耗，请谨慎开启。
+* 部分API存在调用嵌套关系，比如functional.batch_norm实际调用torch.batch_norm，该场景会影响acl init初始化多次，导致功能异常。
+
+## CPU或GPU及NPU精度数据dump 
+
+### 总体说明
+
+- 本节主要介绍CPU或GPU及NPU精度数据dump所需要的函数以及示例。
+
+- ptdbg_ascend工具默认情况下仅dump PyTorch模型的API输入输出数据进行精度比对，若在比对结果中发现某个API下可能存在ACL的精度问题，那么可以选择dump该API的ACL级别数据进行精度分析。
+
+- 某些torch api的输出不是Tensor类型的数据。对于此类API的反向过程进行ACL dump，工具会在运行日志中给出对应的Warning（is not of tensor type and cannot be automatically derived）提示。如若想要进行该类API反向ACL dump，可以通过手动构建单API用例的方式进行ACL dump，具体用例可参见“**[反向ACL dump用例说明](https://gitee.com/ascend/tools/blob/master/ptdbg_ascend/doc/%E5%8F%8D%E5%90%91ACL%20dump%E7%94%A8%E4%BE%8B%E8%AF%B4%E6%98%8E.md)**”。
+
+- 工具性能：dump数据量较小时（小于5G），参考dump速度0.1GB/s；dump数据量较大时，参考dump速度0.2GB/s。
+  推荐环境配置：独占环境，CPU核心数192，固态硬盘（IO速度参考：固态硬盘 > 500MB/s，机械硬盘60 ~ 170MB/s）。
+
+  用户环境性能弱于标准约束或非独占使用的比对速度酌情向下浮动。Dump速度的计算方式：Dump数据量/(单个step添加Dump耗时-原始单个step耗时）。
+
+### 约束
+
+- 进行CPU或GPU数据dump时，请安装torch包而非torch_npu包，避免工具无法识别使用场景，导致失败。
+
+- TASK_QUEUE_ENABLE环境变量会导致API下发和执行异步进行，因此在ACL dump前需要将TASK_QUEUE_ENABLE关闭，即export TASK_QUEUE_ENABLE=0。
+
+- 不建议在PyTorch训练脚本中同时添加dump接口和性能数据采集（如Ascend PyThon Profiler）接口，二者可能相互影响导致数据不准确。
+
+### seed_all
+
+**功能说明**
+
+固定随机数。通过固定随机数保证模型的输入或输出一致。在训练主函数开始前调用，避免随机数固定不全。
+
+dump操作必选。
+
+**函数原型**
+
+```python
+seed_all(seed=1234, mode=False)
+```
+
+**参数说明**
+
+| 参数名 | 说明                                                         | 是否必选 |
+| ------ | ------------------------------------------------------------ | -------- |
+| seed   | 随机数种子。参数示例：seed=1000。默认值为：1234。            | 否       |
+| mode   | 确定性计算模式。可配置True或False。参数示例：mode=True。默认为False。<br/>即使在相同的硬件和输入下，API多次执行的结果也可能不同，开启确定性计算是为了保证在相同的硬件和输入下，API多次执行的结果相同。<br/>确定性计算会导致API执行性能降低，建议在发现模型多次执行结果不同的情况下开启。<br/>rnn类算子、ReduceSum、ReduceMean等算子可能与确定性计算存在冲突，若开启确定性计算后多次执行的结果不相同，则考虑存在这些算子。 | 否       |
+
+**函数示例**
+
+seed_all函数的随机数种子，取默认值即可，无须配置；第二个参数默认关闭，不开启确定性计算时也无须配置。
+
+- 示例1：仅固定随机数，不开启确定性计算
+
+  ```python
+  seed_all()
+  ```
+
+- 示例2：固定随机数，开启确定性计算
+
+  ```python
+  seed_all(mode=True)
+  ```
+
+**固定随机数范围**
+
+seed_all函数可固定随机数的范围如下表。
+
+| API                                      | 固定随机数                  |
+| ---------------------------------------- | --------------------------- |
+| os.environ['PYTHONHASHSEED'] = str(seed) | 禁止Python中的hash随机化    |
+| random.seed(seed)                        | 设置random随机生成器的种子  |
+| np.random.seed(seed)                     | 设置numpy中随机生成器的种子 |
+| torch.manual_seed(seed)                  | 设置当前CPU的随机种子       |
+| torch.cuda.manual_seed(seed)             | 设置当前GPU的随机种子       |
+| torch.cuda.manual_seed_all(seed)         | 设置所有GPU的随机种子       |
+| torch_npu.npu.manual_seed(seed)          | 设置当前NPU的随机种子       |
+| torch_npu.npu.manual_seed_all(seed)      | 设置所有NPU的随机种子       |
+| torch.backends.cudnn.enable=False        | 关闭cuDNN                   |
+| torch.backends.cudnn.benchmark=False     | cuDNN确定性地选择算法       |
+| torch.backends.cudnn.deterministic=True  | cuDNN仅使用确定性的卷积算法 |
+
+需要保证CPU或GPU以及NPU的模型输入完全一致，dump数据的比对才有意义，seed_all并不能保证模型输入完全一致，如下表所示场景需要用户自行保证输入的一致性。
+
+| 场景            | 固定方法      |
+| --------------- | ------------- |
+| 数据集的shuffle | 关闭shuffle。 |
+| dropout         | 关闭dropout。 |
+
+关闭shuffle示例：
+
+```python
+train_loader = torch.utils.data.DataLoader(
+	train_dataset,
+	batch_size = batch_size,
+	shuffle = False,
+	num_workers = num_workers
+)
+```
+
+关闭dropout示例：
+
+```python
+torch.nn.functional.dropout(input, p = 0)
+```
+
+将所有包含dropout的代码设置p = 0，或者可以将所有包含dropout的代码注释。
+
+### set_dump_path
+
+**功能说明**
+
+设置dump数据目录。建议在seed_all函数之后调用且需要保证训练进程能够调用该函数；多卡时须保证每个进程都能调用该函数。
+
+dump操作必选。
+
+**函数原型**
+
+```python
+set_dump_path(fpath=None, dump_tag='ptdbg_dump')
+```
+
+**参数说明**
+
+| 参数名   | 说明                                                         | 是否必选 |
+| -------- | ------------------------------------------------------------ | -------- |
+| fpath    | 设置dump数据目录路径。参数示例：'./dump_path'。dump_path须为已存在目录。<br/>默认在指定的dump_path路径下生成`ptdbg_dump_{version}`目录，并在该目录下生成`dump.pkl`文件以及`dump`数据文件保存目录。<br/>当set_dump_switch函数配置了mode参数时，`dump.pkl`文件以及`dump`数据文件保存目录名称添加mode参数值为前缀，详情请参见“**dump数据存盘说明**”。 | 是       |
+| dump_tag | 设置dump数据目录名称。参数示例：dump_tag='dump_conv2d'。默认dump数据目录命名为ptdbg_dump_{version}。<br/>{version}为当前安装ptdbg_ascend工具版本。目录结构参见“**dump数据存盘说明**”。<br/>配置该参数会将生成的`ptdbg_dump_{version}`目录名称变更为dump_tag配置的值，如`dump_conv2d_{version}`。 | 否       |
+
+**函数示例**
+
+- 示例1：设置dump数据目录路径
+
+  ```python
+  set_dump_path('./dump_path')
+  ```
+
+- 示例2：设置dump数据目录名称
+
+  ```python
+  set_dump_path('./dump_path', dump_tag='dump_conv2d')
+  ```
+
+
+若以相同的dump数据目录多次dump，则会因同名导致覆盖；多次dump建议配置不同的dump_tag。
+
+### register_hook
+
+**功能说明**
+
+注册工具钩子函数。在set_dump_path之后调用。
+
+dump操作必选。
+
+**函数原型**
+
+```python
+register_hook(model, hook, overflow_nums=overflow_nums, dump_mode=dump_mode, dump_config=dump_config_file, rank=0)
+```
+
+**参数说明**
+
+| 参数名        | 说明                                                         | 是否必选 |
+| ------------- | ------------------------------------------------------------ | -------- |
+| model         | model对象。                                                  | 是       |
+| hook          | 注册工具的dump和溢出检测钩子。可取值overflow_check和acc_cmp_dump，二选一。 | 是       |
+| overflow_nums | 控制溢出次数，表示第N次溢出时，停止训练，过程中检测到溢出API对应ACL数据均dump。参数示例：overflow_nums=3。配置overflow_check时可配置，默认不配置，即检测到1次溢出，训练停止。 | 否       |
+| dump_mode     | 控制针对溢出API的dump模式。可取值"api"或"acl"，配置acl时表示dump ACL级别的溢出数据，此时set_dump_path参数不生效，dump数据目录由dump_config的.json文件配置，参数示例：dump_mode="acl"。默认不配置，即dump API级别的溢出数据。 | 否       |
+| dump_config   | acl dump的配置文件。dump_mode="acl"时，该参数必选；dump_mode="api"时，该参数不选。参数示例：dump_config='./dump.json'。 | 否       |
+| rank          | 控制dump数据保存的rank目录名称。参数示例：rank=1。默认不配置，即自动读取dump数据所属的卡并保存在该卡对应的rank目录下。目录结构参见“**dump数据存盘说明**”。<br/>多卡情况下，可能出现工具识别rank出错，导致dump数据保存到错误的rank目录下，此时需要根据“**[rank_id获取方法](https://gitee.com/ascend/tools/tree/master/ptdbg_ascend/doc/rank_id获取方法.md)**”配置该参数，以获取正确的rank_id；工具可正确识别rank_id时无须配置该参数。 | 否       |
+
+**函数示例**
+
+- 示例1：注册工具钩子函数
+
+  ```python
+  register_hook(model, acc_cmp_dump)
+  ```
+
+- 示例2：dump指定API的ACL级别数据
+
+  ```python
+  register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+  ```
+
+  需要配置set_dump_switch的mode="acl"以及scope指定为前向或反向API，请参见“**set_dump_switch”**的示例。
+
+  该场景set_dump_path不生效，由dump_config中的dump.json文件配置dump数据目录。
+
+- 示例3：溢出检测dump
+
+  ```python
+  register_hook(model, overflow_check, overflow_nums=3)
+  ```
+
+  dump执行时会在set_dump_path的fpath参数指定的目录下生成ptdbg_dump_{version}目录，保存溢出数据。
+
+  多卡场景时，需要检测到至少有一张卡溢出次数达到overflow_nums时，训练结束。
+
+  仅支持NPU环境。
+
+- 示例4：dump指定API的ACL级别溢出数据
+
+  ```python
+  register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json')
+  ```
+
+  该场景set_dump_path不生效，由dump_config中的dump.json文件配置溢出数据目录。
+
+  仅支持NPU环境。
+
+### set_dump_switch
+
+**功能说明**
+
+设置dump范围。建议在register_hook函数之后的脚本内任意位置插入，但进行精度问题排查建议参照“场景化示例 > 单卡场景精度比对”章节的顺序，先从第一个迭代开始的位置调用并dump整网数据。
+
+dump操作必选。
+
+**函数原型**
+
+```python
+set_dump_switch(switch, mode='all', scope=[], api_list=[], filter_switch='ON', dump_mode='all')
+```
+
+**参数说明**
+
+| 参数名          | 说明                                                         | 是否必选 |
+| --------------- | ------------------------------------------------------------ | -------- |
+| switch          | dump开关。可取值"ON"或"OFF"。须在选定dump开始的位置配置set_dump_switch("ON")；dump结束的位置设置set_dump_switch("OFF")，不设置OFF则表示dump从set_dump_switch("ON")开始的所有数据。 | 是       |
+| mode            | dump模式。可取值"list"、"range"、"stack"、"acl"、"api_list"、"api_stack"，各参数含义请参见本节的“**函数示例**”。参数示例：mode="list"。默认为空。该参数配置值将作为dump数据文件名的前缀，详情请参见“**dump数据存盘说明**”。 | 否       |
+| scope或api_list | dump范围。根据model配置的模式选择dump的API范围。参数示例：scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward"])、api_list=["relu"]。默认为空。 | 否       |
+| filter_switch   | 开启dump bool和整型的tensor以及浮点、bool和整型的标量。可取值"ON"或"OFF"。参数示例：filter_switch="OFF"。默认不配置，即filter_switch="ON"，表示不dump上述数据。 | 否       |
+| dump_mode       | dump数据过滤。可取值“all”、“forward”和“backward”，表示仅保存dump的数据中文件名包含“forward”或“backward”的前向或反向.npy文件。参数示例dump_mode='backward'。默认为all，即保存所有dump的数据。 | 否       |
+
+**推荐配置**
+
+```python
+set_dump_switch("ON", mode="api_stack", filter_switch="OFF")
+```
+
+开启dump数据和堆栈模式，同时为保证数据完整性开启dump bool和整型的tensor以及浮点、bool和整型的标量。
+
+**函数示例**
+
+set_dump_switch可配置多中dump模式，示例如下：
+
+说明：以下均以dump部分API数据为例，API名可以从首次dump整网数据的结果csv文件中的NPU Name或Bench Name列获取。
+
+- 示例1：dump指定API列表
+
+  ```python
+  set_dump_switch("ON", mode="list", scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward", "Torch_relu_3_backward"])
+  ```
+
+- 示例2：dump指定范围
+
+  ```python
+  set_dump_switch("ON", mode="range", scope=["Tensor_abs_1_forward", "Tensor_transpose_3_forward"])
+  ```
+
+- 示例3：STACK模式，只dump堆栈信息
+
+  ```python
+  set_dump_switch("ON", mode="stack", scope=["Tensor_abs_1_forward", "Tensor_transpose_3_forward"])
+  ```
+
+- 示例4：dump指定前向API的ACL级别数据
+
+  ```python
+  register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+  set_dump_switch("ON", mode="acl", scope=["Tensor_permute_1_forward"])
+  ```
+
+  需要配置register_hook的dump_mode='acl'和dump_config配置文件。
+
+- 示例4：dump指定反向API的ACL级别数据
+
+  ```python
+  register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+  set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"])
+  set_backward_input(["./npu_dump/dump_conv2d_v2.0/rank0/dump/Functional_conv2d_1_backward_input.0.npy"])
+  ```
+
+  需要配置register_hook的dump_mode='acl'和dump_config配置文件，并通过set_backward_input设置反向API输入的.npy文件。
+
+- 示例5：dump指定某一类API的API级别输入输出数据
+
+  ```python
+  set_dump_switch("ON", mode="api_list", api_list=["relu"])
+  ```
+
+  mode="api_list"时不配置scope。
+
+- 示例6：dump全部API级别输入输出数据以及相应堆栈信息
+
+  ```python
+  set_dump_switch("ON", mode="api_stack")
+  ```
+
+  mode="api_stack"时不配置scope。
+
+- 示例7： dump全部API级别输入输出数据并包含bool和整型的tensor以及浮点、bool和整型的标量，默认不配置为ON，会过滤bool和整型数据
+
+  ```python
+  set_dump_switch("ON", filter_switch="OFF")
+  ```
+
+  配置filter_switch="OFF"同时也可以配置mode、scope和api_list，除dump ACL级别数据。
+
+- 示例8：仅保存dump的数据文件名包含“backward”的反向.npy文件
+
+  ```python
+  set_dump_switch("ON", dump_mode="backward")
+  ```
+
+
+以上示例均不set_dump_switch("OFF")，表示从set_dump_switch("ON")插入的位置开始到整体训练结束均进行示例中配置的范围dump；若在脚本中插入set_dump_switch("OFF")，则dump操作在此结束。
+
+### set_overflow_check_switch
+
+**功能说明**
+
+置溢出检测范围。默认不配置该函数，全量进行溢出检测。
+
+仅支持NPU环境。
+
+**函数原型**
+
+```python
+set_overflow_check_switch(switch, filter_switch='ON')
+```
+
+**参数说明**
+
+| 参数名        | 说明                                                         | 是否必选 |
+| ------------- | ------------------------------------------------------------ | -------- |
+| switch,       | 检测开关。可取值"ON"或"OFF"。如果只在特定的step溢出检测，则在期望溢出检测的step位置开始前插入set_overflow_check_switch("ON")，在step结束的位置插入set_overflow_check_switch("OFF")。 | 是       |
+| filter_switch | 开启dump bool和整型的tensor以及浮点、bool和整型的标量。可取值"ON"或"OFF"。参数示例：filter_switch="OFF"。默认不配置，即filter_switch="ON"，表示不dump上述数据。 | 否       |
+
+**函数示例**
+
+- 示例1：指定范围溢出检测
+
+  ```python
+  register_hook(model, overflow_check)
+  set_overflow_check_switch("ON")
+  
+  ...
+  
+  set_overflow_check_switch("OFF")
+  ```
+
+  该场景set_dump_path不生效，dump执行时会在当前目录自动生成ptdbg_dump_{version}目录，保存溢出数据。
+
+- 示例2：前向API的ACL级别范围溢出检测
+
+  ```python
+  register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json')
+  set_overflow_check_switch("ON")
+  
+  ...
+  
+  set_overflow_check_switch("OFF")
+  ```
+
+  该场景set_dump_path不生效，由dump_config中的dump.json文件配置溢出数据目录。
+
+### set_backward_input 
+
+**功能说明**
+
+设置反向ACL级别dump时需要的反向输入的.npy文件。
+
+**函数原型**
+
+```python
+set_backward_input(backward_input)
+```
+
+**参数说明**
+
+| 参数名         | 说明                                                         | 是否必选 |
+| -------------- | ------------------------------------------------------------ | -------- |
+| backward_input | 该输入文件为首次运行训练dump得到反向API输入的.npy文件。例如若需要dump Functional_conv2d_1 API的反向过程的输入输出，则需要在dump目录下查找命名包含Functional_conv2d_1、backward和input字段的.npy文件。 | 是       |
+
+**函数示例**
+
+```python
+register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"])
+set_backward_input(["./npu_dump/dump_conv2d_v2.0/rank0/dump/Functional_conv2d_1_backward_input.0.npy"])
+```
+
+### dump.json配置文件说明
+
+**dump.json配置示例**
+
+```python
+{
+ "dump":
+ {
+         "dump_list":[],
+         "dump_path":"./dump/output",
+         "dump_mode":"all",
+         "dump_op_switch":"on"
+ }
+}
+```
+
+**dump.json参数说明**
+
+| 字段名         | 说明                                                         |
+| -------------- | ------------------------------------------------------------ |
+| dump_list      | 待dump数据的API模型。为空，无需配置。                        |
+| dump_path      | dump数据文件存储到运行环境的目录，主要用于指定ACL dump数据路径。支持配置绝对路径或相对路径。dump_path须为已存在目录。 |
+| dump_mode      | dump数据模式，配置如下：<br/>- output：dump API的输出数据。默认值。<br/>- input：dump API的输入数据。<br/>-  all：dump API的输入、输出数据。 |
+| dump_op_switch | 单API模型dump数据开关，配置如下： * off：关闭单API模型dump，默认值。 * on：开启单API模型dump。 |
+
+**dump目录说明**
+
+配置register_hook的dump_config后，采集的dump数据会在{dump_path}/{time}/{deviceid}/{model_id}目录下生成，例如“/home/HwHiAiUser/output/20200808163566/0/0”
+
+```bash
+├── 20230131172437
+│   └── 1
+│       ├── 0
+│       │   ├── Add.Add.45.0.1675157077183551
+│       │   ├── Cast.trans_Cast_0.31.0.1675157077159449
+│       │   ├── Cast.trans_Cast_5.43.0.1675157077180129
+│       │   ├── MatMul.MatMul.39.0.1675157077172961
+│       │   ├── Mul.Mul.29.0.1675157077155731
+│       │   ├── NPUAllocFloatStatus.NPUAllocFloatStatus.24.0.1675157077145262
+│       │   ├── TransData.trans_TransData_1.33.0.1675157077162791
+│       │   └── TransData.trans_TransData_4.41.0.1675157077176648
+│       ├── 1701737061
+│       │   └── Cast.trans_Cast_2.35.0.1675157077166214
+│       ├── 25
+│       │   └── NPUClearFloatStatus.NPUClearFloatStatus.26.0.1675157077150342
+│       └── 68
+│           └── TransData.trans_TransData_3.37.0.1675157077169473
+```
+
+### dump数据存盘说明
+
+dump结果目录结构示例如下：
+
+```bash
+├── dump_path
+│   └── ptdbg_dump_{version}
+│       ├── rank0
+│       │   ├── dump
+|       |   |    ├── Tensor_permute_1_forward.npy
+|       |   |    ...
+|       |   |    └── Fcuntion_linear_5_backward_output.npy
+│       │   └── dump.pkl
+│       ├── rank1
+|       |   ├── dump
+|       |   |   └── ...
+|       |   └── dump.pkl 
+│       ├── ...
+│       |
+|       └── rank7
+```
+
+其中ptdbg_dump_{version}为未设置set_dump_path的dump_tag参数时的默认命名；rank为设备上各卡的ID，每张卡上dump的数据会生成对应dump目录，可由register_hook函数的rank参数控制rank目录名称。
+
+**精度比对dump场景**
+
+精度比对dump场景的结果如下：
+
+* dump.pkl文件：包含dump数据的API名称、dtype、 shape以及各数据的max、min、mean统计信息。
+
+* dump目录：目录下为npy格式的dump数据。
+
+  npy文件保存的前缀和PyTorch对应关系如下
+
+  | 前缀       | Torch模块           |
+  | ---------- | ------------------- |
+  | Tensor     | torch.Tensor        |
+  | Torch      | torch               |
+  | Functional | torch.nn.functional |
+  | NPU        | NPU亲和算子         |
+  | VF         | torch._VF           |
+
+当set_dump_switch配置mode参数（例如：mode="api_stack" ）时，dump结果的文件名会添加api_stack前缀，dump结果如下：
+
+* api_stack_dump.pkl
+* api_stack_dump目录
+
+**溢出检测dump场景**
+
+register_hook设置了overflow_check时，检测API溢出，dump结果的文件名固定为Overflow_info_{timestamp}，dump结果如下：
+
+* Overflow_info_{timestamp}.pkl
+* Overflow_info_{timestamp}目录
+
+## CPU或GPU与NPU精度数据比对
+
+### 总体说明
+
+- 本节主要介绍CPU或GPU与NPU精度数据比对的函数以及示例。
+
+- 比对函数均通过单独创建精度比对脚本执行，可支持单卡和多卡场景的精度数据比对。
+
+- 工具性能：比对数据量较小时（参考值单份文件小于10GB），参考比对速度0.1GB/s；比对数据量较大时，参考比对速度0.3GB/s。
+  推荐环境配置：独占环境，CPU核心数192，固态硬盘（IO速度参考：固态硬盘 > 500MB/s，机械硬盘60 ~ 170MB/s）。
+
+  用户环境性能弱于标准约束或非独占使用的比对速度酌情向下浮动。比对速度的计算方式：两份比对文件大小/比对耗时。
+
+### 约束
+
+- NPU自研API，在CPU或GPU若没有对应的API，该API的dump数据不比对。
+
+- NPU与CPU或GPU的计算结果误差可能会随着模型的执行不断累积，最终会出现同一个API因为输入的数据差异较大而无法比对的情况。
+
+- CPU或GPU与NPU中两个相同的API会因为调用次数不同导致无法比对或比对到错误的API，不影响整体运行，该API忽略。
+
+### compare_distributed
+
+**功能说明**
+
+将CPU或GPU与NPU的dump文件进行比对，支持单卡和多卡，可同时比对多卡的dump数据。多机场景需要每个设备单独执行比对操作。可自动检索和匹配对应卡和进程所dump的数据文件，再调用compare进行比对。单机单卡时与compare函数二选一。
+
+**函数原型**
+
+```python
+compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs)
+```
+
+**参数说明**
+
+| 参数名         | 说明                                                         | 是否必选 |
+| -------------- | ------------------------------------------------------------ | -------- |
+| npu_dump_dir   | 配置NPU环境下的dump目录，即set_dump_path函数的dump_tag参数对应的目录名称。参数示例：'./npu_dump/dump_conv2d_v2.0'。 | 是       |
+| bench_dump_dir | 配置CPU、GPU或NPU环境下的dump目录，即set_dump_path函数的dump_tag参数对应的目录名称。参数示例：'./gpu_dump/dump_conv2d_v2.0'。 | 是       |
+| output_path    | 配置比对结果csv文件存盘目录。需要预先创建output_path目录。参数示例：'./output'。文件名称基于时间戳自动生成，格式为：`compare_result_rank{npu_ID}-rank{cpu/gpu/npu_ID}_{timestamp}.csv`。 | 是       |
+| **kwargs       | 支持compare的所有可选参数。                                  | 否       |
+
+**函数示例**
+
+创建比对脚本，例如compare_distributed.py，拷贝如下代码，具体参数请根据实际环境修改。
+
+```python
+from ptdbg_ascend import *
+compare_distributed('./npu_dump/ptdbg_dump_v2.0', './gpu_dump/ptdbg_dump_v2.0', './output')
+```
+
+### compare
+
+**功能说明**
+
+将CPU或GPU与NPU的dump文件进行比对，仅支持单机单卡。
+
+**函数原型**
+
+```python
+compare(input_param, output_path, stack_mode=False, auto_analyze=True, suffix='', fuzzy_match=False)
+```
+
+**参数说明**
+
+| 参数名       | 说明                                                         | 是否必选 |
+| ------------ | ------------------------------------------------------------ | -------- |
+| input_param  | 配置dump数据文件及目录。配置参数包括：<br/>- "npu_pkl_path"：指定NPU dump目录下的.pkl文件。参数示例："npu_pkl_path": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl"。必选。<br/>- "bench_pkl_path"：指定CPU、GPU或NPU dump目录下的.pkl文件。参数示例："bench_pkl_path": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl"。必选。<br/>- "npu_dump_data_dir"："指定NPU dump目录下的dump数据目录。参数示例："npu_dump_data_dir": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump"。必选。<br/>- "bench_dump_data_dir"："指定CPU、GPU或NPU dump目录下的dump数据目录。参数示例："npu_dump_data_dir": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump"。必选。<br/>- "is_print_compare_log"：配置是否开启日志打屏。可取值True或False。可选。 | 是       |
+| output_path  | 配置比对结果csv文件存盘目录。参数示例：'./output'。文件名称基于时间戳自动生成，格式为：`compare_result_{timestamp}.csv`。 | 是       |
+| stack_mode   | 配置stack_mode的开关。仅当dump数据时配置set_dump_switch的mode="api_stack"时需要开启。参数示例：stack_mode=True，默认为False。 | 否       |
+| auto_analyze | 自动精度分析，开启后工具自动针对比对结果进行分析，识别到第一个精度不达标节点（在比对结果文件中的“Accuracy Reached or Not”列显示为No），并给出问题可能产生的原因（打屏展示并生成advisor_{timestamp}.txt文件）。可取值True或False，参数示例：auto_analyze=False，默认为True。 | 否       |
+| suffix       | 标识比对结果的文件名。配置的suffix值在比对结果文件名的compare_result和{timestamp}中间插入，例如：`compare_result_{suffix}_{timestamp}`。默认为空。 | 否       |
+| fuzzy_match  | 模糊匹配。开启后，对于网络中同一层级且命名仅调用次数不同的API，可匹配并进行比对。可取值True或False，参数示例：fuzzy_match=True，默认为False。 | 否       |
+
+**函数示例**
+
+单机单卡场景下创建比对脚本，例如compare.py，拷贝如下代码，具体参数请根据实际环境修改。
+
+```python
+from ptdbg_ascend import *
+dump_result_param={
+"npu_pkl_path": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl",
+"bench_pkl_path": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl",
+"npu_dump_data_dir": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump",
+"bench_dump_data_dir": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump",
+"is_print_compare_log": True
+}
+compare(dump_result_param, "./output", stack_mode=True)
+```
+
+### parse
+
+parse  。取值为：<br/>* 第一个参数指定dump数据文件中的pkl文件名。参数示例："./npu_dump/ptdbg_dump_v2.0/rank0/dump.pkl"。必选。<br/>* 第二个参数指定待提取的API接口前缀。参数示例："Torch_norm_1_forward"。必选。<br/>仅NPU环境支持。
+
+**功能说明**
+
+提取dump信息中的堆栈信息及数据统计信息
+
+**函数原型**
+
+```python
+parse(pkl_file, moudule_name_prefix)
+```
+
+**参数说明**
+
+| 参数名              | 说明                                                         | 是否必选 |
+| ------------------- | ------------------------------------------------------------ | -------- |
+| pkl_file            | 指定dump数据文件中的pkl文件名。参数示例："./npu_dump/ptdbg_dump_v2.0/rank0/dump.pkl"。 | 是       |
+| moudule_name_prefix | 指定待提取的API接口前缀。参数示例："Torch_norm_1_forward"。  | 是       |
+
+**函数示例**
+
+创建堆栈信息及数据统计信息提取脚本，例如parse.py，拷贝如下代码，具体参数请根据实际环境修改。
+
+```python
+from ptdbg_ascend import *
+parse("./npu_dump/ptdbg_dump_v2.0/rank0/dump.pkl", "Torch_batch_normal_1_forward")
+```
+
+### 计算精度评价指标
+
+PyTorch精度比对是以CPU或GPU的计算结果为标杆，计算Cosine（余弦相似度）和MaxAbsErr（最大绝对误差），根据这两个结果判断API在运行时是否存在精度问题。
+
+计算精度评价指标：
+
+1. Cosine：通过计算两个向量的余弦值来判断其相似度，数值越接近于1说明计算出的两个张量越相似，实际可接受阈值为大于0.99。在计算中可能会存在nan，主要由于可能会出现其中一个向量为0。
+2. MaxAbsError：当最大绝对误差越接近0表示其计算的误差越小，实际可接受阈值为小于0.001。
+
+精度比对结果csv文件中只需要通过Accuracy Reached or Not来判断计算精度是否达标，判断标准如下：
+
+1. Cosine < 0.99 且 MaxAbsError > 0.001时，精度不达标，标记为“No”。
+2. Cosine < 0.9，精度不达标，标记为“No”。
+3. MaxAbsError > 1，精度不达标，标记为“No”。
+4. 其余情况下记为精度达标，标记为“Yes”。
+
+## FAQ
+
+[FAQ](https://gitee.com/ascend/tools/tree/master/ptdbg_ascend/doc/FAQ.md)
diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v3.1.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v3.1.md"
new file mode 100644
index 0000000000000000000000000000000000000000..6cb11037160555f0bec3a362bf161ddc04155edc
--- /dev/null
+++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v3.1.md"
@@ -0,0 +1,999 @@
+# **PyTorch精度工具使用指南**
+
+本文主要介绍PyTorch精度工具精度工具ptdbg_ascend的使用以及精度比对场景示例。
+
+ptdbg_ascend工具的原理及安装请参见《[PyTorch精度工具](https://gitee.com/ascend/tools/blob/master/ptdbg_ascend/README.md)》。
+
+## PyTorch精度比对总体流程
+
+1. 准备CPU或GPU训练工程。
+
+2. 在环境下安装ptdbg_ascend工具。
+
+3. 在训练脚本内插入ptdbg_ascend工具dump接口。
+
+4. 执行训练dump数据。
+
+5. 将CPU或GPU训练工程迁移为NPU训练工程。
+
+   请参见《[PyTorch模型迁移和训练指南](https://www.hiascend.com/document/detail/zh/canncommercial/63RC1/modeldevpt/ptmigr/ptmigr_0001.html)》。
+
+6. 在NPU环境下安装ptdbg_ascend工具。
+
+7. 在NPU训练脚本内插入ptdbg_ascend工具dump接口。
+
+8. NPU环境下执行训练dump数据。
+
+9. 创建并配置精度比对脚本，例如compare.py。
+
+10. 执行CPU或GPU dump与NPU dump数据的精度比对。
+
+11. 比对结果分析。
+
+## 场景化示例
+
+本章节主要介绍通过ptdbg_ascend工具进行精度比对和分析，主要使用“**CPU或GPU及NPU精度数据dump**”和“**CPU或GPU与NPU精度数据比对**”章节中介绍的ptdbg_ascend工具接口。
+
+### 单卡场景精度比对
+
+**精度分析建议**
+
+PyTorch训练场景的精度问题分析建议参考以下思路进行精度比对和比对结果分析：
+
+1. 整网比对：dump整网数据并进行精度比对，初步定位异常范围。
+2. 缩小范围：根据Accuracy Reached or Not找出不符合精度标准的API。
+3. 范围比对：对不符合精度标准的API重新dump。
+4. 分析原因并优化：分析API精度不符合标准的原因并进行优化调整。
+5. 整网比对：重新进行整网比对，判断优化后的API是否已符合精度标准以及是否出现新的精度问题。
+6. 重复1~5步，直到不存在精度问题为止。
+
+**精度分析示例**
+
+1. dump整网数据。
+
+   分别dump CPU或GPU以及NPU数据，在PyTorch训练脚本插入dump接口，示例代码如下（下面以NPU为例，CPU或GPU dump基本相同）：
+
+   ```python
+   from ptdbg_ascend import *
+   
+   # 在main函数开始前固定随机数
+   seed_all()
+   
+   # 配置dump数据目录路径和名称
+   set_dump_path("./npu_dump", dump_tag='all')
+   
+   # 注册dump回调函数
+   register_hook(model, acc_cmp_dump)
+   
+   ...
+   
+   # 在第一个迭代开始的位置开启dump和堆栈模式，同时为保证数据完整性开启dump bool和整型的tensor以及浮点、bool和整型的标量
+   set_dump_switch("ON", mode="api_stack", filter_switch="OFF")
+   
+   ...
+   
+   # 在第一个迭代结束的位置关闭dump
+   set_dump_switch("OFF")
+   ```
+
+2. 比对整网数据。
+
+   第1步中的NPU dump数据文件为npu_dump.pkl，假设NPU dump npy数据目录为npu_dump，GPU dump数据文件为gpu_dump.pkl，GPU dump npy数据目录为gpu_dump。
+
+   创建并配置精度比对脚本，以创建compare.py为例，示例代码如下：
+
+   ```python
+   from ptdbg_ascend import *
+   dump_result_param={
+   "npu_pkl_path": "./npu_dump/all_v2.0/rank0/api_stack_dump.pkl",
+   "bench_pkl_path": "./gpu_dump/all_v2.0/rank0/api_stack_dump.pkl",
+   "npu_dump_data_dir": "./npu_dump/all_v2.0/rank0/api_stack_dump",
+   "bench_dump_data_dir": "./gpu_dump/all_v2.0/rank0/api_stack_dump",
+   "is_print_compare_log": True
+   }
+   compare(dump_result_param, "./output")
+   ```
+
+   执行比对：
+
+   ```bash
+   python3 compare.py
+   ```
+
+   在output目录下生成结果文件，包括：`compare_result_{timestamp}.csv`和`advisor_{timestamp}.txt`
+
+3. 找出存在问题的API。
+
+   1. 根据`advisor_{timestamp}.txt`或打屏信息的提示，可找到存在精度问题的算子（Suspect Nodes）和专家建议（Expert Advice)
+
+      ![auto_analyze_log](img/auto_analyze_log.png)
+
+   2. 根据第2步结果文件`compare_result_{timestamp}.csv`中的Accuracy Reached or No字段显示为NO的API，针对该API执行后续比对操作，分析该API存在的精度问题。
+
+4. （可选）提取指定API的堆栈信息和dump数据统计信息。
+
+   通过parse接口可以清晰的显示特定API的堆栈信息和dump数据统计信息，结合堆栈信息分析代码中可能存在的精度问题。
+
+   创建并配置提取脚本，以创建parse.py为例，示例代码如下：
+
+   ```python
+   from ptdbg_ascend import *
+   
+   # 提取dump信息中第1次调用的API：Torch_batch_normal的堆栈信息及数据统计信息
+   parse("./npu_dump/all_v2.0/rank0/api_stack_dump.pkl", "Torch_batch_normal_1_forward")
+   ```
+
+   执行提取：
+
+   ```bash
+   python3 parse.py
+   ```
+
+   
+
+5. （可选）指定API dump数据。
+
+   - dump指定前向API的ACL级别数据
+
+     ```python
+     from ptdbg_ascend import *
+     
+     # 固定随机数，开启确定性计算
+     seed_all(mode=True)
+     set_dump_path("./dump_path", dump_tag='forward')
+     register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+     
+     # dump指定前向API的ACL级别数据、bool和整型的tensor以及浮点、bool和整型的标量
+     set_dump_switch("ON", mode="acl", scope=["Tensor_permute_1_forward"], filter_switch="OFF")
+     
+     ...
+     
+     set_dump_switch("OFF")
+     ```
+
+   - dump指定反向API的ACL级别数据
+
+     ```python
+     from ptdbg_ascend import *
+     
+     # 固定随机数，开启确定性计算
+     seed_all(mode=True)
+     set_dump_path("./dump_path", dump_tag='backward')
+     register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+     
+     # dump指定反向API的ACL级别数据、bool和整型的tensor以及浮点、bool和整型的标量
+     set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"], filter_switch="OFF")
+     set_backward_input(["./npu_dump/all_v2.0/rank0/api_stack_dump/Functional_conv2d_1_backward_input.0.npy"])
+     
+     ...
+     
+     set_dump_switch("OFF")
+     ```
+
+6. （可选）重新比对。
+
+   根据第4或5步的dump数据重新配置compare.py并执行比对，可以对单API模型进行问题复现。
+
+**注意事项**
+
+* dump_mode="acl"场景下，会增加npu的内存消耗，请谨慎开启。
+* 部分API存在调用嵌套关系，比如functional.batch_norm实际调用torch.batch_norm，该场景会影响acl init初始化多次，导致功能异常。
+
+### 多卡场景精度比对
+
+精度工具支持多卡场景的精度比对，多卡场景的dump步骤与单卡场景完全一致，请参见“**单卡场景精度比对**”章节，不同的是多卡数据精度比对时需要使用“compare_distributed”函数进行比对。如下示例：
+
+说明：多机多卡场景需要每个设备单独执行比对操作。
+
+假设NPU dump npy数据目录为npu_dump/dump_conv2d_v1.0，GPU dump npy数据目录为gpu_dump/dump_conv2d_v1.0。
+
+1. 创建比对脚本，例如compare_distributed.py，拷贝如下代码。
+
+   ```python
+   from ptdbg_ascend import *
+   compare_distributed('./npu_dump/ptdbg_dump_v2.0', './gpu_dump/ptdbg_dump_v2.0', './output')
+   ```
+
+2. 执行比对：
+
+   ```bash
+   python3 compare_distributed.py
+   ```
+
+两次运行须用相同数量的卡，传入`compare_distributed`的两个文件夹下须有相同个数的rank文件夹，且不包含其他无关文件，否则将无法比对。
+
+**多卡set_dump_path注意事项**
+
+多卡一般为多进程，须保证每个进程都正确调用set_dump_path，或把set_dump_path插入到import语句后，如：
+
+```python
+from ptdbg_ascend import *
+seed_all()
+set_dump_path('./dump_resnet')
+```
+
+如此可保证set_dump_path在每个进程都被调用。
+
+**多卡register_hook注意事项**
+
+register_hook需要在set_dump_path之后调用，也需要在每个进程上被调用，建议在搬运模型数据到卡之后调用。识别方法如下：
+
+- 找到训练代码中遍历epoch的for循环或遍历数据集的for循环，把register_hook放到循环开始前即可。
+- 找到训练代码中调用DDP或者DistributedDataParallel的代码行，把register_hook放到该代码行所在的代码块之后。
+- 若代码中均无以上两种情况，需要保证register_hook在模型定义之后插入，并配置rank参数。rank参数获取rank_id请参见“**[rank_id获取方法](https://gitee.com/ascend/tools/tree/master/ptdbg_ascend/doc/rank_id获取方法.md)**”。
+
+### NPU vs NPU精度比对
+
+对于NPU vs NPU场景，是针对同一模型，进行迭代（模型、API版本升级或设备硬件升级）时存在的精度下降问题，对比相同模型在迭代前后版本的API计算数值，进行问题定位。
+
+一般情况下迭代涉及NPU自定义算子，因此，可以仅dump NPU自定义算子进行比对。比对精度问题分析请参见“**单卡场景精度比对**”章节。
+
+工具当前支持dump NPU自定义算子如下：
+
+| 序号 | NPU自定义算子                       |
+| :--- | ----------------------------------- |
+| 1    | torch_npu.one_                      |
+| 2    | torch_npu.npu_sort_v2               |
+| 3    | torch_npu.npu_transpose             |
+| 4    | torch_npu.npu_broadcast             |
+| 5    | torch_npu.npu_dtype_cast            |
+| 6    | torch_npu.empty_with_format         |
+| 7    | torch_npu.npu_one_hot               |
+| 8    | torch_npu.npu_stride_add            |
+| 9    | torch_npu.npu_ps_roi_pooling        |
+| 10   | torch_npu.npu_roi_align             |
+| 11   | torch_npu.npu_nms_v4                |
+| 12   | torch_npu.npu_iou                   |
+| 13   | torch_npu.npu_nms_with_mask         |
+| 14   | torch_npu.npu_pad                   |
+| 15   | torch_npu.npu_bounding_box_encode   |
+| 16   | torch_npu.npu_bounding_box_decode   |
+| 17   | torch_npu.npu_batch_nms             |
+| 18   | torch_npu.npu_slice                 |
+| 19   | torch_npu._npu_dropout              |
+| 20   | torch_npu.npu_indexing              |
+| 21   | torch_npu.npu_ifmr                  |
+| 22   | torch_npu.npu_max                   |
+| 23   | torch_npu.npu_scatter               |
+| 24   | torch_npu.npu_layer_norm_eval       |
+| 25   | torch_npu.npu_alloc_float_status    |
+| 26   | torch_npu.npu_get_float_status      |
+| 27   | torch_npu.npu_clear_float_status    |
+| 28   | torch_npu.npu_confusion_transpose   |
+| 29   | torch_npu.npu_bmmV2                 |
+| 30   | torch_npu.fast_gelu                 |
+| 31   | torch_npu.npu_sub_sample            |
+| 32   | torch_npu.npu_deformable_conv2d     |
+| 33   | torch_npu.npu_mish                  |
+| 34   | torch_npu.npu_anchor_response_flags |
+| 35   | torch_npu.npu_yolo_boxes_encode     |
+| 36   | torch_npu.npu_grid_assign_positive  |
+| 37   | torch_npu.npu_normalize_batch       |
+| 38   | torch_npu.npu_masked_fill_range     |
+| 39   | torch_npu.npu_linear                |
+| 40   | torch_npu.npu_bert_apply_adam       |
+| 41   | torch_npu.npu_giou                  |
+| 42   | torch_npu.npu_ciou                  |
+| 43   | torch_npu.npu_ciou_backward         |
+| 44   | torch_npu.npu_diou                  |
+| 45   | torch_npu.npu_diou_backward         |
+| 46   | torch_npu.npu_sign_bits_pack        |
+| 47   | torch_npu.npu_sign_bits_unpack      |
+
+### 溢出检测场景
+
+溢出检测是针对NPU的PyTorch API，检测是否存在溢出的情况。当前仅支持识别aicore浮点溢出。
+
+溢出检测原理：针对溢出阶段，开启acl dump模式，重新对溢出阶段执行，落盘数据。
+
+建议按照如下步骤操作：
+
+1. 在NPU环境下安装ptdbg_ascend工具。
+
+2. 在NPU训练脚本内插入ptdbg_ascend工具溢出检测接口。
+
+   - 示例1：全量溢出检测
+
+     ```python
+     from ptdbg_ascend import *
+     seed_all()
+     ...
+     # 设置检测到3次溢出后退出训练
+     register_hook(model, overflow_check, overflow_nums=3)
+     
+     ...
+     ```
+
+     多卡使用时各卡单独计算溢出次数。
+
+   - 示例2：dump指定API的ACL级别溢出数据
+
+     ```python
+     from ptdbg_ascend import *
+     seed_all()
+     ...
+     # dump指定API的ACL级别溢出数据
+     register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json')
+     
+     # 在期望溢出检测的step位置开始前打开溢出检测开关
+     set_overflow_check_switch("ON")
+     
+     ...
+     
+     # 在step结束的位置关闭溢出检测开关
+     set_overflow_check_switch("OFF")
+     
+     ...
+     ```
+
+   - 示例3：dump指定反向API的ACL级别的溢出数据
+
+     1. 进行全量溢出检测
+
+        ```python
+        from ptdbg_ascend import *
+        seed_all()
+        ...
+        # 设置检测到3次溢出后退出训练
+        register_hook(model, overflow_check)
+        
+        ...
+        ```
+
+     2. dump指定反向API的ACL级别的溢出数据
+
+        ```python
+        from ptdbg_ascend import *
+        seed_all()
+        ...
+        # dump指定反向API的ACL级别溢出数据
+        register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+        set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"])
+        set_backward_input(["./npu_dump/ptdbg_dump_v2.0/rank0/dump/Functional_conv2d_1_backward_input.0.npy"])
+        ```
+
+   针对前向溢出API，可以通过overflow_nums，配置允许的溢出次数，并将每次溢出API的全部ACL数据dump下来，到达指定溢出次数后停止，停止后会看到堆栈打印包含如下字段。
+
+   ```bash
+   ValueError: [overflow xxx times]: dump file is saved in 'xxxxx.pkl'.
+   ```
+
+   其中xxx times为用户设置的次数，xxxxx.pkl为文件生成路径。
+
+3. NPU环境下执行训练dump溢出数据。
+
+**注意事项**
+
+* dump_mode="acl"场景下，会增加npu的内存消耗，请谨慎开启。
+* 部分API存在调用嵌套关系，比如functional.batch_norm实际调用torch.batch_norm，该场景会影响acl init初始化多次，导致功能异常。
+
+## CPU或GPU及NPU精度数据dump 
+
+### 总体说明
+
+- 本节主要介绍CPU或GPU及NPU精度数据dump所需要的函数以及示例。
+
+- ptdbg_ascend工具默认情况下仅dump PyTorch模型的API输入输出数据进行精度比对，若在比对结果中发现某个API下可能存在ACL的精度问题，那么可以选择dump该API的ACL级别数据进行精度分析。
+
+- 某些torch api的输出不是Tensor类型的数据。对于此类API的反向过程进行ACL dump，工具会在运行日志中给出对应的Warning（is not of tensor type and cannot be automatically derived）提示。如若想要进行该类API反向ACL dump，可以通过手动构建单API用例的方式进行ACL dump，具体用例可参见“**[反向ACL dump用例说明](https://gitee.com/ascend/tools/blob/master/ptdbg_ascend/doc/%E5%8F%8D%E5%90%91ACL%20dump%E7%94%A8%E4%BE%8B%E8%AF%B4%E6%98%8E.md)**”。
+
+- 工具性能：dump数据量较小时（小于5G），参考dump速度0.1GB/s；dump数据量较大时，参考dump速度0.2GB/s。
+  推荐环境配置：独占环境，CPU核心数192，固态硬盘（IO速度参考：固态硬盘 > 500MB/s，机械硬盘60 ~ 170MB/s）。
+  
+  用户环境性能弱于标准约束或非独占使用的比对速度酌情向下浮动。Dump速度的计算方式：Dump数据量/(单个step添加Dump耗时-原始单个step耗时）。
+
+### 约束
+- 进行CPU或GPU数据dump时，请安装torch包而非torch_npu包，避免工具无法识别使用场景，导致失败。
+  
+- TASK_QUEUE_ENABLE环境变量会导致API下发和执行异步进行，因此在ACL dump前需要将TASK_QUEUE_ENABLE关闭，即export TASK_QUEUE_ENABLE=0。
+
+- 不建议在PyTorch训练脚本中同时添加dump接口和性能数据采集（如Ascend PyThon Profiler）接口，二者可能相互影响导致数据不准确。
+
+### seed_all
+
+**功能说明**
+
+固定随机数。通过固定随机数保证模型的输入或输出一致。在训练主函数开始前调用，避免随机数固定不全。
+
+dump操作必选。
+
+**函数原型**
+
+```python
+seed_all(seed=1234, mode=False)
+```
+
+**参数说明**
+
+| 参数名 | 说明                                                         | 是否必选 |
+| ------ | ------------------------------------------------------------ | -------- |
+| seed   | 随机数种子。参数示例：seed=1000。默认值为：1234。            | 否       |
+| mode   | 确定性计算模式。可配置True或False。参数示例：mode=True。默认为False。<br/>即使在相同的硬件和输入下，API多次执行的结果也可能不同，开启确定性计算是为了保证在相同的硬件和输入下，API多次执行的结果相同。<br/>确定性计算会导致API执行性能降低，建议在发现模型多次执行结果不同的情况下开启。<br/>rnn类算子、ReduceSum、ReduceMean等算子可能与确定性计算存在冲突，若开启确定性计算后多次执行的结果不相同，则考虑存在这些算子。 | 否       |
+
+**函数示例**
+
+seed_all函数的随机数种子，取默认值即可，无须配置；第二个参数默认关闭，不开启确定性计算时也无须配置。
+
+- 示例1：仅固定随机数，不开启确定性计算
+
+  ```python
+  seed_all()
+  ```
+
+- 示例2：固定随机数，开启确定性计算
+
+  ```python
+  seed_all(mode=True)
+  ```
+
+**固定随机数范围**
+
+seed_all函数可固定随机数的范围如下表。
+
+| API                                      | 固定随机数                  |
+| ---------------------------------------- | --------------------------- |
+| os.environ['PYTHONHASHSEED'] = str(seed) | 禁止Python中的hash随机化    |
+| random.seed(seed)                        | 设置random随机生成器的种子  |
+| np.random.seed(seed)                     | 设置numpy中随机生成器的种子 |
+| torch.manual_seed(seed)                  | 设置当前CPU的随机种子       |
+| torch.cuda.manual_seed(seed)             | 设置当前GPU的随机种子       |
+| torch.cuda.manual_seed_all(seed)         | 设置所有GPU的随机种子       |
+| torch_npu.npu.manual_seed(seed)          | 设置当前NPU的随机种子       |
+| torch_npu.npu.manual_seed_all(seed)      | 设置所有NPU的随机种子       |
+| torch.backends.cudnn.enable=False        | 关闭cuDNN                   |
+| torch.backends.cudnn.benchmark=False     | cuDNN确定性地选择算法       |
+| torch.backends.cudnn.deterministic=True  | cuDNN仅使用确定性的卷积算法 |
+
+需要保证CPU或GPU以及NPU的模型输入完全一致，dump数据的比对才有意义，seed_all并不能保证模型输入完全一致，如下表所示场景需要保证输入的一致性。
+
+| 场景            | 固定方法      |
+| --------------- | ------------- |
+| 数据集的shuffle | 关闭shuffle。 |
+| dropout         | 关闭dropout。 |
+
+关闭shuffle示例：
+
+```python
+train_loader = torch.utils.data.DataLoader(
+	train_dataset,
+	batch_size = batch_size,
+	shuffle = False,
+	num_workers = num_workers
+)
+```
+
+关闭dropout：
+
+在使用from ptdbg import *后，工具会自动将torch.nn.functional.dropout、torch.nn.functional.dropout2d、torch.nn.functional.dropout3d、torch.nn.Dropout、torch.nn.Dropout2d、torch.nn.Dropout3d的接口参数p置为0。
+
+### set_dump_path
+
+**功能说明**
+
+设置dump数据目录。建议在seed_all函数之后调用且需要保证训练进程能够调用该函数；多卡时须保证每个进程都能调用该函数。
+
+dump操作必选。
+
+**函数原型**
+
+```python
+set_dump_path(fpath=None, dump_tag='ptdbg_dump')
+```
+
+**参数说明**
+
+| 参数名   | 说明                                                         | 是否必选 |
+| -------- | ------------------------------------------------------------ | -------- |
+| fpath    | 设置dump数据目录路径。参数示例：'./dump_path'。dump_path须为已存在目录。<br/>默认在指定的dump_path路径下生成`ptdbg_dump_{version}`目录，并在该目录下生成`dump.pkl`文件以及`dump`数据文件保存目录。<br/>当set_dump_switch函数配置了mode参数时，`dump.pkl`文件以及`dump`数据文件保存目录名称添加mode参数值为前缀，详情请参见“**dump数据存盘说明**”。 | 是       |
+| dump_tag | 设置dump数据目录名称。参数示例：dump_tag='dump_conv2d'。默认dump数据目录命名为ptdbg_dump_{version}。<br/>{version}为当前安装ptdbg_ascend工具版本。目录结构参见“**dump数据存盘说明**”。<br/>配置该参数会将生成的`ptdbg_dump_{version}`目录名称变更为dump_tag配置的值，如`dump_conv2d_{version}`。 | 否       |
+
+**函数示例**
+
+- 示例1：设置dump数据目录路径
+
+  ```python
+  set_dump_path('./dump_path')
+  ```
+
+- 示例2：设置dump数据目录名称
+
+  ```python
+  set_dump_path('./dump_path', dump_tag='dump_conv2d')
+  ```
+
+
+若以相同的dump数据目录多次dump，则会因同名导致覆盖；多次dump建议配置不同的dump_tag。
+
+### register_hook
+
+**功能说明**
+
+注册工具钩子函数。在set_dump_path之后调用。
+
+dump操作必选。
+
+**函数原型**
+
+```python
+register_hook(model, hook, overflow_nums=overflow_nums, dump_mode=dump_mode, dump_config=dump_config_file, rank=0)
+```
+
+**参数说明**
+
+| 参数名        | 说明                                                         | 是否必选 |
+| ------------- | ------------------------------------------------------------ | -------- |
+| model         | model对象。                                                  | 是       |
+| hook          | 注册工具的dump和溢出检测钩子。可取值overflow_check和acc_cmp_dump，二选一。 | 是       |
+| overflow_nums | 控制溢出次数，表示第N次溢出时，停止训练，过程中检测到溢出API对应ACL数据均dump。参数示例：overflow_nums=3。配置overflow_check时可配置，默认不配置，即检测到1次溢出，训练停止。 | 否       |
+| dump_mode     | 控制针对溢出API的dump模式。可取值"api"或"acl"，配置acl时表示dump ACL级别的溢出数据，此时set_dump_path参数不生效，dump数据目录由dump_config的.json文件配置，参数示例：dump_mode="acl"。默认不配置，即dump API级别的溢出数据。 | 否       |
+| dump_config   | acl dump的配置文件。dump_mode="acl"时，该参数必选；dump_mode="api"时，该参数不选。参数示例：dump_config='./dump.json'。 | 否       |
+| rank          | 控制dump数据保存的rank目录名称。参数示例：rank=1。默认不配置，即自动读取dump数据所属的卡并保存在该卡对应的rank目录下。目录结构参见“**dump数据存盘说明**”。<br/>多卡情况下，可能出现工具识别rank出错，导致dump数据保存到错误的rank目录下，此时需要根据“**[rank_id获取方法](https://gitee.com/ascend/tools/tree/master/ptdbg_ascend/doc/rank_id获取方法.md)**”配置该参数，以获取正确的rank_id；工具可正确识别rank_id时无须配置该参数。 | 否       |
+
+**函数示例**
+
+- 示例1：注册工具钩子函数
+
+  ```python
+  register_hook(model, acc_cmp_dump)
+  ```
+
+- 示例2：dump指定API的ACL级别数据
+
+  ```python
+  register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+  ```
+
+  需要配置set_dump_switch的mode="acl"以及scope指定为前向或反向API，请参见“**set_dump_switch”**的示例。
+
+  该场景set_dump_path不生效，由dump_config中的dump.json文件配置dump数据目录。
+
+- 示例3：溢出检测dump
+
+  ```python
+  register_hook(model, overflow_check, overflow_nums=3)
+  ```
+
+  dump执行时会在set_dump_path的fpath参数指定的目录下生成ptdbg_dump_{version}目录，保存溢出数据。
+
+  多卡场景时，需要检测到至少有一张卡溢出次数达到overflow_nums时，训练结束。
+
+  仅支持NPU环境。
+
+- 示例4：dump指定API的ACL级别溢出数据
+
+  ```python
+  register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json')
+  ```
+
+  该场景set_dump_path不生效，由dump_config中的dump.json文件配置溢出数据目录。
+
+  仅支持NPU环境。
+
+### set_dump_switch
+
+**功能说明**
+
+设置dump范围。建议在register_hook函数之后的脚本内任意位置插入，但进行精度问题排查建议参照“场景化示例 > 单卡场景精度比对”章节的顺序，先从第一个迭代开始的位置调用并dump整网数据。
+
+dump操作必选。
+
+**函数原型**
+
+```python
+def set_dump_switch(switch, mode="all", scope=[], api_list=[], filter_switch=Const.ON, dump_mode=["all"]):
+```
+
+**参数说明**
+
+| 参数名          | 说明                                                         | 是否必选 |
+| --------------- | ------------------------------------------------------------ | -------- |
+| switch          | dump开关。可取值"ON"或"OFF"。须在选定dump开始的位置配置set_dump_switch("ON")；dump结束的位置设置set_dump_switch("OFF")，不设置OFF则表示dump从set_dump_switch("ON")开始的所有数据。 | 是       |
+| mode            | dump模式。可取值"all"、"list"、"range"、"stack"、"acl"、"api_list"、"api_stack"，各参数含义请参见本节的“**函数示例**”。参数示例：mode="list"。默认为空。该参数配置值将作为dump数据文件名的前缀，详情请参见“**dump数据存盘说明**”。 | 否       |
+| scope或api_list | dump范围。根据model配置的模式选择dump的API范围。参数示例：scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward"]、api_list=["relu"]。默认为空。 | 否       |
+| filter_switch   | 开启dump bool和整型的tensor以及浮点、bool和整型的标量。可取值"ON"或"OFF"。参数示例：filter_switch="OFF"。默认不配置，即filter_switch="ON"，表示不dump上述数据。 | 否       |
+| dump_mode       | dump数据过滤。可取值“all”、“forward”、“backward”、input和output，表示仅保存dump的数据中文件名包含“forward”、“backward”、input或output的前向、反向、输入或输出的.npy文件。参数示例dump_mode=["backward"]或dump_mode=["forward", "backward"]。默认为all，即保存所有dump的数据。除了all参数只能单独配置外，其他参数可以自由组合。 | 否       |
+
+**推荐配置**
+
+```python
+set_dump_switch("ON", mode="api_stack", filter_switch="OFF")
+```
+
+开启dump数据和堆栈模式，同时为保证数据完整性开启dump bool和整型的tensor以及浮点、bool和整型的标量。
+
+**函数示例**
+
+set_dump_switch可配置多中dump模式，示例如下：
+
+说明：以下均以dump部分API数据为例，API名可以从首次dump整网数据的结果csv文件中的NPU Name或Bench Name列获取。
+
+- 示例1：dump指定API列表
+
+  ```python
+  set_dump_switch("ON", mode="list", scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward", "Torch_relu_3_backward"])
+  ```
+
+- 示例2：dump指定范围
+
+  ```python
+  set_dump_switch("ON", mode="range", scope=["Tensor_abs_1_forward", "Tensor_transpose_3_forward"])
+  ```
+
+- 示例3：STACK模式，只dump堆栈信息
+
+  ```python
+  set_dump_switch("ON", mode="stack", scope=["Tensor_abs_1_forward", "Tensor_transpose_3_forward"])
+  ```
+
+- 示例4：dump指定前向API的ACL级别数据
+
+  ```python
+  register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+  set_dump_switch("ON", mode="acl", scope=["Tensor_permute_1_forward"])
+  ```
+
+  需要配置register_hook的dump_mode='acl'和dump_config配置文件。
+
+- 示例4：dump指定反向API的ACL级别数据
+
+  ```python
+  register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+  set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"])
+  set_backward_input(["./npu_dump/dump_conv2d_v2.0/rank0/dump/Functional_conv2d_1_backward_input.0.npy"])
+  ```
+
+  需要配置register_hook的dump_mode='acl'和dump_config配置文件，并通过set_backward_input设置反向API输入的.npy文件。
+
+- 示例5：dump指定某一类API的API级别输入输出数据
+
+  ```python
+  set_dump_switch("ON", mode="api_list", api_list=["relu"])
+  ```
+
+  mode="api_list"时不配置scope。
+
+- 示例6：dump全部API级别输入输出数据以及相应堆栈信息
+
+  ```python
+  set_dump_switch("ON", mode="api_stack")
+  ```
+
+  mode="api_stack"时不配置scope。
+
+- 示例7： dump全部API级别输入输出数据并包含bool和整型的tensor以及浮点、bool和整型的标量，默认不配置为ON，会过滤bool和整型数据
+
+  ```python
+  set_dump_switch("ON", filter_switch="OFF")
+  ```
+
+  配置filter_switch="OFF"同时也可以配置mode、scope和api_list，除dump ACL级别数据。
+  
+- 示例8：仅保存dump的数据文件名包含“backward”的反向.npy文件
+
+  ```python
+  set_dump_switch("ON", dump_mode=["backward"])
+  ```
+
+
+以上示例均不set_dump_switch("OFF")，表示从set_dump_switch("ON")插入的位置开始到整体训练结束均进行示例中配置的范围dump；若在脚本中插入set_dump_switch("OFF")，则dump操作在此结束。
+
+### set_overflow_check_switch
+
+**功能说明**
+
+置溢出检测范围。默认不配置该函数，全量进行溢出检测。
+
+仅支持NPU环境。
+
+**函数原型**
+
+```python
+set_overflow_check_switch(switch, filter_switch='ON')
+```
+
+**参数说明**
+
+| 参数名        | 说明                                                         | 是否必选 |
+| ------------- | ------------------------------------------------------------ | -------- |
+| switch,       | 检测开关。可取值"ON"或"OFF"。如果只在特定的step溢出检测，则在期望溢出检测的step位置开始前插入set_overflow_check_switch("ON")，在step结束的位置插入set_overflow_check_switch("OFF")。 | 是       |
+| filter_switch | 开启dump bool和整型的tensor以及浮点、bool和整型的标量。可取值"ON"或"OFF"。参数示例：filter_switch="OFF"。默认不配置，即filter_switch="ON"，表示不dump上述数据。 | 否       |
+
+**函数示例**
+
+- 示例1：指定范围溢出检测
+
+  ```python
+  register_hook(model, overflow_check)
+  set_overflow_check_switch("ON")
+  
+  ...
+  
+  set_overflow_check_switch("OFF")
+  ```
+
+  该场景set_dump_path不生效，dump执行时会在当前目录自动生成ptdbg_dump_{version}目录，保存溢出数据。
+
+- 示例2：前向API的ACL级别范围溢出检测
+
+  ```python
+  register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json')
+  set_overflow_check_switch("ON")
+  
+  ...
+  
+  set_overflow_check_switch("OFF")
+  ```
+
+  该场景set_dump_path不生效，由dump_config中的dump.json文件配置溢出数据目录。
+
+### set_backward_input 
+
+**功能说明**
+
+设置反向ACL级别dump时需要的反向输入的.npy文件。
+
+**函数原型**
+
+```python
+set_backward_input(backward_input)
+```
+
+**参数说明**
+
+| 参数名         | 说明                                                         | 是否必选 |
+| -------------- | ------------------------------------------------------------ | -------- |
+| backward_input | 该输入文件为首次运行训练dump得到反向API输入的.npy文件。例如若需要dump Functional_conv2d_1 API的反向过程的输入输出，则需要在dump目录下查找命名包含Functional_conv2d_1、backward和input字段的.npy文件。 | 是       |
+
+**函数示例**
+
+```python
+register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"])
+set_backward_input(["./npu_dump/dump_conv2d_v2.0/rank0/dump/Functional_conv2d_1_backward_input.0.npy"])
+```
+
+### dump.json配置文件说明
+
+**dump.json配置示例**
+
+```python
+{
+ "dump":
+ {
+         "dump_list":[],
+         "dump_path":"./dump/output",
+         "dump_mode":"all",
+         "dump_op_switch":"on"
+ }
+}
+```
+
+**dump.json参数说明**
+
+| 字段名         | 说明                                                         |
+| -------------- | ------------------------------------------------------------ |
+| dump_list      | 待dump数据的API模型。为空，无需配置。                        |
+| dump_path      | dump数据文件存储到运行环境的目录，主要用于指定ACL dump数据路径。支持配置绝对路径或相对路径。dump_path须为已存在目录。 |
+| dump_mode      | dump数据模式，配置如下：<br/>- output：dump API的输出数据。默认值。<br/>- input：dump API的输入数据。<br/>-  all：dump API的输入、输出数据。 |
+| dump_op_switch | 单API模型dump数据开关，配置如下： * off：关闭单API模型dump，默认值。 * on：开启单API模型dump。 |
+
+**dump目录说明**
+
+配置register_hook的dump_config后，采集的dump数据会在{dump_path}/{time}/{deviceid}/{model_id}目录下生成，例如“/home/HwHiAiUser/output/20200808163566/0/0”
+
+```bash
+├── 20230131172437
+│   └── 1
+│       ├── 0
+│       │   ├── Add.Add.45.0.1675157077183551
+│       │   ├── Cast.trans_Cast_0.31.0.1675157077159449
+│       │   ├── Cast.trans_Cast_5.43.0.1675157077180129
+│       │   ├── MatMul.MatMul.39.0.1675157077172961
+│       │   ├── Mul.Mul.29.0.1675157077155731
+│       │   ├── NPUAllocFloatStatus.NPUAllocFloatStatus.24.0.1675157077145262
+│       │   ├── TransData.trans_TransData_1.33.0.1675157077162791
+│       │   └── TransData.trans_TransData_4.41.0.1675157077176648
+│       ├── 1701737061
+│       │   └── Cast.trans_Cast_2.35.0.1675157077166214
+│       ├── 25
+│       │   └── NPUClearFloatStatus.NPUClearFloatStatus.26.0.1675157077150342
+│       └── 68
+│           └── TransData.trans_TransData_3.37.0.1675157077169473
+```
+
+### dump数据存盘说明
+
+dump结果目录结构示例如下：
+
+```bash
+├── dump_path
+│   └── ptdbg_dump_{version}
+│       ├── rank0
+│       │   ├── dump
+|       |   |    ├── Tensor_permute_1_forward.npy
+|       |   |    ...
+|       |   |    └── Fcuntion_linear_5_backward_output.npy
+│       │   └── dump.pkl
+│       ├── rank1
+|       |   ├── dump
+|       |   |   └── ...
+|       |   └── dump.pkl 
+│       ├── ...
+│       |
+|       └── rank7
+```
+
+其中ptdbg_dump_{version}为未设置set_dump_path的dump_tag参数时的默认命名；rank为设备上各卡的ID，每张卡上dump的数据会生成对应dump目录，可由register_hook函数的rank参数控制rank目录名称。
+
+**精度比对dump场景**
+
+精度比对dump场景的结果如下：
+
+* dump.pkl文件：包含dump数据的API名称、dtype、 shape以及各数据的max、min、mean统计信息。
+
+* dump目录：目录下为npy格式的dump数据。
+
+   npy文件保存的前缀和PyTorch对应关系如下
+
+   | 前缀       | Torch模块           |
+   | ---------- | ------------------- |
+   | Tensor     | torch.Tensor        |
+   | Torch      | torch               |
+   | Functional | torch.nn.functional |
+   | NPU        | NPU亲和算子         |
+   | VF         | torch._VF           |
+
+当set_dump_switch配置mode参数（例如：mode="api_stack" ）时，dump结果的文件名会添加api_stack前缀，dump结果如下：
+
+* api_stack_dump.pkl
+* api_stack_dump目录
+
+**溢出检测dump场景**
+
+register_hook设置了overflow_check时，检测API溢出，dump结果的文件名固定为Overflow_info_{timestamp}，dump结果如下：
+
+* Overflow_info_{timestamp}.pkl
+* Overflow_info_{timestamp}目录
+
+## CPU或GPU与NPU精度数据比对
+
+### 总体说明
+
+- 本节主要介绍CPU或GPU与NPU精度数据比对的函数以及示例。
+
+- 比对函数均通过单独创建精度比对脚本执行，可支持单卡和多卡场景的精度数据比对。
+
+- 工具性能：比对数据量较小时（参考值单份文件小于10GB），参考比对速度0.1GB/s；比对数据量较大时，参考比对速度0.3GB/s。
+  推荐环境配置：独占环境，CPU核心数192，固态硬盘（IO速度参考：固态硬盘 > 500MB/s，机械硬盘60 ~ 170MB/s）。
+  
+  用户环境性能弱于标准约束或非独占使用的比对速度酌情向下浮动。比对速度的计算方式：两份比对文件大小/比对耗时。
+
+### 约束
+
+- NPU自研API，在CPU或GPU若没有对应的API，该API的dump数据不比对。
+  
+- NPU与CPU或GPU的计算结果误差可能会随着模型的执行不断累积，最终会出现同一个API因为输入的数据差异较大而无法比对的情况。
+
+- CPU或GPU与NPU中两个相同的API会因为调用次数不同导致无法比对或比对到错误的API，不影响整体运行，该API忽略。
+
+### compare_distributed
+
+**功能说明**
+
+将CPU或GPU与NPU的dump文件进行比对，支持单卡和多卡，可同时比对多卡的dump数据。多机场景需要每个设备单独执行比对操作。可自动检索和匹配对应卡和进程所dump的数据文件，再调用compare进行比对。单机单卡时与compare函数二选一。
+
+**函数原型**
+
+```python
+compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs)
+```
+
+**参数说明**
+
+| 参数名         | 说明                                                         | 是否必选 |
+| -------------- | ------------------------------------------------------------ | -------- |
+| npu_dump_dir   | 配置NPU环境下的dump目录，即set_dump_path函数的dump_tag参数对应的目录名称。参数示例：'./npu_dump/dump_conv2d_v2.0'。 | 是       |
+| bench_dump_dir | 配置CPU、GPU或NPU环境下的dump目录，即set_dump_path函数的dump_tag参数对应的目录名称。参数示例：'./gpu_dump/dump_conv2d_v2.0'。 | 是       |
+| output_path    | 配置比对结果csv文件存盘目录。需要预先创建output_path目录。参数示例：'./output'。文件名称基于时间戳自动生成，格式为：`compare_result_rank{npu_ID}-rank{cpu/gpu/npu_ID}_{timestamp}.csv`。 | 是       |
+| **kwargs       | 支持compare的所有可选参数。                                  | 否       |
+
+**函数示例**
+
+创建比对脚本，例如compare_distributed.py，拷贝如下代码，具体参数请根据实际环境修改。
+
+```python
+from ptdbg_ascend import *
+compare_distributed('./npu_dump/ptdbg_dump_v2.0', './gpu_dump/ptdbg_dump_v2.0', './output')
+```
+
+### compare
+
+**功能说明**
+
+将CPU或GPU与NPU的dump文件进行比对，仅支持单机单卡。
+
+**函数原型**
+
+```python
+compare(input_param, output_path, stack_mode=False, auto_analyze=True, suffix='', fuzzy_match=False)
+```
+
+**参数说明**
+
+| 参数名       | 说明                                                         | 是否必选 |
+| ------------ | ------------------------------------------------------------ | -------- |
+| input_param  | 配置dump数据文件及目录。配置参数包括：<br/>- "npu_pkl_path"：指定NPU dump目录下的.pkl文件。参数示例："npu_pkl_path": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl"。必选。<br/>- "bench_pkl_path"：指定CPU、GPU或NPU dump目录下的.pkl文件。参数示例："bench_pkl_path": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl"。必选。<br/>- "npu_dump_data_dir"："指定NPU dump目录下的dump数据目录。参数示例："npu_dump_data_dir": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump"。必选。<br/>- "bench_dump_data_dir"："指定CPU、GPU或NPU dump目录下的dump数据目录。参数示例："npu_dump_data_dir": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump"。必选。<br/>- "is_print_compare_log"：配置是否开启日志打屏。可取值True或False。可选。 | 是       |
+| output_path  | 配置比对结果csv文件存盘目录。参数示例：'./output'。文件名称基于时间戳自动生成，格式为：`compare_result_{timestamp}.csv`。 | 是       |
+| stack_mode   | 配置stack_mode的开关。仅当dump数据时配置set_dump_switch的mode="api_stack"时需要开启。参数示例：stack_mode=True，默认为False。 | 否       |
+| auto_analyze | 自动精度分析，开启后工具自动针对比对结果进行分析，识别到第一个精度不达标节点（在比对结果文件中的“Accuracy Reached or Not”列显示为No），并给出问题可能产生的原因（打屏展示并生成advisor_{timestamp}.txt文件）。可取值True或False，参数示例：auto_analyze=False，默认为True。 | 否       |
+| suffix       | 标识比对结果的文件名。配置的suffix值在比对结果文件名的compare_result和{timestamp}中间插入，例如：`compare_result_{suffix}_{timestamp}`。默认为空。 | 否       |
+| fuzzy_match  | 模糊匹配。开启后，对于网络中同一层级且命名仅调用次数不同的API，可匹配并进行比对。可取值True或False，参数示例：fuzzy_match=True，默认为False。 | 否       |
+
+**函数示例**
+
+单机单卡场景下创建比对脚本，例如compare.py，拷贝如下代码，具体参数请根据实际环境修改。
+
+```python
+from ptdbg_ascend import *
+dump_result_param={
+"npu_pkl_path": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl",
+"bench_pkl_path": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl",
+"npu_dump_data_dir": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump",
+"bench_dump_data_dir": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump",
+"is_print_compare_log": True
+}
+compare(dump_result_param, "./output", stack_mode=True)
+```
+
+### parse
+
+**功能说明**
+
+解析并提取dump信息中的堆栈信息及数据统计信息。
+
+**函数原型**
+
+```python
+parse(pkl_file, moudule_name_prefix)
+```
+
+**参数说明**
+
+| 参数名              | 说明                                                         | 是否必选 |
+| ------------------- | ------------------------------------------------------------ | -------- |
+| pkl_file            | 指定dump数据文件中的pkl文件名。参数示例："./npu_dump/ptdbg_dump_v2.0/rank0/dump.pkl"。 | 是       |
+| moudule_name_prefix | 指定待提取的API接口前缀。参数示例："Torch_norm_1_forward"。  | 是       |
+
+**函数示例**
+
+创建堆栈信息及数据统计信息提取脚本，例如parse.py，拷贝如下代码，具体参数请根据实际环境修改。
+
+```python
+from ptdbg_ascend import *
+parse("./npu_dump/ptdbg_dump_v2.0/rank0/dump.pkl", "Torch_batch_normal_1_forward")
+```
+
+### 计算精度评价指标
+
+PyTorch精度比对是以CPU或GPU的计算结果为标杆，计算Cosine（余弦相似度）、MaxAbsErr（最大绝对误差）和MaxRelativeErr（最大相对误差），根据这两个结果判断API在运行时是否存在精度问题。
+
+计算精度评价指标：
+
+1. Cosine：通过计算两个向量的余弦值来判断其相似度，数值越接近于1说明计算出的两个张量越相似，实际可接受阈值为大于0.99。在计算中可能会存在nan，主要由于可能会出现其中一个向量为0。
+
+2. MaxAbsErr：当最大绝对误差越接近0表示其计算的误差越小，实际可接受阈值为小于0.001。
+
+3. MaxRelativeErr：当最大相对误差越接近0表示其计算的误差越小。
+
+   当dump数据中存在0或Nan时，比对结果中最大相对误差则出现inf或Nan的情况，属于正常现象。
+
+精度比对结果csv文件中只需要通过Accuracy Reached or Not来判断计算精度是否达标，判断标准如下：
+
+1. Cosine < 0.99 且 MaxAbsError > 0.001时，精度不达标，标记为“No”。
+2. Cosine < 0.9，精度不达标，标记为“No”。
+3. MaxAbsError > 1，精度不达标，标记为“No”。
+4. 其余情况下记为精度达标，标记为“Yes”。
+
+## FAQ
+
+[FAQ](https://gitee.com/ascend/tools/tree/master/ptdbg_ascend/doc/FAQ.md)
diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v3.2.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v3.2.md"
new file mode 100644
index 0000000000000000000000000000000000000000..8603e80ed06706888a2b5e238581b68dd2e37b54
--- /dev/null
+++ "b/debug/accuracy_tools/ptdbg_ascend/doc/ptdbg_ascend\347\262\276\345\272\246\345\267\245\345\205\267\345\212\237\350\203\275\350\257\264\346\230\216_v3.2.md"
@@ -0,0 +1,1464 @@
+# **PyTorch精度工具使用指南**
+
+本文主要介绍PyTorch精度工具精度工具ptdbg_ascend的使用以及精度比对场景示例。
+
+ptdbg_ascend工具的原理及安装请参见《[PyTorch精度工具](https://gitee.com/kun_8/att_1/blob/master/debug/accuracy_tools/ptdbg_ascend/README.md)》。
+
+## PyTorch精度比对总体流程
+
+1. 准备CPU或GPU训练工程。
+
+2. 在环境下安装ptdbg_ascend工具。
+
+3. 在训练脚本内插入ptdbg_ascend工具dump接口。
+
+4. 执行训练dump数据。
+
+5. 将CPU或GPU训练工程迁移为NPU训练工程。
+
+   请参见《[PyTorch模型迁移和训练指南](https://www.hiascend.com/document/detail/zh/canncommercial/63RC1/modeldevpt/ptmigr/ptmigr_0001.html)》。
+
+6. 在NPU环境下安装ptdbg_ascend工具。
+
+7. 在NPU训练脚本内插入ptdbg_ascend工具dump接口。
+
+8. NPU环境下执行训练dump数据。
+
+9. 创建并配置精度比对脚本，例如compare.py。
+
+10. 执行CPU或GPU dump与NPU dump数据的精度比对。
+
+11. 比对结果分析。
+
+## 场景化示例
+
+本章节主要介绍通过ptdbg_ascend工具进行精度比对和分析，主要使用“**CPU或GPU及NPU精度数据dump**”和“**CPU或GPU与NPU精度数据比对**”章节中介绍的ptdbg_ascend工具接口。
+
+### 单卡场景精度比对
+
+**精度分析建议**
+
+PyTorch训练场景的精度问题分析建议参考以下思路进行精度比对和比对结果分析：
+
+1. 整网比对：dump整网数据并进行精度比对，初步定位异常范围。
+2. 缩小范围：根据Accuracy Reached or Not找出不符合精度标准的API。
+3. 范围比对：对不符合精度标准的API重新dump。
+4. 分析原因并优化：分析API精度不符合标准的原因并进行优化调整。
+5. 整网比对：重新进行整网比对，判断优化后的API是否已符合精度标准以及是否出现新的精度问题。
+6. 重复1~5步，直到不存在精度问题为止。
+
+**精度分析示例**
+
+1. dump整网数据。
+
+   分别dump CPU或GPU以及NPU数据，在PyTorch训练脚本插入dump接口，示例代码如下（下面以NPU为例，CPU或GPU dump基本相同）：
+
+   ```python
+   from ptdbg_ascend import *
+   
+   # 在main函数开始前固定随机数
+   seed_all()
+   
+   # 配置dump数据目录路径和名称
+   set_dump_path("./npu_dump", dump_tag='all')
+   
+   # 注册dump回调函数
+   register_hook(model, acc_cmp_dump)
+   
+   ...
+   
+   # 在第一个迭代开始的位置开启dump和堆栈模式，同时为保证数据完整性开启dump bool和整型的tensor以及浮点、bool和整型的标量
+   set_dump_switch("ON", mode="api_stack", filter_switch="OFF")
+   
+   ...
+   
+   # 在第一个迭代结束的位置关闭dump
+   set_dump_switch("OFF")
+   ```
+
+2. 比对整网数据。
+
+   第1步中的NPU dump数据文件为npu_dump.pkl，假设NPU dump npy数据目录为npu_dump，GPU dump数据文件为gpu_dump.pkl，GPU dump npy数据目录为gpu_dump。
+
+   创建并配置精度比对脚本，以创建compare.py为例，示例代码如下：
+
+   ```python
+   from ptdbg_ascend import *
+   dump_result_param={
+   "npu_pkl_path": "./npu_dump/all_v2.0/rank0/api_stack_dump.pkl",
+   "bench_pkl_path": "./gpu_dump/all_v2.0/rank0/api_stack_dump.pkl",
+   "npu_dump_data_dir": "./npu_dump/all_v2.0/rank0/api_stack_dump",
+   "bench_dump_data_dir": "./gpu_dump/all_v2.0/rank0/api_stack_dump",
+   "is_print_compare_log": True
+   }
+   compare(dump_result_param, "./output")
+   ```
+
+   执行比对：
+
+   ```bash
+   python3 compare.py
+   ```
+
+   在output目录下生成结果文件，包括：`compare_result_{timestamp}.csv`和`advisor_{timestamp}.txt`
+
+3. 找出存在问题的API。
+
+   1. 根据`advisor_{timestamp}.txt`或打屏信息的提示，可找到存在精度问题的算子（Suspect Nodes）和专家建议（Expert Advice)
+
+      ![auto_analyze_log](img/auto_analyze_log.png)
+
+   2. 根据第2步结果文件`compare_result_{timestamp}.csv`中的Accuracy Reached or No字段显示为NO的API，针对该API执行后续比对操作，分析该API存在的精度问题。
+
+4. （可选）提取指定API的堆栈信息和dump数据统计信息。
+
+   通过parse接口可以清晰的显示特定API的堆栈信息和dump数据统计信息，结合堆栈信息分析代码中可能存在的精度问题。
+
+   创建并配置提取脚本，以创建parse.py为例，示例代码如下：
+
+   ```python
+   from ptdbg_ascend import *
+   
+   # 提取dump信息中第1次调用的API：Torch_batch_normal的堆栈信息及数据统计信息
+   parse("./npu_dump/all_v2.0/rank0/api_stack_dump.pkl", "Torch_batch_normal_1_forward")
+   ```
+
+   执行提取：
+
+   ```bash
+   python3 parse.py
+   ```
+
+   
+
+5. （可选）指定API dump数据。
+
+   - dump指定前向API的ACL级别数据
+
+     ```python
+     from ptdbg_ascend import *
+     
+     # 固定随机数，开启确定性计算
+     seed_all(mode=True)
+     set_dump_path("./dump_path", dump_tag='forward')
+     register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+     
+     # dump指定前向API的ACL级别数据、bool和整型的tensor以及浮点、bool和整型的标量
+     set_dump_switch("ON", mode="acl", scope=["Tensor_permute_1_forward"], filter_switch="OFF")
+     
+     ...
+     
+     set_dump_switch("OFF")
+     ```
+
+   - dump指定反向API的ACL级别数据
+
+     ```python
+     from ptdbg_ascend import *
+     
+     # 固定随机数，开启确定性计算
+     seed_all(mode=True)
+     set_dump_path("./dump_path", dump_tag='backward')
+     register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+     
+     # dump指定反向API的ACL级别数据、bool和整型的tensor以及浮点、bool和整型的标量
+     set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"], filter_switch="OFF")
+     set_backward_input(["./npu_dump/all_v2.0/rank0/api_stack_dump/Functional_conv2d_1_backward_input.0.npy"])
+     
+     ...
+     
+     set_dump_switch("OFF")
+     ```
+
+6. （可选）重新比对。
+
+   根据第4或5步的dump数据重新配置compare.py并执行比对，可以对单API模型进行问题复现。
+
+**注意事项**
+
+* dump_mode="acl"场景下，会增加npu的内存消耗，请谨慎开启。
+* 部分API存在调用嵌套关系，比如functional.batch_norm实际调用torch.batch_norm，该场景会影响acl init初始化多次，导致功能异常。
+
+### 多卡场景精度比对
+
+精度工具支持多卡场景的精度比对，多卡场景的dump步骤与单卡场景完全一致，请参见“**单卡场景精度比对**”章节，不同的是多卡数据精度比对时需要使用“compare_distributed”函数进行比对。如下示例：
+
+说明：多机多卡场景需要每个设备单独执行比对操作。
+
+假设NPU dump npy数据目录为npu_dump/dump_conv2d_v1.0，GPU dump npy数据目录为gpu_dump/dump_conv2d_v1.0。
+
+1. 创建比对脚本，例如compare_distributed.py，拷贝如下代码。
+
+   ```python
+   from ptdbg_ascend import *
+   compare_distributed('./npu_dump/ptdbg_dump_v2.0', './gpu_dump/ptdbg_dump_v2.0', './output')
+   ```
+
+2. 执行比对：
+
+   ```bash
+   python3 compare_distributed.py
+   ```
+
+两次运行须用相同数量的卡，传入`compare_distributed`的两个文件夹下须有相同个数的rank文件夹，且不包含其他无关文件，否则将无法比对。
+
+**多卡set_dump_path注意事项**
+
+多卡一般为多进程，须保证每个进程都正确调用set_dump_path，或把set_dump_path插入到import语句后，如：
+
+```python
+from ptdbg_ascend import *
+seed_all()
+set_dump_path('./dump_resnet')
+```
+
+如此可保证set_dump_path在每个进程都被调用。
+
+**多卡register_hook注意事项**
+
+register_hook需要在set_dump_path之后调用，也需要在每个进程上被调用，建议在搬运模型数据到卡之后调用。识别方法如下：
+
+- 找到训练代码中遍历epoch的for循环或遍历数据集的for循环，把register_hook放到循环开始前即可。
+- 找到训练代码中调用DDP或者DistributedDataParallel的代码行，把register_hook放到该代码行所在的代码块之后。
+- 若代码中均无以上两种情况，需要保证register_hook在模型定义之后插入，并配置rank参数。rank参数获取rank_id请参见“**[rank_id获取方法](https://gitee.com/kun_8/att_1/blob/master/debug/accuracy_tools/ptdbg_ascend/doc/rank_id获取方法.md)**”。
+
+### NPU vs NPU精度比对
+
+对于NPU vs NPU场景，是针对同一模型，进行迭代（模型、API版本升级或设备硬件升级）时存在的精度下降问题，对比相同模型在迭代前后版本的API计算数值，进行问题定位。
+
+一般情况下迭代涉及NPU自定义算子，因此，可以仅dump NPU自定义算子进行比对。比对精度问题分析请参见“**单卡场景精度比对**”章节。
+
+工具当前支持dump NPU自定义算子如下：
+
+| 序号 | NPU自定义算子                       |
+| :--- | ----------------------------------- |
+| 1    | torch_npu.one_                      |
+| 2    | torch_npu.npu_sort_v2               |
+| 3    | torch_npu.npu_transpose             |
+| 4    | torch_npu.npu_broadcast             |
+| 5    | torch_npu.npu_dtype_cast            |
+| 6    | torch_npu.empty_with_format         |
+| 7    | torch_npu.npu_one_hot               |
+| 8    | torch_npu.npu_stride_add            |
+| 9    | torch_npu.npu_ps_roi_pooling        |
+| 10   | torch_npu.npu_roi_align             |
+| 11   | torch_npu.npu_nms_v4                |
+| 12   | torch_npu.npu_iou                   |
+| 13   | torch_npu.npu_nms_with_mask         |
+| 14   | torch_npu.npu_pad                   |
+| 15   | torch_npu.npu_bounding_box_encode   |
+| 16   | torch_npu.npu_bounding_box_decode   |
+| 17   | torch_npu.npu_batch_nms             |
+| 18   | torch_npu.npu_slice                 |
+| 19   | torch_npu._npu_dropout              |
+| 20   | torch_npu.npu_indexing              |
+| 21   | torch_npu.npu_ifmr                  |
+| 22   | torch_npu.npu_max                   |
+| 23   | torch_npu.npu_scatter               |
+| 24   | torch_npu.npu_layer_norm_eval       |
+| 25   | torch_npu.npu_alloc_float_status    |
+| 26   | torch_npu.npu_get_float_status      |
+| 27   | torch_npu.npu_clear_float_status    |
+| 28   | torch_npu.npu_confusion_transpose   |
+| 29   | torch_npu.npu_bmmV2                 |
+| 30   | torch_npu.fast_gelu                 |
+| 31   | torch_npu.npu_sub_sample            |
+| 32   | torch_npu.npu_deformable_conv2d     |
+| 33   | torch_npu.npu_mish                  |
+| 34   | torch_npu.npu_anchor_response_flags |
+| 35   | torch_npu.npu_yolo_boxes_encode     |
+| 36   | torch_npu.npu_grid_assign_positive  |
+| 37   | torch_npu.npu_normalize_batch       |
+| 38   | torch_npu.npu_masked_fill_range     |
+| 39   | torch_npu.npu_linear                |
+| 40   | torch_npu.npu_bert_apply_adam       |
+| 41   | torch_npu.npu_giou                  |
+| 42   | torch_npu.npu_ciou                  |
+| 43   | torch_npu.npu_ciou_backward         |
+| 44   | torch_npu.npu_diou                  |
+| 45   | torch_npu.npu_diou_backward         |
+| 46   | torch_npu.npu_sign_bits_pack        |
+| 47   | torch_npu.npu_sign_bits_unpack      |
+
+### 溢出检测场景
+
+溢出检测是针对NPU的PyTorch API，检测是否存在溢出的情况。当前仅支持识别aicore浮点溢出。
+
+溢出检测原理：针对溢出阶段，开启acl dump模式，重新对溢出阶段执行，落盘数据。
+
+建议按照如下步骤操作：
+
+1. 在NPU环境下安装ptdbg_ascend工具。
+
+2. 在NPU训练脚本内插入ptdbg_ascend工具溢出检测接口。
+
+   - 示例1：全量溢出检测
+
+     ```python
+     from ptdbg_ascend import *
+     seed_all()
+     ...
+     # 设置检测到3次溢出后退出训练
+     register_hook(model, overflow_check, overflow_nums=3)
+     
+     ...
+     ```
+
+     多卡使用时各卡单独计算溢出次数。
+
+   - 示例2：dump指定API的ACL级别溢出数据
+
+     ```python
+     from ptdbg_ascend import *
+     seed_all()
+     ...
+     # dump指定API的ACL级别溢出数据
+     register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json')
+     
+     # 在期望溢出检测的step位置开始前打开溢出检测开关
+     set_overflow_check_switch("ON")
+     
+     ...
+     
+     # 在step结束的位置关闭溢出检测开关
+     set_overflow_check_switch("OFF")
+     
+     ...
+     ```
+
+   - 示例3：dump指定反向API的ACL级别的溢出数据
+
+     1. 进行全量溢出检测
+
+        ```python
+        from ptdbg_ascend import *
+        seed_all()
+        ...
+        # 设置检测到3次溢出后退出训练
+        register_hook(model, overflow_check)
+        
+        ...
+        ```
+
+     2. dump指定反向API的ACL级别的溢出数据
+
+        ```python
+        from ptdbg_ascend import *
+        seed_all()
+        ...
+        # dump指定反向API的ACL级别溢出数据
+        register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+        set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"])
+        set_backward_input(["./npu_dump/ptdbg_dump_v2.0/rank0/dump/Functional_conv2d_1_backward_input.0.npy"])
+        ```
+
+   针对前向溢出API，可以通过overflow_nums，配置允许的溢出次数，并将每次溢出API的全部ACL数据dump下来，到达指定溢出次数后停止，停止后会看到堆栈打印包含如下字段。
+
+   ```bash
+   ValueError: [overflow xxx times]: dump file is saved in 'xxxxx.pkl'.
+   ```
+
+   其中xxx times为用户设置的次数，xxxxx.pkl为文件生成路径。
+
+3. NPU环境下执行训练dump溢出数据。
+
+   针对输入正常但输出存在溢出的API，会训练执行目录下将溢出的API信息dump并保存为`forward_info_{pid}.json`和`backward_info_{pid}.json`，通过 [Ascend模型精度预检工具](https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/api_accuracy_checker)对json文件进行解析，输出溢出API为正常溢出还是非正常溢出，从而帮助用户快速判断。
+
+   精度预检工具执行命令如下：
+
+   ```
+   cd $ATT_HOME/debug/accuracy_tools/api_accuracy_checker/run_ut
+   python run_overflow_check.py -forward ./forward_info_0.json -backward ./backward_info_0.json
+   ```
+
+**注意事项**
+
+* dump_mode="acl"场景下，会增加npu的内存消耗，请谨慎开启。
+* 部分API存在调用嵌套关系，比如functional.batch_norm实际调用torch.batch_norm，该场景会影响acl init初始化多次，导致功能异常。
+
+## debugger方式dump和溢出检测（推荐）
+
+### PrecisionDebugger模块
+
+**功能说明**
+
+PrecisionDebugger模块包含dump和溢出检测功能的总体配置项。可以指定dump目录，设置dump或溢出检测功能，指定dump的卡和迭代。
+
+可以在from ptdbg_ascend import *和模型初始化之间的任意位置添加该模块。
+
+**原型**
+
+```python
+PrecisionDebugger(dump_path=None, hook_name=None, rank=None):
+```
+
+**参数说明**
+
+| 参数名    | 说明                                                         | 是否必选 |
+| --------- | ------------------------------------------------------------ | -------- |
+| dump_path | 设置dump数据目录路径，参数示例："./dump_path"。dump_path的父目录须为已存在目录。<br/>默认在指定的dump_path路径下生成`ptdbg_dump_{version}`目录，并在该目录下生成`dump.pkl`文件以及`dump`数据文件保存目录。<br/>当**configure_hook**函数配置了mode参数时，`dump.pkl`文件以及`dump`数据文件保存目录名称添加mode参数值为前缀，详情请参见“**dump数据存盘说明**”。 | 是       |
+| hook_name | dump模式，可取值dump和overflow_check，表示dump和溢出检测功能，二选一。 | 是       |
+| rank      | 指定对某张卡上的数据进行dump或溢出检测，默认未配置（表示dump所有卡的数据），须根据实际卡的Rank ID配置。 | 否       |
+
+### configure_hook函数（可选）
+
+**功能说明**
+
+设置dump范围。
+
+建议在**PrecisionDebugger**模块与模型初始化之间的任意位置添加，不添加此函数时默认使用mode="api_stack" dump整网数据。
+
+**原型**
+
+dump：
+
+```python
+debugger.configure_hook(mode="api_stack", scope=[], api_list=[], filter_switch="ON", acl_config=None, backward_input=[], input_output_mode=["all"])
+```
+
+溢出检测：
+
+```python
+debugger.configure_hook(mode=None, acl_config=None, overflow_nums=1)
+```
+
+**参数说明**
+
+| 参数名            | 说明                                                         | 是否必选 |
+| ----------------- | ------------------------------------------------------------ | -------- |
+| mode              | dump模式。可取值"all"、"list"、"range"、"stack"、"acl"、"api_list"、"api_stack"，各参数含义请参见本节的“**函数示例**”。参数示例：mode="list"。默认为api_stack。该参数配置值将作为dump数据文件名的前缀，详情请参见“**dump数据存盘说明**”。 | 否       |
+| scope或api_list   | dump范围。根据model配置的模式选择dump的API范围，mode="api_list"时，需要配置api_list=[]，其他模式有需要时配置scope=[]。参数示例：scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward"]、api_list=["relu"]。默认为空。 | 否       |
+| filter_switch     | 开启dump bool和整型的tensor以及浮点、bool和整型的标量。可取值"ON"或"OFF"。参数示例：filter_switch="OFF"。默认不配置，即filter_switch="ON"，表示不dump上述数据。 | 否       |
+| acl_config        | acl dump的配置文件。mode="acl"时，该参数必选；mode为其他值时，该参数不选。参数示例：acl_config='./dump.json'。dump.json配置文件详细介绍请参见“**dump.json配置文件说明**”。 | 否       |
+| backward_input    | 该输入文件为首次运行训练dump得到反向API输入的.npy文件。例如若需要dump Functional_conv2d_1 API的反向过程的输入输出，则需要在dump目录下查找命名包含Functional_conv2d_1、backward和input字段的.npy文件。 | 否       |
+| input_output_mode | dump数据过滤。可取值"all"、"forward"、"backward"、"input"和"output"，表示仅保存dump的数据中文件名包含"forward"、"backward"、"input"和"output"的前向、反向、输入或输出的.npy文件。参数示例input_output_mode=["backward"]或input_output_mode=["forward", "backward"]。默认为all，即保存所有dump的数据。除了all参数只能单独配置外，其他参数可以自由组合。 | 否       |
+| overflow_nums     | 控制溢出次数，表示第N次溢出时，停止训练，过程中检测到溢出API对应ACL数据均dump。参数示例：overflow_nums=3。配置overflow_check时可配置，默认不配置，即检测到1次溢出，训练停止。 | 否       |
+
+**函数示例**
+
+configure_hook可配置多种dump模式，示例如下：
+
+说明：以下均以dump部分API数据为例，API名可以从首次dump整网数据的结果csv文件中的NPU Name或Bench Name列获取。
+
+- 示例1：dump指定API列表
+
+  ```python
+  debugger.configure_hook(mode="list", scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward", "Torch_relu_3_backward"])
+  ```
+
+- 示例2：dump指定范围
+
+  ```python
+  debugger.configure_hook(mode="range", scope=["Tensor_abs_1_forward", "Tensor_transpose_3_forward"])
+  ```
+
+- 示例3：STACK模式，只dump堆栈信息
+
+  ```python
+  debugger.configure_hook(mode="stack", scope=["Tensor_abs_1_forward", "Tensor_transpose_3_forward"])
+  ```
+
+- 示例4：dump指定前向API的ACL级别数据
+
+  ```python
+  debugger.configure_hook(mode="acl", scope=["Tensor_permute_1_forward"], acl_config="./dump.json")
+  ```
+
+- 示例4：dump指定反向API的ACL级别数据
+
+  ```python
+  debugger.configure_hook(mode="acl", scope=["Functional_conv2d_1_backward"], acl_config="./dump.json", backward_input=["./npu_dump/dump_conv2d_v2.0/rank0/dump/Functional_conv2d_1_backward_input.0.npy"])
+  ```
+
+- 示例5：dump指定某一类API的API级别输入输出数据
+
+  ```python
+  debugger.configure_hook(mode="api_list", api_list=["relu"])
+  ```
+
+  mode="api_list"时不配置scope。
+
+- 示例6：dump全部API级别输入输出数据以及相应堆栈信息
+
+  ```python
+  debugger.configure_hook(mode="api_stack")
+  ```
+
+  mode="api_stack"时不配置scope。
+
+- 示例7： dump全部API级别输入输出数据并包含bool和整型的tensor以及浮点、bool和整型的标量，默认不配置为ON，会过滤bool和整型数据
+
+  ```python
+  debugger.configure_hook(filter_switch="OFF")
+  ```
+
+  配置filter_switch="OFF"同时也可以配置mode、scope和api_list，除dump ACL级别数据。
+
+- 示例8：仅保存dump的数据文件名包含“backward”的反向.npy文件
+
+  ```python
+  debugger.configure_hook(input_output_mode=["backward"])
+  ```
+
+- 示例9：溢出检测dump
+
+  ```python
+  debugger.configure_hook(overflow_nums=1)
+  ```
+
+  dump执行时会在**PrecisionDebugger**模块的dump_path参数指定的目录下生成ptdbg_dump_{version}目录，保存溢出数据。
+
+  多卡场景时，需要检测到至少有一张卡溢出次数达到overflow_nums时，训练结束。
+
+  仅支持NPU环境。
+
+- 示例10：dump指定API的ACL级别溢出数据
+
+  ```python
+  debugger.configure_hook(mode="acl", acl_config="./dump.json")
+  ```
+
+  该场景**PrecisionDebugger**模块的dump_path参数不生效，由acl_config中的dump.json文件配置溢出数据目录。
+
+  仅支持NPU环境。
+
+### start函数
+
+**功能说明**
+
+dump或溢出检测启动函数。
+
+在模型初始化之后的任意位置添加。
+
+**原型**
+
+```python
+debugger.start()
+```
+
+该函数为类函数，可以使用debugger.start()也可以使用PrecisionDebugger.start()。
+
+### stop函数
+
+**功能说明**
+
+dump或溢出检测停止函数。
+
+在**start**函数之后的任意位置添加。
+
+**原型**
+
+```python
+debugger.stop()
+```
+
+该函数为类函数，可以使用debugger.stopt()也可以使用PrecisionDebugger.stop()。
+
+### 示例代码
+
+- 示例1：开启dump
+
+  ```python
+  from ptdbg_ascend import *
+  debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump")
+  
+  # 模型初始化
+  # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop()
+  debugger.start()
+  
+  ...
+  
+  debugger.stop()
+  ```
+
+- 示例2：开启溢出检测dump
+
+  ```python
+  from ptdbg_ascend import *
+  debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="overflow_check")
+  
+  # 模型初始化
+  # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop()
+  debugger.start()
+  
+  ...
+  
+  debugger.stop()
+  ```
+
+## CPU或GPU及NPU精度数据dump 
+
+### 总体说明
+
+- 本节主要介绍CPU或GPU及NPU精度数据dump所需要的函数以及示例。
+
+- ptdbg_ascend工具默认情况下仅dump PyTorch模型的API输入输出数据进行精度比对，若在比对结果中发现某个API下可能存在ACL的精度问题，那么可以选择dump该API的ACL级别数据进行精度分析。
+
+- 某些torch api的输出不是Tensor类型的数据。对于此类API的反向过程进行ACL dump，工具会在运行日志中给出对应的Warning（is not of tensor type and cannot be automatically derived）提示。如若想要进行该类API反向ACL dump，可以通过手动构建单API用例的方式进行ACL dump，具体用例可参见“**[反向ACL dump用例说明](https://gitee.com/kun_8/att_1/blob/master/debug/accuracy_tools/ptdbg_ascend/doc/%E5%8F%8D%E5%90%91ACL%20dump%E7%94%A8%E4%BE%8B%E8%AF%B4%E6%98%8E.md)**”。
+
+- 工具性能：dump数据量较小时（小于5G），参考dump速度0.1GB/s；dump数据量较大时，参考dump速度0.2GB/s。
+  推荐环境配置：独占环境，CPU核心数192，固态硬盘（IO速度参考：固态硬盘 > 500MB/s，机械硬盘60 ~ 170MB/s）。
+  
+  用户环境性能弱于标准约束或非独占使用的比对速度酌情向下浮动。Dump速度的计算方式：Dump数据量/(单个step添加Dump耗时-原始单个step耗时）。
+
+### 约束
+- 进行CPU或GPU数据dump时，请安装torch包而非torch_npu包，避免工具无法识别使用场景，导致失败。
+  
+- TASK_QUEUE_ENABLE环境变量会导致API下发和执行异步进行，因此在ACL dump前需要将TASK_QUEUE_ENABLE关闭，即export TASK_QUEUE_ENABLE=0。
+
+- 不建议在PyTorch训练脚本中同时添加dump接口和性能数据采集（如Ascend PyThon Profiler）接口，二者可能相互影响导致数据不准确。
+
+### seed_all
+
+**功能说明**
+
+固定随机数。通过固定随机数保证模型的输入或输出一致。在训练主函数开始前调用，避免随机数固定不全。
+
+dump操作必选。
+
+**函数原型**
+
+```python
+seed_all(seed=1234, mode=False)
+```
+
+**参数说明**
+
+| 参数名 | 说明                                                         | 是否必选 |
+| ------ | ------------------------------------------------------------ | -------- |
+| seed   | 随机数种子。参数示例：seed=1000。默认值为：1234。            | 否       |
+| mode   | 确定性计算模式。可配置True或False。参数示例：mode=True。默认为False。<br/>即使在相同的硬件和输入下，API多次执行的结果也可能不同，开启确定性计算是为了保证在相同的硬件和输入下，API多次执行的结果相同。<br/>确定性计算会导致API执行性能降低，建议在发现模型多次执行结果不同的情况下开启。<br/>rnn类算子、ReduceSum、ReduceMean等算子可能与确定性计算存在冲突，若开启确定性计算后多次执行的结果不相同，则考虑存在这些算子。 | 否       |
+
+**函数示例**
+
+seed_all函数的随机数种子，取默认值即可，无须配置；第二个参数默认关闭，不开启确定性计算时也无须配置。
+
+- 示例1：仅固定随机数，不开启确定性计算
+
+  ```python
+  seed_all()
+  ```
+
+- 示例2：固定随机数，开启确定性计算
+
+  ```python
+  seed_all(mode=True)
+  ```
+
+**固定随机数范围**
+
+seed_all函数可固定随机数的范围如下表。
+
+| API                                      | 固定随机数                  |
+| ---------------------------------------- | --------------------------- |
+| os.environ['PYTHONHASHSEED'] = str(seed) | 禁止Python中的hash随机化    |
+| random.seed(seed)                        | 设置random随机生成器的种子  |
+| np.random.seed(seed)                     | 设置numpy中随机生成器的种子 |
+| torch.manual_seed(seed)                  | 设置当前CPU的随机种子       |
+| torch.cuda.manual_seed(seed)             | 设置当前GPU的随机种子       |
+| torch.cuda.manual_seed_all(seed)         | 设置所有GPU的随机种子       |
+| torch_npu.npu.manual_seed(seed)          | 设置当前NPU的随机种子       |
+| torch_npu.npu.manual_seed_all(seed)      | 设置所有NPU的随机种子       |
+| torch.backends.cudnn.enable=False        | 关闭cuDNN                   |
+| torch.backends.cudnn.benchmark=False     | cuDNN确定性地选择算法       |
+| torch.backends.cudnn.deterministic=True  | cuDNN仅使用确定性的卷积算法 |
+
+需要保证CPU或GPU以及NPU的模型输入完全一致，dump数据的比对才有意义，seed_all并不能保证模型输入完全一致，如下表所示场景需要保证输入的一致性。
+
+| 场景            | 固定方法      |
+| --------------- | ------------- |
+| 数据集的shuffle | 关闭shuffle。 |
+| dropout         | 关闭dropout。 |
+
+关闭shuffle示例：
+
+```python
+train_loader = torch.utils.data.DataLoader(
+	train_dataset,
+	batch_size = batch_size,
+	shuffle = False,
+	num_workers = num_workers
+)
+```
+
+关闭dropout：
+
+在使用from ptdbg import *后，工具会自动将torch.nn.functional.dropout、torch.nn.functional.dropout2d、torch.nn.functional.dropout3d、torch.nn.Dropout、torch.nn.Dropout2d、torch.nn.Dropout3d的接口参数p置为0。
+
+### set_dump_path
+
+**功能说明**
+
+设置dump数据目录。建议在seed_all函数之后调用且需要保证训练进程能够调用该函数；多卡时须保证每个进程都能调用该函数。
+
+dump操作必选。
+
+**函数原型**
+
+```python
+set_dump_path(fpath=None, dump_tag='ptdbg_dump')
+```
+
+**参数说明**
+
+| 参数名   | 说明                                                         | 是否必选 |
+| -------- | ------------------------------------------------------------ | -------- |
+| fpath    | 设置dump数据目录路径。参数示例：'./dump_path'。dump_path须为已存在目录。<br/>默认在指定的dump_path路径下生成`ptdbg_dump_{version}`目录，并在该目录下生成`dump.pkl`文件以及`dump`数据文件保存目录。<br/>当set_dump_switch函数配置了mode参数时，`dump.pkl`文件以及`dump`数据文件保存目录名称添加mode参数值为前缀，详情请参见“**dump数据存盘说明**”。 | 是       |
+| dump_tag | 设置dump数据目录名称。参数示例：dump_tag='dump_conv2d'。默认dump数据目录命名为ptdbg_dump_{version}。<br/>{version}为当前安装ptdbg_ascend工具版本。目录结构参见“**dump数据存盘说明**”。<br/>配置该参数会将生成的`ptdbg_dump_{version}`目录名称变更为dump_tag配置的值，如`dump_conv2d_{version}`。 | 否       |
+
+**函数示例**
+
+- 示例1：设置dump数据目录路径
+
+  ```python
+  set_dump_path('./dump_path')
+  ```
+
+- 示例2：设置dump数据目录名称
+
+  ```python
+  set_dump_path('./dump_path', dump_tag='dump_conv2d')
+  ```
+
+
+若以相同的dump数据目录多次dump，则会因同名导致覆盖；多次dump建议配置不同的dump_tag。
+
+### register_hook
+
+**功能说明**
+
+注册工具钩子函数。在set_dump_path之后调用。
+
+dump操作必选。
+
+**函数原型**
+
+```python
+register_hook(model, hook, overflow_nums=overflow_nums, dump_mode=dump_mode, dump_config=dump_config_file, rank=0)
+```
+
+**参数说明**
+
+| 参数名        | 说明                                                         | 是否必选 |
+| ------------- | ------------------------------------------------------------ | -------- |
+| model         | model对象。                                                  | 是       |
+| hook          | 注册工具的dump和溢出检测钩子。可取值overflow_check和acc_cmp_dump，二选一。 | 是       |
+| overflow_nums | 控制溢出次数，表示第N次溢出时，停止训练，过程中检测到溢出API对应ACL数据均dump。参数示例：overflow_nums=3。配置overflow_check时可配置，默认不配置，即检测到1次溢出，训练停止。 | 否       |
+| dump_mode     | 控制针对溢出API的dump模式。可取值"api"或"acl"，配置acl时表示dump ACL级别的溢出数据，此时set_dump_path参数不生效，dump数据目录由dump_config的.json文件配置，参数示例：dump_mode="acl"。默认不配置，即dump API级别的溢出数据。 | 否       |
+| dump_config   | acl dump的配置文件。dump_mode="acl"时，该参数必选；dump_mode="api"时，该参数不选。参数示例：dump_config='./dump.json'。 | 否       |
+| rank          | 控制dump数据保存的rank目录名称。参数示例：rank=1。默认不配置，即自动读取dump数据所属的卡并保存在该卡对应的rank目录下。目录结构参见“**dump数据存盘说明**”。<br/>多卡情况下，可能出现工具识别rank出错，导致dump数据保存到错误的rank目录下，此时需要根据“**[rank_id获取方法](https://gitee.com/kun_8/att_1/blob/master/debug/accuracy_tools/ptdbg_ascend/doc/rank_id获取方法.md)**”配置该参数，以获取正确的rank_id；工具可正确识别rank_id时无须配置该参数。 | 否       |
+
+**函数示例**
+
+- 示例1：注册工具钩子函数
+
+  ```python
+  register_hook(model, acc_cmp_dump)
+  ```
+
+- 示例2：dump指定API的ACL级别数据
+
+  ```python
+  register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+  ```
+
+  需要配置set_dump_switch的mode="acl"以及scope指定为前向或反向API，请参见“**set_dump_switch”**的示例。
+
+  该场景set_dump_path不生效，由dump_config中的dump.json文件配置dump数据目录。
+
+- 示例3：溢出检测dump
+
+  ```python
+  register_hook(model, overflow_check, overflow_nums=3)
+  ```
+
+  dump执行时会在set_dump_path的fpath参数指定的目录下生成ptdbg_dump_{version}目录，保存溢出数据。
+
+  多卡场景时，需要检测到至少有一张卡溢出次数达到overflow_nums时，训练结束。
+
+  仅支持NPU环境。
+
+- 示例4：dump指定API的ACL级别溢出数据
+
+  ```python
+  register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json')
+  ```
+
+  该场景set_dump_path不生效，由dump_config中的dump.json文件配置溢出数据目录。
+
+  仅支持NPU环境。
+
+### set_dump_switch
+
+**功能说明**
+
+设置dump范围。建议在register_hook函数之后的脚本内任意位置插入，但进行精度问题排查建议参照“场景化示例 > 单卡场景精度比对”章节的顺序，先从第一个迭代开始的位置调用并dump整网数据。
+
+dump操作必选。
+
+**函数原型**
+
+```python
+def set_dump_switch(switch, mode="all", scope=[], api_list=[], filter_switch="ON", dump_mode=["all"]):
+```
+
+**参数说明**
+
+| 参数名          | 说明                                                         | 是否必选 |
+| --------------- | ------------------------------------------------------------ | -------- |
+| switch          | dump开关。可取值"ON"或"OFF"。须在选定dump开始的位置配置set_dump_switch("ON")；dump结束的位置设置set_dump_switch("OFF")。 | 是       |
+| mode            | dump模式。可取值"all"、"list"、"range"、"stack"、"acl"、"api_list"、"api_stack"，各参数含义请参见本节的“**函数示例**”。参数示例：mode="list"。默认为all。该参数配置值将作为dump数据文件名的前缀，详情请参见“**dump数据存盘说明**”。 | 否       |
+| scope或api_list | dump范围。根据model配置的模式选择dump的API范围。参数示例：scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward"]、api_list=["relu"]。默认为空。 | 否       |
+| filter_switch   | 开启dump bool和整型的tensor以及浮点、bool和整型的标量。可取值"ON"或"OFF"。参数示例：filter_switch="OFF"。默认不配置，即filter_switch="ON"，表示不dump上述数据。 | 否       |
+| dump_mode       | dump数据过滤。可取值"all"、"forward"、"backward"、"input"和"output"，表示仅保存dump的数据中文件名包含"forward"、"backward"、"input"和"output"的前向、反向、输入或输出的.npy文件。参数示例dump_mode=["backward"]或dump_mode=["forward", "backward"]。默认为all，即保存所有dump的数据。除了all参数只能单独配置外，其他参数可以自由组合。 | 否       |
+
+**推荐配置**
+
+```python
+set_dump_switch("ON", mode="api_stack", filter_switch="OFF")
+```
+
+开启dump数据和堆栈模式，同时为保证数据完整性开启dump bool和整型的tensor以及浮点、bool和整型的标量。
+
+**函数示例**
+
+set_dump_switch可配置多种dump模式，示例如下：
+
+说明：以下均以dump部分API数据为例，API名可以从首次dump整网数据的结果csv文件中的NPU Name或Bench Name列获取。
+
+- 示例1：dump指定API列表
+
+  ```python
+  set_dump_switch("ON", mode="list", scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward", "Torch_relu_3_backward"])
+  ```
+
+- 示例2：dump指定范围
+
+  ```python
+  set_dump_switch("ON", mode="range", scope=["Tensor_abs_1_forward", "Tensor_transpose_3_forward"])
+  ```
+
+- 示例3：STACK模式，只dump堆栈信息
+
+  ```python
+  set_dump_switch("ON", mode="stack", scope=["Tensor_abs_1_forward", "Tensor_transpose_3_forward"])
+  ```
+
+- 示例4：dump指定前向API的ACL级别数据
+
+  ```python
+  register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+  set_dump_switch("ON", mode="acl", scope=["Tensor_permute_1_forward"])
+  ```
+
+  需要配置register_hook的dump_mode='acl'和dump_config配置文件。
+
+- 示例4：dump指定反向API的ACL级别数据
+
+  ```python
+  register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+  set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"])
+  set_backward_input(["./npu_dump/dump_conv2d_v2.0/rank0/dump/Functional_conv2d_1_backward_input.0.npy"])
+  ```
+
+  需要配置register_hook的dump_mode='acl'和dump_config配置文件，并通过set_backward_input设置反向API输入的.npy文件。
+
+- 示例5：dump指定某一类API的API级别输入输出数据
+
+  ```python
+  set_dump_switch("ON", mode="api_list", api_list=["relu"])
+  ```
+
+  mode="api_list"时不配置scope。
+
+- 示例6：dump全部API级别输入输出数据以及相应堆栈信息
+
+  ```python
+  set_dump_switch("ON", mode="api_stack")
+  ```
+
+  mode="api_stack"时不配置scope。
+
+- 示例7： dump全部API级别输入输出数据并包含bool和整型的tensor以及浮点、bool和整型的标量，默认不配置为ON，会过滤bool和整型数据
+
+  ```python
+  set_dump_switch("ON", filter_switch="OFF")
+  ```
+
+  配置filter_switch="OFF"同时也可以配置mode、scope和api_list，除dump ACL级别数据。
+  
+- 示例8：仅保存dump的数据文件名包含“backward”的反向.npy文件
+
+  ```python
+  set_dump_switch("ON", dump_mode=["backward"])
+  ```
+
+以上示例均需要在结束dump的位置插入set_dump_switch("OFF")。
+
+set_dump_switch配置mode为all或api_stack时，结束dump后，在dump目录下会自动生成compare_data.py比对脚本模板，示例如下：
+
+```python
+from ptdbg_ascend import compare
+
+pkl_path = "%s"
+dump_data_dir = "%s"
+
+dump_path_param = {
+	"npu_pkl_path": ,
+	"bench_pkl_path": ,
+	"npu_dump_data_dir": ,
+	"bench_dump_data_dir": ,
+	"is_print_compare_log": True
+}
+
+compare(dump_path_param, output_path="", stack_mode="%s")
+```
+
+pkl_path和dump_data_dir字段会自动识别pkl和dump目录的路径，用户需要判断当前dump的环境是NPU、CPU或GPU，并将pkl_path和dump_data_dir字段填入下方dump_path_param函数对应的字段中，例如当前设备为NPU，那么填写方式如下：
+
+```python
+from ptdbg_ascend import compare
+
+pkl_path = "%s"
+dump_data_dir = "%s"
+
+dump_path_param = {
+    "npu_pkl_path": pkl_path,
+	"bench_pkl_path": ,
+	"npu_dump_data_dir": dump_data_dir,
+	"bench_dump_data_dir": ,
+	"is_print_compare_log": True
+}
+
+compare(dump_path_param, output_path="", stack_mode="%s")
+```
+
+此时，另一侧数据的路径，需要用户另外识别并填入。
+
+### set_overflow_check_switch
+
+**功能说明**
+
+置溢出检测范围。默认不配置该函数，全量进行溢出检测。
+
+仅支持NPU环境。
+
+**函数原型**
+
+```python
+set_overflow_check_switch(switch, filter_switch='ON')
+```
+
+**参数说明**
+
+| 参数名        | 说明                                                         | 是否必选 |
+| ------------- | ------------------------------------------------------------ | -------- |
+| switch,       | 检测开关。可取值"ON"或"OFF"。如果只在特定的step溢出检测，则在期望溢出检测的step位置开始前插入set_overflow_check_switch("ON")，在step结束的位置插入set_overflow_check_switch("OFF")。 | 是       |
+| filter_switch | 开启dump bool和整型的tensor以及浮点、bool和整型的标量。可取值"ON"或"OFF"。参数示例：filter_switch="OFF"。默认不配置，即filter_switch="ON"，表示不dump上述数据。 | 否       |
+
+**函数示例**
+
+- 示例1：指定范围溢出检测
+
+  ```python
+  register_hook(model, overflow_check)
+  set_overflow_check_switch("ON")
+  
+  ...
+  
+  set_overflow_check_switch("OFF")
+  ```
+
+  该场景set_dump_path不生效，dump执行时会在当前目录自动生成ptdbg_dump_{version}目录，保存溢出数据。
+
+- 示例2：前向API的ACL级别范围溢出检测
+
+  ```python
+  register_hook(model, overflow_check, dump_mode='acl', dump_config='./dump.json')
+  set_overflow_check_switch("ON")
+  
+  ...
+  
+  set_overflow_check_switch("OFF")
+  ```
+
+  该场景set_dump_path不生效，由dump_config中的dump.json文件配置溢出数据目录。
+
+### set_backward_input 
+
+**功能说明**
+
+设置反向ACL级别dump时需要的反向输入的.npy文件。
+
+**函数原型**
+
+```python
+set_backward_input(backward_input)
+```
+
+**参数说明**
+
+| 参数名         | 说明                                                         | 是否必选 |
+| -------------- | ------------------------------------------------------------ | -------- |
+| backward_input | 该输入文件为首次运行训练dump得到反向API输入的.npy文件。例如若需要dump Functional_conv2d_1 API的反向过程的输入输出，则需要在dump目录下查找命名包含Functional_conv2d_1、backward和input字段的.npy文件。 | 是       |
+
+**函数示例**
+
+```python
+register_hook(model, acc_cmp_dump, dump_mode='acl', dump_config='./dump.json')
+set_dump_switch("ON", mode="acl", scope=["Functional_conv2d_1_backward"])
+set_backward_input(["./npu_dump/dump_conv2d_v2.0/rank0/dump/Functional_conv2d_1_backward_input.0.npy"])
+```
+
+### dump.json配置文件说明
+
+**dump.json配置示例**
+
+```python
+{
+ "dump":
+ {
+         "dump_list":[],
+         "dump_path":"./dump/output",
+         "dump_mode":"all",
+         "dump_op_switch":"on"
+ }
+}
+```
+
+**dump.json参数说明**
+
+| 字段名         | 说明                                                         |
+| -------------- | ------------------------------------------------------------ |
+| dump_list      | 待dump数据的API模型。为空，无需配置。                        |
+| dump_path      | dump数据文件存储到运行环境的目录，主要用于指定ACL dump数据路径。支持配置绝对路径或相对路径。dump_path须为已存在目录。 |
+| dump_mode      | dump数据模式，配置如下：<br/>- output：dump API的输出数据。默认值。<br/>- input：dump API的输入数据。<br/>-  all：dump API的输入、输出数据。 |
+| dump_op_switch | 单API模型dump数据开关，配置如下： * off：关闭单API模型dump，默认值。 * on：开启单API模型dump。 |
+
+**dump目录说明**
+
+配置register_hook的dump_config后，采集的dump数据会在{dump_path}/{time}/{deviceid}/{model_id}目录下生成，例如“/home/HwHiAiUser/output/20200808163566/0/0”
+
+```bash
+├── 20230131172437
+│   └── 1
+│       ├── 0
+│       │   ├── Add.Add.45.0.1675157077183551
+│       │   ├── Cast.trans_Cast_0.31.0.1675157077159449
+│       │   ├── Cast.trans_Cast_5.43.0.1675157077180129
+│       │   ├── MatMul.MatMul.39.0.1675157077172961
+│       │   ├── Mul.Mul.29.0.1675157077155731
+│       │   ├── NPUAllocFloatStatus.NPUAllocFloatStatus.24.0.1675157077145262
+│       │   ├── TransData.trans_TransData_1.33.0.1675157077162791
+│       │   └── TransData.trans_TransData_4.41.0.1675157077176648
+│       ├── 1701737061
+│       │   └── Cast.trans_Cast_2.35.0.1675157077166214
+│       ├── 25
+│       │   └── NPUClearFloatStatus.NPUClearFloatStatus.26.0.1675157077150342
+│       └── 68
+│           └── TransData.trans_TransData_3.37.0.1675157077169473
+```
+
+### dump数据存盘说明
+
+dump结果目录结构示例如下：
+
+```bash
+├── dump_path
+│   └── ptdbg_dump_{version}
+│       ├── rank0
+│       │   ├── dump
+|       |   |    ├── Tensor_permute_1_forward.npy
+|       |   |    ...
+|       |   |    └── Fcuntion_linear_5_backward_output.npy
+│       │   └── dump.pkl
+│       ├── rank1
+|       |   ├── dump
+|       |   |   └── ...
+|       |   └── dump.pkl 
+│       ├── ...
+│       |
+|       └── rank7
+```
+
+其中ptdbg_dump_{version}为未设置set_dump_path的dump_tag参数时的默认命名；rank为设备上各卡的ID，每张卡上dump的数据会生成对应dump目录，可由register_hook函数的rank参数控制rank目录名称。
+
+**精度比对dump场景**
+
+精度比对dump场景的结果如下：
+
+* dump.pkl文件：包含dump数据的API名称、dtype、 shape以及各数据的max、min、mean统计信息。
+
+* dump目录：目录下为npy格式的dump数据。
+
+   npy文件保存的前缀和PyTorch对应关系如下
+
+   | 前缀       | Torch模块           |
+   | ---------- | ------------------- |
+   | Tensor     | torch.Tensor        |
+   | Torch      | torch               |
+   | Functional | torch.nn.functional |
+   | NPU        | NPU亲和算子         |
+   | VF         | torch._VF           |
+
+当set_dump_switch或configure_hook配置mode参数（例如：mode="api_stack" ）时，dump结果的文件名会添加api_stack前缀，dump结果如下：
+
+* api_stack_dump.pkl
+* api_stack_dump目录
+
+**溢出检测dump场景**
+
+register_hook设置了overflow_check时，检测API溢出，dump结果的文件名固定为Overflow_info_{timestamp}，dump结果如下：
+
+* Overflow_info_{timestamp}.pkl
+* Overflow_info_{timestamp}目录
+
+## CPU或GPU与NPU精度数据比对
+
+### 总体说明
+
+- 本节主要介绍CPU或GPU与NPU精度数据比对的函数以及示例。
+
+- 比对函数均通过单独创建精度比对脚本执行，可支持单卡和多卡场景的精度数据比对。
+
+- 工具性能：比对数据量较小时（参考值单份文件小于10GB），参考比对速度0.1GB/s；比对数据量较大时，参考比对速度0.3GB/s。
+  推荐环境配置：独占环境，CPU核心数192，固态硬盘（IO速度参考：固态硬盘 > 500MB/s，机械硬盘60 ~ 170MB/s）。
+  
+  用户环境性能弱于标准约束或非独占使用的比对速度酌情向下浮动。比对速度的计算方式：两份比对文件大小/比对耗时。
+
+### 约束
+
+- NPU自研API，在CPU或GPU若没有对应的API，该API的dump数据不比对。
+  
+- NPU与CPU或GPU的计算结果误差可能会随着模型的执行不断累积，最终会出现同一个API因为输入的数据差异较大而无法比对的情况。
+
+- CPU或GPU与NPU中两个相同的API会因为调用次数不同导致无法比对或比对到错误的API，不影响整体运行，该API忽略。
+
+### compare_distributed
+
+**功能说明**
+
+将CPU或GPU与NPU的dump文件进行比对，支持单卡和多卡，可同时比对多卡的dump数据。多机场景需要每个设备单独执行比对操作。可自动检索和匹配对应卡和进程所dump的数据文件，再调用compare进行比对。单机单卡时与compare函数二选一。
+
+**函数原型**
+
+```python
+compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs)
+```
+
+**参数说明**
+
+| 参数名         | 说明                                                         | 是否必选 |
+| -------------- | ------------------------------------------------------------ | -------- |
+| npu_dump_dir   | 配置NPU环境下的dump目录，即set_dump_path函数的dump_tag参数对应的目录名称。参数示例：'./npu_dump/dump_conv2d_v2.0'。 | 是       |
+| bench_dump_dir | 配置CPU、GPU或NPU环境下的dump目录，即set_dump_path函数的dump_tag参数对应的目录名称。参数示例：'./gpu_dump/dump_conv2d_v2.0'。 | 是       |
+| output_path    | 配置比对结果csv文件存盘目录。需要预先创建output_path目录。参数示例：'./output'。文件名称基于时间戳自动生成，格式为：`compare_result_rank{npu_ID}-rank{cpu/gpu/npu_ID}_{timestamp}.csv`。 | 是       |
+| **kwargs       | 支持compare的所有可选参数。                                  | 否       |
+
+**函数示例**
+
+创建比对脚本，例如compare_distributed.py，拷贝如下代码，具体参数请根据实际环境修改。
+
+```python
+from ptdbg_ascend import *
+compare_distributed('./npu_dump/ptdbg_dump_v2.0', './gpu_dump/ptdbg_dump_v2.0', './output')
+```
+
+### compare
+
+**功能说明**
+
+将CPU或GPU与NPU的dump文件进行比对，仅支持单机单卡。
+
+**函数原型**
+
+```python
+compare(input_param, output_path, stack_mode=False, auto_analyze=True, suffix='', fuzzy_match=False)
+```
+
+**参数说明**
+
+| 参数名       | 说明                                                         | 是否必选 |
+| ------------ | ------------------------------------------------------------ | -------- |
+| input_param  | 配置dump数据文件及目录。配置参数包括：<br/>- "npu_pkl_path"：指定NPU dump目录下的.pkl文件。参数示例："npu_pkl_path": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl"。必选。<br/>- "bench_pkl_path"：指定CPU、GPU或NPU dump目录下的.pkl文件。参数示例："bench_pkl_path": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl"。必选。<br/>- "npu_dump_data_dir"："指定NPU dump目录下的dump数据目录。参数示例："npu_dump_data_dir": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump"。必选。<br/>- "bench_dump_data_dir"："指定CPU、GPU或NPU dump目录下的dump数据目录。参数示例："npu_dump_data_dir": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump"。必选。<br/>- "is_print_compare_log"：配置是否开启日志打屏。可取值True或False。可选。 | 是       |
+| output_path  | 配置比对结果csv文件存盘目录。参数示例：'./output'。文件名称基于时间戳自动生成，格式为：`compare_result_{timestamp}.csv`。 | 是       |
+| stack_mode   | 配置stack_mode的开关。仅当dump数据时配置set_dump_switch的mode="api_stack"时需要开启。参数示例：stack_mode=True，默认为False。 | 否       |
+| auto_analyze | 自动精度分析，开启后工具自动针对比对结果进行分析，识别到第一个精度不达标节点（在比对结果文件中的“Accuracy Reached or Not”列显示为No），并给出问题可能产生的原因（打屏展示并生成advisor_{timestamp}.txt文件）。可取值True或False，参数示例：auto_analyze=False，默认为True。 | 否       |
+| suffix       | 标识比对结果的文件名。配置的suffix值在比对结果文件名的compare_result和{timestamp}中间插入，例如：`compare_result_{suffix}_{timestamp}`。默认为空。 | 否       |
+| fuzzy_match  | 模糊匹配。开启后，对于网络中同一层级且命名仅调用次数不同的API，可匹配并进行比对。可取值True或False，参数示例：fuzzy_match=True，默认为False。 | 否       |
+
+**函数示例**
+
+单机单卡场景下创建比对脚本，例如compare.py，拷贝如下代码，具体参数请根据实际环境修改。
+
+```python
+from ptdbg_ascend import *
+dump_result_param={
+"npu_pkl_path": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl",
+"bench_pkl_path": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl",
+"npu_dump_data_dir": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump",
+"bench_dump_data_dir": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump",
+"is_print_compare_log": True
+}
+compare(dump_result_param, "./output", stack_mode=True)
+```
+
+### parse
+
+**功能说明**
+
+解析并提取dump信息中的堆栈信息及数据统计信息。
+
+**函数原型**
+
+```python
+parse(pkl_file, moudule_name_prefix)
+```
+
+**参数说明**
+
+| 参数名              | 说明                                                         | 是否必选 |
+| ------------------- | ------------------------------------------------------------ | -------- |
+| pkl_file            | 指定dump数据文件中的pkl文件名。参数示例："./npu_dump/ptdbg_dump_v2.0/rank0/dump.pkl"。 | 是       |
+| moudule_name_prefix | 指定待提取的API接口前缀。参数示例："Torch_norm_1_forward"。  | 是       |
+
+**函数示例**
+
+创建堆栈信息及数据统计信息提取脚本，例如parse.py，拷贝如下代码，具体参数请根据实际环境修改。
+
+```python
+from ptdbg_ascend import *
+parse("./npu_dump/ptdbg_dump_v2.0/rank0/dump.pkl", "Torch_batch_normal_1_forward")
+```
+
+### 计算精度评价指标
+
+PyTorch精度比对是以CPU或GPU的计算结果为标杆，计算Cosine（余弦相似度）、MaxAbsErr（最大绝对误差）和MaxRelativeErr（最大相对误差），根据这两个结果判断API在运行时是否存在精度问题。
+
+计算精度评价指标：
+
+1. Cosine：通过计算两个向量的余弦值来判断其相似度，数值越接近于1说明计算出的两个张量越相似，实际可接受阈值为大于0.99。在计算中可能会存在nan，主要由于可能会出现其中一个向量为0。
+
+2. MaxAbsErr：当最大绝对误差越接近0表示其计算的误差越小，实际可接受阈值为小于0.001。
+
+3. MaxRelativeErr：当最大相对误差越接近0表示其计算的误差越小。
+
+   当dump数据中存在0或Nan时，比对结果中最大相对误差则出现inf或Nan的情况，属于正常现象。
+
+精度比对结果csv文件中只需要通过Accuracy Reached or Not来判断计算精度是否达标，判断标准如下：
+
+1. Cosine < 0.99 且 MaxAbsError > 0.001时，精度不达标，标记为“No”。
+2. Cosine < 0.9，精度不达标，标记为“No”。
+3. MaxAbsError > 1，精度不达标，标记为“No”。
+4. 其余情况下记为精度达标，标记为“Yes”。
+
+## ptdbg_ascend.parse数据解析功能
+
+ptdbg_ascend.parse为命令行交互式界面解析工具，提供更多的数据解析功能并且展示结果。
+
+主要的使用场景包括：
+
+- 支持指定ACL层级算子数据比对。
+- 支持指定ACL层级算子数据转换及展示。
+- 支持交互式指定pkl文件中API对应dump数据查看。
+- 支持API进行可选层级比对和打印（统计级和像素级）。
+
+安装ptdbg_ascend工具后，可以通过使用命令 **python -m ptdbg_ascend.parse** 进入交互式界面，可在parse的界面中执行Shell命令，以及上述场景的相关解析命令。Ctrl+C可以退出该界面。
+
+### ACL层级算子数据比对
+
+- 依赖：CANN包中的msaccucmp工具。
+
+- 输入以下比对命令进行数据比对。
+
+  ```bash
+  vc -m [*my_dump_path*] -g [*golden_dump_path*] (-out) [*output_path*]
+  ```
+  
+  | 参数名称 | 说明                                                         | 是否必选 |
+  | -------- | ------------------------------------------------------------ | -------- |
+  | -m       | 待比对dump数据目录。                                         | 是       |
+  | -g       | dump数据目录。                                               | 是       |
+  | -out     | 结果输出目录。                                               | 否       |
+  | -asc     | 指定msaccucmp路径，默认路径为：/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py。 | 否       |
+  
+  - 输出结果：result_{timestamp}.csv文件。
+  - 若指定-out参数需要用户传入输出路径，并且路径需要已存在。
+  - 若未指定输出目录或指定目录不存在， 则比对结束后将结果保存在默认目录 “./parse_data/comapre_result”中，比对结束后会打印log提示输出结果存放路径。
+
+**示例**
+
+```bash
+# 传入待比对数据目录以及标杆数据目录
+Parse >>> vc -m ./my_dump_path -g ./golden_data_path 
+......
+# 比对结果打印
+[INFO] The comparison result have been written to "./parse_data/compare_result/result_20230818104735.csv".
+[INFO] The command was completed and took 6 seconds.
+[INFO] Compare finished!!
+```
+
+### ACL算子数据的npy转换
+
+- 依赖：CANN包中的msaccucmp工具。
+
+- 输入以下转换命令进行数据转换， 将ACL级别dump数据转为npy文件。
+
+  ```bash
+  dc -n [*file_name/file_path*] (-out) [*output_path*]
+  ```
+
+  | 参数名称 | 说明                                                         | 是否必选 |
+  | -------- | ------------------------------------------------------------ | -------- |
+  | -n       | 需转换的dump数据文件或dump数据文件目录。                     | 是       |
+  | -out     | 结果输出目录。                                               | 否       |
+  | -asc     | 指定msaccucmp路径，默认路径为：/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py | 否       |
+
+  [^]: 若传入单个dump文件，则转换单个文件，若传入dump文件目录则转换目录下所有dump文件。
+
+  - 输出结果：npy文件。
+  - 若指定-out参数需要用户传入输出路径，并且路径需要已存在。
+  - 若未指定输出目录或指定目录不存在， 则比对结束后将结果保存在默认目录 “./parse_data/convert_result”中，比对结束后会打印log提示输出结果存放路径及转换结果。
+
+- 输入以下命令，展示npy数据统计信息。
+
+  ```bash
+  pt -n [*file_path*]
+  ```
+
+  | 参数名称 | 说明          | 是否必选 |
+  | -------- | ------------- | -------- |
+  | -n       | npy文件路径。 | 是       |
+
+  打印统计信息：shape, dtype, max, min和mean。
+
+**示例1**
+
+```bash
+# 传入需转换的dump文件目录
+Parse >>> dc -n ./dump_data/
+......
+# 转换结果
+╭──────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ SrcFile: ./dump_data/
+│  - Add.fp32_vars_add_2fp32_vars_Relu_9.31.5.1636595794731103.input.0.npy                             │
+│  - Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.output.0.npy                            │
+│  - Add.fp32_vars_add_2fp32_vars_Relu_9.31.5.1636595794731103.input.1.npy                             │
+│  - Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.input.1.npy                             │
+│  - Add.fp32_vars_add_3fp32_vars_Relu_12.40.5.1636595794846124.input.1.npy                            │
+│  - Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.input.0.npy                             │
+│  - Add.fp32_vars_add_3fp32_vars_Relu_12.40.5.1636595794846124.input.0.npy                            │
+│  - Add.fp32_vars_add_2fp32_vars_Relu_9.31.5.1636595794731103.output.0.npy                            │
+│  - Add.fp32_vars_add_3fp32_vars_Relu_12.40.5.1636595794846124.output.0.npy                           │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────╯
+```
+
+**示例2**
+
+```bash
+# 查看某个dump数据块的数据信息
+# 默认会将数据中的tensor保存成 txt
+Parse >>> pt -n ./parse_data/dump_convert/Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.output.0.npy
+......
+# 打印统计信息
+[Shape: (1, 16, 56, 56, 16)] [Dtype: float16] [Max: 452.0] [Min: -408.5] [Mean: -3.809]
+Path: ./parse_data/dump_convert/Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.input.0.npy                           
+TextFile:./parse_data/dump_convert/Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.input.0.npy.txt 
+```
+
+### pkl文件中指定API的dump数据信息查看
+
+- 输入以下命令，解析并输出pkl文件中指定api的统计信息。
+
+  ```bash
+  pk -f [*pkl_path*] -n [*api_name*]
+  ```
+
+  | 参数名称 | 说明              | 是否必选 |
+  | -------- | ----------------- | -------- |
+  | -f       | 指定pkl文件路径。 | 是       |
+  | -n       | 指定API名称。     | 是       |
+
+  - 输出结果：打印统计信息（shape, dtype, max和min mean）。
+  - 若pkl文件中存在相应的堆栈信息，则会打印堆栈信息。
+
+**示例**
+
+```bash
+# 传入pkl文件及api名称
+Parse >>> pk -f ./torch_dump/ptdbg_v3.2/rank0/api_stack_dump.pkl -n Functional_conv2d_0_forward
+......
+# 打印统计信息及堆栈（pkl文件不包含堆栈则不会打印堆栈）
+
+Statistic Info:
+  [Functional_conv2d_0_forward_input.0][dtype: torch.float32][shape: [2, 1, 2, 2]][max: 1.576936960220337][min: -0.9757485389709473][mean: 0.4961632490158081]
+  [Functional_conv2d_0_forward_input.1][dtype: torch.float32][shape: [2, 1, 2, 2]][max: 0.20064473152160645][min: -0.47102075815200806][mean: -0.20796933770179749]
+  [Functional_conv2d_0_forward_input.2][dtype: torch.float32][shape: [2]][max: 0.17380613088607788][min: -0.16853803396224976][mean: 0.0026340484619140625]
+  [Functional_conv2d_0_forward_output][dtype: torch.float32][shape: [2, 2, 1, 1]][max: 0.02364911139011383][min: -1.762906551361084][mean: -0.6710853576660156]
+```
+
+### API可选层级比对
+
+- 输入以下命令, 进行统计级和像素级比对。
+
+  ```bash
+  cn -m [*my_data *.npy*] -g [*gloden *.npy*] (-p) [*num*] (-al) [*atol*] (-rl) [*rtol*]
+  ```
+
+  - 统计级比对：对tensor整体进行余弦值及相对误差的计算。
+  - 像素级比对：对输入的两个npy文件进行逐元素比对。若两个tensor对应元素的相对误差或绝对误差大于**误差阈值**（-al和-rl配置）则被标记为错误数据。
+
+  | 参数名称 | 说明                                            | 是否必选 |
+  | -------- | ----------------------------------------------- | -------- |
+  | -m       | 待比对数据。                                    | 是       |
+  | -g       | 标杆数据。                                      | 是       |
+  | -p       | 设置比对结束后打印错误元素的个数，默认值20。    | 否       |
+  | -al      | 判定数据存在精度问题的绝对误差阈值，默认0.001。 | 否       |
+  | -rl      | 判定数据存在精度问题的相对误差阈值，默认0.001。 | 否       |
+  | -s       | 将npy文件保存成txt文件，用于查看，默认开启。    | 否       |
+  
+  输出结果：
+  
+  - 统计级比对结果。
+  - 两个文件的统计信息（shape, dtype, max, min和mean）。
+  - 错误数据打印表格。
+
+**示例**
+
+```bash
+# 对比两个tensor的数据
+Parse >>> cn -m Add.InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.323.1619494134703053.output.0.npy -g InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.0.1619492699305998.npy -p 10 -s -al 0.002 -rl 0.005
+                  Error Item Table                                        Top Item Table
+┏━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ ┏━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
+┃ Index ┃ Left          ┃ Right        ┃ Diff         ┃ ┃ Index ┃ Left        ┃ Right       ┃ Diff          ┃
+┡━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ ┡━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
+│ 155   │ 0.024600908   │ 0.022271132  │ 0.002329776  │ │ 0     │ -0.9206961  │ -0.9222216  │ 0.0015255213  │
+│ 247   │ 0.015752593   │ 0.017937578  │ 0.0021849852 │ │ 1     │ -0.6416973  │ -0.64051837 │ 0.0011789203  │
+│ 282   │ -0.0101207765 │ -0.007852031 │ 0.0022687456 │ │ 2     │ -0.35383835 │ -0.35433492 │ 0.0004965663  │
+│ 292   │ 0.019581757   │ 0.02240482   │ 0.0028230622 │ │ 3     │ -0.18851271 │ -0.18883198 │ 0.00031927228 │
+│ 640   │ -0.06593232   │ -0.06874806  │ 0.0028157383 │ │ 4     │ -0.43508735 │ -0.43534422 │ 0.00025686622 │
+│ 1420  │ 0.09293677    │ 0.09586689   │ 0.0029301196 │ │ 5     │ 1.4447614   │ 1.4466647   │ 0.0019032955  │
+│ 1462  │ -0.085207745  │ -0.088047795 │ 0.0028400496 │ │ 6     │ -0.3455438  │ -0.3444429  │ 0.0011008978  │
+│ 1891  │ -0.03433288   │ -0.036525503 │ 0.002192624  │ │ 7     │ -0.6560242  │ -0.6564579  │ 0.0004336834  │
+│ 2033  │ 0.06828873    │ 0.07139922   │ 0.0031104907 │ │ 8     │ -2.6964858  │ -2.6975214  │ 0.0010356903  │
+│ 2246  │ -0.06376442   │ -0.06121233  │ 0.002552092  │ │ 9     │ -0.73746175 │ -0.73650354 │ 0.00095820427 │
+└───────┴───────────────┴──────────────┴──────────────┘ └───────┴─────────────┴─────────────┴───────────────┘
+╭───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ Left:                                                                                                                                 | 
+│  |- NpyFile: ./dump/temp/decode/Add.InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.323.1619494134703053.output.0.npy                 |
+│  |- TxtFile: ./dump/temp/decode/Add.InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.323.1619494134703053.output.0.npy.txt             |
+│  |- NpySpec: [Shape: (32, 8, 8, 320)] [Dtype: float32] [Max: 5.846897] [Min: -8.368301] [Mean: -0.72565556]                           |
+│ DstFile:                                                                                                                              │
+│  |- NpyFile: ./dump/cpu/InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.0.1619492699305998.npy                                        |
+│  |- TxtFile: ./dump/cpu/InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.0.1619492699305998.npy.txt                                    |
+│  |- NpySpec: [Shape: (32, 8, 8, 320)] [Dtype: float32] [Max: 5.8425903] [Min: -8.374472] [Mean: -0.7256237]                           │
+│ NumCnt:   655360                                                                                                                      │
+│ AllClose: False                                                                                                                       │
+│ CosSim:   0.99999493                                                                                                                  │
+│ ErrorPer: 0.023504638671875  (rl= 0.005, al= 0.002)                                                                                   │
+╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+```
+
+## FAQ
+
+[FAQ](https://gitee.com/kun_8/att_1/blob/master/debug/accuracy_tools/ptdbg_ascend/doc/FAQ.md)
diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/rank_id\350\216\267\345\217\226\346\226\271\346\263\225.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/rank_id\350\216\267\345\217\226\346\226\271\346\263\225.md"
new file mode 100644
index 0000000000000000000000000000000000000000..f084d990308b7adf86af49e919e872ab948ed817
--- /dev/null
+++ "b/debug/accuracy_tools/ptdbg_ascend/doc/rank_id\350\216\267\345\217\226\346\226\271\346\263\225.md"
@@ -0,0 +1,38 @@
+# rank_id获取方法
+
+## **通过环境变量获取**
+
+当前进程的rank_id可能保存在环境变量中，比如`LOCAL_RANK`。则可以通过如下示例来检查当前进程的rank_id：
+
+```python
+import os
+print("Local rank is: ", os.environ.get('LOCAL_RANK'))
+```
+
+若打印结果显示该环境变量被配置过，如：
+
+```python
+# 以单机8卡为例
+Local rank is: 0
+Local rank is: 2
+Local rank is: 3
+Local rank is: 1
+Local rank is: 4
+Local rank is: 5
+Local rank is: 6
+Local rank is: 7
+```
+
+那么将该环境变量作为rank传参即可自动获取到rank_id，如：
+
+```python
+register_hook(model, acc_cmp_dump, rank=os.environ.get('LOCAL_RANK')
+```
+
+## **通过命令行参数获取**
+
+若通过命令行参数传入rank_id，比如`--local_rank`。那么可以在代码中找到`args.local_rank` 来作为rank参数值。如：
+
+```bash
+register_hook(model, acc_cmp_dump, rank=args.local_rank)
+```
diff --git "a/debug/accuracy_tools/ptdbg_ascend/doc/\345\217\215\345\220\221ACL dump\347\224\250\344\276\213\350\257\264\346\230\216.md" "b/debug/accuracy_tools/ptdbg_ascend/doc/\345\217\215\345\220\221ACL dump\347\224\250\344\276\213\350\257\264\346\230\216.md"
new file mode 100644
index 0000000000000000000000000000000000000000..bf8989e8a47bfb919619c8f877fe38b1b84c4215
--- /dev/null
+++ "b/debug/accuracy_tools/ptdbg_ascend/doc/\345\217\215\345\220\221ACL dump\347\224\250\344\276\213\350\257\264\346\230\216.md"	
@@ -0,0 +1,42 @@
+### 反向ACL dump用例说明
+
+当前昇腾AI处理器上的PyTorch框架通过torch_npu.npu中的init_dump(),set_dump()和finalize_dump()接口来进行ACL级别的数据采集。首先init_dump()会进行初始化dump配置，然后通过set_dump()接口传入配置文件来配置dump参数，最后通过finalize_dump来结束dump。 下面将以torch.sort运算的反向过程为例，介绍反向ACL数据dump的方法。
+
+```bash
+import numpy as np
+import torch
+import torch_npu
+torch_npu.npu.set_device("npu:0")
+input = torch.tensor(np.load(input_path)).requires_grad_().npu()
+grad = torch.tensor(np.load(grad_path)).requires_grad_().npu()
+b, c = torch.sort(input)
+torch_npu.npu.init_dump()
+torch_npu.npu.set_dump("dump.json")
+torch_npu.npu.synchronize()
+b.backward(grad)
+torch_npu.npu.synchronize()
+torch_npu.npu.finalize_dump()
+```
+
+- input_path是该API前向运算的输入，可以通过ACL dump的API名称获得。如想要对Torch_sort_0_backward进行ACL dump，则该反向API对应的前向过程输入为Torch_sort_0_forward_input.0.npy。
+- grad_path是该API反向运算的输入，同理可以通过期望dump的API名称获得。
+
+- b, c是torch.sort的输出，分别表示排序后的tensor和排序后tensor中的各元素在原始tensor中的位置。对torch.sort进行反向时，需要对b进行backward。
+
+**dump.json配置**
+
+```bash
+{
+  "dump":
+  {
+    "dump_list": [],
+    "dump_path": "/home/HwHiAiUser/dump/output",
+    "dump_mode": "all",
+    "dump_op_switch": "on"
+  }
+}
+```
+
+**查看dump数据**
+
+采集的dump数据会在{dump_path}/{time}/{device_id}/{model_id}/{data_index}目录下生成。
diff --git a/debug/accuracy_tools/ptdbg_ascend/figures/advisor_summary.png b/debug/accuracy_tools/ptdbg_ascend/figures/advisor_summary.png
new file mode 100644
index 0000000000000000000000000000000000000000..317584f559dd8557bb29773428b33c4962bfd8a3
Binary files /dev/null and b/debug/accuracy_tools/ptdbg_ascend/figures/advisor_summary.png differ
diff --git a/debug/accuracy_tools/ptdbg_ascend/figures/auto_analyze_log.png b/debug/accuracy_tools/ptdbg_ascend/figures/auto_analyze_log.png
new file mode 100644
index 0000000000000000000000000000000000000000..999b47f97ef5661316c7e61dbdc93c87996259f3
Binary files /dev/null and b/debug/accuracy_tools/ptdbg_ascend/figures/auto_analyze_log.png differ
diff --git a/debug/accuracy_tools/ptdbg_ascend/figures/compare_struct.png b/debug/accuracy_tools/ptdbg_ascend/figures/compare_struct.png
new file mode 100644
index 0000000000000000000000000000000000000000..4faf1dd73f48082e8b2e5876f92bac451ddd9558
Binary files /dev/null and b/debug/accuracy_tools/ptdbg_ascend/figures/compare_struct.png differ
diff --git a/debug/accuracy_tools/ptdbg_ascend/figures/h5_file_struct.png b/debug/accuracy_tools/ptdbg_ascend/figures/h5_file_struct.png
new file mode 100644
index 0000000000000000000000000000000000000000..dff8ccaa4a8e04c061f08d08464194c34b9b9c10
Binary files /dev/null and b/debug/accuracy_tools/ptdbg_ascend/figures/h5_file_struct.png differ
diff --git a/debug/accuracy_tools/ptdbg_ascend/figures/module_compare.png b/debug/accuracy_tools/ptdbg_ascend/figures/module_compare.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e1ea564eb191807034afd8aceac92b29b62a086
Binary files /dev/null and b/debug/accuracy_tools/ptdbg_ascend/figures/module_compare.png differ
diff --git a/debug/accuracy_tools/ptdbg_ascend/figures/op_compare.png b/debug/accuracy_tools/ptdbg_ascend/figures/op_compare.png
new file mode 100644
index 0000000000000000000000000000000000000000..e2b0459196d7545d915649d81bb2cf12e7f034bc
Binary files /dev/null and b/debug/accuracy_tools/ptdbg_ascend/figures/op_compare.png differ
diff --git a/debug/accuracy_tools/ptdbg_ascend/img/module_compare.png b/debug/accuracy_tools/ptdbg_ascend/img/module_compare.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e1ea564eb191807034afd8aceac92b29b62a086
Binary files /dev/null and b/debug/accuracy_tools/ptdbg_ascend/img/module_compare.png differ
diff --git a/debug/accuracy_tools/ptdbg_ascend/img/op_compare.png b/debug/accuracy_tools/ptdbg_ascend/img/op_compare.png
new file mode 100644
index 0000000000000000000000000000000000000000..e2b0459196d7545d915649d81bb2cf12e7f034bc
Binary files /dev/null and b/debug/accuracy_tools/ptdbg_ascend/img/op_compare.png differ
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/MANIFEST.in b/debug/accuracy_tools/ptdbg_ascend/src/python/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..4d9d5ffe4a256e8d4063df82568223185455a93c
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/MANIFEST.in
@@ -0,0 +1,3 @@
+recursive-include * *.py
+recursive-include * *.yaml
+recursive-include * *.template
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fedc0b746be14bd9abfec02538415f361fe5b68
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/__init__.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+
+from .compare.acc_compare import compare, parse
+from .compare.distributed_compare import compare_distributed
+from .dump.dump import acc_cmp_dump
+from .overflow_check.overflow_check import overflow_check
+from .overflow_check.utils import set_overflow_check_switch
+from .dump.utils import set_dump_path, set_dump_switch, set_backward_input
+from .hook_module.register_hook import register_hook
+from .common.utils import seed_all
+from .common.version import __version__
+from .debugger.debugger_config import DebuggerConfig
+from .debugger.precision_debugger import PrecisionDebugger
+seed_all()
+
+__all__ = ["register_hook", "set_dump_path", "set_dump_switch", "set_overflow_check_switch", "seed_all",
+           "acc_cmp_dump", "overflow_check", "compare", "parse", "compare_distributed", "set_backward_input",
+           "PrecisionDebugger", "DebuggerConfig"]
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f597b7c4df6095ea9b9fb6e31fa8d831a53d70f
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+import pandas as pd
+
+from .advisor_result import AdvisorResult
+from .advisor_const import AdvisorConst
+from ..common import utils
+from ..common.utils import CompareException, CompareConst, Const
+from ..common.utils import print_info_log, print_warn_log, print_error_log
+
+
+class Advisor:
+    """
+    Class for generate advisor
+    """
+
+    def __init__(self, input_file, out_path=""):
+        self.input_file = os.path.realpath(input_file)
+        self.out_path = os.path.realpath(out_path)
+
+    def _parse_input_file(self):
+        if not self.input_file.endswith(".csv"):
+            print_error_log("Advisor only support csv file from ptdbg_ascend result.")
+            raise CompareException(CompareException.INVALID_FILE_ERROR)
+        try:
+            df = pd.read_csv(self.input_file, on_bad_lines='skip')
+        except OSError as os_err:
+            print_error_log('Failed to parse the input file %s. %s'
+                            % (self.input_file, str(os_err)))
+            raise CompareException(CompareException.PARSE_FILE_ERROR) from os_err
+        data_columns = df.columns.values
+        if not {CompareConst.ACCURACY, CompareConst.NPU_NAME}.issubset(data_columns):
+            print_error_log('Compare result file does not contain %s, %s columns.' % (CompareConst.ACCURACY,
+                                                                                      CompareConst.NPU_NAME))
+            raise CompareException(CompareException.INVALID_FILE_ERROR)
+        df.reset_index(inplace=True)
+        # The value of index is consistent with the line number of csv, csv file first line is 2
+        df.iloc[:, 0] += 2
+        return df
+
+    def _check_result_file(self):
+        utils.check_file_or_directory_path(self.input_file)
+        utils.check_file_size(self.input_file, Const.ONE_GB)
+
+    @staticmethod
+    def filter_data(pd_data):
+        """
+        filter some apis cannot be fixed
+        """
+        result = pd_data[~pd_data[CompareConst.NPU_NAME].str.contains(AdvisorConst.BATCH_NORM)]
+        return result
+
+    @staticmethod
+    def gen_advisor_message(node_name):
+        if AdvisorConst.FORWARD in node_name:
+            if AdvisorConst.INPUT in node_name:
+                message = AdvisorConst.FORWARD_INPUT_SUGGEST
+            else:
+                message = AdvisorConst.FORWARD_OUTPUT_SUGGEST
+        else:
+            if AdvisorConst.INPUT in node_name:
+                message = AdvisorConst.BACKWARD_INPUT_SUGGEST
+            else:
+                message = AdvisorConst.BACKWARD_OUTPUT_SUGGEST
+        return message
+
+    def gen_advisor_result(self, pd_data):
+        first_failing_data = pd_data.iloc[0]
+        node_name = first_failing_data[CompareConst.NPU_NAME]
+        index = first_failing_data['index']
+        message = self.gen_advisor_message(node_name)
+        print_warn_log("Find %s accuracy not reached, the line is %s" % (node_name, index))
+        result = AdvisorResult(node_name, index, message)
+        return result
+    
+    def analyze_unmatched(self, analyze_data):
+        accuracy_unmatched = analyze_data[analyze_data[CompareConst.ACCURACY] == CompareConst.ACCURACY_CHECK_UNMATCH]
+        num_unmatch = len(accuracy_unmatched)
+        if num_unmatch != 0:
+            for i in range(len(accuracy_unmatched)):
+                item = analyze_data.iloc[i]
+                print_warn_log("The tensor name matches but the shape or dtype does not match: {}"\
+                        .format(item[CompareConst.NPU_NAME]))
+
+    def analysis(self):
+        self._check_result_file()
+        analyze_data = self._parse_input_file()
+        print_info_log("Start analyzing the comparison result: %s" % self.input_file)
+        self.analyze_unmatched(analyze_data)
+        accuracy_not_reached = analyze_data[analyze_data[CompareConst.ACCURACY] == CompareConst.ACCURACY_CHECK_NO]
+        failing_data = self.filter_data(accuracy_not_reached)
+        if failing_data.empty:
+            print_info_log("All data from api input/output accuracy reached")
+            result = AdvisorResult(AdvisorConst.NO_ERROR_API, AdvisorConst.NO_ERROR_API, AdvisorConst.NO_ERR_SUGGEST)
+        else:
+            result = self.gen_advisor_result(failing_data)
+        message_list = result.print_advisor_log()
+        result.gen_summary_file(self.out_path, message_list)
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor_const.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor_const.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a6b5ea4b775235adc5e6c34e2f93be367ae3bf6
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor_const.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+
+class AdvisorConst:
+    """
+    Class for advisor const
+    """
+
+    # text symbol
+    NEW_LINE = "\n"
+    COLON = ": "
+
+    # advisor summary key
+    SUSPECT_NODES = "Suspect Nodes"
+    LINE = "Line"
+    ADVISOR_SUGGEST = "Expert Advice"
+
+    NO_ERROR_API = "NA"
+
+    # advisor message
+    NO_ERR_SUGGEST = "All data in comparison result meets the accuracy requirements."
+    FORWARD_INPUT_SUGGEST = "1. Analyze the model to view the input source.\n" \
+                            "2. Check whether an inplace API causes the output result to overwrite the input result. That is, the fault is actually caused by a computation error.\n" \
+                            "3. The fault may be caused by memory corruption and further analysis is required."
+    FORWARD_OUTPUT_SUGGEST = "This is a forward API computation error. Check the computation implementation."
+    BACKWARD_INPUT_SUGGEST = "Check whether the forward computation result is affected."
+    BACKWARD_OUTPUT_SUGGEST = "This is a backward API computation error. Check the computation implementation."
+
+    # cannot be fixed api
+    BATCH_NORM = "batch_norm"
+
+    # name keyword
+    INPUT = "input"
+    OUTPUT = "output"
+    FORWARD = "forward"
+    BACKWARD = "backward"
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor_result.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..95a6774763c5d70dccd0dc67131c329dd1d15480
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/advisor/advisor_result.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import os
+import time
+
+from .advisor_const import AdvisorConst
+from ..common.utils import Const
+from ..common.utils import print_info_log, print_error_log
+
+
+class AdvisorResult:
+    """
+    Class for generate advisor result
+    """
+
+    def __init__(self, node, line, message):
+        self.suspect_node = node
+        self.line = line
+        self.advisor_message = message
+
+    @staticmethod
+    def gen_summary_file(out_path, message_list):
+        file_name = 'advisor_{}.txt'.format(time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())))
+        result_file = os.path.join(out_path, file_name)
+        try:
+            with os.fdopen(os.open(result_file, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as output_file:
+                output_file.truncate(0)
+                message_list = [message + AdvisorConst.NEW_LINE for message in message_list]
+                output_file.writelines(message_list)
+        except IOError as io_error:
+            print_error_log("Failed to save %s, the reason is %s." % (result_file, io_error))
+        else:
+            print_info_log("The advisor summary is saved in: %s" % result_file)
+
+    def print_advisor_log(self):
+        print_info_log("The summary of the expert advice is as follows: ")
+        message_list = [AdvisorConst.LINE + AdvisorConst.COLON + str(self.line),
+                        AdvisorConst.SUSPECT_NODES + AdvisorConst.COLON + self.suspect_node,
+                        AdvisorConst.ADVISOR_SUGGEST + AdvisorConst.COLON + self.advisor_message]
+        for message in message_list:
+            print_info_log(message)
+        return message_list
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/compare_script.template b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/compare_script.template
new file mode 100644
index 0000000000000000000000000000000000000000..91565b3c87fa504ca96e7ebfd03f140f648a64c7
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/compare_script.template
@@ -0,0 +1,14 @@
+from ptdbg_ascend import compare
+
+pkl_path = "%s"
+dump_data_dir = "%s"
+
+dump_path_param = {
+    "npu_pkl_path": ,
+    "bench_pkl_path": ,
+    "npu_dump_data_dir": ,
+    "bench_dump_data_dir": ,
+    "is_print_compare_log": True
+}
+
+compare(dump_path_param, output_path="", stack_mode=%s)
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa66cafaa1269a643df4437c8e2c4841f6267b9e
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py
@@ -0,0 +1,649 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import collections
+import os
+import random
+import re
+import shutil
+import stat
+import subprocess
+import sys
+import time
+from datetime import datetime, timezone
+from functools import wraps
+from pathlib import Path
+import numpy as np
+import torch
+
+try:
+    import torch_npu
+except ImportError:
+    is_gpu = True
+else:
+    is_gpu = False
+
+torch_without_guard_version_list = ['2.1']
+for version in torch_without_guard_version_list:
+    if torch.__version__.startswith(version):
+        torch_without_guard_version = True
+        break
+    else:
+        torch_without_guard_version = False
+
+if not is_gpu and not torch_without_guard_version:
+    from torch_npu.utils.device_guard import torch_device_guard as torch_npu_device_guard
+
+device = collections.namedtuple('device', ['type', 'index'])
+prefixes = ['api_stack', 'list', 'range', 'acl']
+
+
+class Const:
+    """
+    Class for const
+    """
+    MODEL_TYPE = ['.onnx', '.pb', '.om']
+    DIM_PATTERN = r"^(-?[0-9]+)(,-?[0-9]+)*"
+    SEMICOLON = ";"
+    COLON = ":"
+    EQUAL = "="
+    COMMA = ","
+    DOT = "."
+    DUMP_RATIO_MAX = 100
+    SUMMERY_DATA_NUMS = 256
+    FLOAT_EPSILON = np.finfo(float).eps
+    SUPPORT_DUMP_MODE = ['api', 'acl']
+    ON = 'ON'
+    OFF = 'OFF'
+    BACKWARD = 'backward'
+    FORWARD = 'forward'
+
+    # dump mode
+    ALL = "all"
+    LIST = "list"
+    RANGE = "range"
+    STACK = "stack"
+    ACL = "acl"
+    API_LIST = "api_list"
+    API_STACK = "api_stack"
+    DUMP_MODE = [ALL, LIST, RANGE, STACK, ACL, API_LIST, API_STACK]
+
+    API_PATTERN = r"^[A-Za-z0-9]+[_]+([A-Za-z0-9]+[_]*[A-Za-z0-9]+)[_]+[0-9]+[_]+[A-Za-z0-9]+"
+    WRITE_FLAGS = os.O_WRONLY | os.O_CREAT
+    WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR
+
+    PKL_SUFFIX = ".pkl"
+    NUMPY_SUFFIX = ".npy"
+    ONE_GB = 1 * 1024 * 1024 * 1024
+    TEN_GB = 10 * 1024 * 1024 * 1024
+    FILE_PATTERN = r'^[a-zA-Z0-9_./-]+$'
+    FILE_NAME_LENGTH = 255
+    DIRECTORY_LENGTH = 4096
+
+
+class CompareConst:
+    """
+    Class for compare module const
+    """
+    # compare result column name
+    NPU_NAME = "NPU Name"
+    BENCH_NAME = "Bench Name"
+    NPU_DTYPE = "NPU Tensor Dtype"
+    BENCH_DTYPE = "Bench Tensor Dtype"
+    NPU_SHAPE = "NPU Tensor Shape"
+    BENCH_SHAPE = "Bench Tensor Shape"
+    NPU_MAX = "NPU max"
+    NPU_MIN = "NPU min"
+    NPU_MEAN = "NPU mean"
+    BENCH_MAX = "Bench max"
+    BENCH_MIN = "Bench min"
+    BENCH_MEAN = "Bench mean"
+    COSINE = "Cosine"
+    MAX_ABS_ERR = "MaxAbsErr"
+    MAX_RELATIVE_ERR = "MaxRelativeErr"
+    ACCURACY = "Accuracy Reached or Not"
+    STACK = "NPU_Stack_Info"
+    ERROR_MESSAGE = "Err_message"
+
+    # compare result data
+    NAN = 'Nan'
+    SHAPE_UNMATCH = 'shape unmatched'
+    DTYPE_UNMATCH = 'dtype unmatched'
+
+    # accuracy standards
+    COS_THRESHOLD = 0.99
+    MAX_ABS_ERR_THRESHOLD = 0.001
+    COS_MAX_THRESHOLD = 0.9
+    MAX_ABS_ERR_MAX_THRESHOLD = 1
+    ACCURACY_CHECK_YES = "Yes"
+    ACCURACY_CHECK_NO = "No"
+    ACCURACY_CHECK_UNMATCH = "Unmatched"
+
+    # error message
+    NO_BENCH = "No bench data matched."
+
+    # compare const
+    FLOAT_TYPE = [np.half, np.single, float, np.double, np.float64, np.longdouble]
+
+
+class VersionCheck:
+    """
+    Class for TorchVersion
+    """
+    V1_8 = "1.8"
+    V1_11 = "1.11"
+    V2_0 = "2.0"
+    V2_1 = "2.1"
+
+    @staticmethod
+    def check_torch_version(version):
+        torch_version = torch.__version__
+        if torch_version.startswith(version):
+            return True
+        else:
+            return False
+
+
+class CompareException(Exception):
+    """
+    Class for Accuracy Compare Exception
+    """
+    NONE_ERROR = 0
+    INVALID_PATH_ERROR = 1
+    OPEN_FILE_ERROR = 2
+    CLOSE_FILE_ERROR = 3
+    READ_FILE_ERROR = 4
+    WRITE_FILE_ERROR = 5
+    INVALID_FILE_ERROR = 6
+    PERMISSION_ERROR = 7
+    INDEX_OUT_OF_BOUNDS_ERROR = 8
+    NO_DUMP_FILE_ERROR = 9
+    INVALID_DATA_ERROR = 10
+    INVALID_PARAM_ERROR = 11
+    INVALID_DUMP_RATIO = 12
+    INVALID_DUMP_FILE = 13
+    UNKNOWN_ERROR = 14
+    INVALID_DUMP_MODE = 15
+    PARSE_FILE_ERROR = 16
+    INVALID_COMPARE_MODE = 17
+    OVER_SIZE_FILE_ERROR = 18
+
+    def __init__(self, code, error_info: str = ""):
+        super(CompareException, self).__init__()
+        self.code = code
+        self.error_info = error_info
+
+    def __str__(self):
+        return self.error_info
+
+class DumpException(CompareException):
+    pass
+
+
+def make_dump_path_if_not_exists(dump_path):
+    # 之前应该已经验证过dump_path的上层文件夹存在
+    dump_root, dump_dir = os.path.split(dump_path)
+    if not os.path.exists(dump_path):
+        Path(dump_path).mkdir(mode=0o750, exist_ok=True)
+    else:
+        if not os.path.isdir(dump_path):
+            print_error_log((f"{dump_path} already exists and is not a directory."))
+
+
+def _print_log(level, msg):
+    current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
+    pid = os.getgid()
+    print(current_time + "(" + str(pid) + ")-[" + level + "]" + msg)
+    sys.stdout.flush()
+
+
+def print_info_log(info_msg):
+    """
+    Function Description:
+        print info log.
+    Parameter:
+        info_msg: the info message.
+    """
+    _print_log("INFO", info_msg)
+
+
+def print_error_log(error_msg):
+    """
+    Function Description:
+        print error log.
+    Parameter:
+        error_msg: the error message.
+    """
+    _print_log("ERROR", error_msg)
+
+
+def print_warn_log(warn_msg):
+    """
+    Function Description:
+        print warn log.
+    Parameter:
+        warn_msg: the warning message.
+    """
+    _print_log("WARNING", warn_msg)
+
+
+def check_mode_valid(mode, scope=[], api_list=[]):
+    mode_check = {
+        Const.ALL: lambda: None,
+        Const.RANGE: lambda:  ValueError("set_dump_switch, scope param set invalid, it's must be [start, end].") if len(scope) != 2 else None,
+        Const.LIST: lambda:  ValueError("set_dump_switch, scope param set invalid, it's should not be an empty list.") if len(scope) == 0 else None,
+        Const.STACK: lambda:  ValueError("set_dump_switch, scope param set invalid, it's must be [start, end] or [].") if len(scope) > 2 else None,
+        Const.ACL: lambda:  ValueError("set_dump_switch, scope param set invalid, only one api name is supported in acl mode.") if len(scope) != 1 else None,
+        Const.API_LIST: lambda:  ValueError("Current dump mode is 'api_list', but the content of api_list parameter is empty or valid.") if not isinstance(api_list, list) or len(api_list) < 1 else None,
+        Const.API_STACK: lambda: None,
+    }
+    if mode not in Const.DUMP_MODE:
+        msg = "Current mode '%s' is not supported. Please use the field in %s" % \
+              (mode, Const.DUMP_MODE)
+        raise CompareException(CompareException.INVALID_DUMP_MODE, msg)
+
+    if mode_check[mode]() is not None:
+        raise mode_check[mode]()
+
+def check_switch_valid(switch):
+    if switch not in ["ON", "OFF"]:
+        raise ValueError("Please set switch with 'ON' or 'OFF'.")
+
+def check_dump_mode_valid(dump_mode):
+    if not isinstance(dump_mode, list):
+        print_warn_log("Please set dump_mode as a list.")
+        dump_mode = [dump_mode]
+    if not all(mode in ["all", "forward", "backward", "input", "output"] for mode in dump_mode):
+        raise ValueError("Please set dump_mode as a list containing one or more of the following: 'all', 'forward', 'backward', 'input', 'output'.")
+    if 'input' not in dump_mode and 'output' not in dump_mode:
+        dump_mode.extend(['input', 'output'])
+    if 'forward' not in dump_mode and 'backward' not in dump_mode:
+        dump_mode.extend(['forward', 'backward'])
+    if 'all' in dump_mode or set(["forward", "backward", "input", "output"]).issubset(set(dump_mode)):
+        return ['all']
+    return dump_mode
+
+def check_summary_only_valid(summary_only):
+    if not isinstance(summary_only, bool):
+        print_error_log("Params auto_analyze only support True or False.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    return summary_only
+
+def check_compare_param(input_parma, output_path, stack_mode=False, auto_analyze=True,
+                        fuzzy_match=False):  # 添加默认值来让不传参时能通过参数检查
+    if not (isinstance(input_parma, dict) and isinstance(output_path, str)
+            and isinstance(stack_mode, bool) and isinstance(fuzzy_match, bool)):
+        print_error_log("Invalid input parameters")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    if not isinstance(auto_analyze, bool):
+        print_error_log("Params auto_analyze only support True or False.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    check_file_or_directory_path(input_parma.get("npu_pkl_path"), False)
+    check_file_or_directory_path(input_parma.get("bench_pkl_path"), False)
+    check_file_or_directory_path(input_parma.get("npu_dump_data_dir"), True)
+    check_file_or_directory_path(input_parma.get("bench_dump_data_dir"), True)
+    check_file_or_directory_path(output_path, True)
+    npu_pkl = open(input_parma.get("npu_pkl_path"), "r")
+    bench_pkl = open(input_parma.get("bench_pkl_path"), "r")
+    check_file_mode(npu_pkl.name, bench_pkl.name, stack_mode)
+    _check_pkl(npu_pkl, input_parma.get("npu_pkl_path"))
+    _check_pkl(bench_pkl, input_parma.get("bench_pkl_path"))
+    return npu_pkl, bench_pkl
+
+
+def check_file_or_directory_path(path, isdir=False):
+    """
+    Function Description:
+        check whether the path is valid
+    Parameter:
+        path: the path to check
+        isdir: the path is dir or file
+    Exception Description:
+        when invalid data throw exception
+    """
+    if isdir:
+        if not os.path.exists(path):
+            print_error_log('The path {} is not exist.'.format(path))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+        if not os.path.isdir(path):
+            print_error_log('The path {} is not a directory.'.format(path))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+        if not os.access(path, os.W_OK):
+            print_error_log(
+                'The path {} does not have permission to write. Please check the path permission'.format(path))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+    else:
+        if not os.path.isfile(path):
+            print_error_log('{} is an invalid file or non-exist.'.format(path))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+    check_file_valid(path)
+
+    if not os.access(path, os.R_OK):
+        print_error_log(
+            'The path {} does not have permission to read. Please check the path permission'.format(path))
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+
+def _check_pkl(pkl_file_handle, file_name):
+    tensor_line = pkl_file_handle.readline()
+    if len(tensor_line) == 0:
+        print_error_log("dump file {} have empty line!".format(file_name))
+        raise CompareException(CompareException.INVALID_DUMP_FILE)
+    pkl_file_handle.seek(0, 0)
+
+
+def is_starts_with(string, prefixes):
+    return any(string.startswith(prefix) for prefix in prefixes)
+
+
+def check_file_mode(npu_pkl, bench_pkl, stack_mode):
+    npu_pkl_name = os.path.split(npu_pkl)[-1]
+    bench_pkl_name = os.path.split(bench_pkl)[-1]
+
+    if not is_starts_with(npu_pkl_name, prefixes) and not is_starts_with(bench_pkl_name, prefixes):
+        if stack_mode:
+            print_error_log("The current file does not contain stack information, please turn off the stack_mode")
+            raise CompareException(CompareException.INVALID_COMPARE_MODE)
+    elif is_starts_with(npu_pkl_name, prefixes) and is_starts_with(bench_pkl_name, prefixes):
+        if not stack_mode:
+            print_error_log("The current file contains stack information, please turn on the stack_mode")
+            raise CompareException(CompareException.INVALID_COMPARE_MODE)
+    else:
+        print_error_log("The dump mode of the two files is not same, please check the dump files")
+        raise CompareException(CompareException.INVALID_COMPARE_MODE)
+
+
+def check_file_size(input_file, max_size):
+    try:
+        file_size = os.path.getsize(input_file)
+    except OSError as os_error:
+        print_error_log('Failed to open "%s". %s' % (input_file, str(os_error)))
+        raise CompareException(CompareException.INVALID_FILE_ERROR)
+    if file_size > max_size:
+        print_error_log('The size (%d) of %s exceeds (%d) bytes, tools not support.'
+                        % (file_size, input_file, max_size))
+        raise CompareException(CompareException.INVALID_FILE_ERROR)
+
+
+def check_file_not_exists(file_path):
+    if os.path.exists(file_path) or os.path.islink(file_path):
+        remove_path(file_path)
+
+
+def remove_path(path):
+    if not os.path.exists(path):
+        return
+    try:
+        if os.path.islink(path) or os.path.isfile(path):
+            os.remove(path)
+        else:
+            shutil.rmtree(path)
+    except PermissionError:
+        print_error_log("Failed to delete {}. Please check the permission.".format(path))
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+
+def get_dump_data_path(dump_dir):
+    """
+    Function Description:
+        traverse directories and obtain the absolute path of dump data
+    Parameter:
+        dump_dir: dump data directory
+    Return Value:
+        dump data path,file is exist or file is not exist
+    """
+    dump_data_path = None
+    file_is_exist = False
+
+    check_file_or_directory_path(dump_dir, True)
+    for dir_path, sub_paths, files in os.walk(dump_dir):
+        if len(files) != 0:
+            dump_data_path = dir_path
+            file_is_exist = True
+            break
+        dump_data_path = dir_path
+    return dump_data_path, file_is_exist
+
+
+def get_api_name_from_matcher(name):
+    api_matcher = re.compile(Const.API_PATTERN)
+    match = api_matcher.match(name)
+    return match.group(1) if match else ""
+
+
+def modify_dump_path(dump_path, mode):
+    if mode == Const.ALL:
+        return dump_path
+    file_name = os.path.split(dump_path)
+    mode_file_name = mode + "_" + file_name[-1]
+    return os.path.join(file_name[0], mode_file_name)
+
+
+def create_directory(dir_path):
+    """
+    Function Description:
+        creating a directory with specified permissions
+    Parameter:
+        dir_path: directory path
+    Exception Description:
+        when invalid data throw exception
+    """
+    if not os.path.exists(dir_path):
+        try:
+            os.makedirs(dir_path, mode=0o700)
+        except OSError as ex:
+            print_error_log(
+                'Failed to create {}.Please check the path permission or disk space .{}'.format(dir_path, str(ex)))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+
+def execute_command(cmd):
+    """
+    Function Description:
+        run the following command
+    Parameter:
+        cmd: command
+    Exception Description:
+        when invalid command throw exception
+    """
+    print_info_log('Execute command:%s' % cmd)
+    process = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    while process.poll() is None:
+        line = process.stdout.readline()
+        line = line.strip()
+        if line:
+            print(line)
+    if process.returncode != 0:
+        print_error_log('Failed to execute command:%s' % " ".join(cmd))
+        raise CompareException(CompareException.INVALID_DATA_ERROR)
+
+
+def save_numpy_data(file_path, data):
+    """
+    save_numpy_data
+    """
+    if not os.path.exists(os.path.dirname(file_path)):
+        os.makedirs(os.path.dirname(file_path))
+    np.save(file_path, data)
+
+
+def parse_arg_value(values):
+    """
+    parse dynamic arg value of atc cmdline
+    """
+    value_list = []
+    for item in values.split(Const.SEMICOLON):
+        value_list.append(parse_value_by_comma(item))
+    return value_list
+
+
+def parse_value_by_comma(value):
+    """
+    parse value by comma, like '1,2,4,8'
+    """
+    value_list = []
+    value_str_list = value.split(Const.COMMA)
+    for value_str in value_str_list:
+        value_str = value_str.strip()
+        if value_str.isdigit() or value_str == '-1':
+            value_list.append(int(value_str))
+        else:
+            print_error_log("please check your input shape.")
+            raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    return value_list
+
+
+def get_data_len_by_shape(shape):
+    data_len = 1
+    for item in shape:
+        if item == -1:
+            print_error_log("please check your input shape, one dim in shape is -1.")
+            return -1
+        data_len = data_len * item
+    return data_len
+
+
+def add_time_as_suffix(name):
+    return '{}_{}.csv'.format(name, time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())))
+
+
+def get_time():
+    return datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
+
+
+def format_value(value):
+    return '{:.6f}'.format(value)
+
+
+def torch_device_guard(func):
+    if is_gpu or torch_without_guard_version:
+        return func
+    # Parse args/kwargs matched torch.device objects
+
+    @torch_npu_device_guard
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+    return wrapper
+
+
+def seed_all(seed=1234, mode=False):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.use_deterministic_algorithms(mode)
+    if is_gpu:
+        torch.cuda.manual_seed_all(seed)
+        torch.cuda.manual_seed(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.enable = False
+        torch.backends.cudnn.benchmark = False
+    else:
+        torch_npu.npu.manual_seed_all(seed)
+        torch_npu.npu.manual_seed(seed)
+
+
+def get_process_rank(model):
+    print_info_log("Rank id is not provided. Trying to get the rank id of the model.")
+    try:
+        device = next(model.parameters()).device
+    except StopIteration:
+        print_warn_log('There is no parameter in the model. Fail to get rank id.')
+        return 0, False
+    if device.type == 'cpu':
+        print_warn_log("Warning: the debugger is unable to get the rank id. "
+            "This may cause the dumpped data to be corrupted in the "
+            "case of distributed training. (You may ignore this if you are using only one card.) "
+            "Transfer the model to npu or gpu before register_hook() to avoid this warning.")
+        return 0, False
+    else:
+        return device.index, True
+
+
+def parameter_adapter(func):
+
+    @wraps(func)
+    def inner(self, *args, **kwargs):
+        if self.op_name_ == "__getitem__" and len(args) > 1 and isinstance(args[1], torch.Tensor):
+            input = args[0]
+            indices = args[1]
+            if indices.dtype == torch.uint8:
+                indices = indices.bool()
+            if indices.dtype == torch.bool:
+                if indices.shape == input.shape:
+                    return getattr(torch._C._VariableFunctionsClass, "masked_select")(input, indices)
+                else:
+                    indices = getattr(torch._C._VariableFunctionsClass, "nonzero")(indices, as_tuple=True)
+                    return getattr(torch._C._TensorBase, "__getitem__")(input, indices)
+            elif indices.dtype != torch.bool:
+                if len(indices.shape) == 1:
+                    return func(self, input, indices.tolist())
+                elif len(indices.shape) == 2:
+                    result = [func(self, input, index) for index in indices.tolist()]
+                    return getattr(torch._C._VariableFunctionsClass, "stack")(result, 0)
+                else:
+                    res = [input[tensor_index] for tensor_index in indices]
+                    return getattr(torch._C._VariableFunctionsClass, "stack")(res, 0)
+        return func(self, *args, **kwargs)
+    return inner
+
+
+def generate_compare_script(dump_path, pkl_file_path, dump_switch_mode):
+    template_path = os.path.join(os.path.dirname(__file__), "compare_script.template")
+    pkl_dir = os.path.dirname(pkl_file_path)
+    compare_script_path = os.path.join(pkl_dir, "compare_data.py")
+    is_api_stack = "True" if dump_switch_mode == Const.API_STACK else "False"
+
+    try:
+        with open(template_path, 'r') as ftemp, \
+           os.fdopen(os.open(compare_script_path, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as fout:
+            code_temp = ftemp.read()
+            fout.write(code_temp % (pkl_file_path, dump_path, is_api_stack))
+    except OSError:
+        print_error_log(f"Failed to open file. Please check file {template_path} or path {pkl_dir}.")
+
+    print_info_log(f"Generate compare script successfully which is {compare_script_path}.")
+
+
+def check_is_npu():
+    return not is_gpu
+
+
+def check_file_valid(file_path):
+    if os.path.islink(file_path):
+        print_error_log('The file path {} is a soft link.'.format(file_path))
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+    if len(os.path.realpath(file_path)) > Const.DIRECTORY_LENGTH or len(os.path.basename(file_path)) > \
+            Const.FILE_NAME_LENGTH:
+        print_error_log('The file path length exceeds limit.')
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+    if not re.match(Const.FILE_PATTERN, os.path.realpath(file_path)):
+        print_error_log('The file path {} contains special characters.'.format(file_path))
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+    if os.path.isfile(file_path):
+        file_size = os.path.getsize(file_path)
+        if file_path.endswith(Const.PKL_SUFFIX) and file_size > Const.ONE_GB:
+            print_error_log('The file {} size is greater than 1GB.'.format(file_path))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+        if file_path.endswith(Const.NUMPY_SUFFIX) and file_size > Const.TEN_GB:
+            print_error_log('The file {} size is greater than 10GB.'.format(file_path))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..6683fd1808b47442f0f1243b577677ff1225d05a
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py
@@ -0,0 +1,600 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import json
+import multiprocessing
+import os.path
+import stat
+import sys
+
+import numpy as np
+import pandas as pd
+
+from ..advisor.advisor import Advisor
+from ..common.utils import check_compare_param, add_time_as_suffix, \
+    print_warn_log, print_error_log, CompareException, Const,\
+    CompareConst, format_value, check_file_not_exists, check_file_valid
+
+
+def correct_data(result):
+    if result == CompareConst.NAN:
+        return result
+    if float(result) > 0.99999:
+        return '1.0'
+    return result
+
+
+def cosine_similarity(n_value, b_value):
+    np.seterr(divide='ignore', invalid='ignore')
+    if len(n_value) == 1:
+        return "unsupported", "This tensor is scalar."
+    num = n_value.dot(b_value)
+    a_norm = np.linalg.norm(n_value)
+    b_norm = np.linalg.norm(b_value)
+    message = ''
+    if a_norm <= Const.FLOAT_EPSILON and b_norm <= Const.FLOAT_EPSILON:
+        result = '1.0'
+    elif a_norm <= Const.FLOAT_EPSILON:
+        message = 'Cannot compare by Cosine Similarity, All the data is Zero in npu dump data.'
+        result = CompareConst.NAN
+    elif b_norm <= Const.FLOAT_EPSILON:
+        message = 'Cannot compare by Cosine Similarity, All the data is Zero in Bench dump data.'
+        result = CompareConst.NAN
+    else:
+        cos = num / (a_norm * b_norm)
+        if np.isnan(cos):
+            message = 'Cannot compare by Cosine Similarity, the dump data has NaN.'
+            result = CompareConst.NAN
+        else:
+            result = format_value(cos)
+    result = correct_data(result)
+    return result, message
+
+
+def get_rmse(n_value, b_value):
+    rmse = np.linalg.norm(n_value - b_value) / np.sqrt(len(n_value))
+    if np.isnan(rmse):
+        rmse = CompareConst.NAN
+    return rmse, ""
+
+
+def get_mape(n_value, b_value):
+    mape_val = np.sum(np.abs((n_value - b_value) / b_value)) / len(b_value) * 100
+    mape = CompareConst.NAN if np.isnan(mape_val) else str(round(mape_val, 4)) + '%'
+    return mape, ""
+
+
+def get_max_abs_err(n_value, b_value):
+    temp_res = n_value - b_value
+    max_value = np.max(np.abs(temp_res))
+    return format_value(max_value), ""
+
+
+def get_max_relative_err(n_value, b_value):
+    np.seterr(divide='ignore', invalid='ignore')
+    if b_value.dtype in CompareConst.FLOAT_TYPE:
+        zero_mask = (b_value == 0)
+        b_value[zero_mask] += np.finfo(b_value.dtype).eps 
+        n_value[zero_mask] += np.finfo(b_value.dtype).eps 
+    else:
+        n_value, b_value = n_value.astype(float), b_value.astype(float)
+        zero_mask = (b_value == 0)
+        b_value[zero_mask] += np.finfo(float).eps 
+        n_value[zero_mask] += np.finfo(float).eps 
+    relative_err = np.divide((n_value - b_value), b_value)
+    max_relative_err = np.max(np.abs(relative_err))
+    if np.isnan(max_relative_err):
+        message = 'Cannot compare by MaxRelativeError, the data contains nan in dump data.'
+        return CompareConst.NAN, message
+    return format_value(max_relative_err), ""
+
+
+def check_op(npu_dict, bench_dict, fuzzy_match):
+    a_op_name = npu_dict["op_name"]
+    b_op_name = bench_dict["op_name"]
+    struct_match = check_struct_match(npu_dict, bench_dict)
+    if not fuzzy_match:
+        return a_op_name == b_op_name and struct_match
+    is_match = True
+    try:
+        is_match = fuzzy_check_op(a_op_name, b_op_name)
+    except Exception as err:
+        print_warn_log("%s and %s can not fuzzy match." % (a_op_name, b_op_name))
+        is_match = False
+    finally:
+        return is_match and struct_match
+
+
+def check_struct_match(npu_dict, bench_dict):
+    npu_struct_in = npu_dict.get("input_struct")
+    bench_struct_in = bench_dict.get("input_struct")
+    npu_struct_out = npu_dict.get("output_struct")
+    bench_struct_out = bench_dict.get("output_struct")
+    is_match = npu_struct_in == bench_struct_in and npu_struct_out == bench_struct_out
+    if not is_match:
+        if len(npu_struct_in) == 0 or len(bench_struct_in) == 0 or len(npu_struct_in) != len(bench_struct_in):
+            return False
+        struct_in_is_match = check_type_shape_match(npu_struct_in, bench_struct_in)
+        struct_out_is_match = check_type_shape_match(npu_struct_out, bench_struct_out)
+        is_match = struct_in_is_match and struct_out_is_match
+    return is_match
+
+
+def check_type_shape_match(npu_struct, bench_struct):
+    shape_type_match = False
+    for npu_type_shape, bench_type_shape in zip(npu_struct, bench_struct):
+        npu_type = npu_type_shape[0]
+        npu_shape = npu_type_shape[1]
+        bench_type = bench_type_shape[0]
+        bench_shape = bench_type_shape[1]
+        shape_match = npu_shape == bench_shape
+        type_match = npu_type == bench_type
+        if not type_match:
+            if [npu_type, bench_type] in [["torch.float16", "torch.float32"], ["torch.float32", "torch.float16"]]:
+                type_match = True
+            else:
+                type_match = False
+        shape_type_match = shape_match and type_match
+        if not shape_type_match:
+            return False
+    return shape_type_match
+
+
+def fuzzy_check_op(npu_name_list, bench_name_list):
+    if len(npu_name_list) == 0 or len(bench_name_list) == 0 or len(npu_name_list) != len(bench_name_list):
+        return False
+    is_match = True
+    for npu_name, bench_name in zip(npu_name_list, bench_name_list):
+        is_match = fuzzy_check_name(npu_name, bench_name)
+        if not is_match:
+            break
+    return is_match
+
+
+def fuzzy_check_name(npu_name, bench_name):
+    if "forward" in npu_name and "forward" in bench_name:
+        is_match = rename_api(npu_name, "forward") == rename_api(bench_name, "forward")
+    elif "backward" in npu_name and "backward" in bench_name:
+        is_match = rename_api(npu_name, "backward") == rename_api(bench_name, "backward")
+    else:
+        is_match = npu_name == bench_name
+    return is_match
+
+
+def rename_api(npu_name, process):
+    npu_split = npu_name.split(process)
+    torch_func_index, in_out = npu_split[0], npu_split[1]
+    torch_func_split = torch_func_index.rsplit("_", 2)
+    torch_func = str(torch_func_split[0]) + str(in_out)
+    return torch_func
+
+
+def merge_tensor(tensor_list):
+    op_dict = {}
+    op_dict["op_name"] = []
+    op_dict["input_struct"] = []
+    op_dict["output_struct"] = []
+    op_dict["summery"] = []
+    op_dict["stack_info"] = []
+
+    for tensor in tensor_list:
+        if tensor[0].find("stack_info") != -1:
+            op_dict["stack_info"].append(tensor[1])
+            break
+        op_dict["op_name"].append(tensor[0])
+        if tensor[0].find("input") != -1:
+            op_dict["input_struct"].append((tensor[3], tensor[4]))
+        elif tensor[0].find("output") != -1:
+            op_dict["output_struct"].append((tensor[3], tensor[4]))
+
+        if tensor[1] <= Const.DUMP_RATIO_MAX:
+            op_dict["summery"].append(tensor[5])
+
+    return op_dict
+
+
+def read_op(ops_queue, pkl_file_handle, stack_mode):
+    tensor_list = []
+    read_err = False
+    read_output_flag = {"last_line": False, "curr_line": False}
+    end_flag = "stack_info" if stack_mode is True else "output"
+
+    while True:
+        curr_pos = pkl_file_handle.tell()
+        tensor_line = pkl_file_handle.readline()
+        if len(tensor_line) == 0 and not read_output_flag.get("curr_line"):
+            read_err = True
+            break
+        if tensor_line == '\n':
+            continue
+        if len(tensor_line) != 0:
+            tensor_data = json.loads(tensor_line)
+            read_output_flag["last_line"] = read_output_flag.get("curr_line")
+            read_output_flag["curr_line"] = True if tensor_data[0].find(end_flag) != -1 else False
+
+        if (read_output_flag.get("last_line") and not read_output_flag.get("curr_line")) \
+                or (len(tensor_line) == 0 and read_output_flag.get("curr_line")):  # end of file scenario
+            ops_queue.append(merge_tensor(tensor_list))
+            # the pos of the handle needs to restore to the start of the next api.
+            pkl_file_handle.seek(curr_pos, 0)
+            break
+        tensor_list.append(tensor_data)
+
+    return not read_err
+
+
+def match_op(npu_queue, bench_queue, fuzzy_match):
+    for b_index, b_op in enumerate(bench_queue[0: -1]):
+        if check_op(npu_queue[-1], b_op, fuzzy_match):
+            return len(npu_queue) - 1, b_index
+    if check_op(npu_queue[-1], bench_queue[-1], fuzzy_match):
+        return len(npu_queue) - 1, len(bench_queue) - 1
+    for n_index, n_op in enumerate(npu_queue[0: -1]):
+        if check_op(n_op, bench_queue[-1], fuzzy_match):
+            return n_index, len(bench_queue) - 1
+    return -1, -1
+
+
+def get_accuracy(result, n_dict, b_dict):
+    index_out = 0
+    npu_stack_info = n_dict.get("stack_info", None)
+    bench_stack_info = b_dict.get("stack_info", None)
+
+    for index, n_name in enumerate(n_dict["op_name"]):
+        b_name = b_dict["op_name"][index]
+        if n_name.find("input") != -1:
+            n_struct = n_dict["input_struct"][index]
+            b_struct = b_dict["input_struct"][index]
+        else:
+            n_struct = n_dict["output_struct"][index_out]
+            b_struct = b_dict["output_struct"][index_out]
+            index_out += 1
+        err_msg = ""
+        accuracy_check_res = CompareConst.ACCURACY_CHECK_YES
+
+        result_item = [n_name, b_name, n_struct[0], b_struct[0], n_struct[1], b_struct[1], " ", " ", " "]
+
+        summery_data = n_dict.get("summery")[index]
+        result_item.extend(summery_data)
+
+        summery_data = b_dict.get("summery")[index]
+        result_item.extend(summery_data)
+        result_item.append(accuracy_check_res)
+        result_item.append(err_msg)
+        if npu_stack_info and bench_stack_info and index == 0:
+            result_item.extend(npu_stack_info)
+
+        result.append(result_item)
+
+
+def _do_multi_process(input_parma, result_path):
+    try:
+        _handle_multi_process(compare_ops, input_parma, result_path, multiprocessing.Manager().RLock())
+    except FileNotFoundError as error:
+        print("File not Found. compare failed!")
+        return
+    except IOError as error:
+        print("IOEError. compare failed!")
+        return
+
+
+def read_dump_path(result_path):
+    try:
+        csv_pd = pd.read_csv(result_path)
+        npu_dump_name_list = csv_pd.iloc[0:, 0].tolist()
+        bench_dump_name_list = csv_pd.iloc[0:, 1].tolist()
+        op_name_mapping_dict = {}
+        for index, _ in enumerate(npu_dump_name_list):
+            npu_dump_name = npu_dump_name_list[index]
+            bench_dump_name = bench_dump_name_list[index]
+            op_name_mapping_dict[npu_dump_name] = [npu_dump_name, bench_dump_name]
+        return op_name_mapping_dict
+    except FileNotFoundError as error:
+        print(error)
+        raise FileNotFoundError(error)
+    except IOError as error:
+        print(error)
+        raise IOError(error)
+
+
+def _handle_multi_process(func, input_parma, result_path, lock):
+    process_num = int((multiprocessing.cpu_count() + 1) / 2)
+    op_name_mapping_dict = read_dump_path(result_path)
+    op_names = []
+    for _ in range(process_num):
+        op_names.append([])
+    all_op_names = list(op_name_mapping_dict.keys())
+    for i, op_name in enumerate(all_op_names):
+        op_names[i % process_num].append(op_name)
+    all_tasks = []
+    pool = multiprocessing.Pool(process_num)
+
+    def err_call(args):
+        try:
+            pool.terminate()
+            if os.path.exists(result_path):
+                os.remove(result_path)
+            sys.exit(args)
+        except SystemExit as error:
+            print('multiprocess compare failed! season:{}'.format(args))
+
+    for process_idx, fusion_op_names in enumerate(op_names):
+        idx = [process_num, process_idx]
+        task = pool.apply_async(func,
+                                args=(idx, fusion_op_names, op_name_mapping_dict, result_path, lock, input_parma),
+                                error_callback=err_call)
+        all_tasks.append(task)
+    pool.close()
+    pool.join()
+
+
+def compare_ops(idx, fusion_op_names, dump_path_dict, result_path, lock, input_parma):
+    cos_result = []
+    max_err_result = []
+    max_relative_err_result = []
+    err_mess = []
+    is_print_compare_log = input_parma.get("is_print_compare_log")
+    for i, op_name in enumerate(fusion_op_names):
+        if is_print_compare_log:
+            print("start comapre: {}".format(op_name))
+        cos_sim, max_abs_err, max_relative_err, err_msg = compare_by_op(op_name, dump_path_dict, input_parma)
+        if is_print_compare_log:
+            print("[{}] Compare result: cosine {}, max_abs_err {}, max_relative_err {}, {}".format(op_name, cos_sim, max_abs_err, max_relative_err, err_msg))
+        cos_result.append(cos_sim)
+        max_err_result.append(max_abs_err)
+        max_relative_err_result.append(max_relative_err)
+        err_mess.append(err_msg)
+    _save_cmp_result(idx, cos_result, max_err_result, max_relative_err_result, err_mess, result_path, lock)
+
+
+def _save_cmp_result(idx, cos_result, max_err_result, max_relative_err_result, err_msg, result_path, lock):
+    lock.acquire()
+    try:
+        csv_pd = pd.read_csv(result_path, dtype=str)
+        process_num = idx[0]
+        process_idx = idx[1]
+        for i, _ in enumerate(cos_result):
+            process_index = i * process_num + process_idx
+            csv_pd.loc[process_index, CompareConst.COSINE] = cos_result[i]
+            csv_pd.loc[process_index, CompareConst.MAX_ABS_ERR] = max_err_result[i]
+            csv_pd.loc[process_index, CompareConst.MAX_RELATIVE_ERR] = max_relative_err_result[i]
+            csv_pd.loc[process_index, CompareConst.ERROR_MESSAGE] = err_msg[i]
+            csv_pd.loc[process_index, CompareConst.ACCURACY] = check_accuracy(cos_result[i], max_err_result[i])
+        csv_pd.to_csv(result_path, index=False)
+    except FileNotFoundError as error:
+        print(error)
+        raise FileNotFoundError(error)
+    except IOError as error:
+        print(error)
+        raise IOError(error)
+    finally:
+        lock.release()
+
+
+def check_accuracy(cos, max_abs_err):
+    if cos == CompareConst.SHAPE_UNMATCH:
+        return CompareConst.ACCURACY_CHECK_UNMATCH
+    if cos == CompareConst.NAN or max_abs_err == CompareConst.NAN:
+        return CompareConst.NAN
+    if cos == "N/A" or max_abs_err == "N/A":
+        return CompareConst.ACCURACY_CHECK_NO
+    try:
+        cos, max_abs_err = float(cos), float(max_abs_err)
+    except ValueError:
+        print_warn_log("Cosine or MaxAbsErr can not get float value.")
+        return CompareConst.NAN
+    if cos < CompareConst.COS_THRESHOLD and max_abs_err > CompareConst.MAX_ABS_ERR_THRESHOLD:
+        return CompareConst.ACCURACY_CHECK_NO
+    if cos < CompareConst.COS_MAX_THRESHOLD or max_abs_err > CompareConst.MAX_ABS_ERR_MAX_THRESHOLD:
+        return CompareConst.ACCURACY_CHECK_NO
+    return CompareConst.ACCURACY_CHECK_YES
+
+
+def compare_by_op(op_name, op_name_mapping_dict, input_parma):
+    npu_bench_name_list = op_name_mapping_dict[op_name]
+    if npu_bench_name_list[1] == CompareConst.NAN:
+        return CompareConst.NAN, CompareConst.NAN, CompareConst.NAN, CompareConst.NO_BENCH
+    try:
+        n_path = os.path.join(input_parma.get("npu_dump_data_dir"), npu_bench_name_list[0] + ".npy")
+        b_path = os.path.join(input_parma.get("bench_dump_data_dir"), npu_bench_name_list[1] + ".npy")
+        check_file_valid(n_path)
+        check_file_valid(b_path)
+        n_value = np.load(n_path)
+        b_value = np.load(b_path)
+    except IOError as error:
+        return CompareConst.NAN, CompareConst.NAN, CompareConst.NAN, "Dump file:{} not found.".format(error.filename)
+    if len(n_value.shape) == 0:
+        if n_value.dtype == bool:
+            n_value = n_value.astype(float)
+            b_value = b_value.astype(float)
+        max_abs_err, _ = get_max_abs_err(n_value, b_value)
+        max_relative_err, _ = get_max_relative_err(n_value, b_value)
+        return "unsupported", max_abs_err, max_relative_err, "This is type of scalar data, can not compare."
+    if n_value.size == 0:
+        return "unsupported", 0, 0, "This is empty data, can not compare."
+    if n_value.shape != b_value.shape:
+        return CompareConst.SHAPE_UNMATCH, CompareConst.SHAPE_UNMATCH, CompareConst.SHAPE_UNMATCH, "Shape of NPU and bench Tensor do not match. Skipped."
+    if n_value.dtype != b_value.dtype:
+        print_warn_log("Dtype of NPU and bench Tensor do not match:{}".format(op_name))
+        err_msg = " Dtype of NPU and bench Tensor do not match."
+    else:
+        err_msg = ""
+    
+    n_value, b_value = handle_inf_nan(n_value, b_value)
+    if n_value is CompareConst.NAN or b_value is CompareConst.NAN:
+        return "N/A", "N/A", "N/A",  "The position of inf or nan in NPU and bench Tensor do not match."
+        
+
+    n_value = n_value.reshape(-1).astype(float)
+    b_value = b_value.reshape(-1).astype(float)
+    err_msg = ""
+    cos_sim, message = cosine_similarity(n_value, b_value)
+
+    max_abs_err, _ = get_max_abs_err(n_value, b_value)
+    max_relative_err, message = get_max_relative_err(n_value, b_value)
+
+    if not err_msg:
+        err_msg += message
+    else:
+        err_msg = err_msg + ' ' + message
+
+    if npu_bench_name_list[0] != npu_bench_name_list[1]:
+        err_msg += " Fuzzy matching data, the comparison accuracy may be affected."
+    return cos_sim, max_abs_err, max_relative_err, err_msg
+
+
+def handle_inf_nan(n_value, b_value):
+    n_inf = np.isinf(n_value)
+    b_inf = np.isinf(b_value)
+    n_nan = np.isnan(n_value)
+    b_nan = np.isnan(b_value)
+    if np.any(n_inf) or np.any(b_inf) or np.any(n_nan) or np.any(b_nan):
+        if np.array_equal(n_inf, b_inf) and np.array_equal(n_nan, b_nan):
+            n_value[n_inf] = 0
+            b_value[b_inf] = 0
+            n_value[n_nan] = 0
+            b_value[b_nan] = 0
+        else:
+            return CompareConst.NAN, CompareConst.NAN
+    return n_value, b_value
+
+
+def compare(input_parma, output_path, **kwargs):
+    if kwargs.get('suffix'):
+        print_error_log("Argument 'suffix' is not supported for compare.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    try:
+        npu_pkl, bench_pkl = check_compare_param(input_parma, output_path, **kwargs)
+    except CompareException as error:
+        print_error_log('Compare failed. Please check the arguments and do it again!')
+        sys.exit(error.code)
+    compare_core(input_parma, output_path, npu_pkl, bench_pkl, **kwargs)
+
+
+def compare_core(input_parma, output_path, npu_pkl, bench_pkl, stack_mode=False, auto_analyze=True,
+                 suffix='', fuzzy_match=False):
+    result = compare_process(npu_pkl, bench_pkl, stack_mode, fuzzy_match)
+    npu_pkl.close()
+    bench_pkl.close()
+
+    columns = [CompareConst.NPU_NAME, CompareConst.BENCH_NAME, CompareConst.NPU_DTYPE, CompareConst.BENCH_DTYPE,
+                CompareConst.NPU_SHAPE, CompareConst.BENCH_SHAPE, CompareConst.COSINE, CompareConst.MAX_ABS_ERR,
+                   CompareConst.MAX_RELATIVE_ERR]
+    columns.extend([CompareConst.NPU_MAX, CompareConst.NPU_MIN, CompareConst.NPU_MEAN])
+    columns.extend([CompareConst.BENCH_MAX, CompareConst.BENCH_MIN, CompareConst.BENCH_MEAN])
+    columns.extend([CompareConst.ACCURACY, CompareConst.ERROR_MESSAGE])
+    if stack_mode:
+        columns.extend([CompareConst.STACK])
+    result_df = pd.DataFrame(result, columns=columns)
+
+    file_name = add_time_as_suffix("compare_result" + suffix)
+    file_path = os.path.join(os.path.realpath(output_path), file_name)
+    check_file_not_exists(file_path)
+    with os.fdopen(os.open(file_path, os.O_RDWR | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP), 'w+') as fout:
+        result_df.to_csv(fout, index=False)
+
+    _do_multi_process(input_parma, file_path)
+    if auto_analyze:
+        advisor = Advisor(file_path, output_path)
+        advisor.analysis()
+
+
+def parse(pkl_file, module_name_prefix):
+    pkl_handle = open(pkl_file, "r")
+    done = False
+    title_printed = False
+    while not done:
+        pkl_line = pkl_handle.readline()
+        if pkl_line == '\n':
+            continue
+        if len(pkl_line) == 0:
+            done = True
+            break
+
+        msg = json.loads(pkl_line)
+        info_prefix = msg[0]
+        if not info_prefix.startswith(module_name_prefix):
+            continue
+
+        if info_prefix.find("stack_info") != -1:
+            print("\nTrace back({}):".format(msg[0]))
+            for item in reversed(msg[1]):
+                print("  File \"{}\", line {}, in {}".format(item[0], item[1], item[2]))
+                print("    {}".format(item[3]))
+            continue
+        if len(msg) > 5:
+            summery_info = "  [{}][dtype: {}][shape: {}][max: {}][min: {}][mean: {}]" \
+                .format(msg[0], msg[3], msg[4], msg[5][0], msg[5][1], msg[5][2])
+            if not title_printed:
+                print("\nStatistic Info:")
+                title_printed = True
+            print(summery_info)
+    pkl_handle.close()
+
+
+def compare_process(npu_pkl_handle, bench_pkl_handle, stack_mode, fuzzy_match):
+    if fuzzy_match:
+        print_warn_log("This task uses fuzzy matching, which may affect the accuracy of the comparison.")
+    npu_ops_queue = []
+    bench_ops_queue = []
+    result = []
+    while True:
+        npu_file_flag = read_op(npu_ops_queue, npu_pkl_handle, stack_mode)
+        bench_file_flag = read_op(bench_ops_queue, bench_pkl_handle, stack_mode)
+        if (not npu_file_flag and not bench_file_flag) \
+                or (len(npu_ops_queue) == 0 or len(bench_ops_queue) == 0):
+            break
+        n_match_point, b_match_point = match_op(npu_ops_queue, bench_ops_queue, fuzzy_match)
+        if n_match_point == -1 and b_match_point == -1:
+            continue
+        n_match_data = npu_ops_queue[n_match_point]
+        b_match_data = bench_ops_queue[b_match_point]
+        un_match_data = npu_ops_queue[0: n_match_point]
+        for npu_data in un_match_data:
+            get_un_match_accuracy(result, npu_data)
+        get_accuracy(result, n_match_data, b_match_data)
+        del npu_ops_queue[0: n_match_point + 1]
+        del bench_ops_queue[0: b_match_point + 1]
+    if npu_ops_queue:
+        for npu_data in npu_ops_queue:
+            get_un_match_accuracy(result, npu_data)
+    return result
+
+
+def get_un_match_accuracy(result, n_dict):
+    index_out = 0
+    npu_stack_info = n_dict.get("stack_info", None)
+    bench_name, bench_type, bench_shape = CompareConst.NAN, CompareConst.NAN, CompareConst.NAN
+    for index, n_name in enumerate(n_dict["op_name"]):
+        if n_name.find("input") != -1:
+            n_struct = n_dict["input_struct"][index]
+        else:
+            n_struct = n_dict["output_struct"][index_out]
+            index_out += 1
+        err_msg = CompareConst.NO_BENCH
+        accuracy_check_res = CompareConst.NAN
+
+        result_item = [n_name, bench_name, n_struct[0], bench_type, n_struct[1], bench_shape, " ", " ", " "]
+        summery_data = n_dict.get("summery")[index]
+        result_item.extend(summery_data)
+        summery_data = [CompareConst.NAN]*3
+        result_item.extend(summery_data)
+        result_item.append(accuracy_check_res)
+        result_item.append(err_msg)
+        if npu_stack_info and index == 0:
+            result_item.extend(npu_stack_info)
+        result.append(result_item)
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..d92b0a145b7bd0251ad2b6ed43351ce3c64ae7e8
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import os, sys
+import re
+from ..common.utils import print_error_log, CompareException, check_compare_param
+from .acc_compare import compare_core
+
+
+def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs):
+    def check_and_return_dir_contents(dump_dir, prefix):
+        contents = os.listdir(dump_dir)
+        pattern = re.compile(f'^{prefix}[0-9]+$')
+        for name in contents:
+            match = pattern.match(name)
+            if match is None:
+                msg = (f"dump_dir contains '{name}'. Expected '{prefix}'. This name is not in the format of dump output. "
+                        f"Please check and delete irrelevant files in {dump_dir} and try again.")
+                print_error_log(msg)
+                raise CompareException(CompareException.INVALID_PATH_ERROR)
+        return contents
+
+    def extract_pkl_and_data_dir(dirname):
+        pkl_path, dump_data_dir, pkl_name, dump_data_dirname = '', '', '', ''
+        for fname in os.listdir(dirname):
+            full_path = os.path.join(dirname, fname)
+            if os.path.isdir(full_path):
+                dump_data_dir = full_path
+                dump_data_dirname = fname
+            elif full_path.endswith('.pkl'):
+                pkl_path = full_path
+                pkl_name = fname
+        # Provide robustness on invalid directory inputs
+        if not pkl_path:
+            print_error_log(f'No file is found in dump dir {dirname}. ')
+            raise CompareException(CompareException.NO_DUMP_FILE_ERROR)
+        if dump_data_dir == '':
+            print_error_log(f'No directory is found in dump dir {dirname}. ')
+            raise CompareException(CompareException.NO_DUMP_FILE_ERROR)
+        name_body, ext = os.path.splitext(pkl_name)
+        pattern = re.compile(f'{name_body}$')
+        match = pattern.match(dump_data_dirname)
+        if match is None:
+            print_error_log('The names of pkl and directory do not match! '
+                f'Please check the names and remove irrelevant files in {dirname}. ')
+            raise CompareException(CompareException.INVALID_FILE_ERROR)
+        return pkl_path, dump_data_dir
+
+
+    if kwargs.get('suffix'):
+        print_error_log("Argument 'suffix' is not supported for compare_distributed.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    # get the ranks and match by order
+    npu_ranks = sorted(check_and_return_dir_contents(npu_dump_dir, 'rank'))
+    bench_ranks = sorted(check_and_return_dir_contents(bench_dump_dir, 'rank'))
+    if len(npu_ranks) != len(bench_ranks):
+        print_error_log('The number of ranks in the two runs are different. '
+            'Unable to match the ranks. Please use another folder to compare '
+            'or use compare() api and manually match the ranks.')
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+    for nr, br in zip(npu_ranks, bench_ranks):
+        n_dir = os.path.join(npu_dump_dir, nr)
+        b_dir = os.path.join(bench_dump_dir, br)
+        npu_pkl_path, npu_dump_data_dir = extract_pkl_and_data_dir(n_dir)
+        bench_pkl_path, bench_dump_data_dir = extract_pkl_and_data_dir(b_dir)
+        dump_result_param = {
+            'npu_pkl_path': npu_pkl_path,
+            'bench_pkl_path': bench_pkl_path,
+            'npu_dump_data_dir': npu_dump_data_dir,
+            'bench_dump_data_dir': bench_dump_data_dir,
+            'is_print_compare_log':True
+        }
+        try:
+            npu_pkl, bench_pkl = check_compare_param(dump_result_param, output_path, **kwargs)
+        except CompareException as error:
+            print_error_log('Compare failed. Please check the arguments and do it again!')
+            sys.exit(error.code)
+        compare_core(dump_result_param, output_path, npu_pkl, bench_pkl, suffix=f'_{nr}-{br}', **kwargs)
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/__init__.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/debugger_config.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/debugger_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c92c3c1245cfb9687ed5a37f5491badaf3ee936
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/debugger_config.py
@@ -0,0 +1,33 @@
+import os
+from ..common.utils import print_warn_log
+
+
+class DebuggerConfig:
+    def __init__(self, dump_path, hook_name, rank=None, step=[0]):
+        self.dump_path = dump_path
+        self.hook_name = hook_name
+        self.rank = rank
+        self.step = step
+        if self.step:
+            self.step.sort()
+        self.check()
+
+    def check(self):
+        dump_root = os.path.split(self.dump_path)[0]
+        if not os.path.exists(dump_root):
+            raise ValueError("dump path {} does not exist".format(dump_root))
+        if self.hook_name not in ["dump", "overflow_check"]:
+            raise ValueError("hook_name should be in ['dump', 'overflow_check']".format(self.hook_name))
+        if self.rank is not None and not isinstance(self.rank, int):
+            raise ValueError("rank {} should be int".format(self.rank))
+        elif isinstance(self.rank, int):
+            print_warn_log(f"Rank argument is provided. Only rank {self.rank} data will be dumpped.")
+        if not isinstance(self.step, list):
+            raise ValueError("step {} should be list".format(self.step))
+        if len(self.step) == 0:
+            raise ValueError("step {} should not be empty".format(self.step))
+        for s in self.step:
+            if not isinstance(s, int):
+                raise ValueError("step element {} should be int".format(s))
+        return True
+
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9db590c67860d18cfab3b4ec3cb945e77030e79
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py
@@ -0,0 +1,86 @@
+import os
+from ..common.utils import Const, make_dump_path_if_not_exists, print_error_log, print_info_log
+from ..dump.dump import DumpUtil, acc_cmp_dump, write_to_disk
+from ..dump.utils import set_dump_path, set_dump_switch_print_info, generate_dump_path_str, \
+        set_dump_switch_config, set_backward_input
+from ..overflow_check.utils import OverFlowUtil
+from ..overflow_check.overflow_check import overflow_check
+from ..hook_module.register_hook import register_hook_core
+from ..hook_module.hook_module import HOOKModule
+from .debugger_config import DebuggerConfig
+
+
+class PrecisionDebugger:
+    first_start = True
+    hook_func = None
+
+    # 提供两种使用方式：逐个传参和构造config后传config，看哪种使用方式更受欢迎，之后只保留一种
+    def __init__(self, dump_path=None, hook_name=None, rank=None, step=[0], config=None):
+        if config is None:
+            if dump_path is None or hook_name is None:
+                err_msg = "You must provide dump_path and hook_name argument to PrecisionDebugger\
+                                when config is not provided."
+                raise Exception(err_msg)
+            self.config = DebuggerConfig(dump_path, hook_name, rank, step)
+        else:
+            self.config = config
+            print_info_log("Debugger gets config, it will override preceding arguments.")
+
+        self.configure_hook = self.get_configure_hook(config.hook_name)
+        self.configure_hook()
+        DumpUtil.target_iter = config.step
+        DumpUtil.target_rank = config.rank
+        make_dump_path_if_not_exists(config.dump_path)
+        set_dump_path(config.dump_path)
+        if config.hook_name == "overflow_check":
+            PrecisionDebugger.hook_func = overflow_check
+        else:
+            PrecisionDebugger.hook_func = acc_cmp_dump
+
+    def get_configure_hook(self, hook_name):
+        if hook_name == "dump":
+            return self.configure_full_dump
+        elif hook_name == "overflow_check":
+            return self.configure_overflow_dump
+        else:
+            raise ValueError("hook name {} is not in ['dump', 'overflow_check']".format(hook_name))
+
+    def configure_full_dump(self, mode='api_stack', scope=[], api_list=[], filter_switch=Const.ON,
+            input_output_mode=[Const.ALL], acl_config=None, backward_input=[], summary_only=False):
+        set_dump_switch_config(mode=mode, scope=scope, api_list=api_list,
+                               filter_switch=filter_switch, dump_mode=input_output_mode, summary_only=summary_only)
+        if mode == 'acl' and acl_config is None:
+            raise ValueError("acl_config must be configured when mode is 'acl'")
+        elif mode == 'acl' and acl_config is not None:
+            DumpUtil.dump_config = acl_config
+        if mode == 'acl' and 'backward' in scope and not backward_input:
+            raise ValueError("backward_input must be configured when mode is 'acl' and scope contains 'backward'")
+        elif mode == 'acl' and 'backward' in scope and backward_input:
+            set_backward_input(backward_input)
+
+    def configure_overflow_dump(self, mode="api", acl_config=None, overflow_nums=1):
+        if mode == "acl":
+            DumpUtil.dump_switch_mode = mode
+            DumpUtil.dump_config = acl_config
+            if acl_config is None:
+                raise ValueError("acl_config must be configured when mode is 'acl'")
+        if isinstance(overflow_nums, int):
+            OverFlowUtil.overflow_nums = overflow_nums
+        else:
+            raise ValueError("overflow_nums must be int")
+
+    @classmethod
+    def start(cls):
+        if cls.first_start:
+            register_hook_core(cls.hook_func)
+            cls.first_start = False
+        DumpUtil.dump_switch = "ON"
+        dump_path_str = generate_dump_path_str()
+        set_dump_switch_print_info("ON", DumpUtil.dump_switch_mode, dump_path_str)
+
+    @classmethod
+    def stop(cls):
+        DumpUtil.dump_switch = "OFF"
+        dump_path_str = generate_dump_path_str()
+        set_dump_switch_print_info("OFF", DumpUtil.dump_switch_mode, dump_path_str)
+        write_to_disk()
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b1b5a967754ed08d789510ef6dd7ae0a7da8ada
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import inspect
+import json
+import os
+import stat
+import numpy as np
+import torch
+import threading
+
+try:
+    import torch_npu
+except ImportError:
+    is_gpu = True
+else:
+    is_gpu = False
+
+from .utils import DumpUtil, _set_dump_switch4api_list, make_dump_data_dir, get_tensor_rank, create_dirs_if_not_exist
+from ..common.utils import print_warn_log, Const, print_info_log, modify_dump_path
+from ..dump.utils import check_writable
+
+forward_init_status = False
+backward_init_status = False
+
+backward_threading_id = 0
+
+api_list = []
+thread_lock = threading.Lock()
+pkl_name = ""
+multi_output_apis = ["_sort_", "npu_flash_attention"]
+
+class DataInfo(object):
+    def __init__(self, data, save_data, summary_data, dtype, shape):
+        self.data = data
+        self.save_data = save_data
+        self.summary_data = summary_data
+        self.dtype = dtype
+        self.shape = shape
+
+
+def get_not_float_tensor_info(data):
+    if data.numel() == 0 or data.dtype == torch.bool:
+        tensor_max = []
+        tensor_min = []
+        tensor_mean = []
+    elif len(data.shape) == 0:
+        tensor_max = data.cpu().detach().float().numpy().tolist()
+        tensor_min = data.cpu().detach().float().numpy().tolist()
+        tensor_mean = data.cpu().detach().float().numpy().tolist()
+    else:
+        tensor_max = torch._C._VariableFunctionsClass.max(data).cpu().detach().float().numpy().tolist()
+        tensor_min = torch._C._VariableFunctionsClass.min(data).cpu().detach().float().numpy().tolist()
+        tensor_mean = torch._C._VariableFunctionsClass.mean(data.float()).cpu().detach().float().numpy().tolist()
+    return get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean)
+
+
+def get_scalar_data_info(data):
+    summary_data = [data, data, data]
+    return DataInfo(data, data, summary_data, str(type(data)), str([]))
+
+
+def get_float_tensor_info(data):
+    tensor_max = torch._C._VariableFunctionsClass.max(data).cpu().detach().float().numpy().tolist()
+    tensor_min = torch._C._VariableFunctionsClass.min(data).cpu().detach().float().numpy().tolist()
+    tensor_mean = torch._C._VariableFunctionsClass.mean(data).cpu().detach().float().numpy().tolist()
+    return get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean)
+
+
+def get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean):
+    summary_data = []
+    saved_tensor = data.contiguous().cpu().detach()
+    if data.dtype == torch.bfloat16:
+        saved_numpy = saved_tensor.to(torch.float32).numpy()
+    else:
+        saved_numpy = saved_tensor.numpy()
+    summary_data.extend([tensor_max, tensor_min, tensor_mean])
+    return DataInfo(data, saved_numpy, summary_data, str(data.dtype), tuple(data.shape))
+
+
+def json_dump_condition(prefix):
+    cur_threading_id = threading.current_thread().ident
+    global backward_threading_id
+    if not backward_threading_id and Const.BACKWARD in prefix:
+        backward_threading_id = cur_threading_id
+    return (Const.BACKWARD in prefix and backward_threading_id == cur_threading_id) or 'forward' in prefix
+
+
+def dump_tensor(x, prefix, dump_step, dump_file_name):
+    global data_info
+    if isinstance(x, (tuple, list)) and x:
+        for i, item in enumerate(x):
+            dump_tensor(item, "{}.{}".format(prefix, i), dump_step, dump_file_name)
+        return
+    elif isinstance(x, torch.Tensor):
+        if x.is_meta:
+            print_info_log(f"Meta tensor {prefix} is skipped.")
+            return
+        if x.numel() == 0 or len(x.shape) == 0 or not x.is_floating_point():
+            if DumpUtil.dump_filter_switch == Const.OFF:
+                data_info = get_not_float_tensor_info(x)
+                dump_data(dump_file_name, dump_step, prefix, data_info)
+            else:
+                return
+        else:
+            data_info = get_float_tensor_info(x)
+            dump_data(dump_file_name, dump_step, prefix, data_info)
+
+    elif DumpUtil.dump_filter_switch == Const.OFF:
+        if isinstance(x, bool) or isinstance(x, int) or isinstance(x, float):
+            data_info = get_scalar_data_info(x)
+            dump_data(dump_file_name, dump_step, prefix, data_info)
+
+
+def dump_data(dump_file_name, dump_step, prefix, data_info):
+    global api_list
+    thread_lock.acquire()
+    try:
+        if json_dump_condition(prefix):
+            output_path = os.path.join(DumpUtil.dump_data_dir, f'{prefix}.npy')
+            if not DumpUtil.summary_only:
+                np.save(output_path, data_info.save_data)
+            api_list.append([prefix, dump_step, [], data_info.dtype, data_info.shape, data_info.summary_data])
+    except Exception as e:
+        print_warn_log("Dump data failed, error: {}".format(e))
+    finally:
+        thread_lock.release()
+
+
+def dump_stack_info(name_template, dump_file):
+    stack_str = []
+    for (_, path, line, func, code, _) in inspect.stack()[3:]:
+        if code:
+            stack_line = [path, str(line), func, code[0].strip() if code else code]
+        else:
+            stack_line = [path, str(line), func, code]
+        stack_str.append(stack_line)
+
+    prefix = name_template.format("stack_info")
+    if DumpUtil.dump_switch_mode in Const.DUMP_MODE:
+        if json_dump_condition(prefix):
+            if Const.ALL in DumpUtil.dump_mode:
+                api_list.append([prefix, stack_str])
+            else:
+                for mode in DumpUtil.dump_mode:
+                    if mode in prefix:
+                        api_list.append([prefix, stack_str])
+    else:
+        api_list.append([prefix, stack_str])
+
+
+def dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file):
+    if Const.BACKWARD in name_template and Const.FORWARD not in DumpUtil.dump_mode:
+        if 'input' in DumpUtil.dump_mode:
+            dump_tensor(out_feat, name_template.format("input"), dump_step, dump_file)
+        if 'output' in DumpUtil.dump_mode:
+            dump_tensor(in_feat, name_template.format("output"), dump_step, dump_file)
+        if Const.ALL in DumpUtil.dump_mode:
+            dump_tensor(out_feat, name_template.format("input"), dump_step, dump_file)
+            dump_tensor(in_feat, name_template.format("output"), dump_step, dump_file)
+    elif Const.BACKWARD not in name_template and Const.BACKWARD not in DumpUtil.dump_mode:
+        if 'input' in DumpUtil.dump_mode:
+            dump_tensor(in_feat, name_template.format("input"), dump_step, dump_file)
+        if 'output' in DumpUtil.dump_mode:
+            dump_tensor(out_feat, name_template.format("output"), dump_step, dump_file)
+        if Const.ALL in DumpUtil.dump_mode:
+            dump_tensor(in_feat, name_template.format("input"), dump_step, dump_file)
+            dump_tensor(out_feat, name_template.format("output"), dump_step, dump_file)
+
+
+def dump_acc_cmp(name, in_feat, out_feat, dump_step, module):
+    dump_file = DumpUtil.get_dump_path()
+    dump_file = modify_dump_path(dump_file, DumpUtil.dump_switch_mode)
+    _set_dump_switch4api_list(name)
+    if DumpUtil.get_dump_switch():
+        rank = get_tensor_rank(in_feat, out_feat)
+        if DumpUtil.target_rank is not None:
+            if rank != DumpUtil.target_rank:
+                return
+        dump_file = create_dirs_if_not_exist(rank, dump_file)
+        global pkl_name
+        pkl_name = dump_file
+        if DumpUtil.dump_init_enable:
+            DumpUtil.dump_init_enable = False
+            DumpUtil.dump_data_dir = make_dump_data_dir(dump_file) \
+                if DumpUtil.dump_switch_mode not in [Const.STACK, Const.ACL] and not DumpUtil.summary_only else ""
+            if os.path.exists(dump_file) and not os.path.isdir(dump_file):
+                check_writable(dump_file)
+                os.remove(dump_file)
+
+        name_prefix = name
+        name_template = f"{name_prefix}" + "_{}"
+        if DumpUtil.dump_switch_mode in [Const.ALL, Const.API_LIST]:
+            dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file)
+        elif DumpUtil.dump_switch_mode == Const.API_STACK:
+            dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file)
+            dump_stack_info(name_template, dump_file)
+        elif DumpUtil.check_switch_scope(name_prefix):
+            if DumpUtil.dump_switch_mode == Const.ACL:
+                acl_dump(module, name, name_prefix)
+            elif DumpUtil.dump_switch_mode != Const.STACK:
+                dump_api_tensor(dump_step, in_feat, name_template, out_feat, dump_file)
+            dump_stack_info(name_template, dump_file)
+
+
+def acl_dump(module, module_name, name_prefix):
+    if name_prefix in DumpUtil.backward_input:
+        dump_mode_backward_acl_dump(module, module_name, DumpUtil.backward_input.get(name_prefix))
+    else:
+        forward_acl_dump(module, module_name)
+
+
+def Op_Need_Trigger(module_name):
+    if 'Tensor___getitem___' in module_name:
+        return True
+    return False
+
+
+def forward_acl_dump(module, module_name):
+    global forward_init_status
+    global backward_init_status
+    if not forward_init_status and not backward_init_status:
+        forward_init_status = True
+        torch_npu.npu.init_dump()
+        torch_npu.npu.set_dump(DumpUtil.dump_config)
+        torch_npu.npu.synchronize()
+        if Op_Need_Trigger(module_name):
+            module.forward(*module.input_args, **module.input_kwargs).cpu()
+        else:
+            module.forward(*module.input_args, **module.input_kwargs)
+        torch_npu.npu.synchronize()
+        torch_npu.npu.finalize_dump()
+    del module.input_args
+    del module.input_kwargs
+    forward_init_status = False
+    print_info_log("Dump %s op file." % module_name)
+
+
+def acl_backward_dump_status(output, grad, module_name):
+    if isinstance(output, torch.Tensor):
+        output.backward(grad, retain_graph=True)
+        return True
+
+    for api_name in multi_output_apis:
+        if api_name in module_name:
+            output[0].backward(grad, retain_graph=True)
+            return True
+    return False
+
+
+def dump_mode_backward_acl_dump(module, module_name, grad_path):
+    global forward_init_status
+    global backward_init_status
+    module_name = module_name.replace(Const.FORWARD, Const.BACKWARD)
+    if not forward_init_status and not backward_init_status:
+        forward_init_status = True
+        module.input_args = list(module.input_args)
+        for i, data in enumerate(module.input_args):
+            if isinstance(data, torch.Tensor) and data.grad_fn:
+                module.input_args[i] = data.detach().requires_grad_()
+        output = module.forward(*module.input_args, **module.input_kwargs)
+        grad = torch.tensor(np.load(grad_path)).to("npu").requires_grad_()
+        torch_npu.npu.init_dump()
+        torch_npu.npu.set_dump(DumpUtil.dump_config)
+        torch_npu.npu.synchronize()
+        if not acl_backward_dump_status(output, grad, module_name):
+            print_warn_log("The output of {} is not of tensor type and cannot be automatically derived. "
+                            "you can manually construct a single API backward case for ACL dump.".format(module_name))
+        torch_npu.npu.synchronize()
+        torch_npu.npu.finalize_dump()
+    del module.input_args
+    del module.input_kwargs
+    forward_init_status = False
+    print_info_log("Dump %s op file." % module_name)
+
+
+def acc_cmp_dump(name, **kwargs):
+    dump_step = kwargs.get('dump_step', 1)
+    pid = kwargs.get('pid')
+    if not pid:
+        return RuntimeError("Not get the specified process pid.")
+
+    def acc_cmp_hook(module, in_feat, out_feat):
+        if pid == os.getpid():
+            dump_acc_cmp(name, in_feat, out_feat, dump_step, module)
+        if hasattr(module, "input_args"):
+            del module.input_args
+        if hasattr(module, "input_kwargs"):
+            del module.input_kwargs
+
+    return acc_cmp_hook
+
+
+def write_to_disk():
+    if api_list:
+        with open(pkl_name, 'a') as f:
+            try:
+                f.write('\n'.join(json.dumps(item) for item in api_list))
+                f.write('\n')
+            except:
+                raise Exception("write to disk failed")
+
+
+def get_pkl_file_path():
+    return pkl_name
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc2313448eb29a4b4eae2577acd0696d911d0044
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py
@@ -0,0 +1,284 @@
+import os
+import shutil
+import sys
+import re
+from pathlib import Path
+import torch
+
+from ..dump import dump
+from ..common.utils import print_error_log, CompareException, DumpException, Const, get_time, print_info_log, \
+    check_mode_valid, get_api_name_from_matcher, check_switch_valid, check_dump_mode_valid, check_summary_only_valid, generate_compare_script, \
+    check_is_npu, check_file_valid
+
+from ..common.version import __version__
+
+dump_count = 0
+range_begin_flag, range_end_flag = False, False
+
+
+class DumpUtil(object):
+    dump_data_dir = None
+    dump_path = None
+    dump_switch = None
+    dump_switch_mode = Const.ALL # all, api_stack, list, stack...
+    dump_switch_scope = []
+    dump_init_enable = False
+    dump_api_list = []
+    dump_filter_switch = None
+    dump_mode = ['all']
+    backward_input = {}
+    dump_dir_tag = 'ptdbg_dump'
+    dump_config = None
+    dataloader_iter = 0
+    target_iter = None
+    target_rank = None
+    summary_only = False
+
+    @staticmethod
+    def incr_iter_num_maybe_exit():
+        if DumpUtil.target_iter is None:
+            return
+        if DumpUtil.dataloader_iter == DumpUtil.target_iter:
+            set_dump_switch("ON")
+        elif DumpUtil.dataloader_iter > DumpUtil.target_iter:
+            raise Exception("Ptdbg: exit after iteration {}".format(DumpUtil.target_iter))
+        else:
+            set_dump_switch("OFF")
+        DumpUtil.dataloader_iter += 1
+
+    @staticmethod
+    def set_dump_path(save_path):
+        DumpUtil.dump_path = save_path
+        DumpUtil.dump_init_enable = True
+
+    @staticmethod
+    def set_dump_config(dump_config):
+        DumpUtil.dump_config = dump_config
+
+    @staticmethod
+    def set_dump_switch(switch, mode=None, scope=None, api_list=None, filter_switch=None, dump_mode=None, summary_only=False):
+        DumpUtil.dump_switch = switch
+        if mode is not None:
+            DumpUtil.dump_switch_mode = mode
+        DumpUtil.dump_init_enable = True
+        if scope is not None:
+            DumpUtil.dump_switch_scope = scope
+        if api_list is not None:
+            DumpUtil.dump_api_list = [api.lower() for api in api_list]
+        if filter_switch is not None:
+            DumpUtil.dump_filter_switch = filter_switch
+        if dump_mode is not None:
+            DumpUtil.dump_mode = dump_mode if isinstance(dump_mode, list) else [dump_mode]
+
+        if mode == Const.ACL:
+            DumpUtil.dump_switch_scope = [api_name.replace("backward", "forward") for api_name in scope]
+        DumpUtil.summary_only = summary_only
+
+    def check_list_or_acl_mode(name_prefix):
+        global dump_count
+        for item in DumpUtil.dump_switch_scope:
+            if name_prefix.startswith(item):
+                dump_count = dump_count + 1
+                return True
+
+    def check_range_mode(name_prefix):
+        global range_begin_flag
+        global range_end_flag
+        if name_prefix.startswith(DumpUtil.dump_switch_scope[0]):
+            range_begin_flag = True
+            return True
+        if name_prefix.startswith(DumpUtil.dump_switch_scope[1]):
+            range_end_flag = True
+            return True
+        if range_begin_flag and not range_end_flag:
+            return True
+        return False
+
+    def check_stack_mode(name_prefix):
+        if len(DumpUtil.dump_switch_scope) == 0:
+            return True
+        elif len(DumpUtil.dump_switch_scope) == 1:
+            return name_prefix.startswith(DumpUtil.dump_switch_scope[0])
+        elif len(DumpUtil.dump_switch_scope) == 2:
+            return DumpUtil.check_range_mode(name_prefix)
+        else:
+            print_error_log("dump scope is invalid, Please set the scope mode in"
+                            " set_dump_switch with 'all', 'list', 'range', 'stack', 'acl', 'api_list'!")
+        return False
+
+    check_mapper = {
+        Const.LIST: check_list_or_acl_mode,
+        Const.ACL: check_list_or_acl_mode,
+        Const.RANGE: check_range_mode,
+        Const.STACK: check_stack_mode
+    }
+
+    @staticmethod
+    def check_switch_scope(name_prefix):
+        if DumpUtil.dump_switch_mode in DumpUtil.check_mapper:
+            check_func = DumpUtil.check_mapper[DumpUtil.dump_switch_mode]
+            return check_func(name_prefix)
+        return False
+
+    @staticmethod
+    def get_dump_path():
+        if DumpUtil.dump_path:
+            return DumpUtil.dump_path
+
+        if DumpUtil.dump_switch_mode == Const.ALL:
+            raise RuntimeError("get_dump_path: the file path is empty,"
+                               " you must use set_dump_path to set a valid dump path!!!")
+        else:
+            dir_path = os.path.realpath("./")
+            dump_file_name = "scope_dump_{}_{}_{}.pkl".format(
+                DumpUtil.dump_switch_mode, DumpUtil.dump_switch_scope[0], get_time())
+            DumpUtil.dump_path = os.path.join(dir_path, dump_file_name)
+            return DumpUtil.dump_path
+
+    @staticmethod
+    def get_dump_switch():
+        return DumpUtil.dump_switch == "ON"
+
+
+def set_dump_path(fpath=None, dump_tag='ptdbg_dump'):
+    if fpath is None:
+        raise RuntimeError("set_dump_path '{}' error, please set a valid filename".format(fpath))
+        return
+    check_file_valid(fpath)
+    real_path = os.path.realpath(fpath)
+    if not os.path.isdir(real_path):
+        print_error_log(
+            "set_dump_path '{}' error, the path is not a directory please set a valid directory.".format(real_path))
+        raise DumpException(DumpException.INVALID_PATH_ERROR)
+    DumpUtil.set_dump_path(real_path)
+    DumpUtil.dump_dir_tag = dump_tag
+
+
+def get_tensor_rank(in_feat, out_feat):
+    def get_tensor_rank_single(x):
+        if isinstance(x, (list, tuple)):
+            if len(x) > 0:
+                return get_tensor_rank_single(x[0])
+            return None
+        elif isinstance(x, torch.Tensor):
+            device = x.device
+            if device.type == 'cpu':
+                return None
+            else:
+                return device.index
+        return None
+    in_rank = get_tensor_rank_single(in_feat)
+    if in_rank is None:
+        out_rank = get_tensor_rank_single(out_feat)
+        if out_rank is None:
+            return 0
+        return out_rank
+    return in_rank
+
+
+def create_dirs_if_not_exist(rank, dump_file):
+    dump_path, file_name = os.path.split(dump_file)
+    rank_dir = os.path.join(dump_path, f"rank{rank}")
+    dump_file = os.path.join(rank_dir, file_name)
+    if not os.path.isdir(rank_dir):
+        Path(rank_dir).mkdir(mode=0o750, exist_ok=True)
+    return dump_file
+
+
+def generate_dump_path_str():
+    if DumpUtil.dump_switch_mode == 'acl':
+        if DumpUtil.dump_config == '':
+            print_error_log("Please provide dump config for register hook before turning on dump switch!")
+            raise DumpException(DumpException.NONE_ERROR)
+        dump_path = f"according to dump config {DumpUtil.dump_config}"
+    else:
+        dump_dir, dump_file = os.path.split(DumpUtil.dump_path)
+        if not dump_file.endswith(".pkl"):
+            dump_dir = DumpUtil.dump_path
+        dump_path = f"to {dump_dir}"
+    return dump_path
+
+
+def set_dump_switch(switch, mode=Const.ALL, scope=[], api_list=[], filter_switch=Const.ON, dump_mode=[Const.ALL], summary_only=False):
+    try:
+        check_switch_valid(switch)
+    except (CompareException, AssertionError) as err:
+        print_error_log(str(err))
+        sys.exit()
+    DumpUtil.set_dump_switch(switch, summary_only=summary_only)
+    dump_path_str = generate_dump_path_str()
+    if switch == "OFF":
+        dump.write_to_disk()
+        if check_is_npu() and DumpUtil.dump_switch_mode in [Const.ALL, Const.API_STACK, Const.LIST, Const.RANGE]:
+            generate_compare_script(DumpUtil.dump_data_dir, dump.get_pkl_file_path(), DumpUtil.dump_switch_mode)
+    set_dump_switch_print_info(switch, mode, dump_path_str)
+    set_dump_switch_config(mode=mode, scope=scope, api_list=api_list, filter_switch=filter_switch, dump_mode=dump_mode,summary_only=summary_only)
+
+
+def set_dump_switch_config(mode=Const.ALL, scope=[], api_list=[], filter_switch=Const.ON, dump_mode=[Const.ALL], summary_only=False):
+    try:
+        check_mode_valid(mode, scope, api_list)
+        check_switch_valid(filter_switch)
+        dump_mode = check_dump_mode_valid(dump_mode)
+        summary_only = check_summary_only_valid(summary_only)
+    except (CompareException, AssertionError) as err:
+        print_error_log(str(err))
+        sys.exit()
+    switch = DumpUtil.dump_switch
+    DumpUtil.set_dump_switch("OFF", mode=mode, scope=scope, api_list=api_list, filter_switch=filter_switch,
+                                dump_mode=dump_mode, summary_only=summary_only)
+    DumpUtil.dump_switch = switch
+
+
+def set_dump_switch_print_info(switch, mode, dump_path_str):
+    global dump_count
+    if switch == "ON":
+        print_info_log(f"Dump switch is turned on. Dump data will be saved {dump_path_str}. ")
+        if mode == Const.LIST:
+            dump_count = 0
+    else:
+        print_info_log(f"Dump switch is turned off. ")
+        if mode == Const.LIST:
+            print_info_log("The number of matched dump is {}".format(dump_count))
+
+
+def _set_dump_switch4api_list(name):
+    if DumpUtil.dump_api_list:
+        api_name = get_api_name_from_matcher(name)
+        DumpUtil.dump_switch = "ON" if api_name in DumpUtil.dump_api_list else "OFF"
+
+
+def set_backward_input(backward_input):
+    for index, api_name in enumerate(DumpUtil.dump_switch_scope):
+        DumpUtil.backward_input[api_name] = backward_input[index]
+
+
+def make_dump_data_dir(dump_file_name):
+    dump_path, file_name = os.path.split(os.path.realpath(dump_file_name))
+    name_body, name_extension = os.path.splitext(file_name)
+    output_dir = os.path.join(dump_path, f"{name_body}")
+    if not os.path.exists(output_dir):
+        Path(output_dir).mkdir(mode=0o750, exist_ok=True)
+    else:
+        shutil.rmtree(output_dir, ignore_errors=True)
+        Path(output_dir).mkdir(mode=0o750, exist_ok=True)
+    return output_dir
+
+
+def make_dump_dirs():
+    dump_file_name, dump_file_name_body = "dump.pkl", "dump"
+    dump_root_dir = DumpUtil.dump_path if DumpUtil.dump_path else "./"
+    tag_dir = os.path.join(dump_root_dir, DumpUtil.dump_dir_tag + f'_v{__version__}')
+    Path(tag_dir).mkdir(mode=0o750, parents=True, exist_ok=True)
+    DumpUtil.dump_dir = tag_dir
+    dump_file_path = os.path.join(tag_dir, dump_file_name)
+    DumpUtil.set_dump_path(dump_file_path)
+
+
+def check_writable(dump_file):
+    if not os.access(dump_file, os.W_OK):
+        print_error_log(
+            'The path {} does not have permission to write. Please check the path permission'.format(
+                dump_file))
+        raise DumpException(DumpException.INVALID_PATH_ERROR)
+
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/__init__.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..0de75ffe9af22a6b4ecb7b44148eefd00357b68b
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/hook_module.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+
+import functools
+
+import torch
+import torch.nn as nn
+import torch.utils.hooks as full_hooks
+
+module_count = {}
+
+
+class HOOKModule(nn.Module):
+
+    def __init__(self, hook) -> None:
+        super(HOOKModule, self).__init__()
+        self.has_overflow = False
+        self.input_args = tuple()
+        self.input_kwargs = dict()
+        prefix = ""
+        if hasattr(self, "prefix_op_name_"):
+            prefix = self.prefix_op_name_
+
+        if prefix not in module_count:
+            module_count[prefix] = 1
+            prefix += '0_'
+        else:
+            module_count[prefix] += 1
+            prefix = prefix + str(module_count[prefix] - 1) + '_'
+
+        self.register_forward_hook(hook(prefix + "forward"))
+        self.register_backward_hook(hook(prefix + "backward"))
+
+    def __call__(self, *input, **kwargs):
+        full_backward_hooks, non_full_backward_hooks = [], []
+        if len(self._backward_hooks) > 0:
+            full_backward_hooks, non_full_backward_hooks = self._get_backward_hooks()
+        for hook in self._forward_pre_hooks.values():
+            result = hook(self, input)
+            if result is not None:
+                if not isinstance(result, tuple):
+                    result = (result,)
+                input = result
+        bw_hook = None
+        if len(full_backward_hooks) > 0:
+            bw_hook = full_hooks.BackwardHook(self, full_backward_hooks)
+            input = bw_hook.setup_input_hook(input)
+        self.input_args = input
+        self.input_kwargs = kwargs
+        if torch._C._get_tracing_state():
+            result = self._slow_forward(*input, **kwargs)
+        else:
+            result = self.forward(*input, **kwargs)
+        for hook in self._forward_hooks.values():
+            hook_result = hook(self, input, result)
+            if hook_result is not None:
+                result = hook_result
+        if bw_hook:
+            result = bw_hook.setup_output_hook(result)
+        if len(non_full_backward_hooks) > 0:
+            var = result
+            while not isinstance(var, torch.Tensor):
+                if isinstance(var, dict):
+                    var = next((v for v in var.values() if isinstance(v, torch.Tensor)))
+                elif isinstance(var, (list, tuple)):
+                    if var:
+                        var = var[0]
+                    else:
+                        return result
+                else:
+                    return result
+            grad_fn = var.grad_fn
+            if grad_fn is not None:
+                for hook in non_full_backward_hooks:
+                    wrapper = functools.partial(hook, self)
+                    functools.update_wrapper(wrapper, hook)
+                    grad_fn.register_hook(wrapper)
+                self._maybe_warn_non_full_backward_hook(input, result, grad_fn)
+        return result
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..85a5b3516d861a8a55ca1e2f64cc89d189e5b127
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/register_hook.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import functools
+import os
+
+import torch
+
+from . import wrap_torch, wrap_functional, wrap_tensor, wrap_vf
+from .hook_module import HOOKModule
+from .wrap_functional import remove_dropout
+from ..common.utils import check_file_or_directory_path, print_error_log, CompareException, Const, \
+    print_info_log, print_warn_log, get_process_rank
+from ..dump.utils import make_dump_dirs, DumpUtil
+from ..overflow_check.utils import OverFlowUtil
+
+try:
+    import torch_npu
+except ImportError:
+    is_gpu = True
+else:
+    is_gpu = False
+    from . import wrap_npu_custom
+
+make_dir_flag = True
+
+
+def initialize_hook(hook):
+    wrap_tensor.wrap_tensor_ops_and_bind(hook)
+    for attr_name in dir(wrap_tensor.HOOKTensor):
+        if attr_name.startswith("wrap_"):
+            setattr(torch.Tensor, attr_name[5:], getattr(wrap_tensor.HOOKTensor, attr_name))
+
+    wrap_torch.wrap_torch_ops_and_bind(hook)
+    for attr_name in dir(wrap_torch.HOOKTorchOP):
+        if attr_name.startswith("wrap_"):
+            setattr(torch, attr_name[5:], getattr(wrap_torch.HOOKTorchOP, attr_name))
+
+    wrap_functional.wrap_functional_ops_and_bind(hook)
+    for attr_name in dir(wrap_functional.HOOKFunctionalOP):
+        if attr_name.startswith("wrap_"):
+            setattr(torch.nn.functional, attr_name[5:], getattr(wrap_functional.HOOKFunctionalOP, attr_name))
+
+    wrap_vf.wrap_vf_ops_and_bind(hook)
+    for attr_name in dir(wrap_vf.HOOKVfOP):
+        if attr_name.startswith("wrap_"):
+            setattr(torch._VF, attr_name[5:], getattr(wrap_vf.HOOKVfOP, attr_name))
+    
+    if not is_gpu:
+        wrap_npu_custom.wrap_npu_ops_and_bind(hook)
+        for attr_name in dir(wrap_npu_custom.HOOKNpuOP):
+            if attr_name.startswith("wrap_"):
+                setattr(torch_npu, attr_name[5:], getattr(wrap_npu_custom.HOOKNpuOP, attr_name))
+
+def add_clear_overflow(func):
+    first_module = True
+    def clear_overflow_wrapper(*args, **kwargs):
+        nonlocal first_module
+        if first_module:
+            torch_npu._C._clear_overflow_npu()
+            first_module = False
+        return func(*args, **kwargs)
+    return clear_overflow_wrapper
+
+
+def register_hook(model, hook, **kwargs):
+    print_info_log("Please disable dataloader shuffle before running the program.")
+    OverFlowUtil.overflow_nums = kwargs.get('overflow_nums', 1)
+    dump_mode, dump_config_file = init_dump_config(kwargs)
+    if dump_mode == 'acl':
+        DumpUtil.dump_switch_mode = dump_mode
+        DumpUtil.dump_config = dump_config_file
+    register_hook_core(hook, **kwargs)
+
+
+def register_hook_core(hook, **kwargs):
+    global make_dir_flag
+
+    pid = os.getpid()
+    need_clear = True
+    if make_dir_flag:
+        make_dump_dirs()
+        make_dir_flag = False
+    hook_name = hook.__name__
+
+    if "overflow_check" in hook_name and not is_gpu:
+        if hasattr(torch_npu._C, "_enable_overflow_npu"):
+            torch_npu._C._enable_overflow_npu()
+            print_info_log("Enable overflow function success.")            
+        else:
+            print_warn_log("Api '_enable_overflow_npu' is not exist, "
+                           "the overflow detection function on milan platform maybe not work! "
+                           "please check the version of software torch_npu.")
+        # In NPU scene, clear the overflow flag before overflow detection
+        if need_clear:
+            HOOKModule.__init__ = add_clear_overflow(HOOKModule.__init__)
+    elif "acc_cmp_dump" in hook_name:
+        remove_dropout()
+
+    print_info_log("Start mounting the {} hook function to the model.".format(hook_name))
+    hook = functools.partial(hook, dump_step=0, pid=pid)
+    print_info_log("The {} hook function is successfully mounted to the model.".format(hook_name))
+
+    initialize_hook(hook)
+
+
+def init_dump_config(kwargs):
+    dump_mode = kwargs.get('dump_mode', "api")
+    dump_config = kwargs.get('dump_config')
+    dump_config_file = ''
+    if dump_mode not in Const.SUPPORT_DUMP_MODE:
+        print_error_log("dump_mode only support %s" % Const.SUPPORT_DUMP_MODE)
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    if dump_mode == "acl":
+        if dump_config is None:
+            print_error_log("dump_mode is acl mode, dump_config must be configured.")
+            raise CompareException(CompareException.INVALID_PARAM_ERROR)
+        dump_config_file = os.path.realpath(dump_config)
+        check_file_or_directory_path(dump_config_file)
+        if not dump_config.endswith(".json"):
+            print_error_log("dump_config must be configure json file.")
+            raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    return dump_mode, dump_config_file
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cbc3dd21611abfa2c0ff22d217d367faee7dfac5
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml
@@ -0,0 +1,1054 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# List of ops that register hooks
+
+functional:
+  - conv1d
+  - conv2d
+  - conv3d
+  - conv_transpose1d
+  - conv_transpose2d
+  - conv_transpose3d
+  - conv_tbc
+  - avg_pool1d
+  - avg_pool2d
+  - avg_pool3d
+  - fractional_max_pool2d_with_indices
+  - fractional_max_pool2d
+  - fractional_max_pool3d_with_indices
+  - fractional_max_pool3d
+  - max_pool1d_with_indices
+  - max_pool1d
+  - max_pool2d_with_indices
+  - max_pool2d
+  - max_pool3d_with_indices
+  - max_pool3d
+  - max_unpool1d
+  - max_unpool2d
+  - max_unpool3d
+  - lp_pool2d
+  - lp_pool1d
+  - adaptive_max_pool1d_with_indices
+  - adaptive_max_pool1d
+  - adaptive_max_pool2d_with_indices
+  - adaptive_max_pool2d
+  - adaptive_max_pool3d_with_indices
+  - adaptive_max_pool3d
+  - adaptive_avg_pool1d
+  - adaptive_avg_pool2d
+  - adaptive_avg_pool3d
+  - dropout
+  - alpha_dropout
+  - dropout2d
+  - dropout3d
+  - feature_alpha_dropout
+  - threshold
+  - threshold_
+  - relu
+  - relu_
+  - glu
+  - hardtanh
+  - hardtanh_
+  - relu6
+  - elu
+  - elu_
+  - selu
+  - selu_
+  - celu
+  - celu_
+  - leaky_relu
+  - leaky_relu_
+  - prelu
+  - rrelu
+  - rrelu_
+  - logsigmoid
+  - gelu
+  - hardshrink
+  - tanhshrink
+  - softsign
+  - softplus
+  - softmin
+  - softmax
+  - gumbel_softmax
+  - log_softmax
+  - softshrink
+  - tanh
+  - sigmoid
+  - hardsigmoid
+  - linear
+  - bilinear
+  - silu
+  - hardswish
+  - embedding
+  - embedding_bag
+  - batch_norm
+  - instance_norm
+  - layer_norm
+  - group_norm
+  - local_response_norm
+  - ctc_loss
+  - nll_loss
+  - poisson_nll_loss
+  - gaussian_nll_loss
+  - kl_div
+  - cross_entropy
+  - binary_cross_entropy
+  - binary_cross_entropy_with_logits
+  - smooth_l1_loss
+  - l1_loss
+  - mse_loss
+  - margin_ranking_loss
+  - hinge_embedding_loss
+  - multilabel_margin_loss
+  - soft_margin_loss
+  - multilabel_soft_margin_loss
+  - cosine_embedding_loss
+  - multi_margin_loss
+  - pixel_shuffle
+  - pixel_unshuffle
+  - channel_shuffle
+  - upsample
+  - interpolate
+  - upsample_nearest
+  - upsample_bilinear
+  - grid_sample
+  - affine_grid
+  - pad
+  - pairwise_distance
+  - pdist
+  - cosine_similarity
+  - one_hot
+  - triplet_margin_loss
+  - triplet_margin_with_distance_loss
+  - normalize
+  - unfold
+  - fold
+  - multi_head_attention_forward
+
+tensor:
+  - __add__
+  - __and__
+  - __bool__
+  - __div__
+  - __eq__
+  - __ge__
+  - __gt__
+  - __getitem__
+  - __iadd__
+  - __iand__
+  - __idiv__
+  - __ifloordiv__
+  - __ilshift__
+  - __imod__
+  - __imul__
+  - __ior__
+  - __irshift__
+  - __isub__
+  - __ixor__
+  - __lshift__
+  - __matmul__
+  - __mod__
+  - __mul__
+  - __nonzero__
+  - __or__
+  - __radd__
+  - __rmul__
+  - __rshift__
+  - __sub__
+  - __truediv__
+  - __xor__
+  - abs
+  - abs_
+  - absolute
+  - absolute_
+  - acos
+  - acos_
+  - acosh
+  - acosh_
+  - add
+  - add_
+  - addbmm
+  - addbmm_
+  - addcdiv
+  - addcdiv_
+  - addcmul
+  - addcmul_
+  - addmm
+  - addmm_
+  - addmv
+  - addmv_
+  - addr
+  - addr_
+  - align_as
+  - align_to
+  - all
+  - allclose
+  - amax
+  - amin
+  - angle
+  - any
+  - arccos
+  - arccos_
+  - arccosh
+  - arccosh_
+  - arcsin
+  - arcsin_
+  - arcsinh
+  - arcsinh_
+  - arctan
+  - arctan_
+  - arctanh
+  - arctanh_
+  - argmax
+  - argmin
+  - argsort
+  - asin
+  - asin_
+  - asinh
+  - asinh_
+  - atan
+  - atan2
+  - atan2_
+  - atan_
+  - atanh
+  - atanh_
+  - baddbmm
+  - baddbmm_
+  - bernoulli
+  - bernoulli_
+  - bincount
+  - bitwise_and
+  - bitwise_and_
+  - bitwise_not
+  - bitwise_not_
+  - bitwise_or
+  - bitwise_or_
+  - bitwise_xor
+  - bitwise_xor_
+  - bmm
+  - broadcast_to
+  - cauchy_
+  - ceil
+  - ceil_
+  - cholesky
+  - chunk
+  - clamp
+  - cholesky_solve
+  - cholesky_inverse
+  - clamp_
+  - clamp_max
+  - clamp_max_
+  - clip
+  - clamp_min
+  - clamp_min_
+  - clip_
+  - copysign
+  - copysign_
+  - cos
+  - cos_
+  - cosh
+  - cosh_
+  - count_nonzero
+  - cummax
+  - cummin
+  - cumprod
+  - cumprod_
+  - cumsum
+  - cumsum_
+  - deg2rad
+  - deg2rad_
+  - det
+  - diag
+  - diag_embed
+  - diagflat
+  - diagonal
+  - diff
+  - dist
+  - digamma
+  - digamma_
+  - div
+  - div_
+  - divide
+  - divide_
+  - dot
+  - eig
+  - eq
+  - eq_
+  - erf
+  - equal
+  - erf_
+  - erfc
+  - erfc_
+  - erfinv
+  - erfinv_
+  - exp
+  - exp2
+  - exp2_
+  - expm1
+  - exp_
+  - expm1_
+  - exponential_
+  - fill_
+  - fix
+  - fill_diagonal_
+  - fix_
+  - flip
+  - fliplr
+  - flatten
+  - flipud
+  - float_power
+  - float_power_
+  - floor
+  - floor_
+  - floor_divide
+  - floor_divide_
+  - fmax
+  - fmin
+  - fmod
+  - fmod_
+  - frac
+  - frac_
+  - gather
+  - gcd
+  - gcd_
+  - ge
+  - ge_
+  - geometric_
+  - geqrf
+  - ger
+  - greater
+  - greater_
+  - gt
+  - gt_
+  - greater_equal
+  - greater_equal_
+  - hardshrink
+  - heaviside
+  - heaviside_
+  - histc
+  - hypot
+  - hypot_
+  - igamma
+  - igamma_
+  - igammac
+  - igammac_
+  - index_add
+  - index_add_
+  - inverse
+  - index_copy
+  - index_copy_
+  - index_fill
+  - index_fill_
+  - index_put
+  - index_put_
+  - inner
+  - index_select
+  - isclose
+  - isfinite
+  - isinf
+  - isnan
+  - isneginf
+  - isposinf
+  - isreal
+  - kron
+  - kthvalue
+  - lcm
+  - lcm_
+  - ldexp
+  - ldexp_
+  - le
+  - le_
+  - lerp
+  - lerp_
+  - where
+  - less
+  - less_
+  - less_equal
+  - less_equal_
+  - lgamma
+  - lgamma_
+  - log
+  - log10
+  - log10_
+  - log1p
+  - log1p_
+  - log2
+  - log2_
+  - log_
+  - log_normal_
+  - log_softmax
+  - logcumsumexp
+  - logdet
+  - logaddexp
+  - logaddexp2
+  - logical_and
+  - logical_and_
+  - logical_not
+  - logit
+  - logical_not_
+  - logical_or
+  - logical_or_
+  - logical_xor
+  - logical_xor_
+  - logit_
+  - logsumexp
+  - lstsq
+  - lt
+  - lt_
+  - lu_solve
+  - map2_
+  - map_
+  - masked_fill
+  - matmul
+  - masked_fill_
+  - masked_scatter
+  - masked_scatter_
+  - masked_select
+  - matrix_exp
+  - max
+  - maximum
+  - mean
+  - matrix_power
+  - median
+  - min
+  - minimum
+  - mm
+  - mode
+  - msort
+  - mul
+  - mul_
+  - multinomial
+  - multiply
+  - multiply_
+  - mv
+  - mvlgamma
+  - mvlgamma_
+  - nansum
+  - narrow
+  - narrow_copy
+  - ne
+  - ne_
+  - neg
+  - neg_
+  - negative
+  - negative_
+  - nonzero
+  - normal_
+  - not_equal
+  - not_equal_
+  - permute
+  - pinverse
+  - polygamma
+  - pow
+  - pow_
+  - polygamma_
+  - prelu
+  - prod
+  - put_
+  - rad2deg
+  - rad2deg_
+  - ravel
+  - real
+  - reciprocal
+  - reciprocal_
+  - relu
+  - relu_
+  - remainder
+  - repeat_interleave
+  - reshape
+  - remainder_
+  - renorm
+  - renorm_
+  - repeat
+  - reshape_as
+  - resize_
+  - resize_as_
+  - roll
+  - rot90
+  - round
+  - round_
+  - rsqrt
+  - rsqrt_
+  - scatter
+  - scatter_
+  - scatter_add
+  - scatter_add_
+  - select
+  - sgn
+  - sgn_
+  - sigmoid
+  - sigmoid_
+  - sign
+  - sign_
+  - signbit
+  - sin
+  - sin_
+  - sinc
+  - sinc_
+  - sinh
+  - sinh_
+  - slogdet
+  - smm
+  - softmax
+  - solve
+  - sort
+  - split_with_sizes
+  - sqrt
+  - sqrt_
+  - square
+  - square_
+  - squeeze
+  - squeeze_
+  - sspaddmm
+  - std
+  - sub
+  - sub_
+  - sum
+  - sum_to_size
+  - svd
+  - symeig
+  - t
+  - t_
+  - take
+  - tan
+  - tan_
+  - tanh
+  - tanh_
+  - tensor_split
+  - tile
+  - topk
+  - transpose
+  - transpose_
+  - triangular_solve
+  - tril
+  - tril_
+  - triu
+  - true_divide
+  - triu_
+  - true_divide_
+  - trunc
+  - trunc_
+  - type_as
+  - unbind
+  - unflatten
+  - unfold
+  - unsafe_chunk
+  - unsqueeze
+  - unsafe_split
+  - unsafe_split_with_sizes
+  - var
+  - vdot
+  - unsqueeze_
+  - view_as
+  - xlogy
+  - xlogy_
+
+torch:
+  - _adaptive_avg_pool2d
+  - _add_relu
+  - _add_relu_
+  - _aminmax
+  - _batch_norm_impl_index
+  - _convolution
+  - abs
+  - abs_
+  - absolute
+  - acos
+  - acos_
+  - acosh
+  - acosh_
+  - adaptive_avg_pool1d
+  - adaptive_max_pool1d
+  - add
+  - addbmm
+  - addcdiv
+  - addcmul
+  - addmm
+  - addmv
+  - addmv_
+  - addr
+  - amax
+  - affine_grid_generator
+  - align_tensors
+  - all
+  - alpha_dropout
+  - amin
+  - alpha_dropout_
+  - angle
+  - any
+  - arange
+  - arccos
+  - arccos_
+  - arccosh
+  - arccosh_
+  - arcsin
+  - arcsin_
+  - arcsinh
+  - arcsinh_
+  - arctan
+  - arctan_
+  - arctanh
+  - arctanh_
+  - argmax
+  - argmin
+  - argsort
+  - asin
+  - asin_
+  - asinh
+  - asinh_
+  - atan
+  - atan2
+  - atan_
+  - atanh
+  - atanh_
+  - atleast_1d
+  - atleast_2d
+  - atleast_3d
+  - avg_pool1d
+  - baddbmm
+  - bartlett_window
+  - batch_norm_backward_elemt
+  - batch_norm_backward_reduce
+  - batch_norm_elemt
+  - batch_norm_gather_stats
+  - batch_norm_gather_stats_with_counts
+  - bernoulli
+  - batch_norm_stats
+  - batch_norm_update_stats
+  - bilinear
+  - bincount
+  - binomial
+  - binary_cross_entropy_with_logits
+  - bitwise_and
+  - bitwise_not
+  - bitwise_or
+  - bitwise_xor
+  - blackman_window
+  - block_diag
+  - bmm
+  - broadcast_tensors
+  - broadcast_to
+  - cartesian_prod
+  - cat
+  - cdist
+  - ceil
+  - ceil_
+  - celu
+  - celu_
+  - chain_matmul
+  - channel_shuffle
+  - cholesky
+  - cholesky_inverse
+  - cholesky_solve
+  - choose_qparams_optimized
+  - chunk
+  - clamp
+  - clamp_
+  - clamp_max
+  - clamp_max_
+  - clamp_min
+  - clamp_min_
+  - clip
+  - clip_
+  - clone
+  - column_stack
+  - combinations
+  - constant_pad_nd
+  - conv1d
+  - conv2d
+  - conv3d
+  - conv_tbc
+  - conv_transpose1d
+  - conv_transpose2d
+  - conv_transpose3d
+  - cos
+  - convolution
+  - copysign
+  - cos_
+  - cosh
+  - cosh_
+  - cosine_embedding_loss
+  - cosine_similarity
+  - count_nonzero
+  - cross
+  - ctc_loss
+  - cummax
+  - cummin
+  - cumprod
+  - cumsum
+  - deg2rad
+  - deg2rad_
+  - det
+  - diag
+  - diag_embed
+  - diff
+  - diagflat
+  - diagonal
+  - digamma
+  - dist
+  - div
+  - divide
+  - dot
+  - dropout
+  - dropout_
+  - dsmm
+  - dstack
+  - eig
+  - einsum
+  - embedding
+  - embedding_bag
+  - embedding_renorm_
+  - eq
+  - equal
+  - erf
+  - erf_
+  - erfc
+  - erfc_
+  - erfinv
+  - exp
+  - exp2
+  - exp2_
+  - exp_
+  - expm1
+  - expm1_
+  - eye
+  - feature_dropout
+  - feature_alpha_dropout
+  - feature_alpha_dropout_
+  - feature_dropout_
+  - fix
+  - fill_
+  - fix_
+  - flatten
+  - flip
+  - fliplr
+  - flipud
+  - float_power
+  - floor
+  - floor_
+  - floor_divide
+  - fmax
+  - fmin
+  - fmod
+  - frac
+  - frac_
+  - full
+  - frobenius_norm
+  - full_like
+  - gather
+  - gcd
+  - gcd_
+  - ge
+  - geqrf
+  - ger
+  - greater
+  - greater_equal
+  - grid_sampler
+  - grid_sampler_2d
+  - group_norm
+  - grid_sampler_3d
+  - gru
+  - gru_cell
+  - gt
+  - hamming_window
+  - hann_window
+  - hardshrink
+  - heaviside
+  - hinge_embedding_loss
+  - histc
+  - hsmm
+  - hspmm
+  - hstack
+  - hypot
+  - igamma
+  - igammac
+  - index_add
+  - index_copy
+  - inner
+  - index_fill
+  - index_put
+  - index_put_
+  - index_select
+  - instance_norm
+  - isclose
+  - isfinite
+  - isinf
+  - isnan
+  - isneginf
+  - isposinf
+  - istft
+  - kaiser_window
+  - kl_div
+  - kron
+  - kthvalue
+  - layer_norm
+  - lcm
+  - lcm_
+  - ldexp
+  - ldexp_
+  - le
+  - lerp
+  - less
+  - less_equal
+  - lgamma
+  - linspace
+  - log
+  - log10
+  - log10_
+  - log1p
+  - log1p_
+  - log2
+  - log2_
+  - log_softmax
+  - log_
+  - logaddexp
+  - logaddexp2
+  - logcumsumexp
+  - logdet
+  - logical_and
+  - logical_not
+  - logical_or
+  - logical_xor
+  - logit
+  - logit_
+  - logspace
+  - logsumexp
+  - lstm
+  - lstm_cell
+  - lstsq
+  - lt
+  - lu_solve
+  - masked_fill
+  - margin_ranking_loss
+  - masked_scatter
+  - masked_select
+  - matrix_exp
+  - matmul
+  - matrix_power
+  - matrix_rank
+  - max
+  - max_pool1d
+  - max_pool2d
+  - max_pool1d_with_indices
+  - max_pool3d
+  - maximum
+  - mean
+  - median
+  - min
+  - minimum
+  - mm
+  - mode
+  - moveaxis
+  - movedim
+  - msort
+  - mul
+  - multinomial
+  - multiply
+  - mv
+  - mvlgamma
+  - nan_to_num
+  - nan_to_num_
+  - nanmedian
+  - nansum
+  - narrow
+  - native_batch_norm
+  - native_group_norm
+  - narrow_copy
+  - native_layer_norm
+  - native_norm
+  - ne
+  - neg
+  - negative
+  - neg_
+  - negative_
+  - nextafter
+  - nonzero
+  - norm_except_dim
+  - normal
+  - not_equal
+  - nuclear_norm
+  - ones_like
+  - pairwise_distance
+  - pdist
+  - pinverse
+  - pixel_shuffle
+  - pixel_unshuffle
+  - poisson
+  - poisson_nll_loss
+  - polar
+  - polygamma
+  - pow
+  - prelu
+  - prod
+  - rad2deg
+  - promote_types
+  - rad2deg_
+  - range
+  - ravel
+  - real
+  - reciprocal
+  - relu
+  - reciprocal_
+  - relu_
+  - remainder
+  - renorm
+  - repeat_interleave
+  - reshape
+  - resize_as_
+  - roll
+  - rot90
+  - round
+  - round_
+  - rrelu
+  - rrelu_
+  - rsqrt
+  - row_stack
+  - rsqrt_
+  - rsub
+  - saddmm
+  - scalar_tensor
+  - scatter
+  - select
+  - scatter_add
+  - searchsorted
+  - selu
+  - selu_
+  - sgn
+  - sigmoid
+  - sigmoid_
+  - sign
+  - signbit
+  - sin
+  - sin_
+  - sinc
+  - sinc_
+  - sinh
+  - sinh_
+  - slogdet
+  - smm
+  - softmax
+  - solve
+  - sort
+  - sparse_coo_tensor
+  - square
+  - split_with_sizes
+  - spmm
+  - sqrt
+  - sqrt_
+  - square_
+  - squeeze
+  - sspaddmm
+  - stack
+  - std
+  - std_mean
+  - sub
+  - subtract
+  - sum
+  - svd
+  - swapaxes
+  - swapdims
+  - symeig
+  - t
+  - take
+  - tan
+  - tan_
+  - tanh
+  - tanh_
+  - tensordot
+  - tensor_split
+  - threshold
+  - threshold_
+  - tile
+  - topk
+  - transpose
+  - trapz
+  - triangular_solve
+  - tril
+  - tril_indices
+  - triplet_margin_loss
+  - triu
+  - triu_indices
+  - true_divide
+  - trunc
+  - trunc_
+  - unique_consecutive
+  - xlogy
+  - unbind
+  - unique_dim
+  - unsafe_chunk
+  - unsafe_split
+  - vander
+  - var
+  - vdot
+  - unsafe_split_with_sizes
+  - unsqueeze
+  - var_mean
+  - vstack
+  - where
+  - xlogy_
+
+_VF:
+  - lstm
+
+torch_npu:
+  - one_
+  - npu_sort_v2
+  - npu_transpose
+  - npu_broadcast
+  - npu_dtype_cast
+  - empty_with_format
+  - npu_one_hot
+  - npu_stride_add
+  - npu_ps_roi_pooling
+  - npu_roi_align
+  - npu_nms_v4
+  - npu_iou
+  - npu_nms_with_mask
+  - npu_pad
+  - npu_bounding_box_encode
+  - npu_bounding_box_decode
+  - npu_batch_nms
+  - npu_slice
+  - _npu_dropout
+  - npu_indexing
+  - npu_ifmr
+  - npu_max
+  - npu_scatter
+  - npu_layer_norm_eval
+  - npu_alloc_float_status
+  - npu_get_float_status
+  - npu_clear_float_status
+  - npu_confusion_transpose
+  - npu_bmmV2
+  - fast_gelu
+  - npu_sub_sample
+  - npu_deformable_conv2d
+  - npu_mish
+  - npu_anchor_response_flags
+  - npu_yolo_boxes_encode
+  - npu_grid_assign_positive
+  - npu_normalize_batch
+  - npu_masked_fill_range
+  - npu_linear
+  - npu_bert_apply_adam
+  - npu_giou
+  - npu_ciou
+  - npu_ciou_backward
+  - npu_diou
+  - npu_diou_backward
+  - npu_sign_bits_pack
+  - npu_sign_bits_unpack
+  - npu_flash_attention
\ No newline at end of file
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_functional.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6e119a8bf319e12d13bb0e1f6e33ef47e1bf713
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_functional.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+
+import torch
+import yaml
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard, print_info_log
+
+def remove_dropout():
+    if torch.__version__ > "1.8":
+        print_info_log("For precision comparison, the probability p in the dropout method is set to 0.")
+        import torch.nn.functional as F
+        from torch import _VF
+        from torch.overrides import has_torch_function_unary, handle_torch_function
+
+        def function_dropout(input: torch.Tensor, p: float = 0.5, training: bool = True,
+                             inplace: bool = False) -> torch.Tensor:
+            if has_torch_function_unary(input):
+                return handle_torch_function(function_dropout, (input,), input, p=0., training=training, inplace=inplace)
+            if p < 0.0 or p > 1.0:
+                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
+            return _VF.dropout_(input, 0., training) if inplace else _VF.dropout(input, 0., training)
+
+
+        def function_dropout2d(input: torch.Tensor, p: float = 0.5, training: bool = True,
+                               inplace: bool = False) -> torch.Tensor:
+            if has_torch_function_unary(input):
+                return handle_torch_function(function_dropout2d, (input,), input, p=0., training=training, inplace=inplace)
+            if p < 0.0 or p > 1.0:
+                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
+            return _VF.feature_dropout_(input, 0., training) if inplace else _VF.feature_dropout(input, 0., training)
+
+
+        def function_dropout3d(input: torch.Tensor, p: float = 0.5, training: bool = True,
+                               inplace: bool = False) -> torch.Tensor:
+            if has_torch_function_unary(input):
+                return handle_torch_function(function_dropout3d, (input,), input, p=0., training=training, inplace=inplace)
+            if p < 0.0 or p > 1.0:
+                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
+            return _VF.feature_dropout_(input, 0., training) if inplace else _VF.feature_dropout(input, 0., training)
+
+        F.dropout = function_dropout
+        F.dropout2d = function_dropout2d
+        F.dropout3d = function_dropout3d
+
+cur_path = os.path.dirname(os.path.realpath(__file__))
+yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
+with open(yaml_path, 'r') as f:
+    WrapFunctionalOps = yaml.safe_load(f).get('functional')
+
+for f in dir(torch.nn.functional):
+    locals().update({f: getattr(torch.nn.functional, f)})
+
+
+def get_functional_ops():
+    global WrapFunctionalOps
+    _all_functional_ops = dir(torch.nn.functional)
+    return set(WrapFunctionalOps) & set(_all_functional_ops)
+
+
+class HOOKFunctionalOP(object):
+    pass
+
+
+class FunctionalOPTemplate(HOOKModule):
+    def __init__(self, op_name, hook):
+        self.op_name_ = op_name
+        self.prefix_op_name_ = "Functional_" + str(op_name) + "_"
+        super().__init__(hook)
+
+    @torch_device_guard
+    def forward(self, *args, **kwargs):
+        return eval(self.op_name_)(*args, **kwargs)
+
+
+def wrap_functional_op(op_name, hook):
+    def functional_op_template(*args, **kwargs):
+        return FunctionalOPTemplate(op_name, hook)(*args, **kwargs)
+
+    return functional_op_template
+
+
+def wrap_functional_ops_and_bind(hook):
+    _functional_ops = get_functional_ops()
+    for op_name in _functional_ops:
+        setattr(HOOKFunctionalOP, "wrap_" + op_name, wrap_functional_op(op_name, hook))
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e127c87b7b1ddb170b3d89e4c0843521f93c68b
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_npu_custom.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+import torch
+import torch_npu
+import yaml
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard, torch_without_guard_version
+
+cur_path = os.path.dirname(os.path.realpath(__file__))
+yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
+with open(yaml_path, 'r') as f:
+    WrapNpuOps = yaml.safe_load(f).get('torch_npu')
+
+
+class HOOKNpuOP(object):
+    pass
+
+
+class NpuOPTemplate(HOOKModule):
+
+    def __init__(self, op_name, hook):
+        self.op_name_ = op_name
+        self.prefix_op_name_ = "NPU_" + str(op_name) + "_"
+        super().__init__(hook)
+
+    @torch_device_guard
+    def forward(self, *args, **kwargs):
+        if torch_without_guard_version:
+            return getattr(torch.ops.npu, str(self.op_name_))(*args, **kwargs)
+        else:
+            return getattr(torch_npu._C._VariableFunctionsClass, str(self.op_name_))(*args, **kwargs)
+
+def wrap_npu_op(op_name, hook):
+
+    def npu_op_template(*args, **kwargs):
+        return NpuOPTemplate(op_name, hook)(*args, **kwargs)
+
+    return npu_op_template
+
+
+def wrap_npu_ops_and_bind(hook):
+    _npu_ops = WrapNpuOps
+    for op_name in _npu_ops:
+        setattr(HOOKNpuOP, "wrap_" + str(op_name), wrap_npu_op(op_name, hook))
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_tensor.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..f87b400dc9f77c30101cd6286e3fb89cb1282cf9
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_tensor.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+
+import torch
+import yaml
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard, parameter_adapter
+
+cur_path = os.path.dirname(os.path.realpath(__file__))
+yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
+with open(yaml_path, 'r') as f:
+    WrapTensorOps = yaml.safe_load(f).get('tensor')
+
+
+def get_tensor_ops():
+    global WrapTensorOps
+    _tensor_ops = dir(torch._C._TensorBase)
+    return set(WrapTensorOps) & set(_tensor_ops)
+
+
+class HOOKTensor(object):
+    pass
+
+
+class TensorOPTemplate(HOOKModule):
+
+    def __init__(self, op_name, hook):
+        self.op_name_ = op_name
+        self.prefix_op_name_ = "Tensor_" + str(op_name) + "_"
+        super().__init__(hook)
+
+    @torch_device_guard
+    @parameter_adapter
+    def forward(self, *args, **kwargs):
+        return getattr(torch._C._TensorBase, str(self.op_name_))(*args, **kwargs)
+
+
+def wrap_tensor_op(op_name, hook):
+
+    def tensor_op_template(*args, **kwargs):
+        return TensorOPTemplate(op_name, hook)(*args, **kwargs)
+
+    return tensor_op_template
+
+
+def wrap_tensor_ops_and_bind(hook):
+    _tensor_ops = get_tensor_ops()
+    for op_name in _tensor_ops:
+        setattr(HOOKTensor, "wrap_" + str(op_name), wrap_tensor_op(op_name, hook))
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69a89d9efdeece2e05485752bb9b01b4504aa0f
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_torch.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+
+import torch
+import yaml
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard
+
+cur_path = os.path.dirname(os.path.realpath(__file__))
+yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
+with open(yaml_path, 'r') as f:
+    WrapTorchOps = yaml.safe_load(f).get('torch')
+
+
+def get_torch_ops():
+    global WrapTorchOps
+    _torch_ops = dir(torch._C._VariableFunctionsClass)
+    return set(WrapTorchOps) & set(_torch_ops)
+
+
+class HOOKTorchOP(object):
+    pass
+
+
+class TorchOPTemplate(HOOKModule):
+
+    def __init__(self, op_name, hook):
+        self.op_name_ = op_name
+        self.prefix_op_name_ = "Torch_" + str(op_name) + "_"
+        super().__init__(hook)
+
+    def input_param_need_adapt(self):
+        special_op_list = ["broadcast_tensors"]
+        for item in special_op_list:
+            if item in self.op_name_:
+                return True
+        return False
+    
+    def einsum_adapt(self, *args):
+        if len(args) < 2:
+            raise ValueError('einsum(): must specify the equation string and at least one operand, '
+                             'or at least one operand and its subscripts list')
+        equation = None
+        operands = None
+        if isinstance(args[0], torch.Tensor):
+            def parse_subscript(n: int) -> str:
+                if n == Ellipsis:
+                    return '...'
+                if n >= 0 and n < 26:
+                    return chr(ord('A') + n)
+                if n >= 26 and n < 52:
+                    return chr(ord('a') + n - 26)
+                raise ValueError('einsum(): subscript in subscript list is not within the valid range [0, 52]')
+            equation = ','.join(''.join(parse_subscript(s) for s in l) for l in args[1::2])
+            
+            if len(args) % 2 == 1:
+                equation += '->' + ''.join(parse_subscript(s) for s in args[-1])
+                operands = args[:-1:2]
+            else:
+                operands = args[::2]
+        else:
+            equation = args[0]
+            operands = args[1:]
+
+        if len(operands) == 1 and isinstance(operands[0], (list, tuple)):
+            _operands = operands[0]
+            return self.einsum_adapt(equation, *_operands)
+        return equation, operands
+
+    @torch_device_guard
+    def forward(self, *args, **kwargs):
+        if self.input_param_need_adapt():
+            return getattr(torch._C._VariableFunctionsClass, str(self.op_name_))(args, **kwargs)
+        else:
+            if self.op_name_ == 'einsum':
+                args = self.einsum_adapt(*args)
+            return getattr(torch._C._VariableFunctionsClass, str(self.op_name_))(*args, **kwargs)
+
+
+def wrap_torch_op(op_name, hook):
+
+    def torch_op_template(*args, **kwargs):
+        return TorchOPTemplate(op_name, hook)(*args, **kwargs)
+
+    return torch_op_template
+
+
+def wrap_torch_ops_and_bind(hook):
+    _torch_ops = get_torch_ops()
+    for op_name in _torch_ops:
+        setattr(HOOKTorchOP, "wrap_" + op_name, wrap_torch_op(op_name, hook))
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_vf.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_vf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b01f28fea765e947c64f160ce5f1a1e1d69fb9c5
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/wrap_vf.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+
+import torch
+import yaml
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard
+
+cur_path = os.path.dirname(os.path.realpath(__file__))
+yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
+with open(yaml_path, 'r') as f:
+    WrapVfOps = yaml.safe_load(f).get('_VF')
+
+
+def get_vf_ops():
+    global WrapVfOps
+    # _all_functional_ops = dir(torch.nn.functional)
+    # assert set(WrapFunctionalOps) <= set(_all_functional_ops)
+    return WrapVfOps
+
+
+class HOOKVfOP(object):
+    pass
+
+
+class VfOPTemplate(HOOKModule):
+    def __init__(self, op_name, hook):
+        self.op_name_ = op_name
+        self.prefix_op_name_ = "VF_" + str(op_name) + "_"
+        super().__init__(hook)
+
+    @torch_device_guard
+    def forward(self, *args, **kwargs):
+        return getattr(torch._C._VariableFunctionsClass, str(self.op_name_))(*args, **kwargs)
+
+
+def wrap_vf_op(op_name, hook):
+    def vf_op_template(*args, **kwargs):
+        return VfOPTemplate(op_name, hook)(*args, **kwargs)
+
+    return vf_op_template
+
+
+def wrap_vf_ops_and_bind(hook):
+    _vf_ops = get_vf_ops()
+    for op_name in _vf_ops:
+        setattr(HOOKVfOP, "wrap_" + op_name, wrap_vf_op(op_name, hook))
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/__init__.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py
new file mode 100644
index 0000000000000000000000000000000000000000..204f3de46098b63267f56c56c46a4aa28cebafb0
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/info_dump.py
@@ -0,0 +1,246 @@
+import inspect
+import fcntl
+import json
+import os
+import torch
+import threading
+
+import numpy as np
+
+from ..common.utils import print_error_log
+
+
+special_torch_object = ["memory_format"]
+lock = threading.Lock()
+
+
+def write_npy(file_path, tensor):
+    saved_tensor = tensor.contiguous().cpu().detach()
+    if tensor.dtype == torch.bfloat16:
+        saved_numpy = saved_tensor.to(torch.float32).numpy()
+    else:
+        saved_numpy = saved_tensor.numpy()
+    if os.path.exists(file_path):
+        raise ValueError(f"File {file_path} already exists")
+    np.save(file_path, saved_numpy)
+    full_path = os.path.abspath(file_path)
+    return full_path
+
+
+class APIInfo:
+    def __init__(self, api_name, is_forward, save_real_data=False):
+        self.rank = os.getpid()
+        self.api_name = api_name
+        self.save_real_data = save_real_data
+        self.torch_object_key = {'device': self.analyze_device_in_kwargs, 'dtype': self.analyze_dtype_in_kwargs}
+        self.is_forward = is_forward
+        self.args_num = 0
+
+    def analyze_element(self, element):
+        if isinstance(element, (list, tuple)):
+            out = []
+            for item in element:
+                out.append(self.analyze_element(item))
+        elif isinstance(element, dict):
+            out = {}
+            for key, value in element.items():
+                if key in self.torch_object_key.keys():
+                    fun = self.torch_object_key[key]
+                    out[key] = fun(value)
+                elif key in special_torch_object:
+                    continue
+                else:
+                    out[key] = self.analyze_element(value)
+
+        elif isinstance(element, torch.Tensor):
+            out = self.analyze_tensor(element, self.save_real_data)
+
+        elif self.is_builtin_class(element):
+            out = self.analyze_builtin(element)
+        else:
+            msg = f"Type {type(element)} is unsupported at analyze_element"
+            print_error_log(msg)
+
+            raise NotImplementedError(msg)
+        return out
+
+    def analyze_tensor(self, arg, save_real_data):
+        single_arg = {}
+        if not save_real_data:
+            single_arg.update({'type': 'torch.Tensor'})
+            single_arg.update({'dtype': str(arg.dtype)})
+            single_arg.update({'shape': arg.shape})
+            single_arg.update({'Max': self.transfer_types(self.get_tensor_extremum(arg, 'max'), str(arg.dtype))})
+            single_arg.update({'Min': self.transfer_types(self.get_tensor_extremum(arg, 'min'), str(arg.dtype))})
+            single_arg.update({'requires_grad': arg.requires_grad})
+
+        else:
+            dump_path = "./"
+            api_args = self.api_name + '*' + str(self.args_num)
+            if self.is_forward:
+                forward_real_data_path = os.path.join(dump_path, 'forward_real_data')
+                if not os.path.exists(forward_real_data_path):
+                    os.makedirs(forward_real_data_path, 0o755)
+
+                file_path = os.path.join(forward_real_data_path, f'{api_args}.npy')
+            else:
+                backward_real_data_path = os.path.join(dump_path, 'backward_real_data')
+                if not os.path.exists(backward_real_data_path):
+                    os.makedirs(backward_real_data_path, 0o755)
+                file_path = os.path.join(backward_real_data_path, f'{api_args}.npy')
+            self.args_num += 1
+            npy_path = write_npy(file_path, arg)
+            single_arg.update({'type': 'torch.Tensor'})
+            single_arg.update({'datapath': npy_path})
+            single_arg.update({'requires_grad': arg.requires_grad})
+        return single_arg
+
+    def analyze_builtin(self, arg):
+        single_arg = {}
+        if isinstance(arg, slice):
+            single_arg.update({'type': "slice"})
+            single_arg.update({'value': [arg.start, arg.stop, arg.step]})
+        else:
+            single_arg.update({'type': self.get_type_name(str(type(arg)))})
+            single_arg.update({'value': arg})
+        return single_arg
+
+    def transfer_types(self, data, dtype):
+        if 'int' in dtype or 'bool' in dtype:
+            return int(data)
+        else:
+            return float(data)
+
+    def is_builtin_class(self, element):
+        if element is None or isinstance(element, (bool, int, float, str, slice)):
+            return True
+        return False
+
+    def analyze_device_in_kwargs(self, element):
+        single_arg = {}
+        single_arg.update({'type': 'torch.device'})
+        if not isinstance(element, str):
+
+            if hasattr(element, "index"):
+                device_value = element.type + ":" + str(element.index)
+                single_arg.update({'value': device_value})
+            else:
+                device_value = element.type
+        else:
+            single_arg.update({'value': element})
+        return single_arg
+
+    def analyze_dtype_in_kwargs(self, element):
+        single_arg = {}
+        single_arg.update({'type': 'torch.dtype'})
+        single_arg.update({'value': str(element)})
+        return single_arg
+
+    def get_tensor_extremum(self, data, operator):
+        if data.dtype is torch.bool:
+            if operator == 'max':
+                return True in data
+            elif operator == 'min':
+                return False not in data
+        if operator == 'max':
+            return torch._C._VariableFunctionsClass.max(data).item()
+        else:
+            return torch._C._VariableFunctionsClass.min(data).item()
+
+    def get_type_name(self, name):
+
+        left = name.index("'")
+        right = name.rindex("'")
+        return name[left + 1: right]
+
+
+class ForwardAPIInfo(APIInfo):
+    def __init__(self, name, save_real_data, args, kwargs):
+        super().__init__(name, is_forward=True, save_real_data=save_real_data)
+        self.analyze_api_input(args, kwargs)
+        self.analyze_api_call_stack()
+
+    def analyze_api_input(self, args, kwargs):
+        args_info_list = self.analyze_element(args)
+        kwargs_info_dict = self.analyze_element(kwargs)
+        self.api_info_struct = {self.api_name: {"args": args_info_list, "kwargs": kwargs_info_dict}}
+
+    def analyze_api_call_stack(self):
+        stack_str = []
+        for (_, path, line, func, code, _) in inspect.stack()[3:]:
+            if not code: continue
+            stack_line = " ".join([
+                "File", ", ".join([path, " ".join(["line", str(line)]), " ".join(["in", func]),
+                                   " ".join(["\n", code[0].strip()])])])
+            stack_str.append(stack_line)
+        self.stack_info_struct = {self.api_name: stack_str}
+
+
+class BackwardAPIInfo(APIInfo):
+    def __init__(self, name, grads):
+        super().__init__(name, is_forward=False)
+        self.analyze_api_input(grads)
+
+    def analyze_api_input(self, grads):
+        grads_info_list = self.analyze_element(grads)
+        self.grad_info_struct = {self.api_name: grads_info_list}
+
+
+def write_api_info_json(api_info):
+    dump_path = "./"
+    rank = api_info.rank
+    if isinstance(api_info, ForwardAPIInfo):
+        file_path = os.path.join(dump_path, f'forward_info_{rank}.json')
+        stack_file_path = os.path.join(dump_path, f'stack_info_{rank}.json')
+        write_json(file_path, api_info.api_info_struct)
+        write_json(stack_file_path, api_info.stack_info_struct, indent=4)
+
+    elif isinstance(api_info, BackwardAPIInfo):
+        file_path = os.path.join(dump_path, f'backward_info_{rank}.json')
+        write_json(file_path, api_info.grad_info_struct)
+    else:
+        raise ValueError(f"Invalid api_info type {type(api_info)}")
+
+
+def write_json(file_path, data, indent=None):
+    if not os.path.exists(file_path):
+        with open(file_path, 'w') as f:
+            f.write("{\n}")
+    lock.acquire()
+    with open(file_path, 'a+') as f:
+        fcntl.flock(f, fcntl.LOCK_EX)
+        try:
+            f.seek(0, os.SEEK_END)
+            f.seek(f.tell() - 1, os.SEEK_SET)
+            f.truncate()
+            if f.tell() > 3:
+                f.seek(f.tell() - 1, os.SEEK_SET)
+                f.truncate()
+                f.write(',\n')
+            f.write(json.dumps(data, indent=indent)[1:-1] + '\n}')
+        except Exception as e:
+            raise ValueError(f"Json save failed:{e}")
+        finally:
+            fcntl.flock(f, fcntl.LOCK_UN)
+            lock.release()
+
+
+def initialize_output_json():
+    dump_path = os.path.realpath("./")
+    files = ['forward_info.json', 'backward_info.json', 'stack_info.json']
+
+    forward_real_data_path = os.path.join(dump_path, 'forward_real_data')
+    if os.path.exists(forward_real_data_path):
+        raise ValueError(f"file {forward_real_data_path} already exists, please remove it first")
+    else:
+        os.mkdir(forward_real_data_path, mode=0o750)
+
+    backward_real_data_path = os.path.join(dump_path, 'backward_real_data')
+    if os.path.exists(backward_real_data_path):
+        raise ValueError(f"file {backward_real_data_path} already exists, please remove it first")
+    else:
+        os.mkdir(backward_real_data_path, mode=0o750)
+    for file in files:
+        file_path = os.path.join(dump_path, file)
+        if os.path.exists(file_path):
+            raise ValueError(f"file {file_path} already exists, please remove it first or use a new dump path")
\ No newline at end of file
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..60693298e22a5c8c9528bdedd3cbdd1b2dd86133
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/overflow_check.py
@@ -0,0 +1,180 @@
+import os
+import glob
+import torch
+
+from ..common.utils import print_warn_log, get_time, print_info_log
+from ..dump.dump import forward_init_status, forward_acl_dump
+from .utils import OverFlowUtil, dump_overflow
+from ..dump.utils import DumpUtil, Const, get_tensor_rank, create_dirs_if_not_exist
+from .info_dump import write_api_info_json, ForwardAPIInfo, BackwardAPIInfo
+from ..dump import dump
+
+try:
+    import torch_npu
+except ImportError:
+    is_gpu = True
+else:
+    is_gpu = False
+
+backward_init_status = False
+api_overflow = []
+forward_api_info = {}
+backward_api_info = {}
+FORWARD_REAL_DATA_PATH = os.path.join('./', 'forward_real_data')
+BACKWARD_REAL_DATA_PATH = os.path.join('./', 'backward_real_data')
+
+
+def check_overflow_environment(pid):
+    if not OverFlowUtil.get_overflow_check_switch():
+        return False
+    if pid != os.getpid():
+        return False
+    if is_gpu:
+        print_warn_log("Overflow detection is not supported in the GPU environment.")
+        return False
+    global backward_init_status
+    if backward_init_status or forward_init_status:
+        return False
+    return True
+
+
+def check_data_overflow(x):
+    if isinstance(x, (tuple, list)) and x:
+        for i, item in enumerate(x):
+            if True == check_data_overflow(item):
+                return True
+        return False
+    else:
+        if isinstance(x, torch.Tensor) and x.numel() != 0 and x.dtype != torch.bool:
+            if len(x.shape) == 0:
+                tensor_max = x.cpu().detach().float().numpy().tolist()
+                tensor_min = tensor_max
+            else:
+                tensor_max = torch._C._VariableFunctionsClass.max(x).cpu().detach().float().numpy().tolist()
+                tensor_min = torch._C._VariableFunctionsClass.min(x).cpu().detach().float().numpy().tolist()
+            # inf
+            if tensor_max == float('inf') or tensor_min == float('-inf'):
+                return True
+            if x.dtype in [torch.float16, torch.float32, torch.bfloat16] and \
+                    (tensor_max == torch.finfo(x.dtype).max or tensor_min == torch.finfo(x.dtype).min):
+                return True
+            # nan
+            elif tensor_max != tensor_max or tensor_min != tensor_min:
+                return True
+            else:
+                return False
+        elif isinstance(x, bool) or isinstance(x, int) or isinstance(x, float):
+            if x == float('inf') or x == float('-inf') or x != x:
+                return True
+            else:
+                return False
+        else:
+            return False
+
+
+def check_path(apis, path):
+    return any(api in path for api in apis)
+
+
+def overflow_check(name, **kwargs):
+    overflow_nums = OverFlowUtil.overflow_nums
+    pid = kwargs.get('pid')
+    dump_mode = DumpUtil.dump_switch_mode
+    if not pid:
+        return RuntimeError("Not get the specified process pid.")
+
+    def overflowcheck_hook(module, in_feat, out_feat):
+        if not check_overflow_environment(pid):
+            return
+        rank = get_tensor_rank(in_feat, out_feat)
+        if DumpUtil.target_rank is not None:
+            if rank != DumpUtil.target_rank:
+                return
+        dump_path = create_dirs_if_not_exist(rank, DumpUtil.dump_path)
+        dump_dir = os.path.split(dump_path)[0]
+        global api_overflow
+        global forward_api_info
+        global backward_api_info
+
+        module_name = name
+        if hasattr(torch_npu._C, '_npu_is_support_inf_nan') and torch_npu._C._npu_is_support_inf_nan():
+            # backward API endwith backward
+            if module_name.endswith(Const.BACKWARD):
+                check_feat = in_feat
+            else:
+                check_feat = out_feat
+            module.has_overflow = check_data_overflow(check_feat)
+        else:
+            module.has_overflow = torch_npu._C._check_overflow_npu()
+        if not module.has_overflow:
+            if hasattr(module, 'input_args'):
+                del module.input_args
+            if hasattr(module, 'input_kwargs'):
+                del module.input_kwargs
+        if module.has_overflow and OverFlowUtil.check_overflow_dump_times(overflow_nums):
+            need_replicate = overflow_type_judge(in_feat, out_feat, module_name)
+            if need_replicate:
+                if module_name.endswith(Const.FORWARD):
+                    forward_api_info.update({name: ForwardAPIInfo(name, True, module.input_args, module.input_kwargs)})
+                    api_overflow.append(module_name)
+                else:
+                    api_overflow.append(module_name.replace("backward", "forward"))
+                    backward_api_info.update({name: BackwardAPIInfo(name, out_feat)})
+            OverFlowUtil.inc_overflow_dump_times()
+            dump_file_name = os.path.join(dump_dir,
+                "Overflow_info_{}_{}.pkl".format(get_time(), OverFlowUtil.real_overflow_dump_times))
+            dump_overflow(module_name, in_feat, out_feat, dump_file_name)
+            dump.pkl_name = dump_file_name
+
+            print_warn_log("[overflow {} times]: module name :'{}' is overflow and dump file is saved in '{}'."
+                           .format(OverFlowUtil.real_overflow_dump_times, module_name,
+                                   os.path.realpath(dump_file_name)))
+            if dump_mode == "acl":
+                acl_dump(module, module_name)
+            dump.write_to_disk()
+            dump.api_list.clear()
+            # clear overflow flag for the next check
+            torch_npu._C._clear_overflow_npu()
+            if not OverFlowUtil.check_overflow_dump_times(overflow_nums):
+                for key in forward_api_info:
+                    write_api_info_json(forward_api_info[key])
+                for key in backward_api_info:
+                    write_api_info_json(backward_api_info[key])
+                raise ValueError("[overflow {} times]: dump file is saved in '{}'."
+                                 .format(OverFlowUtil.real_overflow_dump_times, os.path.realpath(dump_file_name)))
+                return
+
+    def delete_forward_npy(api_overflow_list, api_info):
+        for path in glob.glob(FORWARD_REAL_DATA_PATH + "/*.npy"):
+            if not check_path(api_overflow_list, path):
+                os.remove(os.path.abspath(path))
+        for key in list(api_info.keys()):
+            if key not in api_overflow:
+                del forward_api_info[key]
+
+    def overflow_type_judge(in_feat, out_feat, module_name):
+        if module_name.endswith(Const.BACKWARD):
+            check_feat = out_feat
+        else:
+            check_feat = in_feat
+        if check_data_overflow(check_feat):
+            print_warn_log("module name :'{}' is overflow and its inputs already has an overflow, so you need "
+                           "to go back to find where the overflow started.".format(module_name))
+            return False
+        elif not check_data_overflow(in_feat) and not check_data_overflow(out_feat):
+            print_warn_log("module name :'{}' is overflow and its inputs and outputs do not overflow, "
+                           "so this is a process overflow".format(module_name))
+            return False
+        else:
+            print_warn_log("module name :'{}' is overflow. Its input is normal and its output "
+                           "is overflow.".format(module_name))
+            return True
+
+    def acl_dump(module, module_name):
+        if "forward" in module_name:
+            forward_acl_dump(module, module_name)
+        if "backward" in module_name:
+            print_info_log("The overflow is caused by backward operator {}. "
+                           "You can use reverse acl dump(mode='acl') to get operator dump data.".format(module_name))
+
+    return overflowcheck_hook
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cff9c07efae812ff9462326be8de2d01758a7790
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/overflow_check/utils.py
@@ -0,0 +1,76 @@
+import json
+import os
+import stat
+import torch
+
+import numpy as np
+
+from ..common.utils import Const, check_switch_valid
+from ..dump.dump import dump_stack_info, get_scalar_data_info, dump_data, \
+    get_not_float_tensor_info, get_float_tensor_info
+from ..dump.utils import DumpUtil, make_dump_data_dir
+
+
+class OverFlowUtil(object):
+    overflow_check_switch = None
+    overflow_filter_switch = None
+    real_overflow_dump_times = 0
+    overflow_nums = 1
+
+    @staticmethod
+    def set_overflow_check_switch(switch, filter_switch):
+        OverFlowUtil.overflow_check_switch = switch
+        OverFlowUtil.overflow_filter_switch = filter_switch
+
+    @staticmethod
+    def get_overflow_check_switch():
+        if OverFlowUtil.overflow_check_switch is None:
+            return True
+        return OverFlowUtil.overflow_check_switch == "ON"
+
+    @staticmethod
+    def inc_overflow_dump_times():
+        OverFlowUtil.real_overflow_dump_times += 1
+
+    @staticmethod
+    def check_overflow_dump_times(need_dump_times):
+        return OverFlowUtil.real_overflow_dump_times < need_dump_times
+
+
+def set_overflow_check_switch(switch, filter_switch=Const.ON):
+    check_switch_valid(switch)
+    check_switch_valid(filter_switch)
+
+    OverFlowUtil.set_overflow_check_switch(switch, filter_switch)
+
+
+def dump_overflow(module_name, in_feat, out_feat, dump_file):
+    name_template = f"{module_name}" + "_{}"
+    DumpUtil.dump_data_dir = make_dump_data_dir(dump_file)
+    dump_stack_info(name_template, dump_file)
+    if "forward" in name_template:
+        _dump_tensor_completely(in_feat, name_template.format("input"), dump_file)
+        _dump_tensor_completely(out_feat, name_template.format("output"), dump_file)
+    else:
+        _dump_tensor_completely(in_feat, name_template.format("output"), dump_file)
+        _dump_tensor_completely(out_feat, name_template.format("input"), dump_file)
+
+
+def _dump_tensor_completely(x, prefix, dump_file_name):
+    dump_flag = Const.DUMP_RATIO_MAX + 1
+    if isinstance(x, (tuple, list)) and x:
+        for i, item in enumerate(x):
+            _dump_tensor_completely(item, "{}.{}".format(prefix, i), dump_file_name)
+    elif isinstance(x, torch.Tensor):
+        if x.numel() == 0 or len(x.shape) == 0 or not x.is_floating_point():
+            if OverFlowUtil.overflow_filter_switch == Const.OFF:
+                data_info = get_not_float_tensor_info(x)
+                dump_data(dump_file_name, dump_flag, prefix, data_info)
+        else:
+            data_info = get_float_tensor_info(x)
+            dump_data(dump_file_name, dump_flag, prefix, data_info)
+
+    elif OverFlowUtil.overflow_filter_switch == Const.OFF:
+        if isinstance(x, bool) or isinstance(x, int) or isinstance(x, float):
+            data_info = get_scalar_data_info(x)
+            dump_data(dump_file_name, dump_flag, prefix, data_info)
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f7a395c2e25a9f35be4d4d07b75c70cddf39a42
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse.py
@@ -0,0 +1,4 @@
+from .parse_tool import cli
+
+if __name__ == '__main__':
+    cli.parse()
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/__init__.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/cli.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..a751c159eb90a6d7e58426449e2dc511f5160d18
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/parse_tool/cli.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from .lib.interactive_cli import InteractiveCli
+
+
+def _run_interactive_cli(cli=None):
+    print("Interactive command mode")
+    if not cli:
+        cli = InteractiveCli()
+    try:
+        cli.cmdloop(intro="Start Parsing........")
+    except KeyboardInterrupt:
+        print("Exit parsing.......")
+
+
+def parse():
+    _run_interactive_cli()
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py b/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..90311ae94e98c1d11e4f44e5ff4f8243a0b8fa7c
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import setuptools
+from pathlib import Path
+import stat
+import os
+
+VERSION = '3.2'
+
+def generate_ptdbg_ascend_version():
+      ptdbg_ascend_root = Path(__file__).parent
+      version_path = ptdbg_ascend_root / "ptdbg_ascend" / "common" / "version.py"
+      if version_path.exists():
+            version_path.unlink()
+      flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL
+      modes = stat.S_IWUSR | stat.S_IRUSR
+      with os.fdopen(os.open(version_path, flags, modes), 'w') as f:
+            f.write("__version__ = '{version}'\n".format(version = VERSION))
+
+generate_ptdbg_ascend_version()
+
+setuptools.setup(name='ptdbg_ascend',
+      version=VERSION,
+      description='This is a pytorch precision comparison tools',
+      long_description='This is a pytorch precision comparison tools, include overflow detect tool',
+      packages=setuptools.find_packages(),
+      install_requires = [
+            "wheel",
+            "numpy",
+            "pandas >= 1.3.5",
+            "pyyaml"
+      ],
+      include_package_data=True,
+      ext_modules=[],
+      zip_safe=False)
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/resources/compare/advisor.txt b/debug/accuracy_tools/ptdbg_ascend/test/resources/compare/advisor.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5c4825e28ebde12b43ad7e46bf05820929c88f8d
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/test/resources/compare/advisor.txt
@@ -0,0 +1,3 @@
+Line: NA
+Suspect Nodes: NA
+Expert Advice: All data in comparison result meets the accuracy requirements.
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/resources/compare/compare_result_20230703104808.csv b/debug/accuracy_tools/ptdbg_ascend/test/resources/compare/compare_result_20230703104808.csv
new file mode 100644
index 0000000000000000000000000000000000000000..a7742ff3fd0863fa157dbabebee252aea6b70888
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/test/resources/compare/compare_result_20230703104808.csv
@@ -0,0 +1,9 @@
+NPU Name,Bench Name,NPU Tensor Dtype,Bench Tensor Dtype,NPU Tensor Shape,Bench Tensor Shape,Cosine,MaxAbsErr,NPU max,NPU min,NPU mean,Bench max,Bench min,Bench mean,Accuracy Reached or Not,Err_message
+Functional_linear_0_forward_input.0,Functional_linear_0_forward_input.0,torch.float32,torch.float32,"[3, 2]","[3, 2]",1.0,0.000000,1.948258399963379,-1.0052297115325928,-0.2003595232963562,1.948258399963379,-1.0052297115325928,-0.2003595232963562,Yes,
+Functional_linear_0_forward_input.1,Functional_linear_0_forward_input.1,torch.float32,torch.float32,"[3, 2]","[3, 2]",1.0,0.000000,0.28375449776649475,-0.6661239266395569,-0.2789986729621887,0.28375449776649475,-0.6661239266395569,-0.2789986729621887,Yes,
+Functional_linear_0_forward_input.2,Functional_linear_0_forward_input.2,torch.float32,torch.float32,[3],[3],1.0,0.000000,0.2457989901304245,-0.6338542103767395,-0.14437106251716614,0.2457989901304245,-0.6338542103767395,-0.14437106251716614,Yes,
+Functional_linear_0_forward_output,Functional_linear_0_forward_output,torch.float32,torch.float32,"[3, 3]","[3, 3]",1.0,0.000000,0.8278868794441223,-0.8729169964790344,0.16790540516376495,0.8278868794441223,-0.8729169964790344,0.16790540516376495,Yes,
+Torch_relu_0_forward_input.0,Torch_relu_0_forward_input.0,torch.float32,torch.float32,"[3, 3]","[3, 3]",1.0,0.000000,0.8278868794441223,-0.8729169964790344,0.16790540516376495,0.8278868794441223,-0.8729169964790344,0.16790540516376495,Yes,
+Torch_relu_0_forward_output,Torch_relu_0_forward_output,torch.float32,torch.float32,"[3, 3]","[3, 3]",1.0,0.000000,0.8278868794441223,0.0,0.31367552280426025,0.8278868794441223,0.0,0.31367552280426025,Yes,
+Functional_relu_0_forward_input.0,Functional_relu_0_forward_input.0,torch.float32,torch.float32,"[3, 3]","[3, 3]",1.0,0.000000,0.8278868794441223,-0.8729169964790344,0.16790540516376495,0.8278868794441223,-0.8729169964790344,0.16790540516376495,Yes,
+Functional_relu_0_forward_output,Functional_relu_0_forward_output,torch.float32,torch.float32,"[3, 3]","[3, 3]",1.0,0.000000,0.8278868794441223,0.0,0.31367552280426025,0.8278868794441223,0.0,0.31367552280426025,Yes,
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/resources/compare/compare_result_without_accuracy.csv b/debug/accuracy_tools/ptdbg_ascend/test/resources/compare/compare_result_without_accuracy.csv
new file mode 100644
index 0000000000000000000000000000000000000000..404af78ec03f497f91dc7fcfc7c6ab0e855e7e7b
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/test/resources/compare/compare_result_without_accuracy.csv
@@ -0,0 +1,9 @@
+NPU Name,Bench Name,NPU Tensor Dtype,Bench Tensor Dtype,NPU Tensor Shape,Bench Tensor Shape,Cosine,MaxAbsErr,NPU max,NPU min,NPU mean,Bench max,Bench min,Bench mean,Accuracy Reached or Not,Err_message
+,Functional_linear_0_forward_input.0,torch.float32,torch.float32,"[3, 2]","[3, 2]",1,0,1.9482584,-1.005229712,-0.200359523,1.9482584,-1.005229712,-0.200359523,,
+,Functional_linear_0_forward_input.1,torch.float32,torch.float32,"[3, 2]","[3, 2]",1,0,0.283754498,-0.666123927,-0.278998673,0.283754498,-0.666123927,-0.278998673,,
+,Functional_linear_0_forward_input.2,torch.float32,torch.float32,[3],[3],1,0,0.24579899,-0.63385421,-0.144371063,0.24579899,-0.63385421,-0.144371063,,
+,Functional_linear_0_forward_output,torch.float32,torch.float32,"[3, 3]","[3, 3]",1,0,0.827886879,-0.872916996,0.167905405,0.827886879,-0.872916996,0.167905405,,
+,Torch_relu_0_forward_input.0,torch.float32,torch.float32,"[3, 3]","[3, 3]",1,0,0.827886879,-0.872916996,0.167905405,0.827886879,-0.872916996,0.167905405,,
+,Torch_relu_0_forward_output,torch.float32,torch.float32,"[3, 3]","[3, 3]",1,0,0.827886879,0,0.313675523,0.827886879,0,0.313675523,,
+,Functional_relu_0_forward_input.0,torch.float32,torch.float32,"[3, 3]","[3, 3]",1,0,0.827886879,-0.872916996,0.167905405,0.827886879,-0.872916996,0.167905405,,
+,Functional_relu_0_forward_output,torch.float32,torch.float32,"[3, 3]","[3, 3]",1,0,0.827886879,0,0.313675523,0.827886879,0,0.313675523,,
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/resources/compare/npu_test.pkl b/debug/accuracy_tools/ptdbg_ascend/test/resources/compare/npu_test.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..2e00b07b7c97e9cdb497bc63dd7eef8063388807
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/test/resources/compare/npu_test.pkl
@@ -0,0 +1,8 @@
+["Functional_linear_0_forward_input.0", 1, [], "torch.float32", [3, 2], [1.948258399963379, -1.0052297115325928, -0.2003595232963562]]
+["Functional_linear_0_forward_input.1", 1, [], "torch.float32", [3, 2], [0.28375449776649475, -0.6661239266395569, -0.2789986729621887]]
+["Functional_linear_0_forward_input.2", 1, [], "torch.float32", [3], [0.2457989901304245, -0.6338542103767395, -0.14437106251716614]]
+["Functional_linear_0_forward_output", 1, [], "torch.float32", [3, 3], [0.8278868794441223, -0.8729169964790344, 0.16790540516376495]]
+["Torch_relu_0_forward_input.0", 1, [], "torch.float32", [3, 3], [0.8278868794441223, -0.8729169964790344, 0.16790540516376495]]
+["Torch_relu_0_forward_output", 1, [], "torch.float32", [3, 3], [0.8278868794441223, 0.0, 0.31367552280426025]]
+["Functional_relu_0_forward_input.0", 1, [], "torch.float32", [3, 3], [0.8278868794441223, -0.8729169964790344, 0.16790540516376495]]
+["Functional_relu_0_forward_output", 1, [], "torch.float32", [3, 3], [0.8278868794441223, 0.0, 0.31367552280426025]]
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/run_test.sh b/debug/accuracy_tools/ptdbg_ascend/test/run_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6da8c4da3d7cdaf37926d05164cb96944e200a61
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/test/run_test.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+CUR_DIR=$(dirname $(readlink -f $0))
+TOP_DIR=${CUR_DIR}/..
+TEST_DIR=${TOP_DIR}/"test"
+SRC_DIR=${TOP_DIR}/"src"/"python"
+
+clean() {
+    cd ${TEST_DIR}
+
+    if [ -e ${TEST_DIR}/"report" ]; then
+      rm -r ${TEST_DIR}/"report"
+      echo "remove last ut_report successfully."
+    fi
+
+    if [ -e ${SRC_DIR}/"build" ]; then
+      cd ${SRC_DIR}
+      rm -r build
+      rm -r ptdbg_ascend.egg-info
+      echo "remove last build cache."
+    fi
+
+    if [ -e ${SRC_DIR}/"ptdbg_ascend"/"common"/"version.py" ]; then
+      rm ${SRC_DIR}/"ptdbg_ascend"/"common"/"version.py"
+      echo "remove last generated 'version.py'."
+    fi
+}
+
+run_ut() {
+    export PYTHONPATH=${SRC_DIR}:${PYTHONPATH} && python3 run_ut.py
+}
+
+main() {
+    clean
+    if [ "$1"x == "clean"x ]; then
+      return 0
+    fi
+
+    cd ${SRC_DIR} && python3 setup.py build
+    cd ${TEST_DIR} && run_ut
+}
+
+main $@
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/run_ut.py b/debug/accuracy_tools/ptdbg_ascend/test/run_ut.py
new file mode 100644
index 0000000000000000000000000000000000000000..5972e73d2762a2ac981fdcf9f870aa761727c1c4
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/test/run_ut.py
@@ -0,0 +1,41 @@
+import os
+import shutil
+import subprocess
+import sys
+
+def run_ut():
+    cur_dir = os.path.realpath(os.path.dirname(__file__))
+    top_dir = os.path.realpath(os.path.dirname(cur_dir))
+    ut_path = os.path.join(cur_dir, "ut/")
+    src_dir = os.path.join(top_dir, "src/python")
+    report_dir = os.path.join(cur_dir, "report")
+
+    if os.path.exists(report_dir):
+        shutil.rmtree(report_dir)
+
+    os.makedirs(report_dir)
+
+    cmd = ["python3", "-m", "pytest", ut_path, "--junitxml=" + report_dir + "/final.xml",
+           "--cov=" + src_dir, "--cov-branch", "--cov-report=xml:" + report_dir + "/coverage.xml"]
+    
+    result_ut = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+    while result_ut.poll() is None:
+        line = result_ut.stdout.readline().strip()
+        if line:
+            print(line)
+
+    ut_flag = False
+    if result_ut.returncode == 0:
+        ut_flag = True
+        print("run ut successfully.")
+    else:
+        print("run ut failed.")
+
+    return ut_flag
+
+if __name__=="__main__":
+    if run_ut():
+        sys.exit(0)
+    else:
+        sys.exit(1)
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/overflow/test_overflow_check.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/overflow/test_overflow_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced2146ce1f04461c75067055ffc5faab25f2903
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/overflow/test_overflow_check.py
@@ -0,0 +1,53 @@
+# coding=utf-8
+import os
+import pytest
+import unittest
+from ptdbg_ascend.overflow_check import overflow_check
+from ptdbg_ascend.overflow_check import utils
+from ptdbg_ascend.overflow_check.utils import OverFlowUtil, dump_overflow
+
+ON = "ON"
+OFF = "OFF"
+ERROR_PID = 1
+
+
+class TestUtilsMethods(unittest.TestCase):
+
+    def test_check_overflow_environment_1(self):
+        utils.set_overflow_check_switch(OFF, OFF)
+        OverFlowUtil.get_overflow_check_switch()
+        res = overflow_check.check_overflow_environment(ERROR_PID)
+        self.assertEqual(res, False)
+
+    def test_check_overflow_environment_2(self):
+        utils.set_overflow_check_switch(ON, ON)
+        OverFlowUtil.get_overflow_check_switch()
+        res = overflow_check.check_overflow_environment(ERROR_PID)
+        self.assertEqual(res, False)
+
+    def test_check_overflow_environment_3(self):
+        utils.set_overflow_check_switch(ON, ON)
+        OverFlowUtil.get_overflow_check_switch()
+        pid = os.getpid()
+        overflow_check.is_gpu = True
+        res = overflow_check.check_overflow_environment(pid)
+        self.assertEqual(res, False)
+
+    def test_check_overflow_environment_4(self):
+        utils.set_overflow_check_switch(ON, ON)
+        OverFlowUtil.get_overflow_check_switch()
+        pid = os.getpid()
+        overflow_check.is_gpu = False
+        overflow_check.backward_init_status = True
+        res = overflow_check.check_overflow_environment(pid)
+        self.assertEqual(res, False)
+
+    def test_check_overflow_environment_5(self):
+        utils.set_overflow_check_switch(ON, ON)
+        OverFlowUtil.get_overflow_check_switch()
+        pid = os.getpid()
+        overflow_check.is_gpu = False
+        overflow_check.backward_init_status = False
+        res = overflow_check.check_overflow_environment(pid)
+        self.assertEqual(res, True)
+
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/overflow/test_overflow_utils.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/overflow/test_overflow_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f084ca211304c331a225b14119dc646b6cd09273
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/overflow/test_overflow_utils.py
@@ -0,0 +1,58 @@
+# coding=utf-8
+import pytest
+import unittest
+from ptdbg_ascend.overflow_check import utils
+from ptdbg_ascend.overflow_check.utils import OverFlowUtil, dump_overflow
+
+ON = "ON"
+OFF = "OFF"
+
+
+class TestUtilsMethods(unittest.TestCase):
+
+    def test_set_overflow_check_switch_error1(self):
+        with pytest.raises(Exception) as error:
+            res = OverFlowUtil.set_overflow_check_switch("abc")
+            self.assertEqual(error.type, TypeError)
+
+    def test_set_overflow_check_switch_error2(self):
+        with pytest.raises(Exception) as error:
+            res = utils.set_overflow_check_switch("abc")
+            self.assertEqual(error.type, AssertionError)
+
+    def test_set_overflow_check_switch_error3(self):
+        with pytest.raises(Exception) as error:
+            res = utils.set_overflow_check_switch(ON, "abc")
+            self.assertEqual(error.type, AssertionError)
+
+    def test_OverFlowUtil_set_overflow_check_switch(self):
+        OverFlowUtil.set_overflow_check_switch(ON, OFF)
+        self.assertEqual(OverFlowUtil.overflow_check_switch, ON)
+        self.assertEqual(OverFlowUtil.overflow_filter_switch, OFF)
+
+    def test_get_overflow_check_switch(self):
+        res = OverFlowUtil.get_overflow_check_switch()
+        self.assertEqual(res, True)
+
+    def test_inc_overflow_dump_times(self):
+        OverFlowUtil.inc_overflow_dump_times()
+        self.assertEqual(OverFlowUtil.real_overflow_dump_times, 1)
+
+    def test_check_overflow_dump_times(self):
+        res = OverFlowUtil.check_overflow_dump_times(100)
+        self.assertEqual(res, True)
+
+    def test_set_overflow_check_switch_success1(self):
+        utils.set_overflow_check_switch(OFF, OFF)
+        self.assertEqual(OverFlowUtil.overflow_check_switch, OFF)
+        self.assertEqual(OverFlowUtil.overflow_filter_switch, OFF)
+
+    def test_set_overflow_check_switch_success2(self):
+        utils.set_overflow_check_switch(ON)
+        self.assertEqual(OverFlowUtil.overflow_check_switch, ON)
+        self.assertEqual(OverFlowUtil.overflow_filter_switch, ON)
+
+    def test_set_overflow_check_switch_success3(self):
+        utils.set_overflow_check_switch(ON, ON)
+        self.assertEqual(OverFlowUtil.overflow_check_switch, ON)
+        self.assertEqual(OverFlowUtil.overflow_filter_switch, ON)
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_acc_compare.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_acc_compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c28922a740cd320e4d05b9cf44f1b647ae4eda0
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_acc_compare.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+import unittest
+import numpy as np
+import os
+from ptdbg_ascend.compare import acc_compare as compare
+from ptdbg_ascend.common.utils import CompareConst
+
+
+npu_dict = {'op_name': ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.1', 'Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_output'],\
+ 'input_struct': [('torch.float32', [1, 1, 28, 28]), ('torch.float32', [16, 1, 5, 5]), ('torch.float32', [16])],\
+  'output_struct': [('torch.float32', [1, 16, 28, 28])], 'summery': [[3.029174327850342, -2.926689624786377, -0.06619918346405029], \
+  [0.19919930398464203, -0.19974489510059357, 0.006269412115216255], [0.19734230637550354, -0.18177609145641327, 0.007903944700956345], [2.1166646480560303, -2.190781354904175, -0.003579073818400502]], 'stack_info': []}
+bench_dict = {'op_name': ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.1', 'Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_output'],\
+ 'input_struct': [('torch.float32', [1, 1, 28, 28]), ('torch.float32', [16, 1, 5, 5]), ('torch.float32', [16])],\
+  'output_struct': [('torch.float32', [1, 16, 28, 28])], 'summery': [[3.029174327850342, -2.926689624786377, -0.06619918346405029], \
+  [0.19919930398464203, -0.19974489510059357, 0.006269412115216255], [0.19734230637550354, -0.18177609145641327, 0.007903944700956345], [2.1166646480560303, -2.190781354904175, -0.003579073818400502]], 'stack_info': []}
+tensor_list = [['Functional_conv2d_0_forward_input.0', 1, [], 'torch.float32', [1, 1, 28, 28], [3.029174327850342, -2.926689624786377, -0.06619918346405029]],\
+ ['Functional_conv2d_0_forward_input.1', 1, [], 'torch.float32', [16, 1, 5, 5], [0.19919930398464203, -0.19974489510059357, 0.006269412115216255]], \
+ ['Functional_conv2d_0_forward_input.2', 1, [], 'torch.float32', [16], [0.19734230637550354, -0.18177609145641327, 0.007903944700956345]],\
+  ['Functional_conv2d_0_forward_output', 1, [], 'torch.float32', [1, 16, 28, 28], [2.1166646480560303, -2.190781354904175, -0.003579073818400502]]]
+result_op_dict = {'op_name': ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.1', 'Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_output'], \
+'input_struct': [('torch.float32', [1, 1, 28, 28]), ('torch.float32', [16, 1, 5, 5]), ('torch.float32', [16])], \
+'output_struct': [('torch.float32', [1, 16, 28, 28])], 'summery': [[3.029174327850342, -2.926689624786377, -0.06619918346405029], [0.19919930398464203, -0.19974489510059357, 0.006269412115216255], \
+[0.19734230637550354, -0.18177609145641327, 0.007903944700956345], [2.1166646480560303, -2.190781354904175, -0.003579073818400502]], 'stack_info': []}
+
+o_result = [['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.0', 'torch.float32', 'torch.float32', [1, 1, 28, 28], [1, 1, 28, 28], ' ', ' ', ' ', 3.029174327850342, -2.926689624786377, -0.06619918346405029, 3.029174327850342, -2.926689624786377, -0.06619918346405029, 'Yes', ''], ['Functional_conv2d_0_forward_input.1', 'Functional_conv2d_0_forward_input.1', 'torch.float32', 'torch.float32', [16, 1, 5, 5], [16, 1, 5, 5], ' ', ' ', ' ', 0.19919930398464203, -0.19974489510059357, 0.006269412115216255, 0.19919930398464203, -0.19974489510059357, 0.006269412115216255, 'Yes', ''], ['Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_input.2', 'torch.float32', 'torch.float32', [16], [16], ' ', ' ', ' ', 0.19734230637550354, -0.18177609145641327, 0.007903944700956345, 0.19734230637550354, -0.18177609145641327, 0.007903944700956345, 'Yes', ''], ['Functional_conv2d_0_forward_output', 'Functional_conv2d_0_forward_output', 'torch.float32', 'torch.float32', [1, 16, 28, 28], [1, 16, 28, 28], ' ', ' ', ' ', 2.1166646480560303, -2.190781354904175, -0.003579073818400502, 2.1166646480560303, -2.190781354904175, -0.003579073818400502, 'Yes', '']]
+
+class TestUtilsMethods(unittest.TestCase):
+    def test_correct_data(self):
+        input_1 = 'NAN'
+        result_1 = compare.correct_data(input_1)
+        self.assertEqual(result_1, 'NAN')
+        input_2 = '0.99999'
+        result_2 = compare.correct_data(input_2)
+        self.assertEqual(result_2, '0.99999')
+        input_3 = '0.999991'
+        result_3 = compare.correct_data(input_3)
+        self.assertEqual(result_3, '1.0')
+
+    def test_cosine_similarity_when_all_result_less_than_epsilon(self):
+        n_value = np.array([0, 0, 0])
+        b_value = np.array([0, 0, 0])
+        result, message = compare.cosine_similarity(n_value, b_value)
+        self.assertEqual(result, '1.0')
+        self.assertEqual(message, '')
+
+    def test_cosine_similarity_when_only_npu_result_less_than_epsilon(self):
+        n_value = np.array([0, 0, 0])
+        b_value = np.array([1, 2, 3])
+        result, message = compare.cosine_similarity(n_value, b_value)
+        self.assertEqual(result, CompareConst.NAN)
+        self.assertEqual(message, 'Cannot compare by Cosine Similarity, All the data is Zero in npu dump data.')
+
+    def test_cosine_similarity_when_only_bench_result_less_than_epsilon(self):
+        n_value = np.array([1, 2, 3])
+        b_value = np.array([0, 0, 0])
+        result, message = compare.cosine_similarity(n_value, b_value)
+        self.assertEqual(result, CompareConst.NAN)
+        self.assertEqual(message, 'Cannot compare by Cosine Similarity, All the data is Zero in Bench dump data.')
+
+    def test_cosine_similarity_when_all_result_greater_than_epsilon_with_no_nan(self):
+        n_value = np.array([1, 2, 3])
+        b_value = np.array([1, 2, 3])
+        result, message = compare.cosine_similarity(n_value, b_value)
+        
+        self.assertEqual(result, '1.0')
+        self.assertEqual(message, '')
+
+    def test_cosine_similarity_when_all_result_greater_than_epsilon_with_nan(self):
+        n_value = np.array([1, 2, np.nan])
+        b_value = np.array([1, 2, 3])
+        result, message = compare.cosine_similarity(n_value, b_value)
+        self.assertEqual(result, CompareConst.NAN)
+        self.assertEqual(message, 'Cannot compare by Cosine Similarity, the dump data has NaN.')
+
+    def test_get_rmse_when_rmse_is_nan(self):
+        n_value = np.array([1, 2, np.nan])
+        b_value = np.array([1, 2, 3])
+        rmse, message = compare.get_rmse(n_value, b_value)
+        self.assertEqual(rmse, CompareConst.NAN)
+        self.assertEqual(message, "")
+
+    def test_get_mape_when_mape_is_nan(self):
+        n_value = np.array([1, 2, np.nan])
+        b_value = np.array([1, 2, 3])
+        mape, message = compare.get_mape(n_value, b_value)
+        self.assertEqual(mape, CompareConst.NAN)
+        self.assertEqual(message, "")
+
+    def test_get_max_relative_err_when_max_relative_is_nan(self):
+        n_value = np.array([1, 2, np.nan])
+        b_value = np.array([1, 2, 3])
+        max_relative_err, message = compare.get_max_relative_err(n_value, b_value)
+        self.assertEqual(max_relative_err, CompareConst.NAN)
+        self.assertEqual(message, 'Cannot compare by MaxRelativeError, the data contains nan in dump data.')
+
+    def test_get_max_relative_err_when_max_relative_is_not_nan(self):
+        n_value = np.array([1, 2, 3])
+        b_value = np.array([1, 2, 3])
+        max_relative_err, message = compare.get_max_relative_err(n_value, b_value)
+        self.assertEqual(max_relative_err, "0.000000")
+        self.assertEqual(message, "")
+
+    def test_check_op(self):
+        fuzzy_match = False
+        result = compare.check_op(npu_dict, bench_dict, fuzzy_match)
+        self.assertEqual(result, True)
+
+    def test_merge_tensor(self):
+        op_dict = compare.merge_tensor(tensor_list)
+        self.assertEqual(op_dict, result_op_dict)
+
+    def test_read_op(self):
+        base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        
+        pkl_dir = os.path.join(base_dir, "resources/compare/npu_test.pkl")
+ 
+        npu_ops_queue = []
+        npu_pkl_handle = open(pkl_dir, "r")
+        stack_mode = False
+        result = compare.read_op(npu_ops_queue, npu_pkl_handle, stack_mode)
+        self.assertEqual(result, True)
+
+
+    def test_match_op(self):
+        fuzzy_match = False
+        a, b = compare.match_op([npu_dict], [bench_dict], fuzzy_match)
+        self.assertEqual(a, 0)
+        self.assertEqual(b, 0)
+
+    def test_get_accuracy(self):
+        result = []
+        compare.get_accuracy(result, npu_dict, bench_dict)
+        
+        self.assertEqual(result, o_result)
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bee5d4afbe0782e58d12ef04b8b243a7a5084bf
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import os
+import shutil
+import unittest
+from ptdbg_ascend.advisor.advisor import Advisor
+from ptdbg_ascend.common.utils import CompareException
+
+
+class TestAdvisor(unittest.TestCase):
+    def setUp(self) -> None:
+        os.makedirs("test_result/output", exist_ok=True)
+        self.output_path = os.path.abspath("test_result/output")
+
+    def tearDown(self) -> None:
+        shutil.rmtree("test_result/", ignore_errors=True)
+
+    def test_analysis_when_csv_path_is_not_exist(self):
+        advisor = Advisor("resources/compare/test.pkl", self.output_path)
+        self.assertRaises(CompareException, advisor.analysis)
+
+    def test_analysis_when_csv_path_is_invalid(self):
+        advisor = Advisor("resources/compare/npu_test_1.pkl", self.output_path)
+        self.assertRaises(CompareException, advisor.analysis)
+
+    def test_analysis_when_csv_is_valid(self):
+        advisor = Advisor("resources/compare/compare_result_20230703104808.csv", self.output_path)
+        advisor.analysis()
+        filenames = os.listdir(self.output_path)
+        self.assertEqual(len(filenames), 1)
+
+    def test_analysis_when_accuracy_and_npu_name_not_in_csv(self):
+        advisor = Advisor("resources/compare/compare_result_without_accuracy.csv", self.output_path)
+        self.assertRaises(AttributeError, advisor.analysis)
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor_result.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c840391591526541784a2456ee1d7b3249d3324
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_advisor_result.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import difflib
+import os
+import shutil
+import unittest
+from ptdbg_ascend.advisor.advisor import Advisor
+
+
+class TestAdvisor(unittest.TestCase):
+    def setUp(self) -> None:
+        os.makedirs("test_result/output", exist_ok=True)
+        self.output_path = os.path.abspath("test_result/output")
+        self.has_error = False
+
+    def tearDown(self) -> None:
+        shutil.rmtree("test_result/", ignore_errors=True)
+
+    def test_advisor_summary_file(self):
+        advisor = Advisor("resources/compare/compare_result_20230703104808.csv", self.output_path)
+        advisor.analysis()
+        filenames = os.listdir(self.output_path)
+        for filename in filenames:
+            filename = os.path.join(self.output_path, filename)
+            self.result_check("resources/compare/advisor.txt", filename)
+        self.assertFalse(self.has_error)
+
+    def result_check(self, standard_file, output_file):
+        with open(standard_file, 'r', encoding='utf-8') as st_file:
+            standard_content = st_file.read().splitlines()
+        with open(output_file, 'r', encoding='utf-8') as out_file:
+            output_content = out_file.read().splitlines()
+        result = list(difflib.unified_diff(standard_content, output_content, n=0))
+        if result:
+            print('\n\n-------------------------------------------------------------------------', flush=True)
+            print(f'[ERROR] {output_file.replace(self.output_path, "")} advisor summary are inconsistent.',
+                  flush=True)
+            print('\n'.join(result), flush=True)
+            print('-------------------------------------------------------------------------', flush=True)
+            self.has_error = True
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_common_util.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_common_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fc5ae51ea7828a2dd683bf9baf4304026b33ee0
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_common_util.py
@@ -0,0 +1,87 @@
+# coding=utf-8
+import unittest
+import time
+from datetime import datetime, timezone
+from ptdbg_ascend.common import utils as common
+#from ptdbg_ascend.common import CompareException
+
+class TestCommonUtilsMethods(unittest.TestCase):
+
+    def test_VersionCheck(self):
+        V0_1 = "0.1"
+        V1_8 = "1.8"
+        V1_11 = "1.11"
+        V2_0 = "2.0"
+        V2_1 = "2.1"
+        version_check = common.VersionCheck
+        self.assertFalse(version_check.check_torch_version(V0_1))
+        self.assertTrue(version_check.check_torch_version(V1_8) or version_check.check_torch_version(V1_11) or version_check.check_torch_version(V2_0) or version_check.check_torch_version(V2_1))
+
+    def test_check_mode_valid(self):
+        mode_check = common.check_mode_valid
+        self.assertEqual(mode_check("all"), None)
+        self.assertEqual(mode_check("list",scope=["Tensor_permute_1_forward", "Tensor_transpose_2_forward", "Torch_relu_3_backward"]), None)
+        self.assertEqual(mode_check("range", scope=["Tensor_abs_1_forward", "Tensor_transpose_3_forward"]), None)
+        self.assertEqual(mode_check("stack",scope=["Tensor_abs_1_forward", "Tensor_transpose_3_forward"]), None)
+        self.assertEqual(mode_check("acl",scope=["Tensor_permute_1_forward"]), None)
+        self.assertEqual(mode_check("api_list",api_list=["relu"]), None)
+        self.assertEqual(mode_check("api_stack"), None)
+        self.assertRaises(common.CompareException, mode_check, "api_stack_123")
+    
+    def test_parse_arg_value(self):
+        data = [[1, 2, 4, 8]]
+        self.assertEqual(common.parse_arg_value("1,2,4,8"), data)
+    
+    def test_parse_value_by_comma(self):
+        data = [1, 2, 4, 8]
+        self.assertEqual(common.parse_value_by_comma("1,2,4,8"), data)
+    
+    def test_get_data_len_by_shape(self):
+        getshape = common.get_data_len_by_shape
+        data = [1, 2, 4, 8]
+        self.assertEqual(getshape(data), 64)
+        data = [-1, 2, 4, 8]
+        self.assertEqual(getshape(data), -1)
+    
+    def test_add_time_as_suffix(self):
+        name = "op_cmp"
+        csv_name = '{}_{}.csv'.format(name, time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())))
+        self.assertEqual(common.add_time_as_suffix(name), csv_name)
+    
+    def test_get_time(self):
+        time = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
+        self.assertEqual(common.get_time(), time)
+    
+    def test_format_value(self):
+        value = 12345.6789
+        format_value = '{:.6f}'.format(value)
+        self.assertEqual(common.format_value(value), format_value)
+
+    def test_modify_dump_path(self):
+        dump_path = "/usr/dump"
+        mode = "api_stack"
+        self.assertEqual(common.modify_dump_path(dump_path, mode), "/usr/api_stack_dump")
+
+    def test_create_directory(self):
+        pass
+
+    def test_execute_command(self):
+        pass
+    
+    def test_save_numpy_data(self):
+        pass
+
+    def test_torch_device_guard(self):
+        pass
+
+    def test_seed_all(self):
+        pass
+
+    def test_get_process_rank(self):
+        pass
+    
+    def test_check_file_size(self):
+        pass
+
+    def test_get_dump_data_path(self):
+        pass
\ No newline at end of file
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_hooks.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..7874d3c2fa947dedcdd28c5f463ab119e3fb712c
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_hooks.py
@@ -0,0 +1,71 @@
+# coding=utf-8
+import os
+import unittest
+from ptdbg_ascend.dump import utils as hooks
+
+
+class TestUtilsMethods(unittest.TestCase):
+
+    def test_set_dump_switch_only_set_switch_as_on(self):
+        dump_count = hooks.dump_count
+        dump_util = hooks.DumpUtil
+        switch_on = "ON"
+        mode_all = "all"
+        hooks.set_dump_switch(switch_on)
+        self.assertEqual(dump_util.dump_switch, switch_on)
+        self.assertEqual(dump_util.dump_switch_mode, mode_all)
+        self.assertTrue(dump_util.dump_init_enable)
+        self.assertEqual(dump_util.dump_switch_scope, [])
+        self.assertEqual(dump_util.dump_api_list, [])
+        self.assertEqual(dump_util.dump_filter_switch, switch_on)
+        self.assertEqual(dump_count, 0)
+
+    def test_set_dump_switch_mode_is_list(self):
+        scope_list = ["Tensor_permute_1_forward", "Tensor_transpose_2_forward"]
+        dump_util = hooks.DumpUtil
+        hooks.set_dump_switch("ON", mode="list", scope=scope_list)
+        self.assertEqual(dump_util.dump_switch_mode, "list")
+        self.assertEqual(dump_util.dump_switch_scope, scope_list)
+
+    def test_set_dump_switch_mode_is_range(self):
+        scope_list = ["Tensor_permute_1_forward", "Tensor_transpose_3_forward"]
+        dump_util = hooks.DumpUtil
+        hooks.set_dump_switch("ON", mode="range", scope=scope_list)
+        self.assertEqual(dump_util.dump_switch_mode, "range")
+        self.assertEqual(dump_util.dump_switch_scope, scope_list)
+
+    def test_set_dump_switch_mode_is_stack(self):
+        scope_list = ["Tensor_abs_1_forward", "Tensor_transpose_3_forward"]
+        dump_util = hooks.DumpUtil
+        hooks.set_dump_switch("ON", mode="stack", scope=scope_list)
+        self.assertEqual(dump_util.dump_switch_mode, "stack")
+        self.assertEqual(dump_util.dump_switch_scope, scope_list)
+
+    def test_set_dump_switch_mode_is_api_list(self):
+        api_list = ["Transpose", "Relu", "triu"]
+        lower_api_list = ["transpose", "relu", "triu"]
+        dump_util = hooks.DumpUtil
+        hooks.set_dump_switch("ON", mode="api_list", api_list=api_list)
+        self.assertEqual(dump_util.dump_switch_mode, "api_list")
+        self.assertEqual(dump_util.dump_api_list, lower_api_list)
+
+    def test_set_dump_switch_mode_is_acl(self):
+        scope_list = ["Tensor_transpose_3_backward"]
+        replace_scope = ["Tensor_transpose_3_forward"]
+        dump_util = hooks.DumpUtil
+        hooks.set_dump_switch("ON", mode="acl", scope=scope_list)
+        self.assertEqual(dump_util.dump_switch_mode, "acl")
+        self.assertEqual(dump_util.dump_switch_scope, replace_scope)
+
+    def test_set_dump_filter_switch_off(self):
+        dump_util = hooks.DumpUtil
+        hooks.DumpUtil.dump_path='/home/dump_path/ptdbg_dump_v3.2/rank0'
+        hooks.set_dump_switch("ON", filter_switch="OFF")
+        self.assertEqual(dump_util.dump_filter_switch, "OFF")
+
+    def test_set_dump_path(self):
+        dump_util = hooks.DumpUtil
+        hooks.set_dump_path("resources", dump_tag="dump_data")
+        output_path = os.path.abspath("resources")
+        self.assertEqual(dump_util.dump_path, output_path)
+        self.assertEqual(dump_util.dump_dir_tag, "dump_data")
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae1f3321189bd14898ca10760dfa3670d36b2a36
--- /dev/null
+++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py
@@ -0,0 +1,43 @@
+import unittest
+import pytest
+import ptdbg_ascend.common.utils as utils
+
+from ptdbg_ascend.common.utils import CompareException
+
+
+class TestUtilsMethods(unittest.TestCase):
+    def test_get_api_name_from_matcher(self):
+        normal_name = "Functional_relu__1_output"
+        unusual_name = "Functional_norm_layer_1_output"
+        error_name = "Tensor_onnx::connect_1_input"
+        api_name_1 = utils.get_api_name_from_matcher(normal_name)
+        api_name_2 = utils.get_api_name_from_matcher(unusual_name)
+        api_name_3 = utils.get_api_name_from_matcher(error_name)
+        self.assertEqual(api_name_1, "relu")
+        self.assertEqual(api_name_2, "norm_layer")
+        self.assertEqual(api_name_3, "")
+
+    def test_check_file_or_directory_path_1(self):
+        file = "list"
+        with pytest.raises(CompareException) as error:
+            utils.check_file_or_directory_path(file)
+        self.assertEqual(error.value.code, CompareException.INVALID_PATH_ERROR)
+
+    def test_check_file_or_directory_path_2(self):
+        file = "/list/dir"
+        with pytest.raises(CompareException) as error:
+            utils.check_file_or_directory_path(file)
+        self.assertEqual(error.value.code, CompareException.INVALID_PATH_ERROR)
+
+    def test_check_file_size_1(self):
+        file = "/list/dir"
+        with pytest.raises(CompareException) as error:
+            utils.check_file_size(file, 100)
+        self.assertEqual(error.value.code, CompareException.INVALID_FILE_ERROR)
+
+    def test_check_file_size_2(self):
+        file = "../run_ut.py"
+        with pytest.raises(CompareException) as error:
+            utils.check_file_size(file, 0)
+        self.assertEqual(error.value.code, CompareException.INVALID_FILE_ERROR)
+
diff --git a/debug/accuracy_tools/ptdbg_ascend/tools/.keep b/debug/accuracy_tools/ptdbg_ascend/tools/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391