diff --git a/debug/accuracy_tools/MANIFEST.in b/debug/accuracy_tools/MANIFEST.in
index 21075694b9b36d38ea5d5fea29e00bfbd6d9e538..a0aeb46bbc6b35a3ecb0015e137fadf595a65c4d 100644
--- a/debug/accuracy_tools/MANIFEST.in
+++ b/debug/accuracy_tools/MANIFEST.in
@@ -1,8 +1,5 @@
 recursive-include ptdbg_ascend/src/python/ptdbg_ascend/ *.py
 recursive-include ptdbg_ascend/src/python/ptdbg_ascend/ *.yaml
 recursive-include ptdbg_ascend/src/python/ptdbg_ascend/ *.template
-recursive-include api_accuracy_checker/ *.py
-recursive-include api_accuracy_checker/ *.yaml
-recursive-include api_accuracy_checker/ *.json
 recursive-include atat/ *
 recursive-exclude api_accuracy_checker/test *
diff --git a/debug/accuracy_tools/atat/README.md b/debug/accuracy_tools/atat/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c2ec06e99565d028da6e713c111aa5cb61efece6
--- /dev/null
+++ b/debug/accuracy_tools/atat/README.md
@@ -0,0 +1,123 @@
+# MindStudio精度调试工具
+
+MindStudio精度调试工具（ascend_training_accuracy_tools），简称atat，是ATT工具链下精度调试部分的工具包。主要包括精度预检和精度比对等子工具，当前适配场景包括PyTorch和MindSpore。
+
+## 工具安装
+
+精度工具合一软件包名称：`ascend_training_accuracy_tools-{version}-py3-none-any.whl`
+
+1. whl包获取。
+
+   请通过下表链接下载工具whl包。
+
+   | 版本  | 发布日期   | 支持PyTorch版本    | 下载链接                                                     | 校验码                                                       |
+   | ----- | ---------- | ------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+   | 0.0.2 | 2024-05-23 | 1.11.0/2.0/2.1/2.2 | [ascend_training_accuracy_tools-0.0.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/att/0.0/ascend_training_accuracy_tools-0.0.2-py3-none-any.whl) | 2e35809bde559e9c4d2f16a02ccde779ed9e436bb65fded0b7ebaf6ac2c88d93 |
+   | 0.0.1 | 2024-03-15 | 1.11.0/2.0/2.1     | [ascend_training_accuracy_tools-0.0.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/att/0.0/ascend_training_accuracy_tools-0.0.1-py3-none-any.whl) | 5801510d4e827e4859bc9a5aca021e4d30c2ea42d60a4c8ad0c2baab1b7782c9 |
+
+2. whl包校验。
+
+   1. 根据以上下载链接下载whl包到Linux安装环境。
+
+   2. 进入whl包所在目录，执行如下命令。
+
+      ```bash
+      sha256sum {name}.whl
+      ```
+
+      {name}为whl包名称。
+
+      若回显呈现对应版本whl包一致的**校验码**，则表示下载了正确的ptdbg_ascend精度工具whl安装包。示例如下：
+
+      ```bash
+      sha256sum ascend_training_accuracy_tools-0.0.1-py3-none-any.whl
+      5801510d4e827e4859bc9a5aca021e4d30c2ea42d60a4c8ad0c2baab1b7782c9 *ascend_training_accuracy_tools-0.0.1-py3-none-any.whl
+      ```
+
+3. 执行如下命令进行安装。
+
+   ```bash
+   pip3 install ./ascend_training_accuracy_tools-{version}-py3-none-any.whl
+   ```
+
+   若为覆盖安装，请在命令行末尾增加“--force-reinstall”参数强制安装，例如：
+
+   ```bash
+   pip3 install ./ascend_training_accuracy_tools-{version}-py3-none-any.whl --force-reinstall
+   ```
+
+   提示如下信息则表示安装成功。
+
+   ```bash
+   Successfully installed ascend_training_accuracy_tools-{version}
+   ```
+
+
+## 工具使用
+
+安装atat工具后，可以按照如下思路选择合适的子工具进行精度调试：
+
+1. 判断框架场景。
+
+   当前支持PyTorch和MindSpore场景。
+
+2. 执行数据采集。 
+
+   工具通过在训练脚本中添加PrecisionDebugger接口的方式对API执行精度数据dump操作。
+
+   PyTorch场景：详见[PyTorch_精度数据采集](./pytorch/doc/dump.md)。
+
+   MindSpore场景：详见[MindSpore_精度数据采集](./mindspore/doc/dump.md)。
+
+3. 执行精度预检。
+
+   在昇腾NPU上扫描用户训练模型中所有API，进行API复现，给出精度情况的诊断和分析。
+
+   PyTorch场景：详见[PyTorch_精度预检工具](./pytorch/doc/api_accuracy_checker.md)。
+
+   MindSpore场景：暂不支持。
+
+4. 执行精度比对。
+
+   进行PyTorch整网API粒度的数据dump、精度比对和溢出检测，从而定位训练场景下的精度问题。
+
+   PyTorch场景：详见[PyTorch_精度比对工具](./pytorch/doc/ptdbg_ascend_overview.md)。
+
+   MindSpore场景：暂不支持。
+
+5. 执行溢出解析。
+
+   溢出解析是在执行精度数据dump时，配置了溢出检测dump，那么对于输入正常但输出存在溢出的API，可以判断是否为正常溢出。
+
+   PyTorch场景：详见[PyTorch_溢出解析工具](./pytorch/doc/run_overflow_check.md)。（暂不支持）
+
+   MindSpore场景：暂不支持。
+
+6. 执行数据解析。
+
+   用于比对前后两次NPU ACL层级dump数据的一致性。
+
+   PyTorch场景：详见[PyTorch_数据解析工具](./pytorch/doc/parse_tool.md)。
+
+   MindSpore场景：暂不支持。
+
+上述流程中的工具均为atat工具的子工具，使用相同的命令行，格式如下：
+
+```bash
+atat [-h] -f <framework> parse run_ut multi_run_ut api_precision_compare run_overflow_check
+```
+
+| 参数 | 说明                                     |
+| ---- | ---------------------------------------- |
+| -f   | 框架，当前支持配置为pytorch和mindspore。 |
+| -h   | 帮助信息。                               |
+
+其他参数在上述对应的工具手册中详细介绍。
+
+## 贡献
+
+push代码前，请务必保证已经完成了基础功能测试和网络测试。
+
+## Release Notes
+
+Release Notes请参见[RELEASE](RELEASE.md)。
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/atat.py b/debug/accuracy_tools/atat/atat.py
index 4f69afd2349f211d1c6e17ab9386de3c8fcd6909..799200ae41c76ac41be8e467910c19a772f9db74 100644
--- a/debug/accuracy_tools/atat/atat.py
+++ b/debug/accuracy_tools/atat/atat.py
@@ -15,11 +15,11 @@
 
 import argparse
 import sys
-from api_accuracy_checker.run_ut.run_ut import _run_ut_parser, run_ut_command
+from atat.pytorch.api_accuracy_checker.run_ut.run_ut import _run_ut_parser, run_ut_command
 from ptdbg_ascend.src.python.ptdbg_ascend.parse_tool.cli import parse as cli_parse
-from api_accuracy_checker.run_ut.multi_run_ut import prepare_config, run_parallel_ut
-from api_accuracy_checker.compare.api_precision_compare import _api_precision_compare_parser, _api_precision_compare_command
-from api_accuracy_checker.run_ut.run_overflow_check import _run_overflow_check_parser, _run_overflow_check_command
+from atat.pytorch.api_accuracy_checker.run_ut.multi_run_ut import prepare_config, run_parallel_ut
+from atat.pytorch.api_accuracy_checker.compare.api_precision_compare import _api_precision_compare_parser, _api_precision_compare_command
+from atat.pytorch.api_accuracy_checker.run_ut.run_overflow_check import _run_overflow_check_parser, _run_overflow_check_command
 
 
 def main():
@@ -30,6 +30,8 @@ def main():
         f"For any issue, refer README.md first",
     )
     parser.set_defaults(print_help=parser.print_help)
+    parser.add_argument('-f', '--framework', required=True, choices=['pytorch'],
+                        help='Deep learning framework.')
     subparsers = parser.add_subparsers()
     subparsers.add_parser('parse')
     run_ut_cmd_parser = subparsers.add_parser('run_ut')
@@ -46,16 +48,16 @@ def main():
         parser.print_help()
         sys.exit(0)
     args = parser.parse_args(sys.argv[1:])
-    if sys.argv[1] == "run_ut":
+    if sys.argv[3] == "run_ut":
         run_ut_command(args)
-    elif sys.argv[1] == "parse":
+    elif sys.argv[3] == "parse":
         cli_parse()
-    elif sys.argv[1] == "multi_run_ut":
+    elif sys.argv[3] == "multi_run_ut":
         config = prepare_config(args)
         run_parallel_ut(config)
-    elif sys.argv[1] == "api_precision_compare":
+    elif sys.argv[3] == "api_precision_compare":
         _api_precision_compare_command(args)
-    elif sys.argv[1] == "run_overflow_check":
+    elif sys.argv[3] == "run_overflow_check":
         _run_overflow_check_command(args)
 
 
diff --git a/debug/accuracy_tools/atat/config/README.md b/debug/accuracy_tools/atat/config/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b1ce8886b58d715b8fde56cccf08497237b8310f
--- /dev/null
+++ b/debug/accuracy_tools/atat/config/README.md
@@ -0,0 +1,276 @@
+# 配置文件说明
+
+当前配置文件主要为PrecisionDebugger接口执行dump或无标杆比对操作时，调用的配置，当PrecisionDebugger接口未指定该配置文件时，使用该文件的默认配置。配置文件详见[config.json](./config.json)。
+
+## 参数说明
+
+### **通用配置参数**
+
+| 参数名           | 说明                                                         | 是否必选 |
+| ---------------- | ------------------------------------------------------------ | -------- |
+| task             | dump的任务类型，str类型。可取值"free_benchmark"（无标杆比对）、"statistics"（仅dump API统计信息，默认值）、"tensor"（dump API统计信息和完全复刻整网的API运行情况的真实数据）、"overflow_check"（溢出检测）。配置示例："task": "tensor"。根据task参数取值的不同，可以配置不同场景参数，详见：“**task配置为free_benchmark**”，“**task配置为statistics**”，“**task配置为tensor**”，“**task配置为overflow_check**”。 | 否       |
+| dump_path        | 设置dump数据目录路径，str类型。配置示例："dump_path": "./dump_path"。 | 是       |
+| rank             | 指定对某张卡上的数据进行dump，list[int]类型，默认未配置（表示dump所有卡的数据），须根据实际卡的Rank ID配置。应配置为大于等于0的整数，且须根据实际卡的Rank ID配置，若所配置的值大于实际训练所运行的卡的Rank ID，则dump数据为空，比如当前环境Rank ID为0到7，实际训练运行0到3卡，此时若配置Rank ID为4或不存在的10等其他值，此时dump数据为空。配置示例："rank": [1]。 | 否       |
+| step             | 指定dump某个step的数据，list[int]类型，默认未配置，表示dump所有step数据。dump特定step时，须指定为训练脚本中存在的step。step为list格式，可配置逐个step，例如："step": [0,1,2]。 | 否       |
+| level            | dump级别，str类型，根据不同级别dump不同数据。可取值"L0"（dump module模块级精度数据，仅PyTorch场景支持，使用背景详见“**模块级精度数据dump说明**”）、"L1"（dump API级精度数据，默认值）、"L2"（dump kernel级精度数据，仅MindSpore场景支持）、"mix"（dump module模块级和API级精度数据）。配置示例："level": "L1"。 | 否       |
+| seed             | 随机种子数，int类型，默认值为：1234。通过固定随机数保证模型的输入或输出一致，可固定的随机数详见“**固定随机数范围**”。配置示例："seed": 1234。 | 否       |
+| is_deterministic | 确定性计算模式，bool类型。可取值true（开启）或false（关闭），默认关闭。配置示例："is_deterministic": true。<br/>即使在相同的硬件和输入下，API多次执行的结果也可能不同，开启确定性计算是为了保证在相同的硬件和输入下，API多次执行的结果相同。<br/>确定性计算会导致API执行性能降低，建议在发现模型多次执行结果不同的情况下开启。<br/>rnn类算子、ReduceSum、ReduceMean等算子可能与确定性计算存在冲突，若开启确定性计算后多次执行的结果不相同，则考虑存在这些算子。 | 否       |
+
+### task配置为free_benchmark
+
+task配置为free_benchmark时，开启**无标杆比对**，在NPU环境下通过对当前模型API的输入添加扰动因子，二次执行，将得到的输出与未添加扰动因子前的输出进行比对，从而得出该模型中可能因迁移等变化导致精度降低的API。
+
+无标杆比对优势在于省去了从GPU环境获取dump数据并执行的步骤，也省去了在NPU环境执行dump的操作，降低了精度比对的操作难度。
+
+建议配置白名单（配置scope或list）控制少量API进行无标杆比对，一次对过多API执行无标杆比对可能导致显存溢出或性能膨胀。
+
+| 参数名       | 说明                                                         | 是否必选 |
+| ------------ | ------------------------------------------------------------ | -------- |
+| scope        | PyTorch场景dump范围，list[str]类型，默认未配置。需要在[]内配置两个模块名或API名，用于锁定区间，dump该范围内的数据。配置示例："scope": ["MyModuleOP1", "MyModuleOP2"]。与level参数取值相关，level为L0和mix级别时，可配置模块名；level为L1级别时，可配置API名。与list参数不能同时配置。 | 否       |
+| list         | 自定义dump范围，list[str]类型，默认未配置。包含如下配置方法：<br>        PyTorch场景配置具体的API全称，dump该API数据。配置示例："list": ["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]。<br/>        PyTorch场景指定某一类API，dump某一类的API级别输入输出数据。配置示例："list": ["relu"]。<br/>        PyTorch场景配置kernel_api，dump前向和反向API的kernel_api级别数据，其中dump反向API时需要配置**backward_input**参数。前向API配置示例："list": ["Tensor.permute.1.forward"]；反向API配置示例："list": ["Tensor.permute.1.forward"], "backward.input": "./npu_dump/step0/rank0/Functional.conv2d.1.backward.input.0.pt"]。<br/>        配置kernel_name，MindSpore为指定算子名dump，PyTorch需要指定API名称以及level为L2（PyTorch场景暂不支持）。<br/>与scope参数不能同时配置。 | 否       |
+| fuzz_device  | 标杆设备，str类型。可取值：<br/>        "npu"：无标杆，通过添加扰动因子进行比对，默认值。<br/>        "cpu"：以CPU为标杆，pert_mode须配置为"to_cpu"。<br/>配置示例："fuzz_device": "cpu"。 | 否       |
+| pert_mode    | 无标杆扰动因子，str类型。可取值：<br/>        "improve_precision"：对输入做升精度，默认值。<br/>        "add_noise"：对输入增加噪声。<br/>        "no_change"：不加扰动直接二次执行。<br/>        "bit_noise"：输入的末位比特翻转。<br/>        "change_value"：输入的张量首尾值调换。<br/>        "to_cpu"：在CPU等价执行。<br/>配置示例："pert_mode": "to_cpu"。 | 否       |
+| handler_type | 处理类型，可取值："check"（进行无标杆比对检查，默认值）、"fix"（将扰动后的API输出结果覆盖原始API输出结果，尝试将Loss曲线恢复正常，该模式下不支持预热if_preheat）。配置示例："handler_type": "fix"。 | 否       |
+| fuzz_level   | 无标杆数据dump级别，即选择比对结果文件应输出的表头属性，当前仅支持取值为："L1"。输出结果详见“**无标杆比对数据存盘格式**”。 | 否       |
+| fuzz_stage   | 前反向，选择对API前向或反向进行无标杆比对，可取值："forward"（前向，默认值）、"backward"（反向）。配置示例："fuzz_stage": "backward"。 | 否       |
+| if_preheat   | 预热功能，开启功能后工具可以根据每次迭代的输出调整精度算法的阈值，从而更准确找出存在精度问题的API，bool类型。可取值true（开启）或false（关闭），默认关闭。配置示例："if_preheat": "true"。"handler_type": "fix"不支持预热。 | 否       |
+| preheat_step | 开启预热的迭代数量，int类型，默认值为15。须配置"if_preheat": "true"。 | 否       |
+| max_sample   | 每个算子预热的采样次数的最大阈值，int类型，默认值为20。须配置"if_preheat": "true"。 | 否       |
+
+#### 无标杆比对数据存盘格式
+
+| 字段         | 说明                                                         |
+| ------------ | ------------------------------------------------------------ |
+| rank         | Rank ID，int类型。                                           |
+| type         | 前向或反向，string类型。                                     |
+| step         | 迭代数，int类型。                                            |
+| max_rel      | 输出对比最大相对误差，float类型。                            |
+| dtype        | 输入的dtype，string类型。                                    |
+| shape        | 输入的shape，tuple类型。                                     |
+| Output_index | 如果输出为列表或元组，其中一个元素检测不一致，则会有该元素的index，否则为空，int类型。 |
+
+### task配置为statistics
+
+| 参数名       | 说明                                                         | 是否必选 |
+| ------------ | ------------------------------------------------------------ | -------- |
+| scope        | PyTorch场景dump范围，list[str]类型，默认未配置。需要在[]内配置两个模块名或API名，用于锁定区间，dump该范围内的数据。配置示例："scope": ["MyModuleOP1", "MyModuleOP2"]。与level参数取值相关，level为L0和mix级别时，可配置模块名；level为L1级别时，可配置API名。 | 否       |
+| list         | 自定义dump范围，list[str]类型，默认未配置。包含如下配置方法：<br>        PyTorch场景配置具体的API全称，dump该API数据。配置示例："list": ["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]。<br/>        PyTorch场景指定某一类API，dump某一类的API级别输入输出数据。配置示例："list": ["relu"]。<br/>        MindSpore场景配置kernel_name，指定算子名dump。 | 否       |
+| data_mode    | dump数据过滤，str类型。可取值"all"、"forward"、"backward"、"input"和"output"，表示仅保存dump的数据中文件名包含"forward"、"backward"、"input"和"output"的前向、反向、输入或输出的dump文件。配置示例"data_mode": ["backward"]或"data_mode": ["forward", "backward"]。默认为["all"]，即保存所有dump的数据。除了all参数只能单独配置外，其他参数可以自由组合。 | 否       |
+| summary_mode | 控制dump文件输出的模式，str类型，可取值md5（dump仅输出包含md5值的dump.json文件，用于验证数据的完整性）、statistics（dump仅输出包含API统计信息的dump.json文件，默认值）。配置示例："summary_mode": "md5"。 | 否       |
+
+### task配置为tensor
+
+| 参数名         | 说明                                                         | 是否必选 |
+| -------------- | ------------------------------------------------------------ | -------- |
+| scope          | PyTorch场景dump范围，list[str]类型，默认未配置。需要在[]内配置两个模块名或API名，用于锁定区间，dump该范围内的数据。配置示例："scope": ["MyModuleOP1", "MyModuleOP2"]。与level参数取值相关，level为L0和mix级别时，可配置模块名；level为L1级别时，可配置API名。 | 否       |
+| list           | 自定义dump范围，list[str]类型，默认未配置。包含如下配置方法：<br>        PyTorch场景配置具体的API全称，dump该API数据。配置示例："list": ["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]。<br/>        PyTorch场景指定某一类API，dump某一类的API级别输入输出数据。配置示例："list": ["relu"]。<br/>        PyTorch场景配置kernel_api，dump前向和反向API的kernel_api级别数据，其中dump反向API时需要配置**backward_input**参数。前向API配置示例："list": ["Tensor.permute.1.forward"]；反API配置示例："list": ["Tensor.permute.1.forward"], "backward.input": "./npu_dump/step0/rank0/Functional.conv2d.1.backward.input.0.pt"]。<br/>        配置kernel_name，MindSpore为指定算子名dump，PyTorch需要指定API名称以及level为L2（PyTorch场景暂不支持）。 | 否       |
+| backward_input | 该输入文件为首次运行训练dump得到反向API输入的dump文件，str类型，默认未配置。例如若需要dump Functional.conv2d.1 API的反向过程的输入输出，则需要在dump目录下查找命名包含Functional.conv2d.1、backward和input字段的dump文件。配置示例："backward_input": "./npu_dump/step0/rank0/Functional.conv2d.1.backward.input.0.pt"] | 否       |
+| data_mode      | dump数据过滤，str类型。可取值"all"、"forward"、"backward"、"input"和"output"，表示仅保存dump的数据中文件名包含"forward"、"backward"、"input"和"output"的前向、反向、输入或输出的dump文件。配置示例"data_mode": ["backward"]或"data_mode": ["forward", "backward"]。默认为["all"]，即保存所有dump的数据。除了all参数只能单独配置外，其他参数可以自由组合。 | 否       |
+| file_format    | MindSpore场景真实tensor数据的保存格式，str类型，可取值"bin"（dump的tensor文件为二进制格式）、"npy"（dump的tensor文件后缀为.npy，默认值）。 | 否       |
+
+### task配置为overflow_check
+
+| 参数名        | 说明                                                         | 是否必选 |
+| ------------- | ------------------------------------------------------------ | -------- |
+| overflow_nums | 暂未支持。控制溢出次数，int类型，表示第N次溢出时，停止训练，过程中检测到溢出API对应kernel数据均dump。配置示例："overflow_nums": 3。默认为1，即检测到1次溢出，训练停止，配置为-1时，表示持续检测溢出直到训练结束。 | 否       |
+| check_mode    | MindSpore场景kernel级别的溢出检测，str类型，可取值"aicore"（开启AI Core的溢出检测）、"atomic"（开启Atomic的溢出检测）、"all"（开启AI Core和Atomic的溢出检测，默认值）。配置示例"check_mode": "aicore"。 | 否       |
+
+## 配置示例
+
+以下示例包含当前支持的所有场景可配置的完整参数。
+
+### PyTorch场景task配置为free_benchmark
+
+```json
+{
+    "task": "free_benchmark",
+    "dump_path": "/home/data_dump",
+    "rank": [],
+    "step": [],
+    "level": "L1",
+    "seed": 1234,
+    "is_deterministic": false,
+
+    "free_benchmark": {
+        "scope": [], 
+        "list": ["conv2d"],
+        "fuzz_device": "npu",
+        "pert_mode": "improve_precision",
+        "handler_type": "check",
+        "fuzz_level": "L1",
+        "fuzz_stage": "forward",
+        "if_preheat": false,
+        "preheat_step": 15,
+        "max_sample": 20
+    }
+}
+```
+
+### PyTorch场景task配置为statistics
+
+```json
+{
+    "task": "statistics",
+    "dump_path": "/home/data_dump",
+    "rank": [],
+    "step": [],
+    "level": "L1",
+    "seed": 1234,
+    "is_deterministic": false,
+
+    "statistics": {
+        "scope": [], 
+        "list": [],
+        "data_mode": ["all"],
+        "summary_mode": "statistics"
+    }
+}
+```
+
+### PyTorch场景task配置为tensor
+
+```json
+{
+    "task": "tensor",
+    "dump_path": "/home/data_dump",
+    "rank": [],
+    "step": [],
+    "level": "L1",
+    "seed": 1234,
+    "is_deterministic": false,
+
+    "tensor": {
+        "scope": [],
+        "list":[],
+        "data_mode": ["all"],
+        "backward_input": ""
+    }
+}
+```
+
+### PyTorch场景task配置为overflow_check
+
+```json
+{
+    "task": "overflow_check",
+    "dump_path": "/home/data_dump",
+    "rank": [],
+    "step": [],
+    "level": "L1",
+    "seed": 1234,
+    "is_deterministic": false,
+
+    "overflow_check": {
+        "overflow_nums": 1
+    }
+}
+```
+
+### MindSpore场景task配置为statistics
+
+```json
+{
+    "task": "statistics",
+    "dump_path": "/home/data_dump",
+    "rank": [],
+    "step": [],
+    "level": "L1",
+    "seed": 1234,
+    "is_deterministic": false,
+
+    "statistics": {
+        "list": [],
+        "data_mode": ["all"],
+        "summary_mode": "statistics"
+    }
+}
+```
+
+### MindSpore场景task配置为tensor
+
+```json
+{
+    "task": "tensor",
+    "dump_path": "/home/data_dump",
+    "rank": [],
+    "step": [],
+    "level": "L1",
+    "seed": 1234,
+    "is_deterministic": false,
+
+    "tensor": {
+        "list":[],
+        "data_mode": ["all"],
+        "backward_input": ""
+    }
+}
+```
+
+### MindSpore场景task配置为overflow_check
+
+```json
+{
+    "task": "overflow_check",
+    "dump_path": "/home/data_dump",
+    "rank": [],
+    "step": [],
+    "level": "L1",
+    "seed": 1234,
+    "is_deterministic": false,
+
+    "overflow_check": {
+        "overflow_nums": 1,
+        "check_mode": "all"
+    }
+}
+```
+
+## 附录
+
+### 模块级精度数据dump说明
+
+大模型场景下，通常不是简单的利用自动迁移能力实现GPU到NPU的训练脚本迁移，而是会对NPU网络进行一系列针对性的适配，因此，常常会造成迁移后的NPU模型存在部分子结构不能与GPU原始模型完全对应。模型结构不一致导致API调用类型及数量不一致，若直接按照API粒度进行精度数据dump和比对，则无法完全比对所有的API。
+
+本节介绍的功能是对模型中的大粒度模块进行数据dump，使其比对时，对于无法以API粒度比对的模块可以直接以模块粒度进行比对。
+
+模块指的是继承自nn.Module类模块，通常情况下这类模块就是一个小模型，可以被视为一个整体，dump数据时以模块为粒度进行dump。
+
+### 固定随机数范围
+
+seed_all函数可固定随机数的范围如下表。
+
+| API                                      | 固定随机数                  |
+| ---------------------------------------- | --------------------------- |
+| os.environ['PYTHONHASHSEED'] = str(seed) | 禁止Python中的hash随机化    |
+| random.seed(seed)                        | 设置random随机生成器的种子  |
+| np.random.seed(seed)                     | 设置numpy中随机生成器的种子 |
+| torch.manual_seed(seed)                  | 设置当前CPU的随机种子       |
+| torch.cuda.manual_seed(seed)             | 设置当前GPU的随机种子       |
+| torch.cuda.manual_seed_all(seed)         | 设置所有GPU的随机种子       |
+| torch_npu.npu.manual_seed(seed)          | 设置当前NPU的随机种子       |
+| torch_npu.npu.manual_seed_all(seed)      | 设置所有NPU的随机种子       |
+| torch.backends.cudnn.enable=False        | 关闭cuDNN                   |
+| torch.backends.cudnn.benchmark=False     | cuDNN确定性地选择算法       |
+| torch.backends.cudnn.deterministic=True  | cuDNN仅使用确定性的卷积算法 |
+
+需要保证CPU或GPU以及NPU的模型输入完全一致，dump数据的比对才有意义，seed_all并不能保证模型输入完全一致，如下表所示场景需要保证输入的一致性。
+
+| 场景            | 固定方法      |
+| --------------- | ------------- |
+| 数据集的shuffle | 关闭shuffle。 |
+| dropout         | 关闭dropout。 |
+
+关闭shuffle示例：
+
+```Python
+train_loader = torch.utils.data.DataLoader(
+	train_dataset,
+	batch_size = batch_size,
+	shuffle = False,
+	num_workers = num_workers
+)
+```
+
+关闭dropout：
+
+在使用from ptdbg import *后，工具会自动将torch.nn.functional.dropout、torch.nn.functional.dropout2d、torch.nn.functional.dropout3d、torch.nn.Dropout、torch.nn.Dropout2d、torch.nn.Dropout3d的接口参数p置为0。
diff --git a/debug/accuracy_tools/atat/config/config.json b/debug/accuracy_tools/atat/config/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf04ee46ffe33de24f46a54a630c4555ac743560
--- /dev/null
+++ b/debug/accuracy_tools/atat/config/config.json
@@ -0,0 +1,26 @@
+{
+    "task": "statistics",
+    "dump_path": "", 
+    "rank": [],  
+    "step": [],  
+    "level": "L1", 
+    "seed": 1234,
+    "is_deterministic": false, 
+    "tensor": {
+        "scope": [], 
+        "list":[], 
+        "data_mode": ["all"], 
+        "backward_input": "", 
+        "file_format": "npy" 
+    },
+    "statistics": {
+        "scope": [], 
+        "list":[], 
+        "data_mode": ["all"],
+        "summary_mode": "statistics"
+    },
+    "overflow_check": {
+        "overflow_nums": 1, 
+        "check_mode":"all" 
+    }
+}
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/core/common_config.py b/debug/accuracy_tools/atat/core/common_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..740119b63cea42356187e2fa9d1ba9a58b705ee7
--- /dev/null
+++ b/debug/accuracy_tools/atat/core/common_config.py
@@ -0,0 +1,50 @@
+from .utils import Const
+
+
+# 公共配置类
+class CommonConfig:
+    def __init__(self, json_config):
+        self.task = json_config.get('task')
+        self.dump_path = json_config.get('dump_path')
+        self.rank = json_config.get('rank')
+        self.step = json_config.get('step')
+        self.level = json_config.get('level')
+        self.seed = json_config.get('seed')
+        self.is_deterministic = json_config.get('is_deterministic')
+        self._check_config()
+
+    def _check_config(self):
+        if self.task and self.task not in Const.TASK_LIST:
+            raise Exception("task is invalid")
+        if self.rank is not None and not isinstance(self.rank, list):
+            raise Exception("rank is invalid")
+        if self.step is not None and not isinstance(self.step, list):
+            raise Exception("step is invalid")
+        if self.level and self.level not in Const.LEVEL_LIST:
+            raise Exception("level is invalid")
+        if self.seed is not None and not isinstance(self.seed, int):
+            raise Exception("seed is invalid")
+        if self.is_deterministic is not None and not isinstance(self.is_deterministic, bool):
+            raise Exception("is_deterministic is invalid")
+        
+
+# 基础配置类
+class BaseConfig:
+    def __init__(self, json_config):
+        self.scope = json_config.get('scope')
+        self.list = json_config.get('list')
+        self.data_mode = json_config.get('data_mode')
+        self.backward_input = json_config.get("backward_input")
+        self.file_format = json_config.get("file_format")
+        self.summary_mode =  json_config.get("summary_mode")
+        self.overflow_num = json_config.get("overflow_num")
+        self.check_mode = json_config.get("check_mode")
+
+    def check_config(self):
+        if self.scope is not None and not isinstance(self.scope, list):
+            raise Exception("scope is invalid")
+        if self.list is not None and not isinstance(self.list, list):
+            raise Exception("list is invalid")
+        if self.data_mode is not None and not isinstance(self.data_mode, list):
+            raise Exception("data_mode is invalid")
+        
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/core/file_check_util.py b/debug/accuracy_tools/atat/core/file_check_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..b10cdd61049ad9a87e91d910e89b121557a58a7f
--- /dev/null
+++ b/debug/accuracy_tools/atat/core/file_check_util.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import os
+import re
+
+from .log import print_warn_log, print_error_log
+
+
+class FileCheckConst:
+    """
+    Class for file check const
+    """
+    READ_ABLE = "read"
+    WRITE_ABLE = "write"
+    READ_WRITE_ABLE = "read and write"
+    DIRECTORY_LENGTH = 4096
+    FILE_NAME_LENGTH = 255
+    FILE_VALID_PATTERN = r"^[a-zA-Z0-9_.:/-]+$"
+    PKL_SUFFIX = ".pkl"
+    NUMPY_SUFFIX = ".npy"
+    JSON_SUFFIX = ".json"
+    PT_SUFFIX = ".pt"
+    CSV_SUFFIX = ".csv"
+    YAML_SUFFIX = ".yaml"
+    MAX_PKL_SIZE = 1 * 1024 * 1024 * 1024
+    MAX_NUMPY_SIZE = 10 * 1024 * 1024 * 1024
+    MAX_JSON_SIZE = 1 * 1024 * 1024 * 1024
+    MAX_PT_SIZE = 10 * 1024 * 1024 * 1024
+    MAX_CSV_SIZE = 1 * 1024 * 1024 * 1024
+    MAX_YAML_SIZE = 10 * 1024 * 1024
+    DIR = "dir"
+    FILE = "file"
+    DATA_DIR_AUTHORITY = 0o750
+    DATA_FILE_AUTHORITY = 0o640
+    FILE_SIZE_DICT = {
+        PKL_SUFFIX: MAX_PKL_SIZE,
+        NUMPY_SUFFIX: MAX_NUMPY_SIZE,
+        JSON_SUFFIX: MAX_JSON_SIZE,
+        PT_SUFFIX: MAX_PT_SIZE,
+        CSV_SUFFIX: MAX_CSV_SIZE,
+        YAML_SUFFIX: MAX_YAML_SIZE
+    }
+
+
+class FileCheckException(Exception):
+    """
+    Class for File Check Exception
+    """
+    NONE_ERROR = 0
+    INVALID_PATH_ERROR = 1
+    INVALID_FILE_TYPE_ERROR = 2
+    INVALID_PARAM_ERROR = 3
+    INVALID_PERMISSION_ERROR = 3
+
+    def __init__(self, code, error_info: str = ""):
+        super(FileCheckException, self).__init__()
+        self.code = code
+        self.error_info = error_info
+
+    def __str__(self):
+        return self.error_info
+
+
+class FileChecker:
+    """
+    The class for check file.
+
+    Attributes:
+        file_path: The file or dictionary path to be verified.
+        path_type: file or dictionary
+        ability(str): FileCheckConst.WRITE_ABLE or FileCheckConst.READ_ABLE to set file has writability or readability
+        file_type(str): The correct file type for file
+    """
+    def __init__(self, file_path, path_type, ability=None, file_type=None, is_script=True):
+        self.file_path = file_path
+        self.path_type = self._check_path_type(path_type)
+        self.ability = ability
+        self.file_type = file_type
+        self.is_script = is_script
+
+    @staticmethod
+    def _check_path_type(path_type):
+        if path_type not in [FileCheckConst.DIR, FileCheckConst.FILE]:
+            print_error_log(f'The path_type must be {FileCheckConst.DIR} or {FileCheckConst.FILE}.')
+            raise FileCheckException(FileCheckException.INVALID_PARAM_ERROR)
+        return path_type
+
+    def common_check(self):
+        """
+        功能：用户校验基本文件权限：软连接、文件长度、是否存在、读写权限、文件属组、文件特殊字符
+        注意：文件后缀的合法性，非通用操作，可使用其他独立接口实现
+        """
+        check_path_exists(self.file_path)
+        check_link(self.file_path)
+        self.file_path = os.path.realpath(self.file_path)
+        check_path_length(self.file_path)
+        check_path_type(self.file_path, self.path_type)
+        self.check_path_ability()
+        if self.is_script:
+            check_path_owner_consistent(self.file_path)
+        check_path_pattern_vaild(self.file_path)
+        check_common_file_size(self.file_path)
+        check_file_suffix(self.file_path, self.file_type)
+        return self.file_path
+
+    def check_path_ability(self):
+        if self.ability == FileCheckConst.WRITE_ABLE:
+            check_path_writability(self.file_path)
+        if self.ability == FileCheckConst.READ_ABLE:
+            check_path_readability(self.file_path)
+        if self.ability == FileCheckConst.READ_WRITE_ABLE:
+            check_path_readability(self.file_path)
+            check_path_writability(self.file_path)
+
+
+class FileOpen:
+    """
+    The class for open file by a safe way.
+
+    Attributes:
+        file_path: The file or dictionary path to be opened.
+        mode(str): The file open mode
+    """
+    SUPPORT_READ_MODE = ["r", "rb"]
+    SUPPORT_WRITE_MODE = ["w", "wb", "a", "ab"]
+    SUPPORT_READ_WRITE_MODE = ["r+", "rb+", "w+", "wb+", "a+", "ab+"]
+
+    def __init__(self, file_path, mode, encoding='utf-8'):
+        self.file_path = file_path
+        self.mode = mode
+        self.encoding = encoding
+        self._handle = None
+
+    def __enter__(self):
+        self.check_file_path()
+        binary_mode = "b"
+        if binary_mode not in self.mode:
+            self._handle = open(self.file_path, self.mode, encoding=self.encoding)
+        else:
+            self._handle = open(self.file_path, self.mode)
+        return self._handle
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._handle:
+            self._handle.close()
+
+    def check_file_path(self):
+        support_mode = self.SUPPORT_READ_MODE + self.SUPPORT_WRITE_MODE + self.SUPPORT_READ_WRITE_MODE
+        if self.mode not in support_mode:
+            print_error_log("File open not support %s mode" % self.mode)
+            raise FileCheckException(FileCheckException.INVALID_PARAM_ERROR)
+        check_link(self.file_path)
+        self.file_path = os.path.realpath(self.file_path)
+        check_path_length(self.file_path)
+        self.check_ability_and_owner()
+        check_path_pattern_vaild(self.file_path)
+        if os.path.exists(self.file_path):
+            check_common_file_size(self.file_path)
+
+    def check_ability_and_owner(self):
+        if self.mode in self.SUPPORT_READ_MODE:
+            check_path_exists(self.file_path)
+            check_path_readability(self.file_path)
+            check_path_owner_consistent(self.file_path)
+        if self.mode in self.SUPPORT_WRITE_MODE and os.path.exists(self.file_path):
+            check_path_writability(self.file_path)
+            check_path_owner_consistent(self.file_path)
+        if self.mode in self.SUPPORT_READ_WRITE_MODE and os.path.exists(self.file_path):
+            check_path_readability(self.file_path)
+            check_path_writability(self.file_path)
+            check_path_owner_consistent(self.file_path)
+
+
+def check_link(path):
+    abs_path = os.path.abspath(path)
+    if os.path.islink(abs_path):
+        print_error_log('The file path {} is a soft link.'.format(path))
+        raise FileCheckException(FileCheckException.INVALID_PATH_ERROR)
+
+
+def check_path_length(path, name_length=None):
+    file_max_name_length = name_length if name_length else FileCheckConst.FILE_NAME_LENGTH
+    if len(path) > FileCheckConst.DIRECTORY_LENGTH or \
+            len(os.path.basename(path)) > file_max_name_length:
+        print_error_log('The file path length exceeds limit.')
+        raise FileCheckException(FileCheckException.INVALID_PATH_ERROR)
+
+
+def check_path_exists(path):
+    if not os.path.exists(path):
+        print_error_log('The file path %s does not exist.' % path)
+        raise FileCheckException(FileCheckException.INVALID_PATH_ERROR)
+
+
+def check_path_readability(path):
+    if not os.access(path, os.R_OK):
+        print_error_log('The file path %s is not readable.' % path)
+        raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR)
+
+
+def check_path_writability(path):
+    if not os.access(path, os.W_OK):
+        print_error_log('The file path %s is not writable.' % path)
+        raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR)
+
+
+def check_path_executable(path):
+    if not os.access(path, os.X_OK):
+        print_error_log('The file path %s is not executable.' % path)
+        raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR)
+
+
+def check_other_user_writable(path):
+    st = os.stat(path)
+    if st.st_mode & 0o002:
+        _user_interactive_confirm(
+            'The file path %s may be insecure because other users have write permissions. '
+            'Do you want to continue?' % path)
+
+
+def _user_interactive_confirm(message):
+    while True:
+        check_message = input(message + " Enter 'c' to continue or enter 'e' to exit: ")
+        if check_message == "c":
+            break
+        elif check_message == "e":
+            print_warn_log("User canceled.")
+            raise FileCheckException(FileCheckException.INVALID_PATH_ERROR)
+        else:
+            print("Input is error, please enter 'c' or 'e'.")
+
+
+def check_path_owner_consistent(path):
+    file_owner = os.stat(path).st_uid
+    if file_owner != os.getuid():
+        print_error_log('The file path %s may be insecure because is does not belong to you.' % path)
+        raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR)
+
+
+def check_path_pattern_vaild(path):
+    if not re.match(FileCheckConst.FILE_VALID_PATTERN, path):
+        print_error_log('The file path {} contains special characters.'.format(path))
+        raise FileCheckException(FileCheckException.INVALID_PATH_ERROR)
+
+
+def check_file_size(file_path, max_size):
+    file_size = os.path.getsize(file_path)
+    if file_size >= max_size:
+        _user_interactive_confirm(f'The size of file path {file_path} exceeds {max_size} bytes.'
+                                  f'Do you want to continue?')
+
+
+def check_common_file_size(file_path):
+    if os.path.isfile(file_path):
+        for suffix, max_size in FileCheckConst.FILE_SIZE_DICT.items():
+            if file_path.endswith(suffix):
+                check_file_size(file_path, max_size)
+                break
+
+
+def check_file_suffix(file_path, file_suffix):
+    if file_suffix:
+        if not file_path.endswith(file_suffix):
+            print_error_log(f"The {file_path} should be a {file_suffix} file!")
+            raise FileCheckException(FileCheckException.INVALID_FILE_TYPE_ERROR)
+
+
+def check_path_type(file_path, file_type):
+    if file_type == FileCheckConst.FILE:
+        if not os.path.isfile(file_path):
+            print_error_log(f"The {file_path} should be a file!")
+            raise FileCheckException(FileCheckException.INVALID_FILE_TYPE_ERROR)
+    if file_type == FileCheckConst.DIR:
+        if not os.path.isdir(file_path):
+            print_error_log(f"The {file_path} should be a dictionary!")
+            raise FileCheckException(FileCheckException.INVALID_FILE_TYPE_ERROR)
+
+
+def create_directory(dir_path):
+    """
+    Function Description:
+        creating a directory with specified permissions
+    Parameter:
+        dir_path: directory path
+    Exception Description:
+        when invalid data throw exception
+    """
+    dir_path = os.path.realpath(dir_path)
+    try:
+        os.makedirs(dir_path, mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True)
+    except OSError as ex:
+        print_error_log(
+            'Failed to create {}.Please check the path permission or disk space .{}'.format(dir_path, str(ex)))
+        raise FileCheckException(FileCheckException.INVALID_PATH_ERROR) from ex
+
+
+def change_mode(path, mode):
+    if not os.path.exists(path) or os.path.islink(path):
+        return
+    try:
+        os.chmod(path, mode)
+    except PermissionError as ex:
+        print_error_log('Failed to change {} authority. {}'.format(path, str(ex)))
+        raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR) from ex
+
diff --git a/debug/accuracy_tools/atat/core/log.py b/debug/accuracy_tools/atat/core/log.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9ac8f5edfb18286aff317b5440bb99a92dd2486
--- /dev/null
+++ b/debug/accuracy_tools/atat/core/log.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import os
+import time
+import sys
+
+
+def _print_log(level, msg, end='\n'):
+    current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
+    pid = os.getgid()
+    print(current_time + "(" + str(pid) + ")-[" + level + "]" + msg, end=end)
+    sys.stdout.flush()
+
+
+def print_info_log(info_msg, end='\n'):
+    """
+    Function Description:
+        print info log.
+    Parameter:
+        info_msg: the info message.
+    """
+    _print_log("INFO", info_msg, end=end)
+
+
+def print_error_log(error_msg):
+    """
+    Function Description:
+        print error log.
+    Parameter:
+        error_msg: the error message.
+    """
+    _print_log("ERROR", error_msg)
+
+
+def print_warn_log(warn_msg):
+    """
+    Function Description:
+        print warn log.
+    Parameter:
+        warn_msg: the warning message.
+    """
+    _print_log("WARNING", warn_msg)
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/core/utils.py b/debug/accuracy_tools/atat/core/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e94b374b73fbcc9207d5bef09a5cf8a3a8c1ec4
--- /dev/null
+++ b/debug/accuracy_tools/atat/core/utils.py
@@ -0,0 +1,689 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import collections
+import os
+import re
+import shutil
+import stat
+import subprocess
+import sys
+import time
+import json
+from json.decoder import JSONDecodeError
+from datetime import datetime, timezone
+from pathlib import Path
+import numpy as np
+
+from .file_check_util import FileOpen, FileChecker, FileCheckConst
+
+
+device = collections.namedtuple('device', ['type', 'index'])
+prefixes = ['api_stack', 'list', 'range', 'acl']
+
+
+class Const:
+    """
+    Class for const
+    """
+    MODEL_TYPE = ['.onnx', '.pb', '.om']
+    DIM_PATTERN = r"^(-?[0-9]+)(,-?[0-9]+)*"
+    SEMICOLON = ";"
+    COLON = ":"
+    EQUAL = "="
+    COMMA = ","
+    DOT = "."
+    DUMP_RATIO_MAX = 100
+    SUMMERY_DATA_NUMS = 256
+    FLOAT_EPSILON = np.finfo(float).eps
+    SUPPORT_DUMP_MODE = ['api', 'acl']
+    ON = 'ON'
+    OFF = 'OFF'
+    BACKWARD = 'backward'
+    FORWARD = 'forward'
+    PRE_FORWARD = "pre_forward"
+
+    # dump mode
+    ALL = "all"
+    LIST = "list"
+    RANGE = "range"
+    STACK = "stack"
+    ACL = "acl"
+    API_LIST = "api_list"
+    API_STACK = "api_stack"
+    DUMP_MODE = [ALL, LIST, RANGE, STACK, ACL, API_LIST, API_STACK]
+    AUTO = "auto"
+    ONLINE_DUMP_MODE = [ALL, LIST, AUTO, OFF]
+    SUMMARY = "summary"
+    MD5 = "md5"
+    SUMMARY_MODE = [ALL, SUMMARY, MD5]
+
+    WRITE_FLAGS = os.O_WRONLY | os.O_CREAT
+    WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR
+
+    PKL_SUFFIX = ".pkl"
+    NUMPY_SUFFIX = ".npy"
+    ONE_GB = 1 * 1024 * 1024 * 1024
+    TEN_GB = 10 * 1024 * 1024 * 1024
+    FILE_PATTERN = r'^[a-zA-Z0-9_./-]+$'
+    FILE_NAME_LENGTH = 255
+    DIRECTORY_LENGTH = 4096
+    DISTRIBUTED_PREFIX_LENGTH = 60
+    SUMMARY_COLUMN_NUM = 6
+    STACK_COLUMN_NUM = 2
+    # env dump path
+    ASCEND_WORK_PATH = "ASCEND_WORK_PATH"
+    DUMP_DIR = "dump_data"
+
+    ENV_ENABLE = "1"
+    ENV_DISABLE = "0"
+
+    MAX_SEED_VALUE = 2**32 - 1
+
+    INPLACE_LIST = ["broadcast", "all_reduce", "reduce", "all_gather", "gather", "scatter", "reduce_scatter",
+                    "_reduce_scatter_base", "_all_gather_base"]
+
+    TASK_LIST = ["tensor", "statistics", "overflow_check", "free_benchmark"]
+    LEVEL_LIST = ["L0", "L1", "L2", "mix"]
+    STATISTICS = "statistics"
+    TENSOR = "tensor"
+    OVERFLOW_CHECK = "overflow_check"
+    FREE_BENCHMARK = "free_benchmark"
+
+class CompareConst:
+    """
+    Class for compare module const
+    """
+    # compare result column name
+    NPU_NAME = "NPU Name"
+    BENCH_NAME = "Bench Name"
+    NPU_DTYPE = "NPU Tensor Dtype"
+    BENCH_DTYPE = "Bench Tensor Dtype"
+    NPU_SHAPE = "NPU Tensor Shape"
+    BENCH_SHAPE = "Bench Tensor Shape"
+    NPU_MAX = "NPU max"
+    NPU_MIN = "NPU min"
+    NPU_MEAN = "NPU mean"
+    NPU_NORM = "NPU l2norm"
+    BENCH_MAX = "Bench max"
+    BENCH_MIN = "Bench min"
+    BENCH_MEAN = "Bench mean"
+    BENCH_NORM = "Bench l2norm"
+    MAX_DIFF = "Max diff"
+    MIN_DIFF = "Min diff"
+    MEAN_DIFF = "Mean diff"
+    NORM_DIFF = "L2norm diff"
+    COSINE = "Cosine"
+    MAX_ABS_ERR = "MaxAbsErr"
+    MAX_RELATIVE_ERR = "MaxRelativeErr"
+    ACCURACY = "Accuracy Reached or Not"
+    STACK = "NPU_Stack_Info"
+    ERROR_MESSAGE = "Err_message"
+    ONE_THOUSANDTH_ERR_RATIO = "One Thousandth Err Ratio"
+    FIVE_THOUSANDTHS_ERR_RATIO = "Five Thousandths Err Ratio"
+    NPU_MD5 = "NPU MD5"
+    BENCH_MD5 = "BENCH MD5"
+    RESULT = "Result"
+
+    COMPARE_RESULT_HEADER = [
+        NPU_NAME, BENCH_NAME, NPU_DTYPE, BENCH_DTYPE, NPU_SHAPE, BENCH_SHAPE, COSINE, MAX_ABS_ERR, MAX_RELATIVE_ERR,
+        ONE_THOUSANDTH_ERR_RATIO, FIVE_THOUSANDTHS_ERR_RATIO,
+        NPU_MAX, NPU_MIN, NPU_MEAN, NPU_NORM, BENCH_MAX, BENCH_MIN, BENCH_MEAN, BENCH_NORM, ACCURACY, ERROR_MESSAGE
+    ]
+
+    SUMMARY_COMPARE_RESULT_HEADER = [
+        NPU_NAME, BENCH_NAME, NPU_DTYPE, BENCH_DTYPE, NPU_SHAPE, BENCH_SHAPE, MAX_DIFF, MIN_DIFF, MEAN_DIFF, NORM_DIFF,
+        NPU_MAX, NPU_MIN, NPU_MEAN, NPU_NORM, BENCH_MAX, BENCH_MIN, BENCH_MEAN, BENCH_NORM, RESULT, ERROR_MESSAGE
+    ]
+
+    MD5_COMPARE_RESULT_HEADER = [
+        NPU_NAME, BENCH_NAME, NPU_DTYPE, BENCH_DTYPE, NPU_SHAPE, BENCH_SHAPE, NPU_MD5, BENCH_MD5, RESULT
+    ]
+
+    # compare result data
+    NAN = 'Nan'
+    SHAPE_UNMATCH = 'shape unmatched'
+    DTYPE_UNMATCH = 'dtype unmatched'
+    PASS = 'Pass'
+    WARNING = 'Warning'
+    DIFF = 'Different'
+
+    # accuracy standards
+    COS_THRESHOLD = 0.99
+    MAX_ABS_ERR_THRESHOLD = 0.001
+    COS_MAX_THRESHOLD = 0.9
+    MAX_ABS_ERR_MAX_THRESHOLD = 1
+    ACCURACY_CHECK_YES = "Yes"
+    ACCURACY_CHECK_NO = "No"
+    ACCURACY_CHECK_UNMATCH = "Unmatched"
+
+    # error message
+    NO_BENCH = "No bench data matched."
+
+    # compare const
+    FLOAT_TYPE = [np.half, np.single, float, np.double, np.float64, np.longdouble]
+
+
+class CompareException(Exception):
+    """
+    Class for Accuracy Compare Exception
+    """
+    NONE_ERROR = 0
+    INVALID_PATH_ERROR = 1
+    OPEN_FILE_ERROR = 2
+    CLOSE_FILE_ERROR = 3
+    READ_FILE_ERROR = 4
+    WRITE_FILE_ERROR = 5
+    INVALID_FILE_ERROR = 6
+    PERMISSION_ERROR = 7
+    INDEX_OUT_OF_BOUNDS_ERROR = 8
+    NO_DUMP_FILE_ERROR = 9
+    INVALID_DATA_ERROR = 10
+    INVALID_PARAM_ERROR = 11
+    INVALID_DUMP_RATIO = 12
+    INVALID_DUMP_FILE = 13
+    UNKNOWN_ERROR = 14
+    INVALID_DUMP_MODE = 15
+    PARSE_FILE_ERROR = 16
+    INVALID_COMPARE_MODE = 17
+    OVER_SIZE_FILE_ERROR = 18
+    INVALID_SUMMARY_MODE = 19
+
+    def __init__(self, code, error_info: str = ""):
+        super(CompareException, self).__init__()
+        self.code = code
+        self.error_info = error_info
+
+    def __str__(self):
+        return self.error_info
+
+
+class DumpException(CompareException):
+    pass
+
+
+class OverflowConst:
+    """
+    Class for Overflow
+    """
+    OVERFLOW_DEBUG_MODE_ENABLE = "OVERFLOW_DEBUG_MODE_ENABLE"
+    OVERFLOW_ORIGINAL_MODE = 0
+    OVERFLOW_DEBUG_MODE = 1
+
+
+def make_dump_path_if_not_exists(dump_path):
+    if not os.path.exists(dump_path):
+        try:
+            Path(dump_path).mkdir(mode=0o750, exist_ok=True, parents=True)
+        except OSError as ex:
+            print_error_log(
+                'Failed to create {}.Please check the path permission or disk space .{}'.format(dump_path, str(ex)))
+            raise CompareException(CompareException.INVALID_PATH_ERROR) from ex
+    else:
+        if not os.path.isdir(dump_path):
+            print_error_log('{} already exists and is not a directory.'.format(dump_path))
+
+
+def _print_log(level, msg, end='\n'):
+    current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
+    pid = os.getgid()
+    print(current_time + "(" + str(pid) + ")-[" + level + "]" + msg, end=end)
+    sys.stdout.flush()
+
+
+def print_info_log(info_msg, end='\n'):
+    """
+    Function Description:
+        print info log.
+    Parameter:
+        info_msg: the info message.
+    """
+    _print_log("INFO", info_msg, end=end)
+
+
+def print_error_log(error_msg):
+    """
+    Function Description:
+        print error log.
+    Parameter:
+        error_msg: the error message.
+    """
+    _print_log("ERROR", error_msg)
+
+
+def print_warn_log(warn_msg):
+    """
+    Function Description:
+        print warn log.
+    Parameter:
+        warn_msg: the warning message.
+    """
+    _print_log("WARNING", warn_msg)
+
+
+def check_mode_valid(mode, scope=None, api_list=None):
+    if scope is None:
+        scope = []
+    if api_list is None:
+        api_list = []
+    if not isinstance(scope, list):
+        raise ValueError("scope param set invalid, it's must be a list.")
+    if not isinstance(api_list, list):
+        raise ValueError("api_list param set invalid, it's must be a list.")
+    mode_check = {
+        Const.ALL: lambda: None,
+        Const.RANGE: lambda:  ValueError("set_dump_switch, scope param set invalid, it's must be [start, end].") if len(scope) != 2 else None,
+        Const.LIST: lambda:  ValueError("set_dump_switch, scope param set invalid, it's should not be an empty list.") if len(scope) == 0 else None,
+        Const.STACK: lambda:  ValueError("set_dump_switch, scope param set invalid, it's must be [start, end] or [].") if len(scope) > 2 else None,
+        Const.ACL: lambda:  ValueError("set_dump_switch, scope param set invalid, only one api name is supported in acl mode.") if len(scope) != 1 else None,
+        Const.API_LIST: lambda:  ValueError("Current dump mode is 'api_list', but the content of api_list parameter is empty or valid.") if len(api_list) < 1 else None,
+        Const.API_STACK: lambda: None,
+    }
+    if mode not in Const.DUMP_MODE:
+        msg = "Current mode '%s' is not supported. Please use the field in %s" % \
+              (mode, Const.DUMP_MODE)
+        raise CompareException(CompareException.INVALID_DUMP_MODE, msg)
+
+    if mode_check.get(mode)() is not None:
+        raise mode_check.get(mode)()
+
+
+def check_switch_valid(switch):
+    if switch not in ["ON", "OFF"]:
+        print_error_log("Please set switch with 'ON' or 'OFF'.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+
+
+def check_dump_mode_valid(dump_mode):
+    if not isinstance(dump_mode, list):
+        print_warn_log("Please set dump_mode as a list.")
+        dump_mode = [dump_mode]
+    if not all(mode in ["all", "forward", "backward", "input", "output"] for mode in dump_mode):
+        raise ValueError("Please set dump_mode as a list containing one or more of the following: 'all', 'forward', 'backward', 'input', 'output'.")
+    if 'input' not in dump_mode and 'output' not in dump_mode:
+        dump_mode.extend(['input', 'output'])
+    if 'forward' not in dump_mode and 'backward' not in dump_mode:
+        dump_mode.extend(['forward', 'backward'])
+    if 'all' in dump_mode or set(["forward", "backward", "input", "output"]).issubset(set(dump_mode)):
+        return ["forward", "backward", "input", "output"]
+    return dump_mode
+
+
+def check_summary_mode_valid(summary_mode):
+    if summary_mode not in Const.SUMMARY_MODE:
+        msg = "The summary_mode is not valid"
+        raise CompareException(CompareException.INVALID_SUMMARY_MODE, msg)
+
+
+def check_summary_only_valid(summary_only):
+    if not isinstance(summary_only, bool):
+        print_error_log("Params summary_only only support True or False.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    return summary_only
+
+
+def check_compare_param(input_parma, output_path, stack_mode=False, summary_compare=False):  # 添加默认值来让不传参时能通过参数检查
+    if not (isinstance(input_parma, dict) and isinstance(output_path, str)):
+        print_error_log("Invalid input parameters")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    check_file_or_directory_path(input_parma.get("npu_pkl_path"), False)
+    check_file_or_directory_path(input_parma.get("bench_pkl_path"), False)
+    if not summary_compare:
+        check_file_or_directory_path(input_parma.get("npu_dump_data_dir"), True)
+        check_file_or_directory_path(input_parma.get("bench_dump_data_dir"), True)
+    check_file_or_directory_path(output_path, True)
+    with FileOpen(input_parma.get("npu_pkl_path"), "r") as npu_pkl, \
+         FileOpen(input_parma.get("bench_pkl_path"), "r") as bench_pkl:
+        check_pkl_file(input_parma, npu_pkl, bench_pkl, stack_mode)
+
+
+def is_summary_compare(input_param):
+    npu_pkl_path = input_param.get("npu_pkl_path", None)
+    bench_pkl_path = input_param.get("bench_pkl_path", None)
+    npu_dump_data_dir = input_param.get("npu_dump_data_dir", None)
+    bench_dump_data_dir = input_param.get("bench_dump_data_dir", None)
+    if not npu_pkl_path or not bench_pkl_path:
+        print_error_log(f"Please check the pkl path is valid.")
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+    if not (npu_dump_data_dir and bench_dump_data_dir):
+        return True
+    if npu_dump_data_dir and bench_dump_data_dir:
+        return False
+    print_error_log(f"Please check the dump data dir is valid.")
+    raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+
+def is_md5_compare(input_parma):
+    with FileOpen(input_parma.get("npu_pkl_path"), "r") as npu_pkl:
+        pkl_lines = npu_pkl.readline()
+    try:
+        line = json.loads(pkl_lines)
+    except JSONDecodeError as err:
+        raise CompareException(CompareException.INVALID_FILE_ERROR) from err
+    if len(line) < 3:
+        return False
+    if line[2]:
+        return True
+    return False
+
+
+def check_configuration_param(stack_mode=False, auto_analyze=True, fuzzy_match=False):
+    if not (isinstance(stack_mode, bool) and isinstance(auto_analyze, bool) and isinstance(fuzzy_match, bool)):
+        print_error_log("Invalid input parameters which should be only bool type.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+
+
+def check_file_or_directory_path(path, isdir=False):
+    """
+    Function Description:
+        check whether the path is valid
+    Parameter:
+        path: the path to check
+        isdir: the path is dir or file
+    Exception Description:
+        when invalid data throw exception
+    """
+    if isdir:
+        path_checker = FileChecker(path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE)
+    else:
+        path_checker = FileChecker(path, FileCheckConst.FILE, FileCheckConst.READ_ABLE)
+    path_checker.common_check()
+
+
+def _check_pkl(pkl_file_handle, file_name):
+    tensor_line = pkl_file_handle.readline()
+    if len(tensor_line) == 0:
+        print_error_log("dump file {} have empty line!".format(file_name))
+        raise CompareException(CompareException.INVALID_DUMP_FILE)
+    pkl_file_handle.seek(0, 0)
+
+
+def is_starts_with(string, prefix_list):
+    return any(string.startswith(prefix) for prefix in prefix_list)
+
+
+def check_stack_mode(pkl_fp):
+    api_prefix = ""
+    api_pattern = r'\[\"([0-9a-zA-Z_.]+_(for|back)ward)_(in|out)put(\.[0-9]+)?'
+    is_stack_mode = False
+    for index, line in enumerate(pkl_fp):
+        if index == 0:
+            api_match = re.search(api_pattern, line)
+            api_prefix = api_match.group(1)
+        elif api_prefix and line.startswith(f'["{api_prefix}'):
+            if line.startswith(f'["{api_prefix}_stack_info'):
+                is_stack_mode = True
+                break
+        else:
+            break
+    pkl_fp.seek(0, 0)
+    return is_stack_mode
+
+
+def check_pkl_file(input_param, npu_pkl, bench_pkl, stack_mode):
+    _check_pkl(npu_pkl, input_param.get("npu_pkl_path"))
+    _check_pkl(bench_pkl, input_param.get("bench_pkl_path"))
+
+    npu_pkl_stack_mode = check_stack_mode(npu_pkl)
+    bench_pkl_stack_mode = check_stack_mode(bench_pkl)
+
+    if not npu_pkl_stack_mode and not bench_pkl_stack_mode:
+        if stack_mode:
+            print_error_log("The current file does not contain stack information, please turn off the stack_mode")
+            raise CompareException(CompareException.INVALID_COMPARE_MODE)
+    elif npu_pkl_stack_mode and bench_pkl_stack_mode:
+        if not stack_mode:
+            print_error_log("The current file contains stack information, please turn on the stack_mode")
+            raise CompareException(CompareException.INVALID_COMPARE_MODE)
+    else:
+        print_error_log("The dump mode of the two files is not same, please check the dump files")
+        raise CompareException(CompareException.INVALID_COMPARE_MODE)
+
+
+def check_file_size(input_file, max_size):
+    try:
+        file_size = os.path.getsize(input_file)
+    except OSError as os_error:
+        print_error_log('Failed to open "%s". %s' % (input_file, str(os_error)))
+        raise CompareException(CompareException.INVALID_FILE_ERROR) from os_error
+    if file_size > max_size:
+        print_error_log('The size (%d) of %s exceeds (%d) bytes, tools not support.'
+                        % (file_size, input_file, max_size))
+        raise CompareException(CompareException.INVALID_FILE_ERROR)
+
+
+def check_file_not_exists(file_path):
+    if os.path.exists(file_path) or os.path.islink(file_path):
+        remove_path(file_path)
+
+
+def remove_path(path):
+    if not os.path.exists(path):
+        return
+    try:
+        if os.path.islink(path) or os.path.isfile(path):
+            os.remove(path)
+        else:
+            shutil.rmtree(path)
+    except PermissionError as err:
+        print_error_log("Failed to delete {}. Please check the permission.".format(path))
+        raise CompareException(CompareException.INVALID_PATH_ERROR) from err
+
+
+def get_dump_data_path(dump_dir):
+    """
+    Function Description:
+        traverse directories and obtain the absolute path of dump data
+    Parameter:
+        dump_dir: dump data directory
+    Return Value:
+        dump data path,file is exist or file is not exist
+    """
+    dump_data_path = None
+    file_is_exist = False
+
+    check_file_or_directory_path(dump_dir, True)
+    for dir_path, sub_paths, files in os.walk(dump_dir):
+        if len(files) != 0:
+            dump_data_path = dir_path
+            file_is_exist = True
+            break
+        dump_data_path = dir_path
+    return dump_data_path, file_is_exist
+
+
+def modify_dump_path(dump_path, mode):
+    if mode == Const.ALL:
+        return dump_path
+    file_name = os.path.split(dump_path)
+    mode_file_name = mode + "_" + file_name[-1]
+    return os.path.join(file_name[0], mode_file_name)
+
+
+def create_directory(dir_path):
+    """
+    Function Description:
+        creating a directory with specified permissions
+    Parameter:
+        dir_path: directory path
+    Exception Description:
+        when invalid data throw exception
+    """
+    if not os.path.exists(dir_path):
+        try:
+            os.makedirs(dir_path, mode=0o700)
+        except OSError as ex:
+            print_error_log(
+                'Failed to create {}.Please check the path permission or disk space .{}'.format(dir_path, str(ex)))
+            raise CompareException(CompareException.INVALID_PATH_ERROR) from ex
+
+
+def execute_command(cmd):
+    """
+    Function Description:
+        run the following command
+    Parameter:
+        cmd: command
+    Exception Description:
+        when invalid command throw exception
+    """
+    print_info_log('Execute command:%s' % cmd)
+    process = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    while process.poll() is None:
+        line = process.stdout.readline()
+        line = line.strip()
+        if line:
+            print(line)
+    if process.returncode != 0:
+        print_error_log('Failed to execute command:%s' % " ".join(cmd))
+        raise CompareException(CompareException.INVALID_DATA_ERROR)
+
+
+def save_numpy_data(file_path, data):
+    """
+    save_numpy_data
+    """
+    if not os.path.exists(os.path.dirname(file_path)):
+        os.makedirs(os.path.dirname(file_path))
+    np.save(file_path, data)
+
+
+def parse_value_by_comma(value):
+    """
+    parse value by comma, like '1,2,4,8'
+    """
+    value_list = []
+    value_str_list = value.split(Const.COMMA)
+    for value_str in value_str_list:
+        value_str = value_str.strip()
+        if value_str.isdigit() or value_str == '-1':
+            value_list.append(int(value_str))
+        else:
+            print_error_log("please check your input shape.")
+            raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    return value_list
+
+
+def get_data_len_by_shape(shape):
+    data_len = 1
+    for item in shape:
+        if item == -1:
+            print_error_log("please check your input shape, one dim in shape is -1.")
+            return -1
+        data_len = data_len * item
+    return data_len
+
+
+def add_time_as_suffix(name):
+    return '{}_{}.csv'.format(name, time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())))
+
+
+def get_time():
+    return datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
+
+
+def format_value(value):
+    return '{:.12f}'.format(value)
+
+
+def check_seed_all(seed, mode):
+    if isinstance(seed, int):
+        if seed < 0 or seed > Const.MAX_SEED_VALUE:
+            print_error_log(f"Seed must be between 0 and {Const.MAX_SEED_VALUE}.")
+            raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    else:
+        print_error_log(f"Seed must be integer.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    if not isinstance(mode, bool):
+        print_error_log(f"seed_all mode must be bool.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+
+
+def get_process_rank(model):
+    print_info_log("Rank id is not provided. Trying to get the rank id of the model.")
+    try:
+        local_device = next(model.parameters()).device
+    except StopIteration:
+        print_warn_log('There is no parameter in the model. Fail to get rank id.')
+        return 0, False
+    if local_device.type == 'cpu':
+        print_warn_log("Warning: the debugger is unable to get the rank id. "
+            "This may cause the dumpped data to be corrupted in the "
+            "case of distributed training. (You may ignore this if you are using only one card.) "
+            "Transfer the model to npu or gpu before register_hook() to avoid this warning.")
+        return 0, False
+    else:
+        return local_device.index, True
+
+
+def generate_compare_script(dump_path, pkl_file_path, dump_switch_mode):
+    template_path = os.path.join(os.path.dirname(__file__), "compare_script.template")
+    pkl_dir = os.path.dirname(pkl_file_path)
+    compare_script_path = os.path.join(pkl_dir, "compare_data.py")
+    is_api_stack = "True" if dump_switch_mode == Const.API_STACK else "False"
+
+    try:
+        with FileOpen(template_path, 'r') as ftemp, \
+           os.fdopen(os.open(compare_script_path, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as fout:
+            code_temp = ftemp.read()
+            fout.write(code_temp % (pkl_file_path, dump_path, is_api_stack))
+    except OSError:
+        print_error_log(f"Failed to open file. Please check file {template_path} or path {pkl_dir}.")
+
+    print_info_log(f"Generate compare script successfully which is {compare_script_path}.")
+
+
+def check_file_valid(file_path):
+    if os.path.islink(file_path):
+        print_error_log('The file path {} is a soft link.'.format(file_path))
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+    if len(os.path.realpath(file_path)) > Const.DIRECTORY_LENGTH or len(os.path.basename(file_path)) > \
+            Const.FILE_NAME_LENGTH:
+        print_error_log('The file path length exceeds limit.')
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+    if not re.match(Const.FILE_PATTERN, os.path.realpath(file_path)):
+        print_error_log('The file path {} contains special characters.'.format(file_path))
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+    if os.path.isfile(file_path):
+        file_size = os.path.getsize(file_path)
+        if file_path.endswith(Const.PKL_SUFFIX) and file_size > Const.ONE_GB:
+            print_error_log('The file {} size is greater than 1GB.'.format(file_path))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+        if file_path.endswith(Const.NUMPY_SUFFIX) and file_size > Const.TEN_GB:
+            print_error_log('The file {} size is greater than 10GB.'.format(file_path))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+
+def check_path_before_create(path):
+    if len(os.path.realpath(path)) > Const.DIRECTORY_LENGTH or len(os.path.basename(path)) > \
+            Const.FILE_NAME_LENGTH:
+        print_error_log('The file path length exceeds limit.')
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+    if not re.match(Const.FILE_PATTERN, os.path.realpath(path)):
+        print_error_log('The file path {} contains special characters.'.format(path))
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+
+def check_inplace_op(prefix):
+    if len(prefix) > Const.DISTRIBUTED_PREFIX_LENGTH:
+        return False
+    match_op = re.findall(r"Distributed_(.+?)_\d", prefix)
+    op_name = match_op[0] if match_op else None
+    return op_name in Const.INPLACE_LIST
diff --git a/debug/accuracy_tools/atat/mindspore/__init__.py b/debug/accuracy_tools/atat/mindspore/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb3f93567542e93ff913edf3daabcd3aedb91ee3
--- /dev/null
+++ b/debug/accuracy_tools/atat/mindspore/__init__.py
@@ -0,0 +1 @@
+from atat.mindspore.debugger.precision_debugger import PrecisionDebugger
diff --git a/debug/accuracy_tools/atat/mindspore/debugger/__init__.py b/debug/accuracy_tools/atat/mindspore/debugger/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/mindspore/debugger/debugger_config.py b/debug/accuracy_tools/atat/mindspore/debugger/debugger_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..56a4b9bf758197d77ef04874f2865e2136d6f67c
--- /dev/null
+++ b/debug/accuracy_tools/atat/mindspore/debugger/debugger_config.py
@@ -0,0 +1,51 @@
+import os
+
+
+class DebuggerConfig:
+    convert_map = {
+        "L0": "cell",
+        "L1": "api",
+        "L2": 'kernel'
+    }
+
+    def __init__(self, common_config, task_config):
+        self.dump_path = common_config.dump_path
+        self.task = common_config.task
+        self.rank = [] if not common_config.rank else common_config.rank
+        self.step = [] if not common_config.step else common_config.step
+        if not common_config.level:
+            common_config.level = "L1"
+        self.level = DebuggerConfig.convert_map[common_config.level]
+        self.list = [] if not task_config.list else task_config.list
+        self.data_mode =  [] if not task_config.data_mode else task_config.data_mode
+        self.file_format = task_config.file_format
+        self.check_mode = task_config.check_mode
+
+        self.check()
+
+    def check(self):
+        if not self.dump_path:
+            raise Exception("Dump path is empty.")
+        if not os.path.isabs(self.dump_path):
+            raise Exception("Dump path must be absolute path.")
+        if not self.task:
+            self.task = "statistics"
+        if not self.level:
+            raise Exception("level must be L0, L1 or L2")
+        if not self.file_format:
+            self.file_format = "npy"
+        if not self.check_mode:
+            self.check_mode = "all"
+        self._check_rank()
+        self._check_step()
+        return True
+
+    def _check_rank(self):
+        for rank_id in self.rank:
+            if not isinstance(rank_id, int) or rank_id < 0:
+                raise ValueError(f"rank {self.rank} must be a positive integer.")
+
+    def _check_step(self):
+        for s in self.step:
+            if not isinstance(s, int):
+                raise ValueError(f"step element {s} should be int")
diff --git a/debug/accuracy_tools/atat/mindspore/debugger/precision_debugger.py b/debug/accuracy_tools/atat/mindspore/debugger/precision_debugger.py
new file mode 100644
index 0000000000000000000000000000000000000000..0099074762f0746c1bd8341047f37b3e5fe08855
--- /dev/null
+++ b/debug/accuracy_tools/atat/mindspore/debugger/precision_debugger.py
@@ -0,0 +1,32 @@
+import os
+from atat.mindspore.ms_config import parse_json_config
+from atat.mindspore.debugger.debugger_config import DebuggerConfig
+from atat.mindspore.task_handler_factory import TaskHandlerFactory
+
+
+class PrecisionDebugger:
+    _instance = None
+
+    def __new__(cls, config_path=None):
+        if not cls._instance:
+            cls._instance = super().__new__(cls)
+            cls._instance.initialized = False
+            cls._instance.config = None
+        return cls._instance
+
+    def __init__(self, config_path=None):
+        if self.initialized:
+            return
+        if not config_path:
+            config_path = os.path.join(os.path.dirname(__file__), "../../config/config.json")
+        common_config, task_config = parse_json_config(config_path)
+        self.config = DebuggerConfig(common_config, task_config)
+        self.initialized = True
+
+    @classmethod
+    def start(cls, target=None):
+        instance = cls._instance
+        if not instance:
+            raise Exception("No instance of PrecisionDebugger found.")
+        handler = TaskHandlerFactory.create(instance.config)
+        handler.handle()
diff --git a/debug/accuracy_tools/atat/mindspore/doc/dump.md b/debug/accuracy_tools/atat/mindspore/doc/dump.md
new file mode 100644
index 0000000000000000000000000000000000000000..34529f580a7b2cb4961a2c992949cab89c15115e
--- /dev/null
+++ b/debug/accuracy_tools/atat/mindspore/doc/dump.md
@@ -0,0 +1,65 @@
+# **精度数据采集**
+
+atat工具主要通过在训练脚本内添加dump接口并启动训练的方式来采集精度数据。
+
+执行dump操作需要安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。
+
+## dump接口介绍
+
+### PrecisionDebugger
+
+**功能说明**
+
+通过加载dump配置文件的方式来确定dump操作的详细配置。
+
+可以在from atat.mindspore import PrecisionDebugger和模型初始化之间的任意位置添加该接口。
+
+**原型**
+
+```Python
+PrecisionDebugger(config_path=None)
+```
+
+**参数说明**
+
+| 参数名      | 说明                                                         | 是否必选 |
+| ----------- | ------------------------------------------------------------ | -------- |
+| config_path | 指定dump配置文件路径，String类型。参数示例："./config.json"。未配置该路径时，默认使用../../config目录下的config.json文件的默认配置。config.json文件可以配置更多参数，若需要进行更多场景的精度数据dump，建议配置[config.json](../../config/config.json)文件。 | 否       |
+
+### start函数
+
+**功能说明**
+
+启动函数。
+
+**原型**
+
+```Python
+debugger.start()
+```
+
+该函数为类函数，可以使用debugger.start()也可以使用PrecisionDebugger.start()。
+
+## 示例代码
+
+```Python
+from atat.mindspore import PrecisionDebugger
+debugger = PrecisionDebugger(config_path="./config.json")
+# 请勿将以上初始化流程插入到循环代码中
+# 下面代码也可以用PrecisionDebugger.start()
+debugger.start()
+...
+```
+
+## dump结果文件介绍
+
+训练结束后，工具将dump的数据保存在dump_path参数指定的目录下。
+
+- level为L1时
+
+  dump结果目录请参见MindSpore官网中的《[同步Dump数据对象目录](https://www.mindspore.cn/tutorials/experts/zh-CN/r2.3.0rc2/debug/dump.html#%E5%90%8C%E6%AD%A5dump%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95)》。
+
+- level为L2时
+
+  dump结果目录请参见MindSpore官网中的《[异步Dump数据对象目录](https://www.mindspore.cn/tutorials/experts/zh-CN/r2.3.0rc2/debug/dump.html#%E5%BC%82%E6%AD%A5dump%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95)》。
+
diff --git a/debug/accuracy_tools/atat/mindspore/dump/__init__.py b/debug/accuracy_tools/atat/mindspore/dump/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/mindspore/dump/api_kbk_dump.py b/debug/accuracy_tools/atat/mindspore/dump/api_kbk_dump.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0f80f40e553a8b136144f515015d0f94c635f5d
--- /dev/null
+++ b/debug/accuracy_tools/atat/mindspore/dump/api_kbk_dump.py
@@ -0,0 +1,55 @@
+import os
+import json
+from atat.core.utils import make_dump_path_if_not_exists
+from atat.mindspore.debugger.debugger_config import DebuggerConfig
+from atat.core.log import print_info_log
+from atat.core.file_check_util import FileOpen
+
+
+class ApiKbkDump:
+    def __init__(self, config: DebuggerConfig):
+        self.dump_json = dict()
+        self.dump_json["common_dump_settings"] = dict()
+        self.dump_json["common_dump_settings"]["dump_mode"] = 0
+        self.dump_json["common_dump_settings"]["path"] = ""
+        self.dump_json["common_dump_settings"]["net_name"] = "Net"
+        self.dump_json["common_dump_settings"]["iteration"] = "all"
+        self.dump_json["common_dump_settings"]["saved_data"] = "statistic"
+        self.dump_json["common_dump_settings"]["input_output"] = 0
+        self.dump_json["common_dump_settings"]["kernels"] = []
+        self.dump_json["common_dump_settings"]["support_device"] = [0,1,2,3,4,5,6,7]
+        self.dump_json["e2e_dump_settings"] = dict()
+        self.dump_json["e2e_dump_settings"]["enable"] = True
+        self.dump_json["e2e_dump_settings"]["trans_flag"] = True
+
+
+        if len(config.list) > 0:
+            self.dump_json["common_dump_settings"]["dump_mode"] = 1
+            self.dump_json["common_dump_settings"]["kernels"] = config.list
+        self.dump_json["common_dump_settings"]["path"] = config.dump_path
+        if len(config.step) > 0:
+            step_str = ""
+            for s in config.step:
+                step_str += (str(s) + '|')
+            self.dump_json["common_dump_settings"]["iteration"] = step_str[:-1]
+        if len(config.rank) > 0:
+            self.dump_json["common_dump_settings"]["support_device"] = config.rank
+        if config.task == "tensor":
+            self.dump_json["common_dump_settings"]["saved_data"] = "tensor"
+        if len(config.data_mode) == 1:
+            if config.data_mode[0] == "input":
+                self.dump_json["common_dump_settings"]["input_output"] = 1
+            if config.data_mode[0] == "output":
+                self.dump_json["common_dump_settings"]["input_output"] = 2
+
+    def handle(self):
+        json_path = self.dump_json["common_dump_settings"]["path"]
+        make_dump_path_if_not_exists(json_path)
+        json_path = os.path.join(json_path, "api_kbk_dump.json")
+        with FileOpen(json_path, 'w') as f:
+            json.dump(self.dump_json, f)
+        print_info_log(json_path + " has been created.")
+        os.environ["GRAPH_OP_RUN"] = "1"
+        os.environ["MINDSPORE_DUMP_CONFIG"] = json_path
+        if "MS_ACL_DUMP_CFG_PATH" in os.environ:
+            del os.environ["MS_ACL_DUMP_CFG_PATH"]
diff --git a/debug/accuracy_tools/atat/mindspore/dump/dump_tool_factory.py b/debug/accuracy_tools/atat/mindspore/dump/dump_tool_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab534edc243dfd5f44688358fe4ca8edb6a8a12d
--- /dev/null
+++ b/debug/accuracy_tools/atat/mindspore/dump/dump_tool_factory.py
@@ -0,0 +1,38 @@
+from atat.mindspore.debugger.debugger_config import DebuggerConfig
+from atat.mindspore.dump.api_kbk_dump import ApiKbkDump
+from atat.mindspore.dump.kernel_graph_dump import KernelGraphDump
+
+
+class DumpToolFactory:
+    tools = {
+        "cell": {
+            "kbk": None,
+            "graph": None,
+            "pynative": None
+        },
+        "api": {
+            "kbk": ApiKbkDump,
+            "graph": None,
+            "pynative": None
+        },
+        "kernel": {
+            "kbk": None,
+            "graph": KernelGraphDump,
+            "pynative": None
+        }
+    }
+
+    @staticmethod
+    def create(config: DebuggerConfig):
+        tool = DumpToolFactory.tools.get(config.level)
+        if not tool:
+            raise Exception("valid level is needed.")
+        if config.level == "api":
+            tool = tool.get("kbk")
+        elif config.level == "kernel":
+            tool = tool.get("graph")
+        elif config.level == "cell":
+            raise Exception("Cell dump in not supported now.")
+        if not tool:
+            raise Exception("Data dump in not supported in this mode.")
+        return tool(config)
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/mindspore/dump/kernel_graph_dump.py b/debug/accuracy_tools/atat/mindspore/dump/kernel_graph_dump.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8a10ec1b1f690931871895a47014d44594ac80a
--- /dev/null
+++ b/debug/accuracy_tools/atat/mindspore/dump/kernel_graph_dump.py
@@ -0,0 +1,60 @@
+import os
+import json
+from atat.core.utils import make_dump_path_if_not_exists
+from atat.mindspore.debugger.debugger_config import DebuggerConfig
+from atat.core.log import print_info_log
+from atat.core.file_check_util import FileOpen
+
+
+class KernelGraphDump:
+    def __init__(self, config: DebuggerConfig):
+        self.dump_json = dict()
+        self.dump_json["common_dump_settings"] = dict()
+        self.dump_json["common_dump_settings"]["dump_mode"] = 0
+        self.dump_json["common_dump_settings"]["path"] = ""
+        self.dump_json["common_dump_settings"]["net_name"] = "Net"
+        self.dump_json["common_dump_settings"]["iteration"] = "all"
+        self.dump_json["common_dump_settings"]["saved_data"] = "statistic"
+        self.dump_json["common_dump_settings"]["input_output"] = 0
+        self.dump_json["common_dump_settings"]["kernels"] = []
+        self.dump_json["common_dump_settings"]["support_device"] = [0, 1, 2, 3, 4, 5, 6, 7]
+        self.dump_json["common_dump_settings"]["op_debug_mode"] = 0
+        self.dump_json["common_dump_settings"]["file_format"] = "npy"
+
+        if len(config.list) > 0:
+            self.dump_json["common_dump_settings"]["dump_mode"] = 1
+            self.dump_json["common_dump_settings"]["kernels"] = config.list
+        self.dump_json["common_dump_settings"]["path"] = config.dump_path
+        if len(config.step) > 0:
+            step_str = ""
+            for s in config.step:
+                step_str += (str(s) + '|')
+            self.dump_json["common_dump_settings"]["iteration"] = step_str[:-1]
+        if len(config.rank) > 0:
+            self.dump_json["common_dump_settings"]["support_device"] = config.rank
+        if config.task == "tensor":
+            self.dump_json["common_dump_settings"]["saved_data"] = "tensor"
+            self.dump_json["common_dump_settings"]["file_format"] = config.file_format
+        if len(config.data_mode) == 1:
+            if config.data_mode[0] == "input":
+                self.dump_json["common_dump_settings"]["input_output"] = 1
+            if config.data_mode[0] == "output":
+                self.dump_json["common_dump_settings"]["input_output"] = 2
+
+    def handle(self):
+        if os.getenv("GRAPH_OP_RUN") == "1":
+            raise Exception("Must run in graph mode, not kbk mode")
+        json_path = self.dump_json["common_dump_settings"]["path"]
+        make_dump_path_if_not_exists(json_path)
+        json_path = os.path.join(json_path, "kernel_graph_dump.json")
+        with FileOpen(json_path, 'w') as f:
+            json.dump(self.dump_json, f)
+        print_info_log(json_path + " has been created.")
+        os.environ["MINDSPORE_DUMP_CONFIG"] = json_path
+        if self.dump_json["common_dump_settings"]["dump_mode"] == 0:
+            if self.dump_json["common_dump_settings"]["iteration"] != "all" or \
+               len(self.dump_json["common_dump_settings"]["kernels"]) == 0:
+                os.environ["MS_ACL_DUMP_CFG_PATH"] = json_path
+        else:
+            if "MS_ACL_DUMP_CFG_PATH" in os.environ:
+                del os.environ["MS_ACL_DUMP_CFG_PATH"]
diff --git a/debug/accuracy_tools/atat/mindspore/ms_config.py b/debug/accuracy_tools/atat/mindspore/ms_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d846c4771caca64443e170d580268ffbbdeff8e
--- /dev/null
+++ b/debug/accuracy_tools/atat/mindspore/ms_config.py
@@ -0,0 +1,78 @@
+import json
+from atat.core.common_config import CommonConfig, BaseConfig
+from atat.core.file_check_util import FileOpen
+
+
+class TensorConfig(BaseConfig):
+    def __init__(self, json_config):
+        super().__init__(json_config)
+        self.check_mode = None
+        self.file_format = json_config.get("file_format")
+        self.check_config()
+        self._check_config()
+
+    def _check_config(self):
+        if self.data_mode is not None and len(self.data_mode) > 0:
+            if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]:
+                raise Exception("data_mode must be all, input or output")
+        if self.file_format and self.file_format not in ["npy", "bin"]:
+            raise Exception("file_format is invalid")
+
+
+class StatisticsConfig(BaseConfig):
+    def __init__(self, json_config):
+        super().__init__(json_config)
+        self.file_format = None
+        self.check_mode = None
+        self.check_config()
+        self._check_config()
+
+    def _check_config(self):
+        if self.data_mode is not None and len(self.data_mode) > 0:
+            if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]:
+                raise Exception("data_mode must be all, input or output")
+
+
+class OverflowCheck(BaseConfig):
+    def __init__(self, json_config):
+        super().__init__(json_config)
+        self.file_format = None
+        self.check_mode = json_config.get("check_mode")
+        self._check_config()
+
+    def _check_config(self):
+        if self.data_mode is not None and len(self.data_mode) > 0:
+            if len(self.data_mode) > 1 or self.data_mode[0] not in ["all", "input", "output"]:
+                raise Exception("data_mode must be all, input or output")
+        if self.check_mode and self.check_mode not in ["all", "aicore", "atomic"]:
+            raise Exception("check_mode is invalid")
+
+
+def parse_common_config(json_config):
+    return CommonConfig(json_config)
+
+
+def parse_task_config(task, json_config):
+    task_map = json_config[task]
+    if not task_map:
+        task_map = dict()
+    if task == "tensor":
+        return TensorConfig(task_map)
+    elif task == "statistics":
+        return StatisticsConfig(task_map)
+    elif task == "overflow_check":
+        return OverflowCheck(task_map)
+    else:
+        raise Exception("task is invalid.")
+
+
+def parse_json_config(json_file_path):
+    if not json_file_path:
+        raise Exception("json file path is None")
+    with FileOpen(json_file_path, 'r') as file:
+        json_config = json.load(file)
+    common_config = parse_common_config(json_config)
+    if not common_config.task:
+        common_config.task = "statistics"
+    task_config = parse_task_config(common_config.task, json_config)
+    return common_config, task_config
diff --git a/debug/accuracy_tools/atat/mindspore/overflow_check/__init__.py b/debug/accuracy_tools/atat/mindspore/overflow_check/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/mindspore/overflow_check/kernel_graph_overflow_check.py b/debug/accuracy_tools/atat/mindspore/overflow_check/kernel_graph_overflow_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ef005e59e8839e19f9af600c168343251580936
--- /dev/null
+++ b/debug/accuracy_tools/atat/mindspore/overflow_check/kernel_graph_overflow_check.py
@@ -0,0 +1,45 @@
+import os
+import json
+from atat.core.utils import make_dump_path_if_not_exists
+from atat.mindspore.debugger.debugger_config import DebuggerConfig
+from atat.core.log import print_warn_log, print_info_log
+from atat.core.file_check_util import FileOpen
+
+
+class KernelGraphOverflowCheck:
+    def __init__(self, config: DebuggerConfig):
+        self.dump_json = dict()
+        self.dump_json["common_dump_settings"] = dict()
+        self.dump_json["common_dump_settings"]["dump_mode"] = 0
+        self.dump_json["common_dump_settings"]["path"] = ""
+        self.dump_json["common_dump_settings"]["net_name"] = "Net"
+        self.dump_json["common_dump_settings"]["iteration"] = "all"
+        self.dump_json["common_dump_settings"]["saved_data"] = "full"
+        self.dump_json["common_dump_settings"]["input_output"] = 0
+        self.dump_json["common_dump_settings"]["kernels"] = []
+        self.dump_json["common_dump_settings"]["support_device"] = [0,1,2,3,4,5,6,7]
+        self.dump_json["common_dump_settings"]["op_debug_mode"] = 3
+        self.dump_json["common_dump_settings"]["file_format"] = "npy"
+
+        self.dump_json["common_dump_settings"]["path"] = config.dump_path
+        if len(config.step) > 0:
+            print_warn_log("Step would change to all in this task.")
+        if len(config.rank) > 0:
+            self.dump_json["common_dump_settings"]["support_device"] = config.rank
+        if config.check_mode == "aicore":
+            self.dump_json["common_dump_settings"]["op_debug_mode"] = 1
+        elif config.check_mode == "atomic":
+            self.dump_json["common_dump_settings"]["op_debug_mode"] = 2
+
+    def handle(self):
+        if os.getenv("GRAPH_OP_RUN") == "1":
+            raise Exception("Must run in graph mode, not kbk mode")
+        json_path = self.dump_json["common_dump_settings"]["path"]
+        make_dump_path_if_not_exists(json_path)
+        json_path = os.path.join(json_path, "kernel_graph_overflow_check.json")
+        with FileOpen(json_path, 'w') as f:
+            json.dump(self.dump_json, f)
+        print_info_log(json_path + " has been created.")
+        os.environ["MINDSPORE_DUMP_CONFIG"] = json_path
+        if "MS_ACL_DUMP_CFG_PATH" in os.environ:
+            del os.environ["MS_ACL_DUMP_CFG_PATH"]
diff --git a/debug/accuracy_tools/atat/mindspore/overflow_check/overflow_check_tool_factory.py b/debug/accuracy_tools/atat/mindspore/overflow_check/overflow_check_tool_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe53359be1ba1ecb73fb84138228415f68e1c2ce
--- /dev/null
+++ b/debug/accuracy_tools/atat/mindspore/overflow_check/overflow_check_tool_factory.py
@@ -0,0 +1,32 @@
+from atat.mindspore.debugger.debugger_config import DebuggerConfig
+from atat.mindspore.overflow_check.kernel_graph_overflow_check import KernelGraphOverflowCheck
+
+
+class OverflowCheckToolFactory:
+    tools = {
+        "cell": {
+            "kbk": None,
+            "graph": None,
+            "pynative": None
+        },
+        "api": {
+            "kbk": None,
+            "graph": None,
+            "pynative": None
+        },
+        "kernel": {
+            "kbk": None,
+            "graph": KernelGraphOverflowCheck,
+            "pynative": None
+        }
+    }
+
+    @staticmethod
+    def create(config: DebuggerConfig):
+        tool = OverflowCheckToolFactory.tools.get(config.level)
+        if not tool:
+            raise Exception("valid level is needed.")
+        tool = tool.get("graph")
+        if not tool:
+            raise Exception("Overflow check in not supported in this mode.")
+        return tool(config)
diff --git a/debug/accuracy_tools/atat/mindspore/task_handler_factory.py b/debug/accuracy_tools/atat/mindspore/task_handler_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f80e4e89c92156762ea0e4c4ed3302cc5c31f5f
--- /dev/null
+++ b/debug/accuracy_tools/atat/mindspore/task_handler_factory.py
@@ -0,0 +1,21 @@
+from atat.mindspore.debugger.debugger_config import DebuggerConfig
+from atat.mindspore.dump.dump_tool_factory import DumpToolFactory
+from atat.mindspore.overflow_check.overflow_check_tool_factory import OverflowCheckToolFactory
+
+
+class TaskHandlerFactory:
+    tasks = {
+        "tensor": DumpToolFactory,
+        "statistics": DumpToolFactory,
+        "overflow_check": OverflowCheckToolFactory
+    }
+
+    @staticmethod
+    def create(config: DebuggerConfig):
+        task = TaskHandlerFactory.tasks.get(config.task)
+        if not task:
+            raise Exception("valid task is needed.")
+        handler = task.create(config)
+        if not handler:
+            raise Exception("Can not find task handler")
+        return handler
diff --git a/debug/accuracy_tools/atat/pytorch/__init__.py b/debug/accuracy_tools/atat/pytorch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..482e850f7baa845bd831e0d4728e841661b9345b
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/__init__.py
@@ -0,0 +1,4 @@
+from .debugger.precision_debugger import PrecisionDebugger
+from .common.utils import seed_all
+from .compare.acc_compare import compare
+from .compare.distributed_compare import compare_distributed
diff --git a/debug/accuracy_tools/atat/pytorch/advisor/advisor.py b/debug/accuracy_tools/atat/pytorch/advisor/advisor.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa441e45be5880730d3a18634b6a582c964a1b57
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/advisor/advisor.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+import pandas as pd
+
+from .advisor_result import AdvisorResult
+from .advisor_const import AdvisorConst
+from ..common.utils_compare import CompareException, CompareConst, Const
+from ..common.utils_compare import print_info_log, print_warn_log, print_error_log
+from ..common.file_check_util import FileChecker, FileCheckConst
+
+
+class Advisor:
+    """
+    Class for generate advisor
+    """
+
+    def __init__(self, input_file, out_path=""):
+        self.input_file = os.path.realpath(input_file)
+        self.out_path = os.path.realpath(out_path)
+
+    def _parse_input_file(self):
+        try:
+            df = pd.read_csv(self.input_file, on_bad_lines='skip')
+        except OSError as os_err:
+            print_error_log('Failed to parse the input file %s. %s'
+                            % (self.input_file, str(os_err)))
+            raise CompareException(CompareException.PARSE_FILE_ERROR) from os_err
+        data_columns = df.columns.values
+        if {CompareConst.ACCURACY, CompareConst.NPU_NAME}.issubset(data_columns):
+            self.file_type = Const.ALL
+        elif {CompareConst.RESULT, CompareConst.NPU_MD5}.issubset(data_columns):
+            self.file_type = Const.MD5
+        elif {CompareConst.MAX_DIFF, CompareConst.RESULT}.issubset(data_columns):
+            self.file_type = Const.SUMMARY
+        else:
+            print_error_log('Compare result file does not meet the required conditions.')
+            raise CompareException(CompareException.INVALID_FILE_ERROR)
+        df.reset_index(inplace=True)
+        # The value of index is consistent with the line number of csv, csv file first line is 2
+        df.iloc[:, 0] += 2
+        return df
+
+    def _check_path_vaild(self):
+        input_file_checker = FileChecker(self.input_file, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
+                                         FileCheckConst.CSV_SUFFIX)
+        input_file_checker.common_check()
+        out_path_checker = FileChecker(self.out_path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE)
+        out_path_checker.common_check()
+
+    def gen_advisor_message(self, node_name):
+        if AdvisorConst.FORWARD in node_name:
+            if AdvisorConst.INPUT in node_name:
+                message = AdvisorConst.FORWARD_INPUT_SUGGEST
+            else:
+                message = AdvisorConst.FORWARD_OUTPUT_SUGGEST
+                message = self.deterministic_advisor(message, node_name)
+        else:
+            if AdvisorConst.INPUT in node_name:
+                message = AdvisorConst.BACKWARD_INPUT_SUGGEST
+            else:
+                message = AdvisorConst.BACKWARD_OUTPUT_SUGGEST
+                message = self.deterministic_advisor(message, node_name)
+        message = self.batch_norm_advisor(message, node_name)
+        return message
+
+    @staticmethod
+    def deterministic_advisor(message, node_name):
+        for api_name in AdvisorConst.NEED_DETERMINISTIC_API:
+            if api_name in node_name:
+                return AdvisorConst.DETERMINISTIC_SUGGEST
+        return message
+
+    @staticmethod
+    def batch_norm_advisor(message, node_name):
+        if AdvisorConst.FUNC_BATCH_NORM in node_name and AdvisorConst.FORWARD_INPUT_1 in node_name:
+            message = AdvisorConst.BATCH_NORM_SUGGEST
+        return message
+
+    def analyze_unmatched(self, analyze_data):
+        if self.file_type == Const.ALL:
+            accuracy_unmatched = analyze_data[analyze_data[CompareConst.ACCURACY] == CompareConst.ACCURACY_CHECK_UNMATCH]
+        else:
+            accuracy_unmatched = analyze_data[(analyze_data[CompareConst.NPU_SHAPE] == CompareConst.NAN) | 
+                                              (analyze_data[CompareConst.BENCH_SHAPE] == CompareConst.NAN)]
+        num_unmatch = len(accuracy_unmatched)
+        if num_unmatch != 0:
+            for i in range(len(accuracy_unmatched)):
+                item = accuracy_unmatched.iloc[i]
+                print_warn_log("The tensor name matches but the shape or dtype does not match: {}"
+                            .format(item[CompareConst.NPU_NAME]))
+
+    def gen_advisor_result(self, pd_data):
+        first_failing_data = pd_data.iloc[0]
+        node_name = first_failing_data[CompareConst.NPU_NAME]
+        index = first_failing_data['index']
+        message = self.gen_advisor_message(node_name)
+        print_warn_log("Find %s accuracy not reached, the line is %s" % (node_name, index))
+        result = AdvisorResult(node_name, index, message)
+        return result
+
+    def analysis(self):
+        self._check_path_vaild()
+        analyze_data = self._parse_input_file()
+        print_info_log("Start analyzing the comparison result: %s" % self.input_file)
+        self.analyze_unmatched(analyze_data)
+        if self.file_type == Const.ALL:
+            failing_data = analyze_data[analyze_data[CompareConst.ACCURACY] == CompareConst.ACCURACY_CHECK_NO]
+        elif self.file_type == Const.MD5:
+            failing_data = analyze_data[analyze_data[CompareConst.RESULT] == CompareConst.DIFF]
+        elif self.file_type == Const.SUMMARY:
+            failing_data = analyze_data[analyze_data[CompareConst.RESULT] == CompareConst.WARNING]
+        if failing_data.empty:
+            print_info_log("All data from api input/output accuracy reached")
+            result = AdvisorResult(AdvisorConst.NO_ERROR_API, AdvisorConst.NO_ERROR_API, AdvisorConst.NO_ERR_SUGGEST)
+        else:
+            result = self.gen_advisor_result(failing_data)
+        message_list = result.print_advisor_log()
+        result.gen_summary_file(self.out_path, message_list)
diff --git a/debug/accuracy_tools/atat/pytorch/advisor/advisor_const.py b/debug/accuracy_tools/atat/pytorch/advisor/advisor_const.py
new file mode 100644
index 0000000000000000000000000000000000000000..fecf7b8aa9e2fd9ac9be66d72bec98c811718696
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/advisor/advisor_const.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+
+class AdvisorConst:
+    """
+    Class for advisor const
+    """
+
+    # text symbol
+    NEW_LINE = "\n"
+    COLON = ": "
+
+    # advisor summary key
+    SUSPECT_NODES = "Suspect Nodes"
+    LINE = "Line"
+    ADVISOR_SUGGEST = "Expert Advice"
+
+    NO_ERROR_API = "NA"
+
+    # advisor message
+    NO_ERR_SUGGEST = "All data in comparison result meets the accuracy requirements."
+    FORWARD_INPUT_SUGGEST = "1. Analyze the model to view the input source.\n" \
+                            "2. Check whether an inplace API causes the output result to overwrite the input result. That is, the fault is actually caused by a computation error.\n" \
+                            "3. The fault may be caused by memory corruption and further analysis is required."
+    FORWARD_OUTPUT_SUGGEST = "This is a forward API computation error. Check the computation implementation."
+    BACKWARD_INPUT_SUGGEST = "Check whether the forward computation result is affected."
+    BACKWARD_OUTPUT_SUGGEST = "This is a backward API computation error. Check the computation implementation."
+    BATCH_NORM_SUGGEST = "Torch API batch_norm input not fixed, the following suggestions may fix it:\n" \
+                         "1. If use torch.nn.functional.batch_norm, you can set parameter training=False.\n" \
+                         "2. If use torch.nn.BatchNormXXX, you can set parameter affine=False.\n" \
+                         "3. Use seed_all(mode=True) to enable deterministic computing."
+    DETERMINISTIC_SUGGEST = "This torch api may be uncertainty in the calculation, " \
+                            "can seed_all(mode=True) to enable deterministic computing."
+
+    FUNC_BATCH_NORM = "Functional_batch_norm"
+    FORWARD_INPUT_1 = "forward_input.1"
+    NEED_DETERMINISTIC_API = ["conv2d", "conv3d", "matmul", "nll_loss", "layer_norm", "lstm"]
+    BATCH_NORM = "batch_norm"
+
+    # name keyword
+    INPUT = "input"
+    OUTPUT = "output"
+    FORWARD = "forward"
+    BACKWARD = "backward"
diff --git a/debug/accuracy_tools/atat/pytorch/advisor/advisor_result.py b/debug/accuracy_tools/atat/pytorch/advisor/advisor_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0ff6198ea89e389e36a422c76927c8e2cdd05fa
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/advisor/advisor_result.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import os
+import time
+
+from .advisor_const import AdvisorConst
+from ..common.utils_compare import Const
+from ..common.utils_compare import print_info_log, print_error_log
+from ..common.file_check_util import change_mode, FileCheckConst
+
+
+class AdvisorResult:
+    """
+    Class for generate advisor result
+    """
+
+    def __init__(self, node, line, message):
+        self.suspect_node = node
+        self.line = line
+        self.advisor_message = message
+
+    @staticmethod
+    def gen_summary_file(out_path, message_list):
+        file_name = 'advisor_{}.txt'.format(time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())))
+        result_file = os.path.join(out_path, file_name)
+        try:
+            with os.fdopen(os.open(result_file, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as output_file:
+                output_file.truncate(0)
+                message_list = [message + AdvisorConst.NEW_LINE for message in message_list]
+                output_file.writelines(message_list)
+            change_mode(result_file, FileCheckConst.DATA_FILE_AUTHORITY)
+        except IOError as io_error:
+            print_error_log("Failed to save %s, the reason is %s." % (result_file, io_error))
+        else:
+            print_info_log("The advisor summary is saved in: %s" % result_file)
+
+    def print_advisor_log(self):
+        print_info_log("The summary of the expert advice is as follows: ")
+        message_list = [AdvisorConst.LINE + AdvisorConst.COLON + str(self.line),
+                        AdvisorConst.SUSPECT_NODES + AdvisorConst.COLON + self.suspect_node,
+                        AdvisorConst.ADVISOR_SUGGEST + AdvisorConst.COLON + self.advisor_message]
+        for message in message_list:
+            print_info_log(message)
+        return message_list
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/.keep b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/README.md b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7738501db87b1cacbc9eb96687bf09aed3a5ed68
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/README.md
@@ -0,0 +1,528 @@
+# Ascend模型精度预检工具
+
+Ascend模型精度预检工具能在昇腾NPU上扫描用户训练模型中所有API，输出精度情况的诊断和分析。工具会提取模型中所有的API前反向信息，构造相应的API单元测试，将NPU输出与标杆（CPU高精度）比对，从而检测出精度有问题的API；另外工具还可以通过新精度标准比对法，从而确认NPU和GPU各自运行时的精度哪一方更接近标杆（CPU高精度）。
+
+**新精度标准比对法**：依据新精度标准，对不同的API采取不同的比对算法进行比对（包括绝对阈值法，标杆比对法和二进制一致法），最终给定预检判定结果。
+
+**真实数据模式**：精度预检工具支持随机生成模式和真实数据模式，即在预检dump时可以选择由工具构造随机数进行输入获得dump数据或选择获取真实输入数据进行预检dump操作；随机生成模式执行效率高，可以快速获得结果，但数据精度低，只能大致判断精度问题；真实数据模式执行效率略低于随机生成模式，但是数据精度高，可以准确判断精度问题。
+
+工具支持PyTorch版本：1.11.0/2.0/2.1。
+
+## 工具特性
+
+1. 落盘数据小。
+2. 不依赖标杆侧GPU训练资源，本地即可完成预检（新精度标准比对法除外）。
+3. 支持随机生成模式和真实数据模式。
+4. 单API测试，排除整网中的累计误差问题。
+
+## 预检流程
+
+精度预检可以分为：标准模式（直接进行NPU vs CPU高精度的预检比对操作）和新精度标准比对法（将NPU vs CPU高精度的预检比对结果和GPU vs CPU高精度的预检比对结果进行比对汇总），两种模式操作流程如下。
+
+### 标准模式
+
+1. 在NPU环境下安装预检工具。详见“**工具安装**”。
+2. 在NPU环境下dump预检数据。详见“**dump预检数据**”。
+3. NPU环境下执行run_ut。详见“**run_ut预检操作**”。
+4. 查看“**预检结果**”。
+
+### 新精度标准比对法
+
+1. 在NPU和GPU环境下分别安装预检工具。详见“**工具安装**”。
+2. 在NPU环境下dump预检数据（使用msCheckerConfig.update_config开启真实数据模式）。详见“**dump预检数据**”。
+3. 将NPU环境下dump的预检数据拷贝至GPU环境。
+4. 在NPU和GPU环境下分别执行run_ut。详见“**run_ut预检操作**”。
+5. 将NPU和GPU执行run_ut生成的`accuracy_checking_details_{timestamp}.csv`结果文件拷贝至同一环境下。
+6. 运行api_precision_compare.py。详见“**预检结果比对**”。
+
+## 工具安装
+
+1. 将att仓代码下载到本地，并配置环境变量。假设下载后att仓路径为 $ATT_HOME，环境变量应配置为：
+
+   ```bash
+   export PYTHONPATH=$PYTHONPATH:$ATT_HOME/debug/accuracy_tools/
+   ```
+
+2. 安装依赖tqdm、rich、pyyaml、pandas
+
+   ```bash
+   pip3 install tqdm rich pyyaml pandas
+   ```
+
+## 预检操作
+
+### dump预检数据
+
+#### dump操作
+
+在训练脚本（如main.py）中加入以下代码导入工具dump模块，启动训练即可自动抓取网络所有API信息。
+
+- 若训练脚本中的代码不是通过torch.utils.data.dataloader来加载数据或在部分流水并行、张量并行场景下，工具的开关无法在每张卡上自动打开，导致多卡训练dump结果只有一组json，那么需要在训练代码中添加打开工具开关的调用。
+
+   在训练代码中添加数据dump操作如下：
+
+   ```Python
+   import api_accuracy_checker.dump as DP
+   
+   # 需要先修改enable_dataloader参数值为False
+   # 关闭torch.utils.data.dataloader加载数据时，下列代码须在训练step代码内添加
+   DP.dump.start()    # 开启工具dump模块
+   
+   ...
+   
+   DP.dump.stop()    # 控制dump结束
+   DP.dump.step()    # 在DP.dump.stop()后加入DP.dump.step()即可指定需要dump的step
+   ```
+
+   上述代码要添加在迭代内，如对于[ModelLink](https://gitee.com/ascend/ModelLink)的LLAMA2-7B可以添加在training.py中train函数的iteration循环内。之后工具会适配这个场景开关的自动打开。
+
+- 如果训练脚本是通过torch.utils.data.dataloader方式加载数据。
+
+   首先，需要开启torch.utils.data.dataloader加载数据，操作如下：
+
+   ```bash
+   cd att/debug/accuracy_tools/api_accuracy_checker
+   vi config.yaml
+   # 修改enable_dataloader参数值为True
+   ```
+
+   其次，在训练脚本中加入以下代码导入工具dump模块，启动训练即可自动抓取网络所有API信息。
+
+   ```python
+   import api_accuracy_checker.dump
+   ```
+
+   工具默认抓取训练的**第二个迭代**并且在第二个迭代后会报错退出训练进程，可通过target_iter参数配置。
+
+   **报错信息如下，这个报错仅用于停止训练，属于正常现象**：
+
+   ```bash
+   Exception: Model pretest: exit after iteration 1.
+   ```
+
+   若报错信息不一致，可能是由于服务器的其他错误信息覆盖导致，可以尝试查找报错信息中的Exception。
+
+dump信息默认会存盘到“./step1”路径下（相对于启动训练的路径），包括：
+
+- forward_info_{pid}.json：前向API信息文件。
+- backward_info_{pid}.json：反向API信息文件。
+- stack_info_{pid}.json：调用栈信息文件。
+
+forward_info与stack_info中的key值一一对应，用户可根据forward_info中API的key在stack_info中查询到其调用栈及代码行位置。
+
+若有需要，用户可以通过msCheckerConfig.update_config来配置dump路径以及开启**真实数据模式**、指定dump某个step或配置**API dump白名单**，详见“**msCheckerConfig.update_config**”。
+
+#### 真实数据模式
+
+预检工具默认为随机数据模式，如果想要完全复刻整网的API运行情况，可以使用真实数据模式，添加以下代码即可：
+
+```python
+from api_accuracy_checker.dump import msCheckerConfig
+msCheckerConfig.update_config(real_data=True)
+```
+
+#### API dump白名单
+
+精度预检工具可以对指定API进行预检操作，可以在dump时的训练脚本中直接添加白名单参数，只dump指定的API数据，示例代码如下：
+
+```python
+from api_accuracy_checker.dump import msCheckerConfig
+msCheckerConfig.update_config(white_list=["conv1d", "conv2d"])
+```
+
+配置的API名称须存在于[support_wrap_ops.yaml](./hook_module/support_wrap_ops.yaml)文件下。
+
+#### 工具支持的API列表 
+
+预检工具维护固定的API支持列表，若需要删除或增加dump的API，可以在[support_wrap_ops.yaml](./hook_module/support_wrap_ops.yaml)文件内手动修改，如下示例：
+
+```bash
+functional:  # functional为算子类别，找到对应的类别，在该类别下按照下列格式删除或添加API
+  - conv1d
+  - conv2d
+  - conv3d
+```
+
+#### msCheckerConfig.update_config
+
+**功能说明**
+
+配置精度预检dump时的属性。
+
+可选配置。
+
+**函数原型**
+
+```python
+msCheckerConfig.update_config(dump_path="./", real_data=False, target_iter=[1], white_list=[], enable_dataloader=False)
+```
+
+**参数说明**
+
+| 参数名称          | 说明                                                         | 是否必选 |
+| ----------------- | ------------------------------------------------------------ | -------- |
+| dump_path         | 设置dump路径，默认为当前目录。若指定目录不存在，则自动创建。 | 否       |
+| real_data         | 真实数据模式，可取值True或False，默认为False，表示随机数据模式，配置为True后开启真实数据模式，dump信息增加forward_real_data和backward_real_data目录，目录下保存每个API输入的具体数值。 | 否       |
+| target_iter       | 指定dump某个step的数据，默认为[1]，须指定为训练脚本中存在的step。target_iter为list格式，可配置逐个step，例如：target_iter=[0,1,2]；也可以配置step范围，例如：target_iter=list(range(0,9))，表示dump第0到第8个step。 | 否       |
+| white_list        | API dump白名单，指定dump具体API数据，也可以直接配置预检的API白名单，详细请参见“**API预检白名单**”。参数示例：white_list=["conv1d", "conv2d"]。默认未配置白名单，即dump全量API数据。 | 否       |
+| enable_dataloader | 自动dump数据开关，可取值True（开启）、False（关闭），默认关闭。 | 否       |
+
+### run_ut预检操作
+
+完成“dump预检数据”后，仅仅获取了API的输入数据，为了得到NPU vs CPU高精度（标杆）的预检比对结果和GPU vs CPU高精度（标杆）的预检比对结果，还需要进行run_ut操作。
+
+run_ut预检操作包括如下场景：
+
+- 使用run_ut.py执行预检：run_ut.py适用于数据量较小的单卡场景。
+- 使用multi_run_ut.py执行多线程预检：multi_run_ut.py适用于数据量较大的大模型场景。
+
+#### 使用run_ut.py执行预检
+
+1. 将API信息输入给run_ut模块运行精度检测并比对，运行如下命令： 
+
+   ```bash
+   cd $ATT_HOME/debug/accuracy_tools/api_accuracy_checker/run_ut
+   python run_ut.py -forward ./forward_info_0.json -backward ./backward_info_0.json
+   ```
+
+   某些场景下（如推理），可以不指定backward_info_0.json，不影响预检功能。
+
+   | 参数名称                         | 说明                                                         | 是否必选                           |
+   | -------------------------------- | ------------------------------------------------------------ | ---------------------------------- |
+   | -forward或--forward_input_file   | 指定前向API信息文件forward_info_{pid}.json。                 | 是                                 |
+   | -backward或--backward_input_file | 指定反向API信息文件backward_info_{pid}.json。                | 否                                 |
+   | -save_error_data                 | 保存精度未达标的API输入输出数据。                            | 否                                 |
+   | -o或--out_path                   | 指定run_ut执行结果存盘路径，默认“./”（相对于run_ut的路径）。 | 否                                 |
+   | -j或--jit_compile                | 开启jit编译。                                                | 否                                 |
+   | -d或--device                     | 指定Device ID，选择UT代码运行所在的卡，默认值为0。           | 否                                 |
+   | -csv_path或--result_csv_path     | 指定本次运行中断时生成的`accuracy_checking_result_{timestamp}.csv`文件路径，执行run_ut中断时，若想从中断处继续执行，配置此参数即可。需要指定为上次中断的`accuracy_checking_result_{timestamp}.csv`文件。详见“**断点续检**”。 | run_ut操作中断后继续执行场景下必选 |
+   | -real_data_path                  | 指定run_ut操作的真实数据路径。真实数据dump模式通过**msCheckerConfig.update_config**接口的real_data参数开启。指定绝对路径为forward_real_data和backward_real_data目录的父目录。 | dump的数据为真实数据下必选         |
+   | -f或--filter_api                 | 过滤模型中除最大值和最小值以外其他参数和结构相同的API。适用于模型较大且重复API较多的场景。 | 否                                 |
+
+   run_ut执行结果包括`accuracy_checking_result_{timestamp}.csv`和`accuracy_checking_details_{timestamp}.csv`两个文件。`accuracy_checking_result_{timestamp}.csv`是API粒度的，标明每个API是否通过测试。建议用户先查看`accuracy_checking_result_{timestamp}.csv`文件，对于其中没有通过测试的或者特定感兴趣的API，根据其API name字段在`accuracy_checking_details_{timestamp}.csv`中查询其各个输出的达标情况以及比较指标。详细介绍请参见“**预检结果**”。
+
+2. （可选）如果需要保存比对不达标的输入和输出数据，可以在run_ut执行命令结尾添加-save_error_data，例如：
+
+   ```bash
+   python run_ut.py -forward ./forward_info_0.json -backward ./backward_info_0.json -save_error_data
+   ```
+
+   数据默认会存盘到'./ut_error_data{timestamp}'路径下（相对于启动run_ut的路径），有需要的话，用户可以通过修改att/debug/accuracy_tools/api_accuracy_checker目录下，config.yaml文件的error_data_path参数来配置保存路径，详见“config.yaml文件说明”。。
+
+3. （可选）如果dump的数据为真实数据，那么需要指定真实数据路径，例如：
+
+   ```bash
+   python run_ut.py -forward ./forward_info_0.json -backward ./backward_info_0.json -real_data_path /home/xxx/ut/real_data
+   ```
+
+#### 使用multi_run_ut.py执行多线程预检
+
+multi_run_ut.py脚本，可以并行执行多个run_ut操作，从而降低预检耗时。
+
+命令示例如下：
+
+```bash
+cd $ATT_HOME/debug/accuracy_tools/api_accuracy_checker/run_ut
+python multi_run_ut.py -forward ./forward_info_0.json -backward ./backward_info_0.json -n 32 -d 0 1 2 3
+```
+
+某些场景下（如推理），可以不指定backward_info_0.json，不影响预检功能。
+
+| 参数名称                         | 说明                                                         | 是否必选                           |
+| -------------------------------- | ------------------------------------------------------------ | ---------------------------------- |
+| -forward或--forward_input_file   | 指定前向API信息文件forward_info_{pid}.json。                 | 是                                 |
+| -backward或--backward_input_file | 指定反向API信息文件backward_info_{pid}.json。                | 否                                 |
+| -save_error_data                 | 保存精度未达标的API输入输出数据。                            | 否                                 |
+| -o或--out_path                   | 指定run_ut执行结果存盘路径，默认“./”（相对于run_ut的路径）。 | 否                                 |
+| -j或--jit_compile                | 开启jit编译。                                                | 否                                 |
+| -n                               | 同时执行run_ut线程的数量，默认为8，最大支持64，但每个Device最大支持8个线程，当指定多个线程和多个Device时，则线程数在每张卡上均分。 | 否                                 |
+| -d或--device                     | 指定Device ID，选择UT代码运行所在的卡，默认值为0，支持同时指定0~7，共8个Device。 | 否                                 |
+| -csv_path或--result_csv_path     | 指定本次运行中断时生成的`accuracy_checking_result_{timestamp}.csv`文件路径，执行run_ut中断时，若想从中断处继续执行，配置此参数即可。需要指定为上次中断的`accuracy_checking_result_{timestamp}.csv`文件。详见“**断点续检**”。 | run_ut操作中断后继续执行场景下必选 |
+| -real_data_path                  | 指定run_ut操作的真实数据路径。真实数据dump模式通过**msCheckerConfig.update_config**接口的real_data参数开启。指定绝对路径为forward_real_data和backward_real_data目录的父目录。 | dump的数据为真实数据下必选         |
+| -f或--filter_api                 | 过滤模型中除最大值和最小值以外其他参数和结构相同的API。适用于模型较大且重复API较多的场景。 | 否                                 |
+
+#### 断点续检
+
+精度预检run_ut过程中，若因环境、数据量过大等原因导致预检进程中断，那么当用户解决这些问题后，重新执行run_ut操作，可以通过断点续检操作继续前面未完成的预检，会在-csv_path指定的`accuracy_checking_result_{timestamp}.csv`文件以及对应的`accuracy_checking_details_{timestamp}.csv`文件中继续写入后续的结果，不会重新创建结果文件。
+
+须指定为上次预检中断的`accuracy_checking_result_{timestamp}.csv`文件。请勿修改`accuracy_checking_result_{timestamp}.csv`和`accuracy_checking_details_{timestamp}.csv`文件名，包括时间戳，否则断点续检会因无法识别到文件名而失败。
+
+断点续检操作通过如下命令执行：
+
+```bash
+python run_ut.py -forward ./forward_info_0.json -backward ./backward_info_0.json -csv_path /home/xxx/ut/accuracy_checking_result_{timestamp}.csv
+```
+
+#### API预检白名单
+
+run_ut过程同样支持API预检白名单，操作方式如下：
+
+修改att/debug/accuracy_tools/api_accuracy_checker目录下config.yaml文件的white_list参数，配置需要预检的API名称，详见“config.yaml文件说明”。
+
+### config.yaml文件说明
+
+config.yaml文件可以通过配置参数来控制dump和run_ut操作的真实数据模式以及白名单等功能。
+
+文件路径为：att/debug/accuracy_tools/api_accuracy_checker/config.yaml
+
+| 参数名称          | 说明                                                         | 是否必选 |
+| ----------------- | ------------------------------------------------------------ | -------- |
+| dump_path         | 设置dump路径，默认为当前目录。若指定目录不存在，则自动创建。 | 否       |
+| real_data         | 真实数据模式，可取值True或False，默认为False，表示随机数据模式，配置为True后开启真实数据模式，dump信息增加forward_real_data和backward_real_data目录，目录下保存每个API输入的具体数值。 | 否       |
+| enable_dataloader | 自动dump数据开关，可取值True（开启）、False（关闭），默认关闭。 | 否       |
+| target_iter       | 指定dump某个step的数据，默认为[1]，须指定为训练脚本中存在的step。target_iter为list格式，可配置逐个step，例如：target_iter=[0,1,2]；也可以配置step范围，例如：target_iter=list(range(0,9))，表示dump第0到第8个step。 | 否       |
+| white_list        | API dump白名单，指定dump具体API数据，也可以直接配置预检的API白名单，详细请参见“**API预检白名单**”。参数示例：white_list=["conv1d", "conv2d"]。默认未配置白名单，即dump全量API数据。 | 否       |
+| error_data_path   | 配置保存精度未达标的API输入输出数据路径。                    | 否       |
+| jit_compile       | 开启jit编译。                                                | 否       |
+| precision         | 浮点数表示位数，默认取小数点后14位。                         | 否       |
+
+## 预检结果
+
+精度预检生成的`accuracy_checking_result_{timestamp}.csv`和`accuracy_checking_details_{timestamp}.csv`文件示例如下：
+
+可以通过先查看`accuracy_checking_result_{timestamp}.csv`文件的Forward Test Success和Backward Test Success，判断是否存在未通过测试的API，再查看`accuracy_checking_details_{timestamp}.csv`文件的API详细达标情况，API达标情况介绍请参见“**API预检指标**”。
+
+`accuracy_checking_result_{timestamp}.csv`
+
+![891a3bd8_12631423](img/accuracy_checking_result.png)
+
+| 字段                  | 含义                                                         |
+| --------------------- | ------------------------------------------------------------ |
+| API name              | API名称。                                                    |
+| Forward Test Success  | 前向API是否通过测试，pass为通过，warning为待观察，error为错误。 |
+| Backward Test Success | 反向API是否通过测试，pass为通过，warning为待观察，error为错误，如果是空白的话代表该API没有反向输出。 |
+| Message               | 提示信息。                                                   |
+
+Forward Test Success和Backward Test Success是否通过测试是由`accuracy_checking_details_{timestamp}.csv`中的余弦相似度、最大绝对误差、双百双千双万指标判定结果决定的。
+
+需要注意的是`accuracy_checking_details_{timestamp}.csv`中可能存在一个API的前向（反向）有多个输出，那么每个输出记录一行，而在`accuracy_checking_result_{timestamp}.csv`中的结果需要该API的所有结果均为pass才能标记为TRUE，否则标记FALSE或WARING。
+
+`accuracy_checking_details_{timestamp}.csv`
+
+![f07237b1_12631423](img/accuracy_checking_details.png)
+
+| 字段             | 含义                                                         |
+| ---------------- | ------------------------------------------------------------ |
+| API name         | NPU或GPU下的API名称。                                        |
+| Bench Dtype      | 标杆数据的API数据类型。                                      |
+| Device Dtype     | NPU或GPU数据的API数据类型。                                  |
+| Shape            | API的Shape信息。                                             |
+| 余弦相似度       | NPU或GPU数据与标杆数据的余弦相似度。                         |
+| 最大绝对误差     | NPU或GPU数据与标杆数据的最大绝对误差。                       |
+| 双百指标         | 双百精度指标。是指NPU或GPU的Tensor中的元素逐个与对应的标杆数据对比，相对误差小于百分之一的个数占总元素个数的比例。测试通过标准为相对误差大于百分之一的个数占总元素个数的比例小于百分之一。 |
+| 双千指标         | 双千精度指标。是指NPU或GPU的Tensor中的元素逐个与对应的标杆数据对比，相对误差小于千分之一的个数占总元素个数的比例。测试通过标准为相对误差大于千分之一的个数占总元素个数的比例小于千分之一。 |
+| 双万指标         | 双万精度指标。是指NPU或GPU的Tensor中的元素逐个与对应的标杆数据对比，相对误差小于万分之一的个数占总元素个数的比例。测试通过标准为相对误差大于万分之一的个数占总元素个数的比例小于万分之一。 |
+| 二进制一致错误率 | NPU或GPU数据中每个Tensor精度不一致的数值的数量与Tensor中数值数量的比值。只有数据是builtin类型（bool、int、float、str）、torch.bool和torch的int类型才会展示。 |
+| 误差均衡性       | NPU或GPU数据与标杆数据精度差的上下浮动情况。                 |
+| 均方根误差       | NPU或GPU数据与标杆数据的均方根误差。                         |
+| 小值域错误占比   | NPU或GPU Tensor中与标杆的绝对误差大于错误阈值的小值在小值域（小值的总数量）中的占比。判断为小值以及绝对误差的错误阈值见“**小值域阈值**”。 |
+| 相对误差最大值   | NPU或GPU数据与标杆数据相对误差的最大值。                     |
+| 相对误差平均值   | NPU或GPU数据与标杆数据相对误差的平均值。                     |
+| inf/nan错误率    | NPU与标杆inf/nan计算不一致的元素个数占总元素的个数比例。     |
+| 相对误差错误率   | NPU与标杆的正常值计算相对误差，其大于错误阈值的元素个数占正常值元素个数的比例。 |
+| 绝对误差错误率   | NPU与标杆的小值计算绝对误差，其大于错误阈值的元素个数占小值元素个数的比例。 |
+| Status           | API预检通过状态，pass表示通过测试，error表示未通过，warning表示测试未通过双千或双万精度指标，SKIP表示该API的某个参数的反向不要计算梯度，所以没有任何计算过程，其他信息均为空。 |
+| message          | 提示信息。                                                   |
+
+### 小值域阈值
+
+判定为小值的阈值为：
+
+- torch.float32：e-6
+- torch.float16：e-3
+- torch.bfloat16：e-3
+
+小值域的绝对误差阈值为：
+
+- torch.float32：e-9
+- torch.float16：e-5
+- torch.bfloat16：e-5
+
+### API预检指标
+
+API预检指标是通过对`accuracy_checking_details_{timestamp}.csv`中的余弦相似度、最大绝对误差双百、双千、双万精度指标的数值进行判断，得出该API是否符合精度标准的参考指标。
+
+API预检通过测试，则在`accuracy_checking_details_{timestamp}.csv`文件中的“Status”列标记“pass”，否则标记“error”或“warning”，详细规则如下：
+
+1. 余弦相似度 > 0.99：≤ 0.99为不达标，标记“error”，> 0.99达标，进行下一步；
+2. 最大绝对误差 ＜ 0.001：＜ 0.001达标，标记“pass”，≥ 0.001为不达标，进行下一步；
+3. 双百、双千、双万精度指标：
+   - 对于float16和bfloat16数据：双百指标不通过，标记“error”；双百指标通过，双千指标不通过，标记“warning”；双百、双千指标均通过，标记“pass”。
+   - 对于float32和float64数据：双千指标不通过，标记“error”；双千指标通过，双万指标不通过，标记“warning”；双千、双万指标均通过，标记“pass”。
+
+4. 在`accuracy_checking_result_{timestamp}.csv`中以“Forward Test Success”和“Backward Test Success”字段统计该算子前向反向输出的测试结果，对于标记“pass”的算子，则在`accuracy_checking_result_{timestamp}.csv`中标记“TRUE”表示测试通过，对于标记“error”或“warning”的算子，则在`accuracy_checking_result_{timestamp}.csv`中标记“FALSE”表示测试不通过。由于一个算子可能有多个前向或反向的输入或输出，那么该类算子的输入或输出中必须全为“pass”，才能在`accuracy_checking_result_{timestamp}.csv`中标记“TRUE”，只要有一个输入或输出标记“error”或“warning”，那么在`accuracy_checking_result_{timestamp}.csv`中标记“FALSE”。
+
+## 预检结果比对
+
+该步骤仅新精度标准比对法需要执行，需要同时获取NPU和GPU环境下run_ut操作的预检结果`accuracy_checking_details_{timestamp}.csv`文件。执行如下命令进行NPU和GPU预检结果的比对：
+
+```bash
+cd $ATT_HOME/debug/accuracy_tools/api_accuracy_checker/compare
+python api_precision_compare.py -npu /home/xxx/npu/accuracy_checking_details_{timestamp}.csv -gpu /home/xxx/gpu/accuracy_checking_details_{timestamp}.csv -o /home/xxx/
+```
+
+| 参数名称             | 说明                                                         | 是否必选 |
+| -------------------- | ------------------------------------------------------------ | -------- |
+| -npu或--npu_csv_path | NPU预检结果`accuracy_checking_details_{timestamp}.csv`文件路径。默认从当前目录下识别该文件。 | 否       |
+| -gpu或--gpu_csv_path | GPU预检结果`accuracy_checking_details_{timestamp}.csv`文件路径。默认从当前目录下识别该文件。 | 否       |
+| -o或--out_path       | 指定api_precision_compare.py执行结果存盘路径，默认为当前目录。 | 否       |
+
+执行完成后输出`api_precision_compare_result_{timestamp}.csv`和`api_precision_compare_details_{timestamp}.csv`文件。文件示例如下：
+
+可以通过先查看`api_precision_compare_result_{timestamp}.csv`文件的Forward Test Success和Backward Test Success，判断是否存在未通过测试的API，再查看`api_precision_compare_details_{timestamp}.csv`文件的API详细达标情况。
+
+`api_precision_compare_result_{timestamp}.csv`
+
+![api_precision_compare_result](img/api_precision_compare_result.png)
+
+| 字段                  | 含义                                                         |
+| --------------------- | ------------------------------------------------------------ |
+| API name              | API名称。                                                    |
+| Forward Test Success  | 前向API是否通过测试，pass为通过，warning为待观察，error为错误，skip表示该API的数据类型不支持使用新精度标准进行比对，如float64。 |
+| Backward Test Success | 反向API是否通过测试，pass为通过，warning为待观察，error为错误，如果是空白的话代表该API没有反向输出，skip表示该API的数据类型不支持使用新精度标准进行比对，如float64。 |
+| Message               | 提示信息。                                                   |
+
+Forward Test Success和Backward Test Success是否通过测试是由`api_precision_compare_details_{timestamp}.csv`中的各个指标判定结果决定的。需要注意的是`api_precision_compare_details_{timestamp}.csv`中可能存在一个API的前向（反向）有多个输出，那么每个输出记录一行，而在`api_precision_compare_result_{timestamp}.csv`中的结果需要该API的所有结果均为pass才能标记为TRUE，否则标记FALSE或WARING。
+
+`api_precision_compare_details_{timestamp}.csv`
+
+![api_precision_compare_details](img/api_precision_compare_details.png)
+
+| 字段                     | 含义                                                         |
+| ------------------------ | ------------------------------------------------------------ |
+| API name                 | NPU或GPU下的API名称。                                        |
+| 小值域错误比值           | NPU与CPU的小值域的错误比率/GPU与CPU的小值域的错误比率。      |
+| 小值域错误判定结果       | 小值域错误比值小于等于1标记为pass，1~2之间标记为waring，大于2标记为error。 |
+| 均方根误差比值           | NPU与CPU的均方根误差/GPU与CPU的均方根误差。                  |
+| 均方根误差判定结果       | 均方根误差比值小于等于1标记为pass，1~2之间标记为waring，大于2标记为error。 |
+| 相对误差最大值比值       | NPU与CPU的相对误差最大值/GPU与CPU的相对误差最大值。          |
+| 相对误差最大值判定结果   | 相对误差最大值比值小于等于1标记为pass，1~10之间标记为waring，大于10标记为error。 |
+| 相对误差平均值比值       | NPU与CPU的相对误差的平均值/GPU与CPU的相对误差的平均值。      |
+| 相对误差平均值判定结果   | 相对误差平均值比值小于等于1标记为pass，1~2之间标记为waring，大于2标记为error。 |
+| 误差均衡性比值           | NPU与CPU的误差均衡性/GPU与CPU的误差均衡性。                  |
+| 误差均衡性判定结果       | 误差均衡性比值小于等于1标记为pass，1~2之间标记为waring，大于2标记为error。该字段暂不参与api_precision_compare_result的结果判定。 |
+| inf/nan错误率            | NPU与标杆inf/nan计算不一致的元素个数占总元素的个数比例。     |
+| inf/nan判定结果          | inf/nan错误率判定结果，等于0标记为pass，其余情况标记为error。 |
+| 相对误差错误率           | NPU与标杆的正常值计算相对误差，其大于错误阈值的元素个数占正常值元素个数的比例。 |
+| 相对误差判定结果         | 相对误差错误率判定结果，等于0标记为pass，其余情况标记为error。 |
+| 绝对误差错误率           | NPU与标杆的小值计算绝对误差，其大于错误阈值的元素个数占小值元素个数的比例。 |
+| 绝对误差判定结果         | 绝对误差错误率判定结果，等于0标记为pass，其余情况标记为error。 |
+| 二进制一致错误率         | NPU或GPU数据中每个Tensor精度不一致的数值的数量与Tensor中数值数量的比值。只有数据是builtin类型（bool、int、float、str）、torch.bool和torch的int类型或者在新精度标准中使用二进制一致算法进行比对的API才会展示。 |
+| 二进制一致错误率判定结果 | 二进制一致错误率判定结果，等于0标记为pass，其余情况标记为error。 |
+| 比对结果                 | 综合所有指标的最终结果。如果比对指标中有error，则标记为error；有warning，则标记为warning；否则标记为pass。 |
+| 比对算法                 | API使用的比对算法，为标杆比对法、二进制一致法和绝对阈值法中的一种。 |
+| Message                  | 提示信息。当前提示该API比对结果为error或warning时对应不符合标准的指标。 |
+
+# 溢出解析工具
+
+针对训练过程中的溢出检测场景（参见[ptdbg_ascend精度工具功能说明](https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/ptdbg_ascend/doc)中的"溢出检测场景"进行溢出检测dump），对于输入正常但输出存在溢出的API，会在训练执行目录下将溢出的API信息按照前向和反向分类，dump并保存为`forward_info_{pid}.json`，前向过程溢出的API可通过该工具对`forward_info_{pid}.json`进行解析，输出溢出API为正常溢出还是非正常溢出，从而帮助用户快速判断。
+
+工具支持PyTorch版本：1.8.1/1.11.0/2.0/2.1。
+
+若溢出检测场景dump结果生成`forward_info_{pid}.json`文件，则使用本工具进行解析。操作步骤如下：
+
+1. 安装预检工具
+
+   将att仓代码下载到本地，并配置环境变量。假设下载后att仓路径为 $ATT_HOME，环境变量应配置为
+
+   ```bash
+   export PYTHONPATH=$PYTHONPATH:$ATT_HOME/debug/accuracy_tools/
+   ```
+
+   安装依赖tqdm、rich、pyyaml
+
+   ```bash
+   pip3 install tqdm rich pyyaml
+   ```
+
+2. 执行溢出API解析操作
+
+   **forward_info_0.json为[ptdbg_ascend精度工具功能说明](https://gitee.com/ascend/att/tree/master/debug/accuracy_tools/ptdbg_ascend/doc)中的"溢出检测场景"执行溢出检测dump时生成，而不是精度预检工具生成。**
+
+   ```bash
+   cd $ATT_HOME/debug/accuracy_tools/api_accuracy_checker/run_ut
+   python run_overflow_check.py -forward ./forward_info_0.json
+   ```
+
+   | 参数名称                       | 说明                                               | 是否必选 |
+   | ------------------------------ | -------------------------------------------------- | -------- |
+   | -forward或--forward_input_file | 指定前向API信息文件forward_info_{pid}.json。       | 是       |
+   | -j或--jit_compile              | 开启jit编译。                                      | 否       |
+   | -d或--device                   | 指定Device ID，选择UT代码运行所在的卡，默认值为0。 | 否       |
+
+   反向过程溢出的API暂不支持该功能。
+
+
+具体参数解释请参见“**Ascend模型精度预检工具”**。
+
+# FAQ 
+
+1. 预检工具在dump和run_ut的过程中，是否需要同时开启或关闭jit编译（jit_compile）？
+
+   答：是。
+
+2. 预检工具对于type_as这类涉及数据类型转换操作的API，是否具有参考性？
+
+   由于这类API在CPU侧存在精度先提升后下降的操作，因此这类API的有效性的参考价值有限。
+
+3. run ut过程中出现报错：ERROR:Got unsupported ScalarType BFloat16
+
+   答：请使用最新版本的工具。
+
+4. Dropout算子，CPU和NPU的随机应该不一样，为什么结果比对是一致的？
+
+   答：这个结果是正常的，工具对该算子有特殊处理，只判定位置为0的位置比例大约和设定p值相当。
+
+5. 为什么浮点型数据bench和CPU的dtype不一致？
+
+   答：对于fp16的数据，CPU会上升一个精度fp32去计算，这是和算子那边对齐的精度结论，CPU用更高精度去计算会更接近真实值。
+
+6. 添加预检工具后截取操作报错：`IndexError: too many indices for tensor of dimension x` 或 `TypeError: len() of a 0-d tensor`。
+
+   答：注释工具目录api_accuracy_checker/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- __getitem__`，工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。
+
+7. 添加预检工具后F.gelu触发ValueError报错：`activation_func must be F.gelu`等。
+
+   答：注释工具目录api_accuracy_checker/hook_module/support_wrap_ops.yaml文件中functional:下的的`- gelu`，工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。
+
+8. 添加预检工具后触发AsStrided算子相关的报错，或者编译相关的报错，如：`Failed to compile Op [AsStrided]`。
+
+   答：注释工具目录api_accuracy_checker/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- t`和`- transpose`。
+
+9. Tensor 魔法函数具体对应什么操作？
+
+   答：
+
+   | Tensor魔法函数  | 具体操作         |
+   | --------------- | ---------------- |
+   | `__add__`       | +                |
+   | `__and__`       | &                |
+   | `__bool__`      | 返回Tensor布尔值 |
+   | `__div__`       | /                |
+   | `__eq__`        | ==               |
+   | `__ge__`        | >=               |
+   | `__gt__`        | >                |
+   | `__iadd__`      | +=               |
+   | `__iand__`      | &=               |
+   | `__idiv__`      | /=               |
+   | `__ifloordiv__` | //=              |
+   | `__ilshift__`   | <<=              |
+   | `__imod__`      | %=               |
+   | `__imul__`      | *=               |
+   | `__ior__`       | \|=              |
+   | `__irshift__`   | >>=              |
+   | `__isub__`      | -=               |
+   | `__ixor__`      | ^=               |
+   | `__lshift__`    | <<               |
+   | `__matmul__`    | 矩阵乘法         |
+   | `__mod__`       | %                |
+   | `__mul__`       | *                |
+   | `__nonzero__`   | 同`__bool__`     |
+   | `__or__`        | \|               |
+   | `__radd__`      | +（反向）        |
+   | `__rmul__`      | *（反向）        |
+   | `__rshift__`    | >>               |
+   | `__sub__`       | -                |
+   | `__truediv__`   | 同`__div__`      |
+   | `__xor__`       | ^                |
+
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/.keep b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/config.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd6607a81ec00ce635ffae6e41b4b9d18e090827
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/config.py
@@ -0,0 +1,76 @@
+import os
+import yaml
+from ..common.utils import check_file_or_directory_path
+from ..hook_module.utils import WrapFunctionalOps, WrapTensorOps, WrapTorchOps
+from ...common.file_check import FileOpen
+
+WrapApi = set(WrapFunctionalOps) | set(WrapTensorOps) | set(WrapTorchOps)
+
+
+class Config:
+    def __init__(self, yaml_file):
+        check_file_or_directory_path(yaml_file, False)
+        with FileOpen(yaml_file, 'r') as file:
+            config = yaml.safe_load(file)
+        self.config = {key: self.validate(key, value) for key, value in config.items()}
+
+    def validate(self, key, value):
+        validators = {
+            'dump_path': str,
+            'real_data': bool,
+            'enable_dataloader': bool,
+            'target_iter': list,
+            'white_list': list,
+            'error_data_path': str,
+            'jit_compile': bool,
+            'precision': int
+        }
+        if key not in validators:
+            raise ValueError(f"{key} must be one of {validators.keys()}")
+        if not isinstance(value, validators.get(key)):
+            raise ValueError(f"{key} must be {validators[key].__name__} type")
+        if key == 'target_iter':
+            if not isinstance(value, list):
+                raise ValueError("target_iter must be a list type")
+            if any(isinstance(i, bool) for i in value):
+                raise ValueError("target_iter cannot contain boolean values")
+            if not all(isinstance(i, int) for i in value):
+                raise ValueError("All elements in target_iter must be of int type")
+            if any(i < 0 for i in value):
+                raise ValueError("All elements in target_iter must be greater than or equal to 0")
+        if key == 'precision' and value < 0:
+            raise ValueError("precision must be greater than 0")
+        if key == 'white_list':
+            if not isinstance(value, list):
+                raise ValueError("white_list must be a list type")
+            if not all(isinstance(i, str) for i in value):
+                raise ValueError("All elements in white_list must be of str type")
+            invalid_api = [i for i in value if i not in WrapApi]
+            if invalid_api:
+                raise ValueError(f"{', '.join(invalid_api)} is not in support_wrap_ops.yaml, please check the white_list")
+        return value
+
+    def __getattr__(self, item):
+        return self.config[item]
+
+    def __str__(self):
+        return '\n'.join(f"{key}={value}" for key, value in self.config.items())
+
+    def update_config(self, dump_path=None, real_data=None, target_iter=None, white_list=None, enable_dataloader=None):
+        args = {
+            "dump_path": dump_path if dump_path else self.config.get("dump_path", './'),
+            "real_data": real_data if real_data else self.config.get("real_data", False),
+            "target_iter": target_iter if target_iter else self.config.get("target_iter", [1]),
+            "white_list": white_list if white_list else self.config.get("white_list", []),
+            "enable_dataloader": enable_dataloader if enable_dataloader else self.config.get("enable_dataloader", False)
+        }
+        for key, value in args.items():
+            if key in self.config:
+                self.config[key] = self.validate(key, value)
+            else:
+                raise ValueError(f"Invalid key '{key}'")
+
+
+cur_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+yaml_path = os.path.join(cur_path, "config.yaml")
+msCheckerConfig = Config(yaml_path)
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/utils.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb2257c2f2e7c409a596bd5c6efa0e4ca2f5d0c0
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/common/utils.py
@@ -0,0 +1,653 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import collections
+import json
+import os
+import random
+import re
+import stat
+import subprocess
+import sys
+import time
+import csv
+from datetime import datetime, timezone
+
+import numpy as np
+import torch
+
+try:
+    import torch_npu
+except ImportError:
+    IS_GPU = True
+else:
+    IS_GPU = False
+
+from ...common.file_check import FileCheckConst, FileChecker, FileOpen
+from ...common import file_check as file_check_util
+
+torch_without_guard_version_list = ['2.1']
+for version in torch_without_guard_version_list:
+    if torch.__version__.startswith(version):
+        torch_without_guard_version = True
+        break
+    else:
+        torch_without_guard_version = False
+if not IS_GPU and not torch_without_guard_version:
+    from torch_npu.utils.device_guard import torch_device_guard as torch_npu_device_guard
+
+
+class Const:
+    """
+    Class for const
+    """
+    SEP = '.'
+    DIRECTORY_LENGTH = 4096
+    FILE_NAME_LENGTH = 255
+    FILE_PATTERN = r'^[a-zA-Z0-9_./-]+$'
+    MODEL_TYPE = ['.onnx', '.pb', '.om']
+    SEMICOLON = ";"
+    COLON = ":"
+    EQUAL = "="
+    COMMA = ","
+    DOT = "."
+    DUMP_RATIO_MAX = 100
+    SUMMERY_DATA_NUMS = 256
+    ONE_HUNDRED_MB = 100 * 1024 * 1024
+    FLOAT_EPSILON = np.finfo(float).eps
+    SUPPORT_DUMP_MODE = ['api', 'acl']
+    ON = 'ON'
+    OFF = 'OFF'
+    BACKWARD = 'backward'
+    FORWARD = 'forward'
+    FLOAT_TYPE = [np.half, np.single, float, np.double, np.float64, np.longdouble, np.float32, np.float16]
+    BOOL_TYPE = [bool, np.uint8]
+    INT_TYPE = [np.int32, np.int64]
+    NPU = 'NPU'
+
+    # dump mode
+    ALL = "all"
+    LIST = "list"
+    RANGE = "range"
+    STACK = "stack"
+    ACL = "acl"
+    API_LIST = "api_list"
+    API_STACK = "api_stack"
+    DUMP_MODE = [ALL, LIST, RANGE, STACK, ACL, API_LIST, API_STACK]
+
+    WRITE_FLAGS = os.O_WRONLY | os.O_CREAT
+    WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR
+
+    RAISE_PRECISION = {
+        torch.float16: torch.float32,
+        torch.bfloat16: torch.float32,
+        torch.float32: torch.float64
+    }
+    CONVERT = {
+        "int32_to_int64": ["torch.int32", "torch.int64"],
+    }
+
+    CONVERT_API = {
+        "int32_to_int64": ["cross_entropy"]
+    }
+
+
+class CompareConst:
+    """
+    Class for compare module const
+    """
+    # compare result column name
+    NPU_NAME = "NPU Name"
+    BENCH_NAME = "Bench Name"
+    NPU_DTYPE = "NPU Tensor Dtype"
+    BENCH_DTYPE = "Bench Tensor Dtype"
+    NPU_SHAPE = "NPU Tensor Shape"
+    BENCH_SHAPE = "Bench Tensor Shape"
+    NPU_MAX = "NPU max"
+    NPU_MIN = "NPU min"
+    NPU_MEAN = "NPU mean"
+    BENCH_MAX = "Bench max"
+    BENCH_MIN = "Bench min"
+    BENCH_MEAN = "Bench mean"
+    COSINE = "Cosine"
+    MAX_ABS_ERR = "MaxAbsErr"
+    ACCURACY = "Accuracy Reached or Not"
+    STACK = "NPU_Stack_Info"
+    ERROR_MESSAGE = "Err_message"
+
+    # compare result data
+    NAN = 'Nan'
+    SHAPE_UNMATCH = 'shape unmatched'
+    DTYPE_UNMATCH = 'dtype unmatched'
+
+    # accuracy standards
+    COS_THRESHOLD = 0.99
+    MAX_ABS_ERR_THRESHOLD = 0.001
+    COS_MAX_THRESHOLD = 0.9
+    MAX_ABS_ERR_MAX_THRESHOLD = 1
+    ACCURACY_CHECK_YES = "Yes"
+    ACCURACY_CHECK_NO = "No"
+    ACCURACY_CHECK_UNMATCH = "Unmatched"
+
+    # error message
+    NO_BENCH = "No bench data matched."
+
+
+class VersionCheck:
+    """
+    Class for TorchVersion
+    """
+    V1_8 = "1.8"
+    V1_11 = "1.11"
+
+    @staticmethod
+    def check_torch_version(version):
+        torch_version = torch.__version__
+        if torch_version.startswith(version):
+            return True
+        else:
+            return False
+
+
+class CompareException(Exception):
+    """
+    Class for Accuracy Compare Exception
+    """
+    NONE_ERROR = 0
+    INVALID_PATH_ERROR = 1
+    OPEN_FILE_ERROR = 2
+    CLOSE_FILE_ERROR = 3
+    READ_FILE_ERROR = 4
+    WRITE_FILE_ERROR = 5
+    INVALID_FILE_ERROR = 6
+    PERMISSION_ERROR = 7
+    INDEX_OUT_OF_BOUNDS_ERROR = 8
+    NO_DUMP_FILE_ERROR = 9
+    INVALID_DATA_ERROR = 10
+    INVALID_PARAM_ERROR = 11
+    INVALID_DUMP_RATIO = 12
+    INVALID_DUMP_FILE = 13
+    UNKNOWN_ERROR = 14
+    INVALID_DUMP_MODE = 15
+    PARSE_FILE_ERROR = 16
+    INVALID_COMPARE_MODE = 17
+
+    def __init__(self, code, error_info: str = ""):
+        super(CompareException, self).__init__()
+        self.code = code
+        self.error_info = error_info
+
+    def __str__(self):
+        return self.error_info
+
+
+class DumpException(CompareException):
+    pass
+
+
+def read_json(file):
+    with FileOpen(file, 'r') as f:
+        obj = json.load(f)
+    return obj
+
+
+def write_csv(data, filepath):
+    with FileOpen(filepath, 'a', encoding='utf-8-sig') as f:
+        writer = csv.writer(f)
+        writer.writerows(data)
+
+
+def _print_log(level, msg, end='\n'):
+    current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
+    pid = os.getgid()
+    print(current_time + "(" + str(pid) + ")-[" + level + "]" + msg, end=end)
+    sys.stdout.flush()
+
+
+def print_info_log(info_msg, end='\n'):
+    """
+    Function Description:
+        print info log.
+    Parameter:
+        info_msg: the info message.
+    """
+    _print_log("INFO", info_msg, end=end)
+
+
+def print_error_log(error_msg):
+    """
+    Function Description:
+        print error log.
+    Parameter:
+        error_msg: the error message.
+    """
+    _print_log("ERROR", error_msg)
+
+
+def print_warn_log(warn_msg):
+    """
+    Function Description:
+        print warn log.
+    Parameter:
+        warn_msg: the warning message.
+    """
+    _print_log("WARNING", warn_msg)
+
+
+def check_mode_valid(mode):
+    if mode not in Const.DUMP_MODE:
+        msg = "Current mode '%s' is not supported. Please use the field in %s" % \
+              (mode, Const.DUMP_MODE)
+        raise CompareException(CompareException.INVALID_DUMP_MODE, msg)
+
+
+def check_object_type(check_object, allow_type):
+    """
+    Function Description:
+        Check if the object belongs to a certain data type
+    Parameter:
+        check_object: the object to be checked
+        allow_type: legal data type
+    Exception Description:
+        when invalid data throw exception
+    """
+    if not isinstance(check_object, allow_type):
+        print_error_log(f"{check_object} not of {allow_type} type")
+        raise CompareException(CompareException.INVALID_DATA_ERROR)
+
+
+def check_file_or_directory_path(path, isdir=False):
+    """
+    Function Description:
+        check whether the path is valid
+    Parameter:
+        path: the path to check
+        isdir: the path is dir or file
+    Exception Description:
+        when invalid data throw exception
+    """
+    if isdir:
+        if not os.path.exists(path):
+            print_error_log('The path {} is not exist.'.format(path))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+        if not os.path.isdir(path):
+            print_error_log('The path {} is not a directory.'.format(path))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+        if not os.access(path, os.W_OK):
+            print_error_log(
+                'The path {} does not have permission to write. Please check the path permission'.format(path))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+    else:
+        if not os.path.isfile(path):
+            print_error_log('{} is an invalid file or non-exist.'.format(path))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+    if not os.access(path, os.R_OK):
+        print_error_log(
+            'The path {} does not have permission to read. Please check the path permission'.format(path))
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+
+def _check_pkl(pkl_file_handle, file_name):
+    tensor_line = pkl_file_handle.readline()
+    if len(tensor_line) == 0:
+        print_error_log("dump file {} have empty line!".format(file_name))
+        raise CompareException(CompareException.INVALID_DUMP_FILE)
+    pkl_file_handle.seek(0, 0)
+
+
+def check_file_mode(npu_pkl, bench_pkl, stack_mode):
+    npu_pkl_name = os.path.split(npu_pkl)[-1]
+    bench_pkl_name = os.path.split(bench_pkl)[-1]
+
+    if not npu_pkl_name.startswith("api_stack") and not bench_pkl_name.startswith("api_stack"):
+        if stack_mode:
+            print_error_log("The current file does not contain stack information, please turn off the stack_mode")
+            raise CompareException(CompareException.INVALID_COMPARE_MODE)
+    elif npu_pkl_name.startswith("api_stack") and bench_pkl_name.startswith("api_stack"):
+        if not stack_mode:
+            print_error_log("The current file contains stack information, please turn on the stack_mode")
+            raise CompareException(CompareException.INVALID_COMPARE_MODE)
+    else:
+        print_error_log("The dump mode of the two files is not same, please check the dump files")
+        raise CompareException(CompareException.INVALID_COMPARE_MODE)
+
+
+def check_file_size(input_file, max_size):
+    try:
+        file_size = os.path.getsize(input_file)
+    except OSError as os_error:
+        print_error_log('Failed to open "%s". %s' % (input_file, str(os_error)))
+        raise CompareException(CompareException.INVALID_FILE_ERROR) from os_error
+    if file_size > max_size:
+        print_error_log('The size (%d) of %s exceeds (%d) bytes, tools not support.'
+                        % (file_size, input_file, max_size))
+        raise CompareException(CompareException.INVALID_FILE_ERROR)
+
+
+def get_dump_data_path(dump_dir):
+    """
+    Function Description:
+        traverse directories and obtain the absolute path of dump data
+    Parameter:
+        dump_dir: dump data directory
+    Return Value:
+        dump data path,file is exist or file is not exist
+    """
+    dump_data_path = None
+    file_is_exist = False
+
+    check_file_or_directory_path(dump_dir, True)
+    for dir_path, sub_paths, files in os.walk(dump_dir):
+        if len(files) != 0:
+            dump_data_path = dir_path
+            file_is_exist = True
+            break
+        dump_data_path = dir_path
+    return dump_data_path, file_is_exist
+
+
+def modify_dump_path(dump_path, mode):
+    if mode == Const.ALL:
+        return dump_path
+    file_name = os.path.split(dump_path)
+    mode_file_name = mode + "_" + file_name[-1]
+    return os.path.join(file_name[0], mode_file_name)
+
+
+def create_directory(dir_path):
+    """
+    Function Description:
+        creating a directory with specified permissions in a thread-safe manner
+    Parameter:
+        dir_path: directory path
+    Exception Description:
+        when invalid data throw exception
+    """
+    try:
+        os.makedirs(dir_path, mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True)
+    except OSError as ex:
+        print_error_log(
+            'Failed to create {}. Please check the path permission or disk space. {}'.format(dir_path, str(ex)))
+        raise CompareException(CompareException.INVALID_PATH_ERROR) from ex
+
+
+def execute_command(cmd):
+    """
+    Function Description:
+        run the following command
+    Parameter:
+        cmd: command
+    Exception Description:
+        when invalid command throw exception
+    """
+    print_info_log('Execute command:%s' % cmd)
+    process = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    while process.poll() is None:
+        line = process.stdout.readline()
+        line = line.strip()
+        if line:
+            print(line)
+    if process.returncode != 0:
+        print_error_log('Failed to execute command:%s' % " ".join(cmd))
+        raise CompareException(CompareException.INVALID_DATA_ERROR)
+
+
+def save_numpy_data(file_path, data):
+    """
+    save_numpy_data
+    """
+    if not os.path.exists(os.path.dirname(file_path)):
+        os.makedirs(os.path.dirname(file_path))
+    np.save(file_path, data)
+
+
+def parse_arg_value(values):
+    """
+    parse dynamic arg value of atc cmdline
+    """
+    value_list = []
+    for item in values.split(Const.SEMICOLON):
+        value_list.append(parse_value_by_comma(item))
+    return value_list
+
+
+def parse_value_by_comma(value):
+    """
+    parse value by comma, like '1,2,4,8'
+    """
+    value_list = []
+    value_str_list = value.split(Const.COMMA)
+    for value_str in value_str_list:
+        value_str = value_str.strip()
+        if value_str.isdigit() or value_str == '-1':
+            value_list.append(int(value_str))
+        else:
+            print_error_log("please check your input shape.")
+            raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    return value_list
+
+
+def get_data_len_by_shape(shape):
+    data_len = 1
+    for item in shape:
+        if item == -1:
+            print_error_log("please check your input shape, one dim in shape is -1.")
+            return -1
+        data_len = data_len * item
+    return data_len
+
+
+def add_time_as_suffix(name):
+    return '{}_{}.csv'.format(name, time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())))
+
+
+def get_time():
+    return datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
+
+
+def format_value(value):
+    return '{:.6f}'.format(value)
+
+
+def torch_device_guard(func):
+    if IS_GPU or torch_without_guard_version:
+        return func
+    # Parse args/kwargs matched torch.device objects
+
+    @torch_npu_device_guard
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+    return wrapper
+
+
+def seed_all(seed=1234, mode=False):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.use_deterministic_algorithms(mode)
+    if IS_GPU:
+        torch.cuda.manual_seed_all(seed)
+        torch.cuda.manual_seed(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.enable = False
+        torch.backends.cudnn.benchmark = False
+    else:
+        torch_npu.npu.manual_seed_all(seed)
+        torch_npu.npu.manual_seed(seed)
+
+
+def get_process_rank(model):
+    print_info_log("Rank id is not provided. Trying to get the rank id of the model.")
+    try:
+        device = next(model.parameters()).device
+    except StopIteration:
+        print_warn_log('There is no parameter in the model. Fail to get rank id.')
+        return 0, False
+    if device.type == 'cpu':
+        print_warn_log("Warning: the debugger is unable to get the rank id. "
+            "This may cause the dumpped data to be corrupted in the "
+            "case of distributed training. (You may ignore this if you are using only one card.) "
+            "Transfer the model to npu or gpu before register_hook() to avoid this warning.")
+        return 0, False
+    else:
+        return device.index, True
+
+
+def get_json_contents(file_path):
+    ops = get_file_content_bytes(file_path)
+    try:
+        json_obj = json.loads(ops)
+    except ValueError as error:
+        print_error_log('Failed to load "%s". %s' % (file_path, str(error)))
+        raise CompareException(CompareException.INVALID_FILE_ERROR) from error
+    if not isinstance(json_obj, dict):
+        print_error_log('Json file %s, content is not a dictionary!' % file_path)
+        raise CompareException(CompareException.INVALID_FILE_ERROR)
+    return json_obj
+
+
+def get_file_content_bytes(file):
+    with FileOpen(file, 'rb') as file_handle:
+        return file_handle.read()
+
+
+def islink(path):
+    path = os.path.abspath(path)
+    return os.path.islink(path)
+
+
+class SoftlinkCheckException(Exception):
+    pass
+
+
+MAX_JSON_FILE_SIZE = 10 * 1024 ** 2
+LINUX_FILE_NAME_LENGTH_LIMIT = 200
+
+
+def check_path_length_valid(path):
+    path = os.path.realpath(path)
+    return len(os.path.basename(path)) <= LINUX_FILE_NAME_LENGTH_LIMIT
+
+
+def check_path_pattern_valid(path):
+    pattern = re.compile(r'(\.|/|:|_|-|\s|[~0-9a-zA-Z])+')
+    if not pattern.fullmatch(path):
+        raise ValueError('Only the following characters are allowed in the path: A-Z a-z 0-9 - _ . / :')
+
+
+def check_input_file_valid(input_path, max_file_size=MAX_JSON_FILE_SIZE):
+    if islink(input_path):
+        raise SoftlinkCheckException("Input path doesn't support soft link.")
+
+    input_path = os.path.realpath(input_path)
+    if not os.path.exists(input_path):
+        raise ValueError('Input file %s does not exist!' % input_path)
+
+    if not os.access(input_path, os.R_OK):
+        raise PermissionError('Input file %s is not readable!' % input_path)
+
+    if not check_path_length_valid(input_path):
+        raise ValueError("The real path or file_name of input is too long.")
+
+    check_path_pattern_valid(input_path)
+
+    if os.path.getsize(input_path) > max_file_size:
+        raise ValueError(f'The file is too large, exceeds {max_file_size // 1024 ** 2}MB')
+
+
+def check_need_convert(api_name):
+    convert_type = None
+    for key, value in Const.CONVERT_API.items():
+        if api_name not in value:
+            continue
+        else:
+            convert_type = key
+    return convert_type
+
+
+def api_info_preprocess(api_name, api_info_dict):
+    """
+    Function Description:
+        Preprocesses the API information.
+    Parameter:
+        api_name: Name of the API.
+        api_info_dict: argument of the API.
+    Return api_info_dict:
+        convert_type: Type of conversion.
+        api_info_dict: Processed argument of the API.
+    """
+    convert_type = check_need_convert(api_name)
+    if api_name == 'cross_entropy':
+        api_info_dict = cross_entropy_process(api_info_dict)
+    return convert_type, api_info_dict
+
+
+def cross_entropy_process(api_info_dict):
+    """
+    Function Description:
+        Preprocesses the cross_entropy API information.
+    Parameter:
+        api_info_dict: argument of the API.
+    Return api_info_dict:
+        api_info_dict: Processed argument of the API.
+    """
+    if 'args' in api_info_dict and len(api_info_dict['args']) > 1 and 'Min' in api_info_dict['args'][1]:
+        if api_info_dict['args'][1]['Min'] <= 0:
+            # The second argument in cross_entropy should be -100 or not less than 0
+            api_info_dict['args'][1]['Min'] = 0
+    return api_info_dict
+
+
+def initialize_save_path(save_path, dir_name):
+    data_path = os.path.join(save_path, dir_name)
+    if os.path.exists(data_path):
+        print_warn_log(f"{data_path} already exists, it will be overwritten")
+    else:
+        os.mkdir(data_path, mode=FileCheckConst.DATA_DIR_AUTHORITY)
+    data_path_checker = FileChecker(data_path, FileCheckConst.DIR)
+    data_path_checker.common_check()
+
+
+def write_pt(file_path, tensor):
+    if os.path.exists(file_path):
+        raise ValueError(f"File {file_path} already exists")
+    torch.save(tensor, file_path)
+    full_path = os.path.realpath(file_path)
+    file_check_util.change_mode(full_path, FileCheckConst.DATA_FILE_AUTHORITY)
+    return full_path
+
+
+def get_real_data_path(file_path):
+    targets = ['forward_real_data', 'backward_real_data', 'ut_error_data\d+']
+    pattern = re.compile(r'({})'.format('|'.join(targets)))
+    match = pattern.search(file_path)
+    if match:
+        target_index = match.start()
+        target_path = file_path[target_index:]
+        return target_path
+    else:
+        raise DumpException(DumpException.INVALID_PATH_ERROR)
+
+
+def get_full_data_path(data_path, real_data_path):
+    if not data_path:
+        return data_path
+    full_data_path = os.path.join(real_data_path, data_path)
+    return os.path.realpath(full_data_path)
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/algorithm.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/algorithm.py
new file mode 100644
index 0000000000000000000000000000000000000000..7983709f14bcca72a0cb29c453198396561681b1
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/algorithm.py
@@ -0,0 +1,190 @@
+# 定义比对算法及比对标准
+import torch
+import numpy as np
+from .compare_utils import CompareConst
+
+
+#cos
+def cosine_sim(bench_output, device_output):
+    msg = ""
+    n_value = device_output.reshape(-1)
+    b_value = bench_output.reshape(-1)
+    cos = CompareConst.SPACE
+    np.seterr(divide="ignore", invalid="ignore")
+    if n_value.shape != b_value.shape:
+        msg = f"Shape of device and bench outputs don't match. device: {n_value.shape}, bench: {b_value.shape}."
+        return -1, False, msg
+    if len(n_value) == 1:
+        msg = "All the data in device dump data is scalar. Please refer to other compare algorithms."
+        return cos, True, msg
+    n_value_max = np.max(np.abs(n_value))
+    b_value_max = np.max(np.abs(b_value))
+    if n_value_max <= np.finfo(float).eps and b_value_max <= np.finfo(float).eps:
+        msg = "All the data in device and bench outputs are zero."
+        return cos, True, msg
+    elif n_value_max <= np.finfo(float).eps:
+        msg = "All the data is zero in device dump data."
+        return CompareConst.SPACE, False, msg
+    elif b_value_max <= np.finfo(float).eps:
+        msg = "All the data is zero in bench dump data."
+        return CompareConst.SPACE, False, msg
+    else:
+        n_value = n_value.astype(float) / n_value_max
+        b_value = b_value.astype(float) / b_value_max
+        cos = np.dot(n_value, b_value) / (np.linalg.norm(n_value) * np.linalg.norm(b_value))
+        if np.isnan(cos):
+            msg = "Dump data has NaN when comparing with Cosine Similarity."
+        cos = np.clip(cos, -1, 1)
+        return cos, cos > 0.99, msg
+
+
+#rmse
+def get_rmse(abs_err, inf_nan_mask):
+    masked_ae = np.where(inf_nan_mask, 0, abs_err)
+    mse = np.mean(np.square(masked_ae))
+    inf_nan_cnt = np.sum(inf_nan_mask)
+    mse = mse * (abs_err.size / (abs_err.size - inf_nan_cnt + 0.0001) + 0.0001)
+    rmse = np.sqrt(mse)
+    return rmse
+
+
+#误差均衡性
+def get_error_balance(bench_data, device_data):
+    larger_count = np.sum(np.greater(device_data - bench_data.astype(device_data.dtype), 0))
+    smaller_count = np.sum(np.less(device_data - bench_data.astype(device_data.dtype), 0))
+    total_count = bench_data.size
+    error_balance = abs(larger_count - smaller_count) / total_count if total_count > 0 else 0
+    return error_balance
+
+
+#小值域错误占比
+def get_small_value_err_ratio(small_value_mask, abs_err_greater_mask):
+    err_mask = np.logical_and(small_value_mask, abs_err_greater_mask)
+    small_value_err_num = np.sum(err_mask)
+    small_value_num = np.sum(small_value_mask)
+    return 0 if small_value_num == 0 else small_value_err_num / small_value_num
+
+
+def get_rel_err(abs_err, abs_bench_with_eps, small_value_mask, inf_nan_mask):
+    rel_err_tmp = abs_err / abs_bench_with_eps
+    rel_err_mask = np.logical_or(small_value_mask, inf_nan_mask)
+    rel_err = np.where(rel_err_mask, -1, rel_err_tmp)
+    return rel_err
+
+
+def get_abs_err(bench_data, device_data):
+    abs_err = np.abs(device_data - bench_data)
+    return abs_err
+
+
+def get_rel_err_origin(abs_err, b_value):
+    rel_err_origin = np.abs(abs_err / b_value)
+    return rel_err_origin
+
+
+def get_max_abs_err(abs_err):
+    max_abs_err = abs_err.max()
+    bool_result = max_abs_err < 0.001
+    return max_abs_err, bool_result
+
+
+#相对误差最大值
+def get_max_rel_err(rel_err):
+    return np.max(rel_err) if np.max(rel_err) >= 0 else 0
+
+
+#相对误差均值
+def get_mean_rel_err(rel_err):
+    non_negative_rel_err = rel_err[rel_err >= 0]
+    return np.mean(non_negative_rel_err) if non_negative_rel_err.size > 0 else 0
+
+
+def get_rel_err_ratio(rel_err, thresholding):
+    if np.size(rel_err) == 0:
+        ratio = 1
+    else:
+        ratio = np.divide(np.sum(rel_err < thresholding), np.size(rel_err))
+    bool_result = ratio > (1 - thresholding)
+    return ratio, bool_result
+
+
+def get_finite_and_infinite_mask(bench_output, device_output):
+    device_finite_mask = np.isfinite(device_output)
+    bench_finite_mask = np.isfinite(bench_output.astype(device_output.dtype))
+    both_finite_mask = np.logical_and(device_finite_mask, bench_finite_mask)
+    inf_nan_mask = np.logical_not(both_finite_mask)
+    return both_finite_mask, inf_nan_mask
+
+
+def get_small_value_mask(abs_bench, both_finite_mask, small_value_threshold):
+    small_value_mask = np.less_equal(abs_bench, small_value_threshold)
+    small_value_mask = np.logical_and(small_value_mask, both_finite_mask)
+    return small_value_mask
+
+
+def get_abs_bench_with_eps(bench, dtype):
+    abs_bench = np.abs(bench)
+    eps = np.finfo(bench.dtype).eps if dtype != torch.bfloat16 else CompareConst.BFLOAT16_EPS
+    abs_bench_with_eps = abs_bench + eps
+    return abs_bench, abs_bench_with_eps
+
+
+def check_inf_nan_value(inf_nan_mask, bench_output, device_output, dtype, rtol):
+    '''
+    新精度标准的绝对阈值法中，检查npu和golden输出的inf、nan是否一致
+    输入：
+        inf_nan_mask：npu输出和golden输出的inf、nan的mask
+        bench_output：golden输出
+        device_output：npu输出
+        dtype：npu输出的dtype
+    输出： 
+        inf_nan_err_ratio：npu输出和golden输出的inf、nan不一致的比例
+    '''
+    abs_gpu, abs_gpu_with_eps = get_abs_bench_with_eps(bench_output, dtype)
+    golden_same_dtype = bench_output.astype(device_output.dtype)
+    a_min = np.finfo(device_output.dtype).min if dtype != torch.bfloat16 else CompareConst.BFLOAT16_MIN
+    a_max = np.finfo(device_output.dtype).max if dtype != torch.bfloat16 else CompareConst.BFLOAT16_MAX
+    golden_clip = np.clip(golden_same_dtype, a_min, a_max)
+    npu_clip = np.clip(device_output, a_min, a_max)
+    clipped_abs_ae = np.abs(npu_clip - golden_clip)
+    clipped_re = clipped_abs_ae / abs_gpu_with_eps
+    pass_mask = np.less_equal(clipped_re, rtol)
+    both_nan_mask = np.logical_and(np.isnan(device_output), np.isnan(golden_clip))
+    pass_mask = np.logical_or(pass_mask, both_nan_mask)
+    not_pass_mask = np.logical_not(pass_mask)
+    not_pass_mask = np.logical_and(not_pass_mask, inf_nan_mask)
+
+    inf_nan_err_cnt = np.sum(not_pass_mask)
+    return 0 if np.sum(inf_nan_mask) == 0 else inf_nan_err_cnt / np.sum(inf_nan_mask)
+
+
+def check_small_value(abs_err, small_value_mask, small_value_atol):
+    '''
+    新精度标准的相对阈值法中，检查npu和golden小值域输出的相对误差是否满足阈值
+    输入：
+        rel_err：npu输出和golden输出的相对误差
+        normal_value_mask：npu输出和golden输出的正常值mask
+        rtol：相对误差的阈值
+    输出： 
+        rel_err_ratio：npu输出和golden输出的相对误差不满足阈值的比例
+    '''
+    greater_mask = np.greater(abs_err, small_value_atol)
+    err_mask = np.logical_and(greater_mask, small_value_mask)
+    err_cnt = np.sum(err_mask)
+    return 0 if np.sum(small_value_mask) == 0 else err_cnt / np.sum(small_value_mask)
+
+
+def check_norm_value(normal_value_mask, rel_err, rtol):
+    '''
+    新精度标准的绝对阈值法中，检查npu和golden正常值输出的绝对误差是否满足阈值
+    输入：
+        abs_err：npu输出和golden输出的绝对误差
+        normal_value_mask：npu输出和golden输出的正常值mask
+        atol：绝对误差的阈值
+    输出： 
+        abs_err_ratio：npu输出和golden输出的绝对误差不满足阈值的比例
+    '''
+    err_mask = np.greater(rel_err, rtol)
+    err_mask = np.logical_and(err_mask, normal_value_mask)
+    err_cnt = np.sum(err_mask)
+    return 0 if np.sum(normal_value_mask) == 0 else err_cnt / np.sum(normal_value_mask)
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_compare.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a544de21a01321774c21aaa90397e3ea80fe7be
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_compare.py
@@ -0,0 +1,387 @@
+import argparse
+import os
+import sys
+import math
+from collections import namedtuple
+import pandas as pd
+
+from ..common.utils import print_info_log, print_warn_log, print_error_log, write_csv, \
+    CompareException, create_directory
+from ..common.config import msCheckerConfig
+from ..compare.compare_utils import CompareConst, API_PRECISION_COMPARE_RESULT_FILE_NAME, \
+    API_PRECISION_COMPARE_DETAILS_FILE_NAME, BENCHMARK_COMPARE_SUPPORT_LIST, API_PRECISION_COMPARE_UNSUPPORT_LIST, \
+    ApiPrecisionCompareColumn, AbsoluteStandardApi, BinaryStandardApi, BINARY_COMPARE_UNSUPPORT_LIST, \
+    convert_str_to_float, CompareMessage
+from ..compare.compare_column import ApiPrecisionOutputColumn
+from ..run_ut.run_ut import get_validated_result_csv_path
+from ...common.file_check import FileCheckConst, FileChecker, change_mode, check_path_before_create
+
+CompareConfig = namedtuple('CompareConfig', ['npu_csv_path', 'gpu_csv_path', 'result_csv_path', 'details_csv_path'])
+unsupported_message = 'This data type does not support benchmark compare.'
+
+benchmark_algorithms_thresholds = {
+    'small_value': {
+        'error_threshold': 2,
+        'warning_threshold': 1
+    },
+    'rmse': {
+        'error_threshold': 2,
+        'warning_threshold': 1
+    },
+    'max_rel_err': {
+        'error_threshold': 10,
+        'warning_threshold': 1
+    },
+    'mean_rel_err': {
+        'error_threshold': 2,
+        'warning_threshold': 1
+    },
+    'eb': {
+        'error_threshold': 2,
+        'warning_threshold': 1
+    }
+}
+
+benchmark_message = {
+    "small_value_err_status": {
+        CompareConst.ERROR: "ERROR: 小值域错误比值超过阈值\n",
+        CompareConst.WARNING: "WARNING: 小值域错误比值超过阈值\n"
+    },
+    "rmse_status": {
+        CompareConst.ERROR: "ERROR: 均方根误差比值超过阈值\n",
+        CompareConst.WARNING: "WARNING: 均方根误差比值超过阈值\n"
+    },
+    "max_rel_err_status": {
+        CompareConst.ERROR: "ERROR: 相对误差最大值比值超过阈值\n",
+        CompareConst.WARNING: "WARNING: 相对误差最大值比值超过阈值\n"
+    },
+    "mean_rel_err_status": {
+        CompareConst.ERROR: "ERROR: 相对误差平均值比值超过阈值\n",
+        CompareConst.WARNING: "WARNING: 相对误差平均值比值超过阈值\n"
+    }
+}
+
+
+class BenchmarkStandard:
+    def __init__(self, api_name, npu_precision, gpu_precision):
+        self.api_name = api_name
+        self.npu_precision = npu_precision
+        self.gpu_precision = gpu_precision
+        self.small_value_err_ratio = 1
+        self.rmse_ratio = 1
+        self.max_rel_err_ratio = 1
+        self.mean_rel_err_ratio = 1
+        self.eb_ratio = 1
+        self.small_value_err_status = CompareConst.PASS
+        self.rmse_status = CompareConst.PASS
+        self.max_rel_err_status = CompareConst.PASS
+        self.mean_rel_err_status = CompareConst.PASS
+        self.eb_status = CompareConst.PASS
+        self.check_result_list = []
+        self.final_result = CompareConst.PASS
+
+    def __str__(self):
+        return "%s" % (self.api_name)
+
+    def get_result(self):
+        self._compare_ratio()
+        self.small_value_err_status = self._get_status(self.small_value_err_ratio, 'small_value')
+        self.check_result_list.append(self.small_value_err_status)
+        self.rmse_status = self._get_status(self.rmse_ratio, 'rmse')
+        self.check_result_list.append(self.rmse_status)
+        self.max_rel_err_status = self._get_status(self.max_rel_err_ratio, 'max_rel_err')
+        self.check_result_list.append(self.max_rel_err_status)
+        self.mean_rel_err_status = self._get_status(self.mean_rel_err_ratio, 'mean_rel_err')
+        self.check_result_list.append(self.mean_rel_err_status)
+        self.eb_status = self._get_status(self.eb_ratio, 'eb')
+        if CompareConst.ERROR in self.check_result_list:
+            self.final_result = CompareConst.ERROR
+        elif CompareConst.WARNING in self.check_result_list:
+            self.final_result = CompareConst.WARNING
+
+    def _compare_ratio(self):
+        self.small_value_err_ratio = self._calc_ratio(
+            self.npu_precision.get(ApiPrecisionCompareColumn.SMALL_VALUE_ERROR_RATE),
+            self.gpu_precision.get(ApiPrecisionCompareColumn.SMALL_VALUE_ERROR_RATE), 10000.0)
+        self.rmse_ratio = self._calc_ratio(self.npu_precision.get(ApiPrecisionCompareColumn.RMSE),
+                                           self.gpu_precision.get(ApiPrecisionCompareColumn.RMSE), 10000.0)
+        self.max_rel_err_ratio = self._calc_ratio(self.npu_precision.get(ApiPrecisionCompareColumn.MAX_REL_ERR),
+                                                  self.gpu_precision.get(ApiPrecisionCompareColumn.MAX_REL_ERR),
+                                                  10000.0)
+        self.mean_rel_err_ratio = self._calc_ratio(self.npu_precision.get(ApiPrecisionCompareColumn.MEAN_REL_ERR),
+                                                   self.gpu_precision.get(ApiPrecisionCompareColumn.MEAN_REL_ERR),
+                                                   10000.0)
+        self.eb_ratio = self._calc_ratio(self.npu_precision.get(ApiPrecisionCompareColumn.EB),
+                                         self.gpu_precision.get(ApiPrecisionCompareColumn.EB), 10000.0)
+
+    def to_column_value(self):
+        return [self.small_value_err_ratio, self.small_value_err_status, self.rmse_ratio,
+                self.rmse_status, self.max_rel_err_ratio, self.max_rel_err_status, self.mean_rel_err_ratio,
+                self.mean_rel_err_status, self.eb_ratio, self.eb_status]
+
+    @staticmethod
+    def _get_status(ratio, algorithm):
+        error_threshold = benchmark_algorithms_thresholds.get(algorithm).get('error_threshold')
+        warning_threshold = benchmark_algorithms_thresholds.get(algorithm).get('warning_threshold')
+        if ratio > error_threshold:
+            return CompareConst.ERROR
+        elif ratio > warning_threshold:
+            return CompareConst.WARNING
+        return CompareConst.PASS
+
+    @staticmethod
+    def _calc_ratio(x, y, default_value=1.0):
+        x, y = convert_str_to_float(x), convert_str_to_float(y)
+        if math.isclose(y, 0.0):
+            return 1.0 if math.isclose(x, 0.0) else default_value
+        else:
+            return abs(x / y)
+
+
+def write_detail_csv(content, save_path):
+    rows = []
+    content = ["{:.{}f}".format(item, msCheckerConfig.precision) \
+                   if isinstance(item, float) else item for item in content]
+    rows.append(content)
+    write_csv(rows, save_path)
+
+
+def api_precision_compare(config):
+    print_info_log("Start compare task")
+    print_info_log(f"Compare task result will be saved in {config.result_csv_path}")
+    print_info_log(f"Compare task detail will be saved in {config.details_csv_path}")
+    try:
+        npu_data = pd.read_csv(config.npu_csv_path)
+    except Exception as err:
+        print_error_log(f"Open npu csv Error: %s" % str(err))
+    check_csv_columns(npu_data.columns, "npu_csv")
+    try:
+        gpu_data = pd.read_csv(config.gpu_csv_path)
+    except Exception as err:
+        print_error_log(f"Open gpu csv Error: %s" % str(err))
+    check_csv_columns(gpu_data.columns, "gpu_csv")
+    detail_csv_title = [ApiPrecisionCompareColumn.get_detail_csv_title()]
+    result_csv_title = [ApiPrecisionCompareColumn.get_result_csv_title()]
+    write_csv(result_csv_title, config.result_csv_path)
+    write_csv(detail_csv_title, config.details_csv_path)
+    try:
+        analyse_csv(npu_data, gpu_data, config)
+    except Exception as err:
+        print_error_log(f"Analyse csv Error: %s" % str(err))
+    change_mode(config.result_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
+    change_mode(config.details_csv_path, FileCheckConst.DATA_FILE_AUTHORITY)
+
+
+def analyse_csv(npu_data, gpu_data, config):
+    forward_status, backward_status = [], []
+    last_api_name, last_api_dtype = None, None
+    for _, row_npu in npu_data.iterrows():
+        message = ''
+        compare_column = ApiPrecisionOutputColumn()
+        full_api_name_with_direction_status = row_npu[ApiPrecisionCompareColumn.API_NAME]
+        row_gpu = gpu_data[gpu_data[ApiPrecisionCompareColumn.API_NAME] == full_api_name_with_direction_status]
+        _, api_name, _, direction_status, _, _ = full_api_name_with_direction_status.split(".")
+        if row_gpu.empty:
+            print_warn_log(f'This API : {full_api_name_with_direction_status} does not exist in the GPU data.')
+            continue
+        if len(row_gpu) > 1:
+            msg = f'This API : {full_api_name_with_direction_status} has multiple records in the GPU data.'
+            raise CompareException(CompareException.INVALID_DATA_ERROR, msg)
+        row_gpu = row_gpu.iloc[0]
+        # 当前API的输出为空（例如反向过程中requires_grad=False）,跳过比对
+        if row_npu[ApiPrecisionCompareColumn.DEVICE_DTYPE].isspace():
+            continue
+        new_status = CompareConst.SPACE
+        compare_column.api_name = full_api_name_with_direction_status
+        if row_npu[
+            ApiPrecisionCompareColumn.DEVICE_DTYPE] not in BINARY_COMPARE_UNSUPPORT_LIST or api_name in BinaryStandardApi:
+            new_status = record_binary_consistency_result(api_name, compare_column, row_npu)
+        elif api_name in AbsoluteStandardApi:
+            new_status = record_absolute_threshold_result(compare_column, row_npu)
+        elif row_npu[ApiPrecisionCompareColumn.DEVICE_DTYPE] in BENCHMARK_COMPARE_SUPPORT_LIST:
+            bs = BenchmarkStandard(full_api_name_with_direction_status, row_npu, row_gpu)
+            new_status = record_benchmark_compare_result(compare_column, bs)
+        write_detail_csv(compare_column.to_column_value(), config.details_csv_path)
+
+        if last_api_name is not None and api_name != last_api_name:
+            if last_api_dtype in API_PRECISION_COMPARE_UNSUPPORT_LIST:
+                message = unsupported_message
+                write_csv([[last_api_name, "skip", "skip", message]], config.result_csv_path)
+                forward_status, backward_status = [], []
+                message = ''
+            else:
+                forward_result = get_api_checker_result(forward_status)
+                backward_result = get_api_checker_result(backward_status)
+                message += CompareMessage.get(last_api_name, "") if forward_result == CompareConst.ERROR else ""
+                write_csv([[last_api_name, forward_result, backward_result, message]], config.result_csv_path)
+                forward_status, backward_status = [], []
+                message = ''
+
+        is_supported = row_npu[ApiPrecisionCompareColumn.DEVICE_DTYPE] not in API_PRECISION_COMPARE_UNSUPPORT_LIST
+        last_api_name = api_name
+
+        last_api_dtype = row_npu[ApiPrecisionCompareColumn.DEVICE_DTYPE]
+        if not is_supported:
+            continue
+
+        if direction_status == 'forward':
+            forward_status.append(new_status)
+        elif direction_status == 'backward':
+            backward_status.append(new_status)
+        else:
+            print_error_log(f"Invalid direction status: {direction_status}")
+
+    if last_api_name is not None:
+        if last_api_dtype in API_PRECISION_COMPARE_UNSUPPORT_LIST:
+            message = unsupported_message
+            write_csv([[last_api_name, "skip", "skip", message]], config.result_csv_path)
+        else:
+            forward_result = get_api_checker_result(forward_status)
+            backward_result = get_api_checker_result(backward_status)
+            message += CompareMessage.get(last_api_name, "") if forward_result == CompareConst.ERROR else ""
+            write_csv([[last_api_name, forward_result, backward_result, message]], config.result_csv_path)
+
+
+def check_error_rate(npu_error_rate):
+    return CompareConst.PASS if convert_str_to_float(npu_error_rate) == 0 else CompareConst.ERROR
+
+
+def get_absolute_threshold_result(row_npu):
+    inf_nan_error_ratio = convert_str_to_float(row_npu[ApiPrecisionCompareColumn.INF_NAN_ERROR_RATIO])
+    rel_err_ratio = convert_str_to_float(row_npu[ApiPrecisionCompareColumn.REL_ERR_RATIO])
+    abs_err_ratio = convert_str_to_float(row_npu[ApiPrecisionCompareColumn.ABS_ERR_RATIO])
+
+    inf_nan_result = CompareConst.PASS if inf_nan_error_ratio == 0 else CompareConst.ERROR
+    rel_err_result = CompareConst.PASS if rel_err_ratio == 0 else CompareConst.ERROR
+    abs_err_result = CompareConst.PASS if abs_err_ratio == 0 else CompareConst.ERROR
+
+    if CompareConst.ERROR in [inf_nan_result, rel_err_result, abs_err_result]:
+        absolute_threshold_result = CompareConst.ERROR
+    else:
+        absolute_threshold_result = CompareConst.PASS
+
+    return {
+        "inf_nan_error_ratio": inf_nan_error_ratio,
+        "inf_nan_result": inf_nan_result,
+        "rel_err_ratio": rel_err_ratio,
+        "rel_err_result": rel_err_result,
+        "abs_err_ratio": abs_err_ratio,
+        "abs_err_result": abs_err_result,
+        "absolute_threshold_result": absolute_threshold_result,
+    }
+
+
+def get_api_checker_result(status):
+    if not status:
+        return CompareConst.SPACE
+    for const in (CompareConst.ERROR, CompareConst.WARNING):
+        if const in status:
+            return const
+    return CompareConst.PASS
+
+
+def check_csv_columns(columns, csv_type):
+    required_columns = ApiPrecisionCompareColumn.to_required_columns()
+    missing_columns = [column for column in required_columns if column not in columns]
+    if missing_columns:
+        msg = f"The followint columns {','.join(missing_columns)} are missing in{csv_type}"
+        raise CompareException(CompareException.INVALID_DATA_ERROR, msg)
+
+
+def record_binary_consistency_result(api_name, compare_column, row_npu):
+    new_status = check_error_rate(row_npu[ApiPrecisionCompareColumn.ERROR_RATE])
+    compare_column.error_rate = row_npu[ApiPrecisionCompareColumn.ERROR_RATE]
+    compare_column.error_rate_status = new_status
+    compare_column.compare_result = new_status
+    compare_column.compare_algorithm = "二进制一致法"
+    message = ''
+    if compare_column.error_rate_status == CompareConst.ERROR:
+        message += "ERROR: 二进制一致错误率超过阈值\n"
+        message += CompareMessage.get(api_name, "")
+    compare_column.compare_message = message
+    return new_status
+
+
+def record_absolute_threshold_result(compare_column, row_npu):
+    absolute_threshold_result = get_absolute_threshold_result(row_npu)
+    compare_column.inf_nan_error_ratio = absolute_threshold_result.get("inf_nan_error_ratio")
+    compare_column.inf_nan_error_ratio_status = absolute_threshold_result.get("inf_nan_result")
+    compare_column.rel_err_ratio = absolute_threshold_result.get("rel_err_ratio")
+    compare_column.rel_err_ratio_status = absolute_threshold_result.get("rel_err_result")
+    compare_column.abs_err_ratio = absolute_threshold_result.get("abs_err_ratio")
+    compare_column.abs_err_ratio_status = absolute_threshold_result.get("abs_err_result")
+    compare_column.compare_result = absolute_threshold_result.get("absolute_threshold_result")
+    compare_column.compare_algorithm = "绝对阈值法"
+    message = ''
+    if compare_column.inf_nan_error_ratio_status == CompareConst.ERROR:
+        message += "ERROR: inf/nan错误率超过阈值\n"
+    if compare_column.rel_err_ratio_status == CompareConst.ERROR:
+        message += "ERROR: 相对误差错误率超过阈值\n"
+    if compare_column.abs_err_ratio_status == CompareConst.ERROR:
+        message += "ERROR: 绝对误差错误率超过阈值\n"
+    compare_column.compare_message = message
+    return compare_column.compare_result
+
+
+def record_benchmark_compare_result(compare_column, bs):
+    bs.get_result()
+    compare_column.small_value_err_ratio = bs.small_value_err_ratio
+    compare_column.small_value_err_status = bs.small_value_err_status
+    compare_column.rmse_ratio = bs.rmse_ratio
+    compare_column.rmse_status = bs.rmse_status
+    compare_column.max_rel_err_ratio = bs.max_rel_err_ratio
+    compare_column.max_rel_err_status = bs.max_rel_err_status
+    compare_column.mean_rel_err_ratio = bs.mean_rel_err_ratio
+    compare_column.mean_rel_err_status = bs.mean_rel_err_status
+    compare_column.eb_ratio = bs.eb_ratio
+    compare_column.eb_status = bs.eb_status
+    compare_column.compare_result = bs.final_result
+    compare_column.compare_algorithm = "标杆比对法"
+    message = ''
+    for status_attr, messages in benchmark_message.items():
+        status_value = getattr(compare_column, status_attr)
+        if status_value in messages:
+            message += messages[status_value]
+    compare_column.compare_message = message
+    return compare_column.compare_result
+
+
+def _api_precision_compare(parser=None):
+    if not parser:
+        parser = argparse.ArgumentParser()
+    _api_precision_compare_parser(parser)
+    args = parser.parse_args(sys.argv[1:])
+    _api_precision_compare_command(args)
+
+
+def _api_precision_compare_command(args):
+    npu_csv_path = get_validated_result_csv_path(args.npu_csv_path, 'detail')
+    gpu_csv_path = get_validated_result_csv_path(args.gpu_csv_path, 'detail')
+    out_path = os.path.realpath(args.out_path) if args.out_path else "./"
+    check_path_before_create(out_path)
+    create_directory(out_path)
+    out_path_checker = FileChecker(out_path, FileCheckConst.DIR, ability=FileCheckConst.WRITE_ABLE)
+    out_path = out_path_checker.common_check()
+    result_csv_path = os.path.join(out_path, API_PRECISION_COMPARE_RESULT_FILE_NAME)
+    details_csv_path = os.path.join(out_path, API_PRECISION_COMPARE_DETAILS_FILE_NAME)
+    compare_config = CompareConfig(npu_csv_path, gpu_csv_path, result_csv_path, details_csv_path)
+    api_precision_compare(compare_config)
+
+
+def _api_precision_compare_parser(parser):
+    parser.add_argument("-npu", "--npu_csv_path", dest="npu_csv_path", default="", type=str,
+                        help="<Required> , Accuracy_checking_details.csv generated on the NPU by using the "
+                             "api_accuracy_checker tool.",
+                        required=True)
+    parser.add_argument("-gpu", "--gpu_csv_path", dest="gpu_csv_path", default="", type=str,
+                        help="<Required> Accuracy_checking_details.csv generated on the GPU by using the "
+                             "api_accuracy_checker tool.",
+                        required=False)
+    parser.add_argument("-o", "--out_path", dest="out_path", default="", type=str,
+                        help="<optional> The api precision compare task result out path.",
+                        required=False)
+
+
+if __name__ == '__main__':
+    _api_precision_compare()
+    print_info_log("Compare task completed.")
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..efba9c5c02bbcc094b75ce2497d830789744b143
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_standard.yaml
@@ -0,0 +1,108 @@
+# Copyright (c) 2024 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+AbsoluteThreshStandard:
+  - mul
+  - mul_
+  - __mul__
+  - __imul__
+  - __rmul__
+  - add
+  - add_
+  - __add__
+  - __iadd__
+  - __radd__
+  - div
+  - div_
+  - __div__
+  - __idiv__
+  - divide
+  - divide_
+  - leaky_relu
+  - leaky_relu_
+  - prelu
+  - reciprocal
+  - reciprocal_
+  - rsqrt
+  - rsqrt_
+  - square
+  - square_
+  - sub
+  - sub_
+  - rsub
+  - __isub__
+  - __sub__
+
+BinaryCompareStandard:
+  - abs
+  - abs_
+  - absolute
+  - absolute_
+  - argmin
+  - bitwise_and
+  - bitwise_and_
+  - broadcast_to
+  - ceil
+  - ceil_
+  - equal
+  - fill_
+  - flatten
+  - floor
+  - floor_
+  - gather
+  - greater
+  - greater_
+  - greater_equal
+  - greater_equal_
+  - isfinite
+  - isnan
+  - less
+  - less_
+  - less_equal
+  - less_equal_
+  - logical_and
+  - logical_and_
+  - logical_not
+  - logical_not_
+  - logical_or
+  - logical_or_
+  - masked_fill
+  - masked_fill_
+  - max_pool3d
+  - maximum
+  - minimum
+  - neg
+  - neg_
+  - nonzero
+  - not_equal
+  - not_equal_
+  - one_hot
+  - pad
+  - relu
+  - reshape
+  - round
+  - round_
+  - select
+  - sign
+  - sign_
+  - sort
+  - tile
+  - topk
+  - transpose
+  - transpose_
+  - tril
+  - tril_
+  - triu
+  - triu_
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0684bd8e9129653b6b69afcf43ab19207006801f
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/api_precision_threshold.yaml
@@ -0,0 +1,390 @@
+mul:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+mul_:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+__mul__:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+__imul__:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+__rmul__:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+add:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+add_:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+__add__:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+__iadd__:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+__radd__:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+div:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+div_:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+__div__:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+__idiv__:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+divide:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+divide_:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+leaky_relu:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+leaky_relu_:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+prelu:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+reciprocal:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+reciprocal_:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+rsqrt:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+rsqrt_:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+square:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+square_:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+sub:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+sub_:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+rsub:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+__isub__:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
+__sub__:
+  torch.float32:
+    rtol: 0.000001
+    small_value: 0.000001
+    small_value_atol: 0.000001
+  torch.float16:
+    rtol: 0.001
+    small_value: 0.001
+    small_value_atol: 0.001
+  torch.bfloat16:
+    rtol: 0.004
+    small_value: 0.001
+    small_value_atol: 0.001
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..67aa69e209b7bdbdfb7a5db937bf8e5af8d1b8c8
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare.py
@@ -0,0 +1,372 @@
+# 进行比对及结果展示
+import os
+import csv
+import torch
+import numpy as np
+from rich.table import Table
+from rich.console import Console
+from ..common.utils import get_json_contents, write_csv, print_warn_log, Const
+from ..compare.compare_utils import CompareConst, check_dtype_comparable, DETAIL_TEST_ROWS, \
+    precision_configs, BENCHMARK_COMPARE_SUPPORT_LIST, AbsoluteStandardApi, BinaryStandardApi, apis_threshold
+from ..compare.compare_column import CompareColumn
+from ..compare.algorithm import get_rmse, get_error_balance, get_max_rel_err, get_mean_rel_err, \
+    get_rel_err, get_abs_err, get_max_abs_err, get_rel_err_ratio, cosine_sim, get_rel_err_origin, \
+    get_small_value_err_ratio, get_finite_and_infinite_mask, get_small_value_mask, check_inf_nan_value, \
+    check_small_value, check_norm_value, get_abs_bench_with_eps
+from ..common.config import msCheckerConfig
+from ...common.file_check import FileOpen
+
+
+class Comparator:
+    # consts for result csv
+    COLUMN_API_NAME = "API name"
+    COLUMN_FORWARD_SUCCESS = "Forward Test Success"
+    COLUMN_BACKWARD_SUCCESS = "Backward Test Success"
+    COLUMN_STACK_INFO = "Traceback callstack info"
+
+    def __init__(self, result_csv_path, details_csv_path, is_continue_run_ut, stack_info_json_path=None):
+        self.save_path = result_csv_path
+        self.detail_save_path = details_csv_path
+        if not is_continue_run_ut and not os.path.exists(self.save_path) and not os.path.exists(self.detail_save_path):
+            self.write_csv_title()
+        if stack_info_json_path:
+            self.stack_info = get_json_contents(stack_info_json_path)
+        else:
+            self.stack_info = None
+
+        self.test_result_cnt = {
+            "forward_fail_num": 0, "backward_fail_num": 0, "forward_and_backward_fail_num": 0, "success_num": 0,
+            "total_num": 0, "forward_or_backward_fail_num": 0
+        }
+
+    def print_pretest_result(self):
+        self.get_statistics_from_result_csv()
+        total_tests = self.test_result_cnt.get("total_num", 0)
+        if total_tests != 0:
+            passing_rate = "{:.2%}".format(self.test_result_cnt.get("success_num", 0) / total_tests)
+        else:
+            passing_rate = "0%"
+
+        print_warn_log("The follwing tables will be deprecated in the future."
+                       "The following results are for reference only.")
+        console = Console()
+        table_total = Table(
+            show_header=True, title="Overall Statistics", show_lines=True, width=75
+        )
+        table_total.add_column("Result")
+        table_total.add_column("Statistics")
+        table_total.add_row("[green]Pass[/green]", str(self.test_result_cnt.get("success_num", 0)))
+        table_total.add_row("[yellow]Warning[/yellow]", str(self.test_result_cnt.get("warning_num", 0)))
+        table_total.add_row("[red]Error[/red]", str(self.test_result_cnt.get("error_num", 0)))
+        table_total.add_row("Passing Rate", passing_rate)
+        table_total.add_row("Skip Tests", str(self.test_result_cnt.get("total_skip_num", 0)))
+
+        table_detail = Table(
+            show_header=True, title="Detail Statistics", show_lines=True, width=75
+        )
+        table_detail.add_column("Result")
+        table_detail.add_column("Statistics")
+        table_detail.add_row("Forward Error", str(self.test_result_cnt.get("forward_fail_num", 0)))
+        table_detail.add_row("Backward Error", str(self.test_result_cnt.get("backward_fail_num", 0)))
+        table_detail.add_row("Both Forward & Backward Error", str(self.test_result_cnt.get("forward_and_backward_fail_num", 0)))
+
+        console.print(table_total)
+        console.print(table_detail)
+
+    def get_statistics_from_result_csv(self):
+        checklist = [CompareConst.PASS, CompareConst.ERROR, CompareConst.WARNING, CompareConst.SPACE, CompareConst.SKIP, "skip"]
+        self.test_result_cnt = {
+            "success_num": 0, "warning_num": 0, "error_num": 0,
+            "forward_fail_num": 0, "backward_fail_num": 0, "forward_and_backward_fail_num": 0,
+            "total_num": 0, "total_skip_num": 0
+        }
+        with FileOpen(self.save_path, 'r') as file:
+            reader = csv.reader(file)
+            result_csv_rows = [row for row in reader]
+        result_csv_name = os.path.basename(self.save_path)
+        for item in result_csv_rows[1:]:
+            if not isinstance(item, list) or len(item) < 3:
+                raise ValueError("The number of columns in %s is incorrect" % result_csv_name)
+            if not all(item[i] and item[i] in checklist for i in (1, 2)):
+                raise ValueError(
+                    "The value in the 2nd or 3rd column of %s is wrong, it must be pass, error, warning, skip, or SPACE"
+                    % result_csv_name)
+            column1 = item[1]
+            column2 = item[2]
+            if column1.upper() == CompareConst.SKIP:
+                self.test_result_cnt["total_skip_num"] += 1
+                continue
+            self.test_result_cnt["total_num"] += 1
+            if column1 == CompareConst.PASS and column2 in [CompareConst.PASS, CompareConst.SPACE]:
+                self.test_result_cnt['success_num'] += 1
+            elif column1 == CompareConst.ERROR and column2 == CompareConst.ERROR:
+                self.test_result_cnt['forward_and_backward_fail_num'] += 1
+                self.test_result_cnt['error_num'] += 1
+            elif column1 == CompareConst.ERROR:
+                self.test_result_cnt['forward_fail_num'] += 1
+                self.test_result_cnt['error_num'] += 1
+            elif column2 == CompareConst.ERROR:
+                self.test_result_cnt['backward_fail_num'] += 1
+                self.test_result_cnt['error_num'] += 1
+            elif column1 == CompareConst.WARNING or column2 == CompareConst.WARNING:
+                self.test_result_cnt['warning_num'] += 1
+
+    def write_csv_title(self):
+        summary_test_rows = [[self.COLUMN_API_NAME, self.COLUMN_FORWARD_SUCCESS, 
+                              self.COLUMN_BACKWARD_SUCCESS, "Message"]]
+        if not os.path.exists(self.save_path):
+            write_csv(summary_test_rows, self.save_path)
+        if not os.path.exists(self.detail_save_path):
+            write_csv(DETAIL_TEST_ROWS, self.detail_save_path)
+
+    def write_summary_csv(self, test_result):
+        test_rows = []
+        if self.stack_info:
+            test_rows[0].append(self.COLUMN_STACK_INFO)
+
+        name = test_result[0]
+        df_row = list(test_result[:3])
+        if test_result[1] == "SKIP" or test_result[2] == "SKIP":
+            df_row.append(test_result[3])
+        if self.stack_info:
+            stack_info = "\n".join(self.stack_info[name])
+            df_row.append(stack_info)
+        test_rows.append(df_row)
+        write_csv(test_rows, self.save_path)
+
+    def write_detail_csv(self, test_result):
+        test_rows = []
+
+        subject_prefix = test_result[0]
+        fwd_result = test_result[3]
+        bwd_result = test_result[4]
+        if isinstance(fwd_result, list):
+            for i, test_subject in enumerate(fwd_result):
+                subject = subject_prefix + ".forward.output." + str(i)
+                test_subject = ["{:.{}f}".format(item, msCheckerConfig.precision) 
+                                if isinstance(item, float) else item for item in test_subject]
+                test_rows.append([subject] + list(test_subject))
+        if isinstance(bwd_result, list):
+            for i, test_subject in enumerate(bwd_result):
+                subject = subject_prefix + ".backward.output." + str(i)
+                test_subject = ["{:.{}f}".format(item, msCheckerConfig.precision) 
+                                if isinstance(item, float) else item for item in test_subject]
+                test_rows.append([subject] + list(test_subject))
+
+        write_csv(test_rows, self.detail_save_path)
+
+    def record_results(self, *args):
+        self.write_summary_csv(args)
+        self.write_detail_csv(args)
+
+    def compare_output(self, full_api_name, bench_output, device_output, bench_grad=None, npu_grad=None):
+        _, api_name, _ = full_api_name.split(Const.SEP)
+        compare_func = self._compare_dropout if "dropout" in full_api_name else self._compare_core_wrapper
+        fwd_success_status, fwd_compare_alg_results = compare_func(api_name, bench_output, device_output)
+        if not (bench_grad and npu_grad):
+            bwd_success_status, bwd_compare_alg_results = (CompareConst.SPACE, [])
+        else:
+            if "dropout" in full_api_name:
+                bwd_success_status, bwd_compare_alg_results = compare_func(api_name, bench_grad[0], npu_grad[0])
+            else:
+                bwd_success_status, bwd_compare_alg_results = compare_func(api_name, bench_grad, npu_grad)
+        self.record_results(full_api_name, fwd_success_status, bwd_success_status if bwd_compare_alg_results is not None else CompareConst.SPACE, fwd_compare_alg_results, bwd_compare_alg_results)
+        return fwd_success_status == CompareConst.PASS, bwd_success_status == CompareConst.PASS \
+            or bwd_success_status == CompareConst.SPACE
+
+    def _compare_core_wrapper(self, api_name, bench_output, device_output):
+        detailed_result_total = []
+        test_final_success = CompareConst.PASS
+        if isinstance(bench_output, (list, tuple)):
+            status, compare_result, message = [], [], []
+            if len(bench_output) != len(device_output):
+                status = [CompareConst.ERROR]
+                message = ["bench and npu output structure is different."]
+            else:
+                for b_out_i, n_out_i in zip(bench_output, device_output):
+                    status_i, compare_result_i, message_i = self._compare_core(api_name, b_out_i, n_out_i)
+                    status.append(status_i)
+                    compare_result.append(compare_result_i)
+                    message.append(message_i)
+        else:
+            status, compare_result, message = self._compare_core(api_name, bench_output, device_output)
+        if not isinstance(status, list):
+            detailed_result_total.append(compare_result.to_column_value(status, message))
+            if status == CompareConst.ERROR:
+                test_final_success = CompareConst.ERROR
+            elif status == CompareConst.WARNING:
+                test_final_success = CompareConst.WARNING
+        else:
+            for item, item_status in enumerate(status):
+                detailed_result_total.append(compare_result[item].to_column_value(item_status, message[item]))
+                if item_status == CompareConst.ERROR:
+                    test_final_success = CompareConst.ERROR
+                elif item_status == CompareConst.WARNING:
+                    test_final_success = CompareConst.WARNING
+        return test_final_success, detailed_result_total
+
+    def _compare_core(self, api_name, bench_output, device_output):
+        compare_column = CompareColumn()
+        if not isinstance(bench_output, type(device_output)):
+            return CompareConst.ERROR, compare_column, "bench and npu output type is different."
+        elif isinstance(bench_output, dict):
+            b_keys, n_keys = set(bench_output.keys()), set(device_output.keys())
+            if b_keys != n_keys:
+                return CompareConst.ERROR, compare_column, "bench and npu output dict keys are different."
+            else:
+                status, compare_result, message = self._compare_core(api_name, list(bench_output.values()), 
+                                                                     list(device_output.values()))
+        elif isinstance(bench_output, torch.Tensor):
+            copy_bench_out = bench_output.detach().clone()
+            copy_device_output = device_output.detach().clone()
+            compare_column.bench_type = str(copy_bench_out.dtype)
+            compare_column.npu_type = str(copy_device_output.dtype)
+            compare_column.shape = tuple(device_output.shape)
+            status, compare_result, message = self._compare_torch_tensor(api_name, copy_bench_out, copy_device_output,
+                                                                compare_column)
+        elif isinstance(bench_output, (bool, int, float, str)):
+            compare_column.bench_type = str(type(bench_output))
+            compare_column.npu_type = str(type(device_output))
+            status, compare_result, message = self._compare_builtin_type(bench_output, device_output, compare_column)
+        elif bench_output is None:
+            return CompareConst.SKIP, compare_column, "Bench output is None, skip this test."
+        else:
+            return CompareConst.PASS, compare_column, 
+        "Unexpected output type in compare_core: {}".format(type(bench_output))
+
+        return status, compare_result, message
+
+    def _compare_torch_tensor(self, api_name, bench_output, device_output, compare_column):
+        cpu_shape = bench_output.shape
+        npu_shape = device_output.shape
+        npu_dtype = device_output.dtype
+        if npu_dtype == torch.bfloat16:
+            bench_output = bench_output.to(torch.float32)
+            device_output = device_output.to(torch.float32)
+        bench_output = bench_output.numpy()
+        device_output = device_output.cpu().numpy()
+        if cpu_shape != npu_shape:
+            return CompareConst.ERROR, compare_column, f"The shape of bench{str(cpu_shape)} " \
+                                                    f"and npu{str(npu_shape)} not equal."
+        if not check_dtype_comparable(bench_output, device_output):
+            return CompareConst.ERROR, compare_column, f"Bench out dtype is {bench_output.dtype} but " \
+                                                    f"npu output dtype is {device_output.dtype}, cannot compare."
+        message = ""
+        if bench_output.dtype in [bool, np.uint8, np.int8, np.int16, np.uint16, np.uint32, np.int32, 
+                                  np.int64, np.uint64]:
+            message += f"Compare algorithm is not supported for {bench_output.dtype} data. " \
+                    f"Only judged by Error Rate."
+            err_rate, status, msg = self._compare_bool_tensor(bench_output, device_output)
+            message += msg + "\n"
+            compare_column.error_rate = err_rate
+            return status, compare_column, message
+        else:
+            status, compare_column, message = self._compare_float_tensor(api_name, bench_output, device_output, 
+                                                                         compare_column, npu_dtype)
+            return status, compare_column, message
+    
+    def _compare_float_tensor(self, api_name, bench_output, device_output, compare_column, dtype):
+        message = ""
+        abs_bench, abs_bench_with_eps = get_abs_bench_with_eps(bench_output, dtype)
+        abs_err = get_abs_err(bench_output, device_output)
+        if str(dtype) in BENCHMARK_COMPARE_SUPPORT_LIST:
+            both_finite_mask, inf_nan_mask = get_finite_and_infinite_mask(bench_output, device_output)
+            if api_name in BinaryStandardApi:
+                err_rate, _, _ = self._compare_bool_tensor(bench_output, device_output)
+                compare_column.error_rate = err_rate
+            elif api_name in AbsoluteStandardApi:
+                small_value_threshold, small_value_atol, rtol = self._get_absolute_threshold_attribute(
+                    api_name, str(dtype))
+                rel_err = abs_err / abs_bench_with_eps
+                small_value_mask = get_small_value_mask(abs_bench, both_finite_mask, small_value_threshold)
+                normal_value_mask = np.logical_and(both_finite_mask, np.logical_not(small_value_mask))
+                compare_column.inf_nan_error_ratio = check_inf_nan_value(inf_nan_mask, bench_output, device_output, dtype, rtol)
+                compare_column.rel_err_ratio = check_norm_value(normal_value_mask, rel_err, rtol)
+                compare_column.abs_err_ratio = check_small_value(abs_err, small_value_mask, small_value_atol)
+            else:
+                dtype_config = precision_configs.get(dtype)    
+                small_value_mask = get_small_value_mask(abs_bench, both_finite_mask, dtype_config['small_value'][0])
+                abs_err_greater_mask = np.greater(abs_err, dtype_config['small_value_atol'][0])
+                compare_column.small_value_err_ratio = get_small_value_err_ratio(small_value_mask, abs_err_greater_mask)
+                rel_err = get_rel_err(abs_err, abs_bench_with_eps, small_value_mask, inf_nan_mask)
+                compare_column.RMSE = get_rmse(abs_err, np.logical_or(inf_nan_mask, small_value_mask))
+                compare_column.EB = get_error_balance(bench_output, device_output)
+                compare_column.Max_rel_error = get_max_rel_err(rel_err)
+                compare_column.Mean_rel_error = get_mean_rel_err(rel_err)
+
+        cos_res, cos_status, msg = cosine_sim(bench_output, device_output)
+        compare_column.cosine_sim = cos_res
+        message += msg + "\n"
+        if not cos_status:
+            message += "Cosine similarity is less than 0.99, consider as error, skip other check and set to SPACE.\n"
+            return CompareConst.ERROR, compare_column, message
+
+        max_abs_res, max_abs_status = get_max_abs_err(abs_err)
+        compare_column.max_abs_err = max_abs_res
+        if max_abs_status:
+            message += "Max abs error is less than 0.001, consider as pass, skip other check and set to SPACE.\n"
+            return CompareConst.PASS, compare_column, message
+
+        rel_err_orign = get_rel_err_origin(abs_err, abs_bench_with_eps)
+        if dtype in [torch.float16, torch.bfloat16]:
+            hundred_res, hundred_status = get_rel_err_ratio(rel_err_orign, 0.01)
+            compare_column.rel_err_hundredth = hundred_res
+            if not hundred_status:
+                message += "Relative error is greater than 0.01, consider as error, skip other check and set to SPACE.\n"
+                return CompareConst.ERROR, compare_column, message
+        thousand_res, thousand_status = get_rel_err_ratio(rel_err_orign, 0.001)
+        compare_column.rel_err_thousandth = thousand_res
+        if dtype in [torch.float16, torch.bfloat16]:
+            if thousand_status:
+                message += "Relative error is less than 0.001, consider as pass, skip other check and set to SPACE.\n"
+                return CompareConst.PASS, compare_column, message
+            message += "Relative error is greater than 0.001, consider as warning, skip other check and set to SPACE.\n"
+            return CompareConst.WARNING, compare_column, message
+        ten_thousand_res, ten_thousand_status = get_rel_err_ratio(rel_err_orign, 0.0001)
+        compare_column.rel_err_ten_thousandth = ten_thousand_res
+        if dtype in [torch.float32, torch.float64]:
+            if not thousand_status:
+                message += "Relative error is greater than 0.001, consider as error, skip other check and set to SPACE.\n"
+                return CompareConst.ERROR, compare_column, message
+            if not ten_thousand_status:
+                message += "Relative error is greater than 0.0001, consider as warning, skip other check and set to SPACE.\n"
+                return CompareConst.WARNING, compare_column, message
+            message += "Relative error is less than 0.0001, consider as pass.\n"
+        return CompareConst.PASS, compare_column, message
+
+    @staticmethod
+    def _compare_dropout(api_name, bench_output, device_output):
+        tensor_num = bench_output.numel()
+        if tensor_num >= 100:
+            if abs((bench_output == 0).sum() - (device_output == 0).cpu().sum()) / tensor_num < 0.1:
+                return CompareConst.PASS, 1
+            else:
+                return CompareConst.ERROR, 0
+        else:
+            return CompareConst.PASS, 1
+
+    @staticmethod
+    def _compare_builtin_type(bench_output, device_output, compare_column):
+        if not isinstance(bench_output, (bool, int, float, str)):
+            return CompareConst.PASS, compare_column, ""
+        if bench_output != device_output:
+            return CompareConst.ERROR, compare_column, ""
+        compare_column.error_rate = 0
+        return CompareConst.PASS, compare_column, ""
+
+
+    @staticmethod
+    def _compare_bool_tensor(bench_output, device_output):
+        error_nums = (bench_output != device_output).sum()
+        if bench_output.size == 0:
+            return CompareConst.NAN, CompareConst.ERROR, "There is not bench calculation result."
+        error_rate = float(error_nums / bench_output.size)
+        result = CompareConst.PASS if error_rate == 0 else CompareConst.ERROR
+        return error_rate, result, ""
+    
+    @staticmethod
+    def _get_absolute_threshold_attribute(api_name, dtype):
+        small_value_threshold = apis_threshold.get(api_name).get(dtype).get('small_value')
+        small_value_atol = apis_threshold.get(api_name).get(dtype).get('small_value_atol')
+        rtol = apis_threshold.get(api_name).get(dtype).get('rtol')
+        return small_value_threshold, small_value_atol, rtol
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_column.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_column.py
new file mode 100644
index 0000000000000000000000000000000000000000..97cf8226bd1ea6c9a668abd91719fd2662b5183b
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_column.py
@@ -0,0 +1,63 @@
+from .compare_utils import CompareConst
+
+
+class CompareColumn:
+    def __init__(self):
+        self.bench_type = CompareConst.SPACE
+        self.npu_type = CompareConst.SPACE
+        self.shape = CompareConst.SPACE
+        self.cosine_sim = CompareConst.SPACE
+        self.max_abs_err = CompareConst.SPACE
+        self.rel_err_hundredth = CompareConst.SPACE
+        self.rel_err_thousandth = CompareConst.SPACE
+        self.rel_err_ten_thousandth = CompareConst.SPACE
+        self.error_rate = CompareConst.SPACE
+        self.EB = CompareConst.SPACE
+        self.RMSE = CompareConst.SPACE
+        self.small_value_err_ratio = CompareConst.SPACE
+        self.Max_rel_error = CompareConst.SPACE
+        self.Mean_rel_error = CompareConst.SPACE
+        self.inf_nan_error_ratio = CompareConst.SPACE
+        self.rel_err_ratio = CompareConst.SPACE
+        self.abs_err_ratio = CompareConst.SPACE
+
+    def to_column_value(self, is_pass, message):
+        return [self.bench_type, self.npu_type, self.shape, self.cosine_sim, self.max_abs_err, self.rel_err_hundredth,
+                self.rel_err_thousandth, self.rel_err_ten_thousandth, self.error_rate, self.EB, self.RMSE, 
+                self.small_value_err_ratio, self.Max_rel_error, self.Mean_rel_error, self.inf_nan_error_ratio, 
+                self.rel_err_ratio, self.abs_err_ratio, is_pass, message]
+
+
+class ApiPrecisionOutputColumn:
+    def __init__(self):
+        self.api_name = CompareConst.SPACE
+        self.small_value_err_ratio = CompareConst.SPACE
+        self.small_value_err_status = CompareConst.SPACE
+        self.rmse_ratio = CompareConst.SPACE
+        self.rmse_status = CompareConst.SPACE
+        self.max_rel_err_ratio = CompareConst.SPACE
+        self.max_rel_err_status = CompareConst.SPACE
+        self.mean_rel_err_ratio = CompareConst.SPACE
+        self.mean_rel_err_status = CompareConst.SPACE
+        self.eb_ratio = CompareConst.SPACE
+        self.eb_status = CompareConst.SPACE
+        self.inf_nan_error_ratio = CompareConst.SPACE
+        self.inf_nan_error_ratio_status = CompareConst.SPACE
+        self.rel_err_ratio = CompareConst.SPACE
+        self.rel_err_ratio_status = CompareConst.SPACE
+        self.abs_err_ratio = CompareConst.SPACE
+        self.abs_err_ratio_status = CompareConst.SPACE
+        self.error_rate = CompareConst.SPACE
+        self.error_rate_status = CompareConst.SPACE
+        self.compare_result = CompareConst.SPACE
+        self.compare_algorithm = CompareConst.SPACE
+        self.compare_message = CompareConst.SPACE
+
+    def to_column_value(self):
+        return [self.api_name, self.small_value_err_ratio, self.small_value_err_status, self.rmse_ratio, 
+                self.rmse_status, self.max_rel_err_ratio, self.max_rel_err_status, self.mean_rel_err_ratio, 
+                self.mean_rel_err_status, self.eb_ratio, self.eb_status, self.inf_nan_error_ratio, 
+                self.inf_nan_error_ratio_status, self.rel_err_ratio, self.rel_err_ratio_status, self.abs_err_ratio, 
+                self.abs_err_ratio_status, self.error_rate, self.error_rate_status, self.compare_result, 
+                self.compare_algorithm, self.compare_message]
+        
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_utils.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5511da724446187e2dd886448bf6b26ea7b7b369
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/compare/compare_utils.py
@@ -0,0 +1,190 @@
+import time
+import os
+import numpy as np
+import torch
+import yaml
+from ..common.utils import Const, print_warn_log, CompareException
+from ...common.file_check import FileOpen
+
+
+current_time = time.strftime("%Y%m%d%H%M%S")
+API_PRECISION_COMPARE_RESULT_FILE_NAME = "api_precision_compare_result_" + current_time + ".csv"
+API_PRECISION_COMPARE_DETAILS_FILE_NAME = "api_precision_compare_details_" + current_time + ".csv"
+BENCHMARK_COMPARE_SUPPORT_LIST = ['torch.float16', 'torch.bfloat16', 'torch.float32']
+API_PRECISION_COMPARE_UNSUPPORT_LIST = ['torch.float64', 'torch.complex64', 'torch.complex128']
+BINARY_COMPARE_UNSUPPORT_LIST = BENCHMARK_COMPARE_SUPPORT_LIST + API_PRECISION_COMPARE_UNSUPPORT_LIST
+
+
+cur_path = os.path.dirname(os.path.realpath(__file__))
+standard_yaml_path = os.path.join(cur_path, "api_precision_standard.yaml")
+with FileOpen(standard_yaml_path, 'r') as f:
+    Apis = yaml.safe_load(f)
+    AbsoluteStandardApi = Apis.get('AbsoluteThreshStandard')
+    BinaryStandardApi = Apis.get('BinaryCompareStandard')
+
+
+threshold_yaml_path = os.path.join(cur_path, "api_precision_threshold.yaml")
+with FileOpen(threshold_yaml_path, 'r') as f:
+    apis_threshold = yaml.safe_load(f)
+
+
+DETAIL_TEST_ROWS = [[
+            "API Name", "Bench Dtype", "DEVICE Dtype", "Shape",
+            "余弦相似度",
+            "最大绝对误差",
+            "双百指标",
+            "双千指标",
+            "双万指标",
+            "二进制一致错误率",
+            "误差均衡性",
+            "均方根误差",
+            "小值域错误占比",
+            "相对误差最大值",
+            "相对误差平均值",
+            "inf/nan错误率",
+            "相对误差错误率",
+            "绝对误差错误率",
+            "Status",
+            "Message"
+        ]]
+
+
+precision_configs = {
+    torch.float16 : {
+        'small_value' : [
+            1e-3
+        ],
+        'small_value_atol' : [
+            1e-5
+        ]
+    },
+    torch.bfloat16: {
+        'small_value' : [
+            1e-3
+        ],
+        'small_value_atol' : [
+            1e-5
+        ]
+    },
+    torch.float32:{
+        'small_value' : [
+            1e-6
+        ],
+        'small_value_atol' : [
+            1e-9
+        ]
+    }
+}
+
+
+class CompareConst:
+    NAN = np.nan
+    NA = "N/A"
+    PASS = 'pass'
+    WARNING = 'warning'
+    ERROR = 'error'
+    SKIP = 'SKIP'
+    TRUE = 'TRUE'
+    FALSE = 'FALSE'
+    BFLOAT16_MIN = -3.3895313892515355e+38
+    BFLOAT16_MAX = 3.3895313892515355e+38
+    BFLOAT16_EPS = 2 ** -8
+    SPACE = " "
+    
+    
+class ApiPrecisionCompareColumn:
+    API_NAME = 'API Name'
+    DEVICE_DTYPE = 'DEVICE Dtype'
+    SMALL_VALUE_ERROR_RATE = '小值域错误占比'
+    RMSE = '均方根误差'
+    MAX_REL_ERR = '相对误差最大值'
+    MEAN_REL_ERR = '相对误差平均值'
+    EB = '误差均衡性'
+    SMALL_VALUE_ERROR_RATIO = '小值域错误比值'
+    SMALL_VALUE_ERROR_STATUS = '小值域判定结果'
+    RMSE_RATIO = '均方根误差比值'
+    RMSE_STATUS = '均方根误差判定结果'
+    MAX_REL_ERR_RATIO = '相对误差最大值比值'
+    MAX_REL_ERR_STATUS = '相对误差最大值判定结果'
+    MEAN_REL_ERR_RATIO = '相对误差平均值比值'
+    MEAN_REL_ERR_STATUS = '相对误差平均值判定结果'
+    EB_RATIO = '误差均衡性比值'
+    EB_STATUS = '误差均衡性判定结果'
+    ERROR_RATE = '二进制一致错误率'
+    ERROR_RATE_STATUS = '二进制一致错误率判定结果'
+    INF_NAN_ERROR_RATIO = 'inf/nan错误率'
+    INF_NAN_ERROR_RATIO_STATUS = 'inf/nan判定结果'
+    REL_ERR_RATIO = '相对误差错误率'
+    REL_ERR_RATIO_STATUS = '相对误差判定结果'
+    ABS_ERR_RATIO = '绝对误差错误率'
+    ABS_ERR_RATIO_STATUS = '绝对误差判定结果'
+    FINAL_RESULT = '比对结果'
+    ALGORITHM = '比对算法'
+    FORWWARD_STATUS = 'Forward Test Success'
+    BACKWARD_STATUS = 'Backward Test Success'
+    MESSAGE = 'Message'
+    
+    @staticmethod
+    def to_required_columns():
+        return [ApiPrecisionCompareColumn.API_NAME, ApiPrecisionCompareColumn.DEVICE_DTYPE, 
+                ApiPrecisionCompareColumn.SMALL_VALUE_ERROR_RATE, ApiPrecisionCompareColumn.RMSE, 
+                ApiPrecisionCompareColumn.MAX_REL_ERR, ApiPrecisionCompareColumn.MEAN_REL_ERR, ApiPrecisionCompareColumn.EB,
+                ApiPrecisionCompareColumn.ERROR_RATE, ApiPrecisionCompareColumn.INF_NAN_ERROR_RATIO, 
+                ApiPrecisionCompareColumn.REL_ERR_RATIO, ApiPrecisionCompareColumn.ABS_ERR_RATIO]
+
+    @staticmethod
+    def get_detail_csv_title():
+        return [ApiPrecisionCompareColumn.API_NAME,  
+                ApiPrecisionCompareColumn.SMALL_VALUE_ERROR_RATIO, ApiPrecisionCompareColumn.SMALL_VALUE_ERROR_STATUS, 
+                ApiPrecisionCompareColumn.RMSE_RATIO, ApiPrecisionCompareColumn.RMSE_STATUS, 
+                ApiPrecisionCompareColumn.MAX_REL_ERR_RATIO, ApiPrecisionCompareColumn.MAX_REL_ERR_STATUS, 
+                ApiPrecisionCompareColumn.MEAN_REL_ERR_RATIO, ApiPrecisionCompareColumn.MEAN_REL_ERR_STATUS, 
+                ApiPrecisionCompareColumn.EB_RATIO, ApiPrecisionCompareColumn.EB_STATUS, 
+                ApiPrecisionCompareColumn.INF_NAN_ERROR_RATIO, ApiPrecisionCompareColumn.INF_NAN_ERROR_RATIO_STATUS, 
+                ApiPrecisionCompareColumn.REL_ERR_RATIO, ApiPrecisionCompareColumn.REL_ERR_RATIO_STATUS, 
+                ApiPrecisionCompareColumn.ABS_ERR_RATIO, ApiPrecisionCompareColumn.ABS_ERR_RATIO_STATUS, 
+                ApiPrecisionCompareColumn.ERROR_RATE, ApiPrecisionCompareColumn.ERROR_RATE_STATUS, 
+                ApiPrecisionCompareColumn.FINAL_RESULT, ApiPrecisionCompareColumn.ALGORITHM, ApiPrecisionCompareColumn.MESSAGE]
+    
+    @staticmethod
+    def get_result_csv_title():
+        return [ApiPrecisionCompareColumn.API_NAME, ApiPrecisionCompareColumn.FORWWARD_STATUS, 
+                ApiPrecisionCompareColumn.BACKWARD_STATUS, ApiPrecisionCompareColumn.MESSAGE]
+
+
+CompareMessage = {
+    "topk" : "在npu上，topk的入参sorted=False时不生效，会返回有序tensor，而cpu上会返回无序tensor。 如果topk精度不达标，请检查是否是该原因导致的。"
+}
+
+
+def check_dtype_comparable(x, y):
+    if x.dtype in Const.FLOAT_TYPE:
+        if y.dtype in Const.FLOAT_TYPE:
+            return True 
+        return False 
+    if x.dtype in Const.BOOL_TYPE:
+        if y.dtype in Const.BOOL_TYPE:
+            return True 
+        return False 
+    if x.dtype in Const.INT_TYPE:
+        if y.dtype in Const.INT_TYPE:
+            return True 
+        return False
+    print_warn_log(f"Compare: Unexpected dtype {x.dtype}, {y.dtype}")
+    return False
+
+
+def convert_str_to_float(input_data):
+    if isinstance(input_data, str) and input_data.strip() == "":
+        msg = 'ERROR: Input data is an empty string'
+        raise CompareException(CompareException.INVALID_DATA_ERROR, msg)
+    try:
+        float_data = float(input_data)
+        if str(float_data) in ('inf', '-inf', 'nan'):
+            msg = 'ERROR: Input data is either "inf", "-inf", "nan"'
+            raise CompareException(CompareException.INVALID_DATA_ERROR, msg)
+        return float_data
+    except ValueError as e:
+        msg = 'ERROR: Input data cannot be converted to float'
+        raise CompareException(CompareException.INVALID_DATA_ERROR, msg) from e
+        
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/config.yaml b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e2582c4539c9408102d3496242651cedeeefeb22
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/config.yaml
@@ -0,0 +1,9 @@
+dump_path: './'
+real_data: False
+enable_dataloader: False
+target_iter: [1]
+white_list: []
+error_data_path: './'
+jit_compile: True
+precision: 14
+ 
\ No newline at end of file
diff --git "a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/doc/API Accuracy Checker\351\242\204\346\243\200\345\267\245\345\205\267\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" "b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/doc/API Accuracy Checker\351\242\204\346\243\200\345\267\245\345\205\267\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md"
new file mode 100644
index 0000000000000000000000000000000000000000..740f72589a034476586c342d9709b05ea44a93d3
--- /dev/null
+++ "b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/doc/API Accuracy Checker\351\242\204\346\243\200\345\267\245\345\205\267\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md"	
@@ -0,0 +1,64 @@
+# API Accuracy Checker预检工具标准性能基线报告
+
+## 环境信息
+
+NPU：Atlas A2 训练系列产品
+
+CPU：
+
+![输入图片说明](https://foruda.gitee.com/images/1707274376423595920/8d725bef_10012209.png)
+
+Torch：2.1.0
+
+CANN：8.0.T2
+
+除上述环境信息影响性能外，API的数量、种类以及Shape都会对性能产生影响，因此本次选取指定网络进行测试。
+
+## 多进程使用说明
+
+1. 因预检工具run ut会在NPU和CPU上分别运行每个API的计算，开启多进程后会将指定总进程数平均分配给指定的NPU处理。经测试多进程数量需控制在每张卡不超过8个进程，8卡总计不超过63个进程。建议大模型场景下使用8卡56个进程。
+2. 进程数过多可能会造成环境的内存占用过高导致环境崩溃或NPU上out of memeory，若发生此类情况请减少总进程数。
+3. 因子进程拉起需要额外耗时，小模型场景下不建议开过多进程，过多进程性能提升可能并不明显。
+4. 若发生上述情况导致运行中断，可以使用断点续训功能减少进程数后重新运行。
+
+## 模型信息和性能基线
+
+以下场景的性能基线测试数据均为多次测试后取平均值，因此实际运行时性能数据可能会根据环境状态稍有浮动。
+
+### YOLOV5
+
+API：442个，主要数据类型：FLOAT32
+
+单进程run_ut耗时：3m55s
+
+单卡8进程耗时：2m11s
+
+当API数量较少时，多进程计算性能提升不明显，因为拉起子进程需要额外耗时，此场景下不建议开过多进程。
+
+### GPT-3
+
+NUM_LAYER：1，API：170个， 主要数据类型：FLOAT16
+
+单进程run_ut耗时：10m22s
+
+单卡8进程耗时：3m50s
+
+4卡16进程耗时：1m50s
+
+### GPT-3
+
+NUM_LAYER：8，API：16782个，主要数据类型：FLOAT16
+
+单进程run_ut耗时：大于2天（未跑完）
+
+8卡56个进程耗时：1h33m
+
+当API数量很多时多进程下性能提升明显，可以将天级的运行时长缩短至小时级。
+
+### GLM
+
+API：6035个，主要数据类型：FLOAT16
+
+单进程run_ut耗时：大于2天（未跑完）
+
+8卡56个进程耗时：2h40m
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/.keep b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9602292b85f753fd132634b98c74c76460997b0
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/__init__.py
@@ -0,0 +1 @@
+__all__ = ['set_dump_switch']
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/api_info.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/api_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..7452cec74e80c812902341ef2af13d3f29c5f10c
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/api_info.py
@@ -0,0 +1,237 @@
+# 定义API INFO，保存基本信息，用于后续结构体的落盘，注意考虑random场景及真实数据场景
+import os
+import inspect
+import torch
+import numpy as np
+from ..common.config import msCheckerConfig
+from ..common.utils import print_error_log, write_pt, create_directory, DumpException, \
+    get_real_data_path
+from ...common.file_check import check_path_before_create
+
+
+def get_tensor_extremum(data, operator):
+    if data.dtype is torch.bool:
+        if data.numel() == 0:
+            return False, False
+        if operator == 'max':
+            return True in data, True in data
+        elif operator == 'min':
+            return False not in data, False not in data
+    data_clone = data.float().clone().detach()
+    if operator == 'max':
+        max_result = torch._C._VariableFunctionsClass.max(data_clone).item()
+        if np.isinf(max_result) or np.isnan(max_result):
+            return handle_tensor_extremum_nan_inf(data_clone, operator), max_result
+        else:
+            return max_result, max_result
+    else:
+        min_result = torch._C._VariableFunctionsClass.min(data_clone).item()
+        if np.isinf(min_result) or np.isnan(min_result):
+            return handle_tensor_extremum_nan_inf(data_clone, operator), min_result
+        else:
+            return min_result, min_result
+
+
+def handle_tensor_extremum_nan_inf(data_clone, operator):
+    data_nan = torch._C._VariableFunctionsClass.isnan(data_clone)
+    if int(torch._C._VariableFunctionsClass.sum(data_nan)) == data_clone.numel():
+        return float('nan')
+    finite_mask = torch._C._VariableFunctionsClass.isfinite(data_clone)
+    if int(torch._C._VariableFunctionsClass.sum(finite_mask)) > 0:
+        finite_values = data_clone[finite_mask]
+        return torch._C._VariableFunctionsClass.max(finite_values).item() if operator == 'max' else \
+         torch._C._VariableFunctionsClass.min(finite_values).item()
+    else:
+        data_no_nan = data_clone[~data_nan]
+        return torch._C._VariableFunctionsClass.max(data_no_nan).item() if operator == 'max' else \
+         torch._C._VariableFunctionsClass.min(data_no_nan).item()
+
+
+def get_type_name(name):
+    left = name.index("'")
+    right = name.rindex("'")
+    return name[left + 1: right]
+
+
+def transfer_types(data, dtype):
+    if 'int' in dtype or 'bool' in dtype:
+        return int(data)
+    else:
+        return float(data)
+
+
+def is_builtin_class(element):
+    return element is None or isinstance(element, (bool, int, float, str, slice))
+
+
+def analyze_device_in_kwargs(element):
+    single_arg = {}
+    single_arg.update({'type': 'torch.device'})
+    if not isinstance(element, str):
+        if hasattr(element, "index"):
+            device_value = element.type + ":" + str(element.index)
+        else:
+            device_value = element.type
+        single_arg.update({'value': device_value})
+    else:
+        single_arg.update({'value': element})
+    return single_arg
+
+
+def analyze_dtype_in_kwargs(element):
+    single_arg = {}
+    single_arg.update({'type': 'torch.dtype'})
+    single_arg.update({'value': str(element)})
+    return single_arg
+
+
+class APIInfo:
+    def __init__(self, api_name, save_path, is_save_data=False):
+        self.api_name = api_name
+        self.torch_object_key = {'device': analyze_device_in_kwargs, 'dtype': analyze_dtype_in_kwargs}
+        self.rank = os.getpid()
+        self.is_save_data = is_save_data
+        self.save_path = save_path
+        self.args_num = 0
+
+    @staticmethod
+    def get_full_save_path(save_path, dir_name, contain_step=False):
+        if contain_step:
+            from calibrator.pytorch.api_accuracy_checker.dump.dump import DumpUtil
+            step_dir = "step" + str(DumpUtil.call_num - 1 if msCheckerConfig.enable_dataloader else DumpUtil.call_num)
+            rank_dir = f"rank{os.getpid()}"
+            return os.path.join(save_path, step_dir, dir_name, rank_dir)
+        else:
+            return os.path.join(save_path, dir_name)
+
+    def analyze_element(self, element):
+        if isinstance(element, (list, tuple)):
+            out = []
+            for item in element:
+                out.append(self.analyze_element(item))
+            return out
+
+        if isinstance(element, dict):
+            out_dict = {}
+            for key, value in element.items():
+                if key in self.torch_object_key.keys():
+                    fun = self.torch_object_key[key]
+                    out_dict[key] = fun(value)
+                else:
+                    out_dict[key] = self.analyze_element(value)
+            return out_dict
+        
+        converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
+        if converted_numpy is not element:
+            return self._analyze_numpy(converted_numpy, numpy_type)
+
+        if isinstance(element, torch.Tensor):
+            return self._analyze_tensor(element)
+
+        if is_builtin_class(element):
+            return self._analyze_builtin(element)
+
+        msg = f"Type {type(element)} is unsupported at analyze_element"
+        print_error_log(msg)
+        raise DumpException(DumpException.INVALID_DATA_ERROR)
+
+    def _analyze_tensor(self, arg):
+        single_arg = {}
+        if not self.is_save_data:
+            single_arg.update({'type': 'torch.Tensor'})
+            single_arg.update({'dtype': str(arg.dtype)})
+            single_arg.update({'shape': arg.shape})
+            max_handle, max_origin = get_tensor_extremum(arg, 'max')
+            single_arg.update({'Max': transfer_types(max_handle, str(arg.dtype))})
+            single_arg.update({'Max_origin': transfer_types(max_origin, str(arg.dtype))})
+            min_handle, min_origin = get_tensor_extremum(arg, 'min')
+            single_arg.update({'Min': transfer_types(min_handle, str(arg.dtype))})
+            single_arg.update({'Min_origin': transfer_types(min_origin, str(arg.dtype))})
+            single_arg.update({'requires_grad': arg.requires_grad})
+        else:
+            api_args = self.api_name + '.' + str(self.args_num)
+            check_path_before_create(self.save_path)
+            create_directory(self.save_path)
+            file_path = os.path.join(self.save_path, f'{api_args}.pt')
+            pt_path = write_pt(file_path, arg.contiguous().cpu().detach())
+            self.args_num += 1
+            real_data_path = get_real_data_path(pt_path)
+            single_arg.update({'type': 'torch.Tensor'})
+            single_arg.update({'datapath': real_data_path})
+            single_arg.update({'requires_grad': arg.requires_grad})
+        return single_arg
+
+    def _analyze_builtin(self, arg):
+        single_arg = {}
+        if self.is_save_data:
+            self.args_num += 1
+        if isinstance(arg, slice):
+            single_arg.update({'type': "slice"})
+            single_arg.update({'value': [arg.start, arg.stop, arg.step]})
+        else:
+            single_arg.update({'type': get_type_name(str(type(arg)))})
+            single_arg.update({'value': arg})
+        return single_arg
+    
+    def _analyze_numpy(self, value, numpy_type):
+        single_arg = {}
+        if self.is_save_data:
+            self.args_num += 1
+        single_arg.update({'type': numpy_type})
+        single_arg.update({'value': value})
+        return single_arg
+    
+    def _convert_numpy_to_builtin(self, arg):
+        type_mapping = {
+            np.integer: int,
+            np.floating: float,
+            np.bool_: bool,
+            np.complexfloating: complex,
+            np.str_: str,
+            np.bytes_: bytes,
+            np.unicode_: str
+        }
+        for numpy_type, builtin_type in type_mapping.items():
+            if isinstance(arg, numpy_type):
+                return builtin_type(arg), get_type_name(str(type(arg)))
+        return arg, ''
+
+
+class ForwardAPIInfo(APIInfo):
+    def __init__(self, name, args, kwargs):
+        super().__init__(name,
+                         self.get_full_save_path(msCheckerConfig.dump_path, 'forward_real_data', contain_step=True),
+                         is_save_data=msCheckerConfig.real_data)
+        self.api_info_struct = {}
+        self.stack_info_struct = {}
+        self.analyze_api_input(args, kwargs)
+        self.analyze_api_call_stack()
+
+    def analyze_api_input(self, args, kwargs):
+        args_info_list = self.analyze_element(args)
+        kwargs_info_dict = self.analyze_element(kwargs)
+        self.api_info_struct = {self.api_name: {"args": args_info_list, "kwargs": kwargs_info_dict}}
+
+    def analyze_api_call_stack(self):
+        stack_str = []
+        for (_, path, line, func, code, _) in inspect.stack()[3:]:
+            if not code:
+                continue
+            stack_line = " ".join([
+                "File", ", ".join([path, " ".join(["line", str(line)]), " ".join(["in", func]),
+                                   " ".join(["\n", code[0].strip()])])])
+            stack_str.append(stack_line)
+        self.stack_info_struct = {self.api_name: stack_str}
+
+
+class BackwardAPIInfo(APIInfo):
+    def __init__(self, name, grads):
+        super().__init__(name,
+                         self.get_full_save_path(msCheckerConfig.dump_path, 'backward_real_data', contain_step=True),
+                         is_save_data=msCheckerConfig.real_data)
+        self.grad_info_struct = {}
+        self.analyze_api_input(grads)
+
+    def analyze_api_input(self, grads):
+        grads_info_list = self.analyze_element(grads)
+        self.grad_info_struct = {self.api_name: grads_info_list}
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/dump.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/dump.py
new file mode 100644
index 0000000000000000000000000000000000000000..b20378fd45d322e1e2e4a61031c8c1fa240ca5a0
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/dump.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .api_info import ForwardAPIInfo, BackwardAPIInfo
+from .info_dump import write_api_info_json, initialize_output_json
+from ..common.utils import print_error_log, CompareException, print_info_log
+from ..hook_module.register_hook import initialize_hook
+from ..common.config import msCheckerConfig
+
+
+def set_dump_switch(switch):
+    if switch not in ["ON", "OFF"]:
+        print_error_log("Please set switch with 'ON' or 'OFF'.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    if switch == "ON":
+        initialize_hook(pretest_hook)
+        initialize_output_json()
+    DumpUtil.set_dump_switch(switch)
+
+
+def check_dataloader_status():
+    if msCheckerConfig.enable_dataloader:
+        error_info = ("If you want to use this  function, set enable_dataloader "
+                        "in the accuracy_tools/api_accuracy_check/config.yaml "
+                        "to False first")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR, error_info)
+
+
+def start():
+    check_dataloader_status()
+    if not DumpUtil.get_dump_switch():
+        DumpUtil.incr_iter_num_maybe_exit()
+
+
+def stop():
+    check_dataloader_status()
+    DumpUtil.set_dump_switch("OFF")
+
+
+def step():
+    check_dataloader_status()
+    DumpUtil.call_num += 1
+
+
+class DumpUtil(object):
+    dump_switch = None
+    call_num = 0
+
+    @staticmethod
+    def set_dump_switch(switch):
+        DumpUtil.dump_switch = switch
+
+    @staticmethod
+    def get_dump_switch():
+        return DumpUtil.dump_switch == "ON"
+
+    @staticmethod
+    def incr_iter_num_maybe_exit():
+        if DumpUtil.call_num in msCheckerConfig.target_iter:
+            set_dump_switch("ON")
+        elif DumpUtil.call_num > max(msCheckerConfig.target_iter):
+            raise Exception("Model pretest: exit after iteration {}".format(DumpUtil.call_num - 1))
+        else:
+            set_dump_switch("OFF")
+
+
+class DumpConst:
+    delimiter = '*'
+    forward = 'forward'
+    backward = 'backward'
+
+
+def pretest_info_dump(name, out_feat, module, phase):
+    if not DumpUtil.get_dump_switch():
+        return
+    if phase == DumpConst.forward:
+        api_info = ForwardAPIInfo(name, module.input_args, module.input_kwargs)
+    elif phase == DumpConst.backward:
+        api_info = BackwardAPIInfo(name, out_feat)
+    else:
+        msg = "Unexpected training phase {}.".format(phase)
+        print_error_log(msg)
+        raise NotImplementedError(msg)
+    print_info_log(f"tools is dumping api: {name}" + " " * 10, end='\r')
+    write_api_info_json(api_info)
+
+
+def pretest_hook(name, phase):
+    def pretest_info_dump_hook(module, in_feat, out_feat):
+        pretest_info_dump(name, out_feat, module, phase)
+        if hasattr(module, "input_args"):
+            del module.input_args
+        if hasattr(module, "input_kwargs"):
+            del module.input_kwargs
+    return pretest_info_dump_hook
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/dump_scope.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/dump_scope.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac78fa8ccae9f5935d919b62ec72ed588b290a9f
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/dump_scope.py
@@ -0,0 +1,22 @@
+# dump范围控制
+import torch 
+from torch.utils.data.dataloader import _BaseDataLoaderIter 
+from ..dump.dump import DumpUtil
+from ..common.config import msCheckerConfig
+
+
+def iter_tracer(original_next):
+    def func_wrapper(*args, **kwargs):
+        if msCheckerConfig.enable_dataloader:
+            DumpUtil.dump_switch = "OFF"
+            result = original_next(*args, **kwargs)
+            DumpUtil.incr_iter_num_maybe_exit()
+            DumpUtil.call_num += 1
+            return result
+        else:
+            return original_next(*args, **kwargs)
+    return func_wrapper
+
+original_next_method = _BaseDataLoaderIter.__next__
+
+_BaseDataLoaderIter.__next__ = iter_tracer(original_next_method)
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/info_dump.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/info_dump.py
new file mode 100644
index 0000000000000000000000000000000000000000..31165077165c724f0e10ad0e279f5a59593cfd48
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/dump/info_dump.py
@@ -0,0 +1,72 @@
+import fcntl
+import json
+import os
+import threading
+import multiprocessing
+
+from ..dump.api_info import ForwardAPIInfo, BackwardAPIInfo
+from ..common.utils import check_file_or_directory_path, create_directory
+from ...common.file_check import check_path_before_create
+from ...common.file_check import FileOpen, FileCheckConst, FileChecker, change_mode
+from ..common.config import msCheckerConfig
+
+
+lock = threading.Lock()
+proc_lock = multiprocessing.Lock()
+
+
+def write_api_info_json(api_info):
+    from ..dump.dump import DumpUtil
+    dump_path = msCheckerConfig.dump_path
+    dump_path = os.path.join(msCheckerConfig.dump_path, "step" + str((DumpUtil.call_num - 1) if msCheckerConfig.enable_dataloader else DumpUtil.call_num)) 
+    check_path_before_create(dump_path)
+    create_directory(dump_path)
+    rank = api_info.rank
+    if isinstance(api_info, ForwardAPIInfo):
+        file_path = os.path.join(dump_path, f'forward_info_{rank}.json')
+        stack_file_path = os.path.join(dump_path, f'stack_info_{rank}.json')
+        write_json(file_path, api_info.api_info_struct)
+        write_json(stack_file_path, api_info.stack_info_struct, indent=4)
+
+    elif isinstance(api_info, BackwardAPIInfo):
+        file_path = os.path.join(dump_path, f'backward_info_{rank}.json')
+        write_json(file_path, api_info.grad_info_struct)
+    else:
+        raise ValueError(f"Invalid api_info type {type(api_info)}")
+
+
+def write_json(file_path, data, indent=None):
+    check_file_or_directory_path(os.path.dirname(file_path), True)
+    with proc_lock, lock, FileOpen(file_path, 'a+') as f:
+        fcntl.flock(f, fcntl.LOCK_EX)
+        try:
+            f.seek(0, os.SEEK_END)
+            current_position = f.tell()
+            if current_position > 0:
+                f.seek(current_position - 1, os.SEEK_SET)
+                f.truncate()
+                if f.tell() > 3:
+                    f.seek(f.tell() - 1, os.SEEK_SET)
+                    f.truncate()
+                    f.write(',\n')
+                f.write(json.dumps(data, indent=indent)[1:-1] + '\n}')
+            else:
+                change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY)
+                f.write('{\n' + json.dumps(data, indent=indent)[1:] + '\n')
+        except Exception as e:
+            raise ValueError(f"Json save failed:{e}") from e
+        finally:
+            fcntl.flock(f, fcntl.LOCK_UN)
+
+
+def initialize_output_json():
+    dump_path = msCheckerConfig.dump_path
+    check_path_before_create(dump_path)
+    create_directory(dump_path)
+    dump_path_checker = FileChecker(dump_path, FileCheckConst.DIR)
+    dump_path = dump_path_checker.common_check()
+    files = ['forward_info.json', 'backward_info.json', 'stack_info.json']
+    for file in files:
+        file_path = os.path.join(dump_path, file)
+        if os.path.exists(file_path):
+            raise ValueError(f"file {file_path} already exists, please remove it first or use a new dump path")
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/.keep b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/hook_module.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/hook_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..02d5fa5500e470a158b980ff889ab4d7a7ec25bf
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/hook_module.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+
+import functools
+
+import torch
+import torch.nn as nn
+import torch.utils.hooks as full_hooks
+
+module_count = {}
+g_stop_hook = False
+
+
+class HOOKModule(nn.Module):
+
+    def __init__(self, hook) -> None:
+        super(HOOKModule, self).__init__()
+        self.has_overflow = False
+        self.input_args = tuple()
+        self.input_kwargs = dict()
+        self._enable_hook = True
+        prefix = ""
+        if hasattr(self, "prefix_op_name_"):
+            prefix = self.prefix_op_name_
+
+        if prefix not in module_count:
+            module_count[prefix] = 1
+            prefix += '0'
+        else:
+            module_count[prefix] += 1
+            prefix = prefix + str(module_count[prefix] - 1)
+
+        self.register_forward_hook(hook(prefix, "forward"))
+        self.register_backward_hook(hook(prefix, "backward"))
+
+    def __call__(self, *inputs, **kwargs):
+        changed = False
+        global g_stop_hook
+        if g_stop_hook:
+            self._enable_hook = False
+        else:
+            g_stop_hook = True
+            changed = True
+        result = self._call_func(*inputs, **kwargs)
+        if changed:
+            g_stop_hook = False
+        return result
+
+    def _call_func(self, *inputs, **kwargs):
+        if self._enable_hook:
+            full_backward_hooks, non_full_backward_hooks = [], []
+            if len(self._backward_hooks) > 0:
+                full_backward_hooks, non_full_backward_hooks = self._get_backward_hooks()
+            for hook in self._forward_pre_hooks.values():
+                result = hook(self, inputs)
+                if result is not None:
+                    if not isinstance(result, tuple):
+                        result = (result,)
+                    inputs = result
+            bw_hook = None
+            if len(full_backward_hooks) > 0:
+                bw_hook = full_hooks.BackwardHook(self, full_backward_hooks)
+                inputs = bw_hook.setup_input_hook(inputs)
+            self.input_args = inputs
+            self.input_kwargs = kwargs
+            if torch._C._get_tracing_state():
+                result = self._slow_forward(*inputs, **kwargs)
+            else:
+                result = self.forward(*inputs, **kwargs)
+            for hook in self._forward_hooks.values():
+                hook_result = hook(self, inputs, result)
+                if hook_result is not None:
+                    result = hook_result
+            if bw_hook:
+                result = bw_hook.setup_output_hook(result)
+            if len(non_full_backward_hooks) > 0:
+                var = result
+                while not isinstance(var, torch.Tensor):
+                    if isinstance(var, dict):
+                        var = next((v for v in var.values() if isinstance(v, torch.Tensor)))
+                    elif isinstance(var, (list, tuple)):
+                        if var:
+                            var = var[0]
+                        else:
+                            return result
+                    else:
+                        return result
+                grad_fn = var.grad_fn
+                if grad_fn is not None:
+                    for hook in non_full_backward_hooks:
+                        wrapper = functools.partial(hook, self)
+                        functools.update_wrapper(wrapper, hook)
+                        grad_fn.register_hook(wrapper)
+                    self._maybe_warn_non_full_backward_hook(inputs, result, grad_fn)
+            return result
+        else:
+            forward_call = (self._slow_forward if torch._C._get_tracing_state() else self.forward)
+            return forward_call(*inputs, **kwargs)
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/register_hook.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/register_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..eee0d6c5d665470fbeaf49938cbbed1693c5f623
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/register_hook.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import torch
+
+from api_accuracy_checker.hook_module import wrap_torch, wrap_functional, wrap_tensor
+
+
+def initialize_hook(hook):
+    wrap_tensor.wrap_tensor_ops_and_bind(hook)
+    for attr_name in dir(wrap_tensor.HOOKTensor):
+        if attr_name.startswith("wrap_"):
+            setattr(torch.Tensor, attr_name[5:], getattr(wrap_tensor.HOOKTensor, attr_name))
+
+    wrap_torch.wrap_torch_ops_and_bind(hook)
+    for attr_name in dir(wrap_torch.HOOKTorchOP):
+        if attr_name.startswith("wrap_"):
+            setattr(torch, attr_name[5:], getattr(wrap_torch.HOOKTorchOP, attr_name))
+
+    wrap_functional.wrap_functional_ops_and_bind(hook)
+    for attr_name in dir(wrap_functional.HOOKFunctionalOP):
+        if attr_name.startswith("wrap_"):
+            setattr(torch.nn.functional, attr_name[5:], getattr(wrap_functional.HOOKFunctionalOP, attr_name))
+
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/support_wrap_ops.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..acd4cc0e6e658dd4278f6a67c4f0e8fc288efde6
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/support_wrap_ops.yaml
@@ -0,0 +1,999 @@
+# Copyright (c) 2023 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# List of ops that register hooks
+
+functional:
+  - conv1d
+  - conv2d
+  - conv3d
+  - conv_transpose1d
+  - conv_transpose2d
+  - conv_transpose3d
+  - conv_tbc
+  - avg_pool1d
+  - avg_pool2d
+  - avg_pool3d
+  - fractional_max_pool2d_with_indices
+  - fractional_max_pool2d
+  - fractional_max_pool3d_with_indices
+  - fractional_max_pool3d
+  - max_pool1d_with_indices
+  - max_pool1d
+  - max_pool2d_with_indices
+  - max_pool2d
+  - max_pool3d_with_indices
+  - max_pool3d
+  - max_unpool1d
+  - max_unpool2d
+  - max_unpool3d
+  - lp_pool2d
+  - lp_pool1d
+  - adaptive_max_pool1d_with_indices
+  - adaptive_max_pool1d
+  - adaptive_max_pool2d_with_indices
+  - adaptive_max_pool2d
+  - adaptive_max_pool3d_with_indices
+  - adaptive_max_pool3d
+  - adaptive_avg_pool1d
+  - adaptive_avg_pool2d
+  - adaptive_avg_pool3d
+  - dropout
+  - alpha_dropout
+  - dropout2d
+  - dropout3d
+  - feature_alpha_dropout
+  - threshold
+  - threshold_
+  - relu
+  - relu_
+  - glu
+  - hardtanh
+  - hardtanh_
+  - relu6
+  - elu
+  - elu_
+  - selu
+  - selu_
+  - celu
+  - celu_
+  - leaky_relu
+  - leaky_relu_
+  - prelu
+  - rrelu
+  - rrelu_
+  - logsigmoid
+  - gelu
+  - hardshrink
+  - tanhshrink
+  - softsign
+  - softplus
+  - softmin
+  - softmax
+  - gumbel_softmax
+  - log_softmax
+  - softshrink
+  - tanh
+  - sigmoid
+  - hardsigmoid
+  - linear
+  - bilinear
+  - silu
+  - hardswish
+  - embedding
+  - embedding_bag
+  - batch_norm
+  - instance_norm
+  - layer_norm
+  - group_norm
+  - local_response_norm
+  - ctc_loss
+  - nll_loss
+  - poisson_nll_loss
+  - gaussian_nll_loss
+  - kl_div
+  - cross_entropy
+  - binary_cross_entropy
+  - binary_cross_entropy_with_logits
+  - smooth_l1_loss
+  - l1_loss
+  - mse_loss
+  - margin_ranking_loss
+  - hinge_embedding_loss
+  - multilabel_margin_loss
+  - soft_margin_loss
+  - multilabel_soft_margin_loss
+  - cosine_embedding_loss
+  - multi_margin_loss
+  - pixel_shuffle
+  - pixel_unshuffle
+  - channel_shuffle
+  - upsample
+  - interpolate
+  - upsample_nearest
+  - upsample_bilinear
+  - grid_sample
+  - affine_grid
+  - pad
+  - pairwise_distance
+  - pdist
+  - cosine_similarity
+  - one_hot
+  - triplet_margin_loss
+  - triplet_margin_with_distance_loss
+  - normalize
+  - unfold
+  - fold
+  - multi_head_attention_forward
+
+tensor:
+  - __add__
+  - __and__
+  - __bool__
+  - __div__
+  - __eq__
+  - __ge__
+  - __gt__
+  - __iadd__
+  - __iand__
+  - __idiv__
+  - __ifloordiv__
+  - __ilshift__
+  - __imod__
+  - __imul__
+  - __ior__
+  - __irshift__
+  - __isub__
+  - __ixor__
+  - __lshift__
+  - __matmul__
+  - __mod__
+  - __mul__
+  - __nonzero__
+  - __or__
+  - __radd__
+  - __rmul__
+  - __rshift__
+  - __sub__
+  - __truediv__
+  - __xor__
+  - abs
+  - abs_
+  - absolute
+  - absolute_
+  - acos
+  - acos_
+  - acosh
+  - acosh_
+  - add
+  - add_
+  - addbmm
+  - addbmm_
+  - addcdiv
+  - addcdiv_
+  - addcmul
+  - addcmul_
+  - addmm
+  - addmm_
+  - addmv
+  - addmv_
+  - addr
+  - addr_
+  - align_as
+  - align_to
+  - all
+  - allclose
+  - amax
+  - amin
+  - angle
+  - any
+  - arccos
+  - arccos_
+  - arccosh
+  - arccosh_
+  - arcsin
+  - arcsin_
+  - arcsinh
+  - arcsinh_
+  - arctan
+  - arctan_
+  - arctanh
+  - arctanh_
+  - argmax
+  - argmin
+  - argsort
+  - asin
+  - asin_
+  - asinh
+  - asinh_
+  - atan
+  - atan2
+  - atan2_
+  - atan_
+  - atanh
+  - atanh_
+  - baddbmm
+  - baddbmm_
+  - bernoulli
+  - bernoulli_
+  - bincount
+  - bitwise_and
+  - bitwise_and_
+  - bitwise_not
+  - bitwise_not_
+  - bitwise_or
+  - bitwise_or_
+  - bitwise_xor
+  - bitwise_xor_
+  - bmm
+  - broadcast_to
+  - cauchy_
+  - ceil
+  - ceil_
+  - cholesky
+  - chunk
+  - clamp
+  - cholesky_solve
+  - cholesky_inverse
+  - clamp_
+  - clamp_max
+  - clamp_max_
+  - clip
+  - clamp_min
+  - clamp_min_
+  - clip_
+  - copysign
+  - copysign_
+  - cos
+  - cos_
+  - cosh
+  - cosh_
+  - count_nonzero
+  - cummax
+  - cummin
+  - cumprod
+  - cumprod_
+  - cumsum
+  - cumsum_
+  - deg2rad
+  - deg2rad_
+  - det
+  - diag
+  - diag_embed
+  - diagflat
+  - diagonal
+  - diff
+  - dist
+  - digamma
+  - digamma_
+  - div
+  - div_
+  - divide
+  - divide_
+  - dot
+  - eig
+  - eq
+  - eq_
+  - erf
+  - equal
+  - erf_
+  - erfc
+  - erfc_
+  - erfinv
+  - erfinv_
+  - exp
+  - exp2
+  - exp2_
+  - expm1
+  - exp_
+  - expm1_
+  - exponential_
+  - fill_
+  - fix
+  - fill_diagonal_
+  - fix_
+  - flip
+  - fliplr
+  - flatten
+  - flipud
+  - float_power
+  - float_power_
+  - floor
+  - floor_
+  - floor_divide
+  - floor_divide_
+  - fmax
+  - fmin
+  - fmod
+  - fmod_
+  - frac
+  - frac_
+  - gather
+  - gcd
+  - gcd_
+  - ge
+  - ge_
+  - geometric_
+  - geqrf
+  - ger
+  - greater
+  - greater_
+  - gt
+  - gt_
+  - greater_equal
+  - greater_equal_
+  - hardshrink
+  - heaviside
+  - heaviside_
+  - histc
+  - hypot
+  - hypot_
+  - igamma
+  - igamma_
+  - igammac
+  - igammac_
+  - index_add
+  - index_add_
+  - inverse
+  - index_copy
+  - index_copy_
+  - index_fill
+  - index_fill_
+  - index_put
+  - index_put_
+  - inner
+  - index_select
+  - isclose
+  - isfinite
+  - isinf
+  - isnan
+  - isneginf
+  - isposinf
+  - isreal
+  - kron
+  - kthvalue
+  - lcm
+  - lcm_
+  - ldexp
+  - ldexp_
+  - le
+  - le_
+  - lerp
+  - lerp_
+  - where
+  - less
+  - less_
+  - less_equal
+  - less_equal_
+  - lgamma
+  - lgamma_
+  - log
+  - log10
+  - log10_
+  - log1p
+  - log1p_
+  - log2
+  - log2_
+  - log_
+  - log_normal_
+  - log_softmax
+  - logcumsumexp
+  - logdet
+  - logaddexp
+  - logaddexp2
+  - logical_and
+  - logical_and_
+  - logical_not
+  - logit
+  - logical_not_
+  - logical_or
+  - logical_or_
+  - logical_xor
+  - logical_xor_
+  - logit_
+  - logsumexp
+  - lstsq
+  - lt
+  - lt_
+  - lu_solve
+  - map2_
+  - map_
+  - masked_fill
+  - matmul
+  - masked_fill_
+  - masked_scatter
+  - masked_scatter_
+  - masked_select
+  - matrix_exp
+  - max
+  - maximum
+  - mean
+  - matrix_power
+  - median
+  - min
+  - minimum
+  - mm
+  - mode
+  - msort
+  - mul
+  - mul_
+  - multinomial
+  - multiply
+  - multiply_
+  - mv
+  - mvlgamma
+  - mvlgamma_
+  - nansum
+  - narrow
+  - narrow_copy
+  - ne
+  - ne_
+  - neg
+  - neg_
+  - negative
+  - negative_
+  - nonzero
+  - normal_
+  - not_equal
+  - not_equal_
+  - permute
+  - pinverse
+  - polygamma
+  - pow
+  - pow_
+  - polygamma_
+  - prelu
+  - prod
+  - put_
+  - rad2deg
+  - rad2deg_
+  - ravel
+  - real
+  - reciprocal
+  - reciprocal_
+  - relu
+  - relu_
+  - remainder
+  - repeat_interleave
+  - reshape
+  - remainder_
+  - renorm
+  - renorm_
+  - repeat
+  - reshape_as
+  - resize_
+  - resize_as_
+  - roll
+  - rot90
+  - round
+  - round_
+  - rsqrt
+  - rsqrt_
+  - scatter
+  - scatter_
+  - scatter_add
+  - scatter_add_
+  - select
+  - sgn
+  - sgn_
+  - sigmoid
+  - sigmoid_
+  - sign
+  - sign_
+  - signbit
+  - sin
+  - sin_
+  - sinc
+  - sinc_
+  - sinh
+  - sinh_
+  - slogdet
+  - smm
+  - softmax
+  - solve
+  - sort
+  - split_with_sizes
+  - sqrt
+  - sqrt_
+  - square
+  - square_
+  - squeeze
+  - squeeze_
+  - sspaddmm
+  - std
+  - sub
+  - sub_
+  - sum
+  - sum_to_size
+  - svd
+  - symeig
+  - t
+  - t_
+  - take
+  - tan
+  - tan_
+  - tanh
+  - tanh_
+  - tensor_split
+  - tile
+  - topk
+  - transpose
+  - transpose_
+  - triangular_solve
+  - tril
+  - tril_
+  - triu
+  - true_divide
+  - triu_
+  - true_divide_
+  - trunc
+  - trunc_
+  - type_as
+  - unbind
+  - unflatten
+  - unfold
+  - unsafe_chunk
+  - unsqueeze
+  - unsafe_split
+  - unsafe_split_with_sizes
+  - var
+  - vdot
+  - unsqueeze_
+  - view_as
+  - xlogy
+  - xlogy_
+
+torch:
+  - _adaptive_avg_pool2d
+  - _add_relu
+  - _add_relu_
+  - _aminmax
+  - _batch_norm_impl_index
+  - _convolution
+  - abs
+  - abs_
+  - absolute
+  - acos
+  - acos_
+  - acosh
+  - acosh_
+  - adaptive_avg_pool1d
+  - adaptive_max_pool1d
+  - add
+  - addbmm
+  - addcdiv
+  - addcmul
+  - addmm
+  - addmv
+  - addmv_
+  - addr
+  - amax
+  - affine_grid_generator
+  - align_tensors
+  - all
+  - alpha_dropout
+  - amin
+  - alpha_dropout_
+  - angle
+  - any
+  - arange
+  - arccos
+  - arccos_
+  - arccosh
+  - arccosh_
+  - arcsin
+  - arcsin_
+  - arcsinh
+  - arcsinh_
+  - arctan
+  - arctan_
+  - arctanh
+  - arctanh_
+  - argmax
+  - argmin
+  - argsort
+  - asin
+  - asin_
+  - asinh
+  - asinh_
+  - atan
+  - atan2
+  - atan_
+  - atanh
+  - atanh_
+  - atleast_1d
+  - atleast_2d
+  - atleast_3d
+  - avg_pool1d
+  - baddbmm
+  - bartlett_window
+  - batch_norm_backward_elemt
+  - batch_norm_backward_reduce
+  - batch_norm_elemt
+  - batch_norm_gather_stats
+  - batch_norm_gather_stats_with_counts
+  - bernoulli
+  - batch_norm_stats
+  - batch_norm_update_stats
+  - bilinear
+  - bincount
+  - binomial
+  - binary_cross_entropy_with_logits
+  - bitwise_and
+  - bitwise_not
+  - bitwise_or
+  - bitwise_xor
+  - blackman_window
+  - block_diag
+  - bmm
+  - broadcast_tensors
+  - broadcast_to
+  - cartesian_prod
+  - cat
+  - cdist
+  - ceil
+  - ceil_
+  - celu
+  - celu_
+  - chain_matmul
+  - channel_shuffle
+  - cholesky
+  - cholesky_inverse
+  - cholesky_solve
+  - choose_qparams_optimized
+  - chunk
+  - clamp
+  - clamp_
+  - clamp_max
+  - clamp_max_
+  - clamp_min
+  - clamp_min_
+  - clip
+  - clip_
+  - clone
+  - column_stack
+  - combinations
+  - constant_pad_nd
+  - conv1d
+  - conv2d
+  - conv3d
+  - conv_tbc
+  - conv_transpose1d
+  - conv_transpose2d
+  - conv_transpose3d
+  - cos
+  - convolution
+  - copysign
+  - cos_
+  - cosh
+  - cosh_
+  - cosine_embedding_loss
+  - cosine_similarity
+  - count_nonzero
+  - cross
+  - ctc_loss
+  - cummax
+  - cummin
+  - cumprod
+  - cumsum
+  - deg2rad
+  - deg2rad_
+  - det
+  - diag
+  - diag_embed
+  - diff
+  - diagflat
+  - diagonal
+  - digamma
+  - dist
+  - div
+  - divide
+  - dot
+  - dropout
+  - dropout_
+  - dsmm
+  - dstack
+  - eig
+  - einsum
+  - embedding
+  - embedding_bag
+  - embedding_renorm_
+  - eq
+  - equal
+  - erf
+  - erf_
+  - erfc
+  - erfc_
+  - erfinv
+  - exp
+  - exp2
+  - exp2_
+  - exp_
+  - expm1
+  - expm1_
+  - eye
+  - feature_dropout
+  - feature_alpha_dropout
+  - feature_alpha_dropout_
+  - feature_dropout_
+  - fix
+  - fill_
+  - fix_
+  - flatten
+  - flip
+  - fliplr
+  - flipud
+  - float_power
+  - floor
+  - floor_
+  - floor_divide
+  - fmax
+  - fmin
+  - fmod
+  - frac
+  - frac_
+  - full
+  - frobenius_norm
+  - full_like
+  - gather
+  - gcd
+  - gcd_
+  - ge
+  - geqrf
+  - ger
+  - greater
+  - greater_equal
+  - grid_sampler
+  - grid_sampler_2d
+  - group_norm
+  - grid_sampler_3d
+  - gru
+  - gru_cell
+  - gt
+  - hamming_window
+  - hann_window
+  - hardshrink
+  - heaviside
+  - hinge_embedding_loss
+  - histc
+  - hsmm
+  - hspmm
+  - hstack
+  - hypot
+  - igamma
+  - igammac
+  - index_add
+  - index_copy
+  - inner
+  - index_fill
+  - index_put
+  - index_put_
+  - index_select
+  - instance_norm
+  - isclose
+  - isfinite
+  - isinf
+  - isnan
+  - isneginf
+  - isposinf
+  - istft
+  - kaiser_window
+  - kl_div
+  - kron
+  - kthvalue
+  - layer_norm
+  - lcm
+  - lcm_
+  - ldexp
+  - ldexp_
+  - le
+  - lerp
+  - less
+  - less_equal
+  - lgamma
+  - linspace
+  - log
+  - log10
+  - log10_
+  - log1p
+  - log1p_
+  - log2
+  - log2_
+  - log_softmax
+  - log_
+  - logaddexp
+  - logaddexp2
+  - logcumsumexp
+  - logdet
+  - logical_and
+  - logical_not
+  - logical_or
+  - logical_xor
+  - logit
+  - logit_
+  - logspace
+  - logsumexp
+  - lstm
+  - lstm_cell
+  - lstsq
+  - lt
+  - lu_solve
+  - masked_fill
+  - margin_ranking_loss
+  - masked_scatter
+  - masked_select
+  - matrix_exp
+  - matmul
+  - matrix_power
+  - matrix_rank
+  - max
+  - max_pool1d
+  - max_pool2d
+  - max_pool1d_with_indices
+  - max_pool3d
+  - maximum
+  - mean
+  - median
+  - min
+  - minimum
+  - mm
+  - mode
+  - moveaxis
+  - movedim
+  - msort
+  - mul
+  - multinomial
+  - multiply
+  - mv
+  - mvlgamma
+  - nan_to_num
+  - nan_to_num_
+  - nanmedian
+  - nansum
+  - narrow
+  - native_batch_norm
+  - native_group_norm
+  - narrow_copy
+  - native_layer_norm
+  - native_norm
+  - ne
+  - neg
+  - negative
+  - neg_
+  - negative_
+  - nextafter
+  - nonzero
+  - norm_except_dim
+  - normal
+  - not_equal
+  - nuclear_norm
+  - pairwise_distance
+  - pdist
+  - pinverse
+  - pixel_shuffle
+  - pixel_unshuffle
+  - poisson
+  - poisson_nll_loss
+  - polar
+  - polygamma
+  - pow
+  - prelu
+  - prod
+  - rad2deg
+  - promote_types
+  - rad2deg_
+  - range
+  - ravel
+  - real
+  - reciprocal
+  - relu
+  - reciprocal_
+  - relu_
+  - remainder
+  - renorm
+  - repeat_interleave
+  - reshape
+  - resize_as_
+  - roll
+  - rot90
+  - round
+  - round_
+  - rrelu
+  - rrelu_
+  - rsqrt
+  - row_stack
+  - rsqrt_
+  - rsub
+  - saddmm
+  - scalar_tensor
+  - scatter
+  - select
+  - scatter_add
+  - searchsorted
+  - selu
+  - selu_
+  - sgn
+  - sigmoid
+  - sigmoid_
+  - sign
+  - signbit
+  - sin
+  - sin_
+  - sinc
+  - sinc_
+  - sinh
+  - sinh_
+  - slogdet
+  - smm
+  - softmax
+  - solve
+  - sort
+  - sparse_coo_tensor
+  - square
+  - split_with_sizes
+  - spmm
+  - sqrt
+  - sqrt_
+  - square_
+  - squeeze
+  - sspaddmm
+  - stack
+  - std
+  - std_mean
+  - sub
+  - subtract
+  - sum
+  - svd
+  - swapaxes
+  - swapdims
+  - symeig
+  - t
+  - take
+  - tan
+  - tan_
+  - tanh
+  - tanh_
+  - tensordot
+  - tensor_split
+  - threshold
+  - threshold_
+  - tile
+  - topk
+  - transpose
+  - trapz
+  - triangular_solve
+  - tril
+  - tril_indices
+  - triplet_margin_loss
+  - triu
+  - triu_indices
+  - true_divide
+  - trunc
+  - trunc_
+  - unique_consecutive
+  - xlogy
+  - unbind
+  - unique_dim
+  - unsafe_chunk
+  - unsafe_split
+  - vander
+  - var
+  - vdot
+  - unsafe_split_with_sizes
+  - unsqueeze
+  - var_mean
+  - vstack
+  - where
+  - xlogy_
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/utils.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6641807f929babeed3af30cf14b043d1e4f7913c
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/utils.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+import yaml
+
+from ...common.file_check import FileOpen
+
+cur_path = os.path.dirname(os.path.realpath(__file__))
+yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
+with FileOpen(yaml_path, 'r') as f:
+    Ops = yaml.safe_load(f)
+    WrapFunctionalOps = Ops.get('functional')
+    WrapTensorOps = Ops.get('tensor')
+    WrapTorchOps = Ops.get('torch')
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_functional.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..056c1d047eb592f0006e3632eaa5597eba5630da
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_functional.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import torch
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard
+from ..common.config import msCheckerConfig
+
+for f in dir(torch.nn.functional):
+    locals().update({f: getattr(torch.nn.functional, f)})
+
+
+def get_functional_ops():
+    global WrapFunctionalOps
+    _all_functional_ops = dir(torch.nn.functional)
+    if msCheckerConfig.white_list:
+        return set(WrapFunctionalOps) & set(_all_functional_ops) & set(msCheckerConfig.white_list)
+    else:
+        return set(WrapFunctionalOps) & set(_all_functional_ops) 
+
+
+class HOOKFunctionalOP(object):
+    pass
+
+
+class FunctionalOPTemplate(HOOKModule):
+    def __init__(self, op_name, hook, need_hook=True):
+        self.op_name_ = op_name
+        self.prefix_op_name_ = "Functional*" + str(op_name) + "*"
+        if need_hook:
+            super().__init__(hook)
+
+    @torch_device_guard
+    def forward(self, *args, **kwargs):
+        return eval(self.op_name_)(*args, **kwargs)
+
+
+def wrap_functional_op(op_name, hook):
+    def functional_op_template(*args, **kwargs):
+        return FunctionalOPTemplate(op_name, hook)(*args, **kwargs)
+
+    return functional_op_template
+
+
+def wrap_functional_ops_and_bind(hook):
+    _functional_ops = get_functional_ops()
+    for op_name in _functional_ops:
+        setattr(HOOKFunctionalOP, "wrap_" + op_name, wrap_functional_op(op_name, hook))
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_tensor.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7791cdc9ac8e2084fc63d76e3819e137f4ea9d7
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_tensor.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import torch
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard
+from ..common.config import msCheckerConfig
+from ...common.utils import parameter_adapter
+
+
+def get_tensor_ops():
+    global WrapTensorOps
+    _tensor_ops = dir(torch._C._TensorBase)
+    if msCheckerConfig.white_list:
+        return set(WrapTensorOps) & set(_tensor_ops) & set(msCheckerConfig.white_list)
+    else:
+        return set(WrapTensorOps) & set(_tensor_ops)
+
+
+class HOOKTensor(object):
+    pass
+
+
+class TensorOPTemplate(HOOKModule):
+
+    def __init__(self, op_name, hook, need_hook=True):
+        self.op_name_ = op_name
+        self.prefix_op_name_ = "Tensor*" + str(op_name) + "*"
+        if need_hook:
+            super().__init__(hook)
+
+    @torch_device_guard
+    @parameter_adapter
+    def forward(self, *args, **kwargs):
+        return getattr(torch._C._TensorBase, str(self.op_name_))(*args, **kwargs)
+
+
+def wrap_tensor_op(op_name, hook):
+
+    def tensor_op_template(*args, **kwargs):
+        return TensorOPTemplate(op_name, hook)(*args, **kwargs)
+
+    return tensor_op_template
+
+
+def wrap_tensor_ops_and_bind(hook):
+    _tensor_ops = get_tensor_ops()
+    for op_name in _tensor_ops:
+        setattr(HOOKTensor, "wrap_" + str(op_name), wrap_tensor_op(op_name, hook))
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_torch.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..aab245b5d21daff0e0ea44e4073333c6854f95ac
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/hook_module/wrap_torch.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import torch
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard
+from ..common.config import msCheckerConfig
+
+
+def get_torch_ops():
+    global WrapTorchOps
+    _torch_ops = dir(torch._C._VariableFunctionsClass)
+    if msCheckerConfig.white_list:
+        return set(WrapTorchOps) & set(_torch_ops) & set(msCheckerConfig.white_list)
+    else:
+        return set(WrapTorchOps) & set(_torch_ops)
+
+
+class HOOKTorchOP(object):
+    pass
+
+
+class TorchOPTemplate(HOOKModule):
+
+    def __init__(self, op_name, hook, need_hook=True):
+        self.op_name_ = op_name
+        self.prefix_op_name_ = "Torch*" + str(op_name) + "*"
+        if need_hook:
+            super().__init__(hook)
+
+    def input_param_need_adapt(self):
+        special_op_list = ["broadcast_tensors", "block_diag"]
+        for item in special_op_list:
+            if item in self.op_name_:
+                return True
+        return False
+
+    def einsum_adapt(self, *args):
+        if len(args) < 2:
+            raise ValueError('einsum(): must specify the equation string and at least one operand, '
+                             'or at least one operand and its subscripts list')
+        equation = None
+        operands = None
+        if isinstance(args[0], torch.Tensor):
+            def parse_subscript(n: int) -> str:
+                if n == Ellipsis:
+                    return '...'
+                if n >= 0 and n < 26:
+                    return chr(ord('A') + n)
+                if n >= 26 and n < 52:
+                    return chr(ord('a') + n - 26)
+                raise ValueError('einsum(): subscript in subscript list is not within the valid range [0, 52]')
+            equation = ','.join(''.join(parse_subscript(script) for script in arg) for arg in args[1::2])
+
+            if len(args) % 2 == 1:
+                equation += '->' + ''.join(parse_subscript(script) for script in args[-1])
+                operands = args[:-1:2]
+            else:
+                operands = args[::2]
+        else:
+            equation = args[0]
+            operands = args[1:]
+
+        if len(operands) == 1 and isinstance(operands[0], (list, tuple)):
+            _operands = operands[0]
+            return self.einsum_adapt(equation, *_operands)
+        return equation, operands
+
+    @torch_device_guard
+    def forward(self, *args, **kwargs):
+        if self.input_param_need_adapt():
+            return getattr(torch._C._VariableFunctionsClass, str(self.op_name_))(args, **kwargs)
+        else:
+            if self.op_name_ == 'einsum':
+                args = self.einsum_adapt(*args)
+            return getattr(torch._C._VariableFunctionsClass, str(self.op_name_))(*args, **kwargs)
+
+
+def wrap_torch_op(op_name, hook):
+
+    def torch_op_template(*args, **kwargs):
+        return TorchOPTemplate(op_name, hook)(*args, **kwargs)
+
+    return torch_op_template
+
+
+def wrap_torch_ops_and_bind(hook):
+    _torch_ops = get_torch_ops()
+    for op_name in _torch_ops:
+        setattr(HOOKTorchOP, "wrap_" + op_name, wrap_torch_op(op_name, hook))
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/accuracy_checking_details.png b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/accuracy_checking_details.png
new file mode 100644
index 0000000000000000000000000000000000000000..ddc4fb348ee55197459c7303b0817853e201ace4
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/accuracy_checking_details.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/accuracy_checking_result.png b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/accuracy_checking_result.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa0b29d8d057ff806d5f5e82a35c5ce085dee1f3
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/accuracy_checking_result.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/api_precision_compare_details.png b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/api_precision_compare_details.png
new file mode 100644
index 0000000000000000000000000000000000000000..c3fd909a8d187fd6a725c7f3cc6798989d3fa0cf
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/api_precision_compare_details.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/api_precision_compare_result.png b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/api_precision_compare_result.png
new file mode 100644
index 0000000000000000000000000000000000000000..2b95897031441408f6a88185e3cda36e4fea8049
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/img/api_precision_compare_result.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/.keep b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/data_generate.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/data_generate.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e4b16ed3849211ac418633dfa1843c969ad337f
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/data_generate.py
@@ -0,0 +1,305 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+import math
+import torch
+import numpy
+
+from ..common.utils import Const, check_file_or_directory_path, check_object_type, print_warn_log, \
+    print_error_log, get_full_data_path, CompareException
+
+TORCH_TYPE = ["torch.device", "torch.dtype"]
+TENSOR_DATA_LIST = ["torch.Tensor", "torch.nn.parameter.Parameter"]
+FLOAT_TYPE = ['torch.float32', 'torch.float', 'torch.float64', 'torch.double', 'torch.float16',
+              'torch.half', 'torch.bfloat16']
+NUMPY_TYPE = ["numpy.int8", "numpy.int16", "numpy.int32", "numpy.int64", "numpy.uint8", "numpy.uint16", "numpy.uint32",
+              "numpy.uint64", "numpy.float16", "numpy.float32", "numpy.float64", "numpy.float128", "numpy.complex64", 
+              "numpy.complex128", "numpy.complex256", "numpy.bool_", "numpy.string_", "numpy.bytes_", "numpy.unicode_"]
+
+
+def gen_data(info, need_grad, convert_type, real_data_path=None):
+    """
+    Function Description:
+        Based on arg basic information, generate arg data
+    Parameter:
+        info: arg basic information. Dict
+        need_grad: set Tensor grad for backward
+        convert_type: convert ori_type to dist_type flag.
+    """
+    check_object_type(info, dict)
+    data_type = info.get('type')
+    data_path = info.get('datapath')
+    data_path = get_full_data_path(data_path, real_data_path)
+    if data_type in TENSOR_DATA_LIST:
+        if data_path:
+            data = gen_real_tensor(data_path, convert_type)
+        else:
+            data = gen_random_tensor(info, convert_type)
+        if info.get('requires_grad') and need_grad:
+            data.requires_grad_(True)
+            temp_data = data * 1
+            data = temp_data.type_as(data)
+            data.retain_grad()
+    elif data_type.startswith("numpy"):
+        if data_type not in NUMPY_TYPE:
+            raise Exception("{} is not supported now".format(data_type))
+        data = info.get("value")
+        try:
+            data = eval(data_type)(data)
+        except Exception as err:
+            print_error_log("Failed to convert the type to numpy: %s" % str(err))
+    else:
+        data = info.get('value')
+        if info.get("type") == "slice":
+            data = slice(*data)
+    return data
+
+
+def gen_real_tensor(data_path, convert_type):
+    """
+    Function Description:
+        Based on API data path, generate input parameters real data
+    Parameter:
+        data_path: API data path
+        convert_type: convert ori_type to dist_type flag.
+    """
+    data_path = os.path.realpath(data_path)
+    check_file_or_directory_path(data_path)
+    if not data_path.endswith('.pt') and not data_path.endswith('.npy'):
+        error_info = f"The file: {data_path} is not a pt or numpy file."
+        raise CompareException(CompareException.INVALID_FILE_ERROR, error_info)
+    if data_path.endswith('.pt'):
+        data = torch.load(data_path)
+    else:
+        data_np = numpy.load(data_path)
+        data = torch.from_numpy(data_np)
+    if convert_type:
+        ori_dtype = Const.CONVERT.get(convert_type)[0]
+        dist_dtype = Const.CONVERT.get(convert_type)[1]
+        if str(data.dtype) == ori_dtype:
+            data = data.type(eval(dist_dtype))
+    return data
+
+
+def gen_random_tensor(info, convert_type):
+    """
+    Function Description:
+        Based on API MAX and MIN, generate input parameters random data
+    Parameter:
+        info: API data info
+        convert_type: convert ori_type to dist_type flag.
+    """
+    check_object_type(info, dict)
+    low, high = info.get('Min'), info.get('Max')
+    low_origin, high_origin = info.get('Min_origin'), info.get('Max_origin')
+    low_info = [low, low_origin]
+    high_info = [high, high_origin]
+    data_dtype = info.get('dtype')
+    shape = tuple(info.get('shape'))
+    if not isinstance(low, (int, float)) or not isinstance(high, (int, float)):
+        error_info = f'Data info Min: {low} , Max: {high}, info type must be int or float.'
+        raise CompareException(CompareException.INVALID_PARAM_ERROR, error_info)
+    if data_dtype == "torch.bool":
+        data = gen_bool_tensor(low, high, shape)
+    else:
+        data = gen_common_tensor(low_info, high_info, shape, data_dtype, convert_type)
+    return data
+
+
+def gen_common_tensor(low_info, high_info, shape, data_dtype, convert_type):
+    """
+    Function Description:
+        Based on API basic information, generate int or float tensor
+    Parameter:
+        low_info: [low, low_origin], low is the minimum value in the tensor removed inf and nan, 
+        low_origin is the original minimum value in the tensor
+        high_info: [high, high_origin], high is the maximum value in the tensor removed inf and nan, 
+        high_origin is the original maximum value in the tensor
+        shape:The shape of Tensor
+        data_dtype: The data type of Tensor
+        convert_type: convert ori_type to dist_type flag.
+    """
+    if convert_type:
+        ori_dtype = Const.CONVERT.get(convert_type)[0]
+        if ori_dtype == data_dtype:
+            data_dtype = Const.CONVERT.get(convert_type)[1]
+    low, low_origin = low_info[0], low_info[1]
+    high, high_origin = high_info[0], high_info[1]
+    if data_dtype in FLOAT_TYPE:
+        if math.isnan(high):
+            tensor = torch._C._VariableFunctionsClass.full(shape, float('nan'), dtype=eval(data_dtype))
+            return tensor
+        #high_origin为新版json中的属性，只有当high_origin不为None,且high为inf或-inf时，原tensor全为inf或-inf
+        if high_origin and high in [float('inf'), float('-inf')]:
+            tensor = torch._C._VariableFunctionsClass.full(shape, high, dtype=eval(data_dtype))
+            tensor[-1] = low
+            return tensor
+        low_scale, high_scale = low, high
+        dtype_finfo = torch.finfo(eval(data_dtype))
+        #适配老版json high和low为inf或-inf的情况，取dtype的最大值或最小值进行放缩
+        if high == float('inf'):
+            high_scale = dtype_finfo.max
+        elif high == float('-inf'):
+            high_scale = dtype_finfo.min
+        if low == float('inf'):
+            low_scale = dtype_finfo.max
+        elif low == float('-inf'):
+            low_scale = dtype_finfo.min
+
+        scale = high_scale - low_scale
+        rand01 = torch.rand(shape, dtype=eval(data_dtype))
+        tensor = rand01 * scale + low_scale
+    elif 'int' in data_dtype or 'long' in data_dtype:
+        low, high = int(low), int(high)
+        tensor = torch.randint(low, high + 1, shape, dtype=eval(data_dtype))
+    else:
+        print_error_log('Dtype is not supported: ' + data_dtype)
+        raise NotImplementedError()
+    if tensor.nelement() == 0:
+        return tensor
+    tmp_tensor = tensor.reshape(-1)
+    if high_origin and math.isnan(high_origin):
+        if tmp_tensor.numel() <= 2:
+            tmp_tensor[0] = float('nan')
+            tmp_tensor[-1] = high
+        else:
+            tmp_tensor[0] = low
+            tmp_tensor[1] = float('nan')
+            tmp_tensor[-1] = high
+    else:
+        tmp_tensor[0] = low
+        tmp_tensor[-1] = high
+        if high_origin in [float('inf'), float('-inf')]:
+            tmp_tensor[-1] = high_origin
+        if low_origin in [float('inf'), float('-inf')]:
+            tmp_tensor[0] = low_origin
+    data = tmp_tensor.reshape(shape)
+    return data
+
+
+def gen_bool_tensor(low, high, shape):
+    """
+    Function Description:
+        Based on API basic information, generate bool tensor
+    Parameter:
+        low: The minimum value in Tensor
+        high: The max value in Tensor
+        shape:The shape of Tensor
+    """
+    low, high = int(low), int(high)
+    if low > high:
+        low, high = high, low
+    tensor = torch.randint(low, high + 1, shape)
+    data = torch.gt(tensor, 0)
+    return data
+
+
+def gen_args(args_info, need_grad=True, convert_type=None, real_data_path=None):
+    """
+    Function Description:
+        Based on API basic information, generate input parameters: args, for API forward running
+    Parameter:
+        api_info: API basic information. List
+        need_grad: set Tensor grad for backward
+        convert_type: convert ori_type to dist_type flag.
+        real_data_path: the root directory for storing real data.
+    """
+    check_object_type(args_info, list)
+    args_result = []
+    for arg in args_info:
+        if isinstance(arg, (list, tuple)):
+            data = gen_args(arg, need_grad, convert_type, real_data_path)
+        elif isinstance(arg, dict):
+            data = gen_data(arg, need_grad, convert_type, real_data_path)
+        elif arg is None:
+            data = None
+        else:
+            print_warn_log(f'Warning: {arg} is not supported')
+            raise NotImplementedError()
+        args_result.append(data)
+    return args_result
+
+
+def gen_kwargs(api_info, convert_type=None, real_data_path=None):
+    """
+    Function Description:
+        Based on API basic information, generate input parameters: kwargs, for API forward running
+    Parameter:
+        api_info: API basic information. Dict
+        convert_type: convert ori_type to dist_type flag.
+        real_data_path: the root directory for storing real data.
+    """
+    check_object_type(api_info, dict)
+    kwargs_params = api_info.get("input_kwargs")
+    for key, value in kwargs_params.items():
+        if isinstance(value, (list, tuple)):
+            kwargs_params[key] = gen_list_kwargs(value, convert_type, real_data_path)
+        elif value is None:
+            kwargs_params[key] = None
+        elif value.get('type') in TENSOR_DATA_LIST or value.get('type').startswith("numpy"):
+            kwargs_params[key] = gen_data(value, True, convert_type, real_data_path)
+        elif value.get('type') in TORCH_TYPE:
+            gen_torch_kwargs(kwargs_params, key, value)
+        else:
+            kwargs_params[key] = value.get('value')
+    return kwargs_params
+
+
+def gen_torch_kwargs(kwargs_params, key, value):
+    if value.get('type') != "torch.device":
+        kwargs_params[key] = eval(value.get('value'))
+
+
+def gen_list_kwargs(kwargs_item_value, convert_type, real_data_path=None):
+    """
+    Function Description:
+        When kwargs value is list, generate the list of kwargs result
+    Parameter:
+        kwargs_item_value: kwargs value before to generate. List
+        convert_type: convert ori_type to dist_type flag.
+    """
+    kwargs_item_result = []
+    for item in kwargs_item_value:
+        if item.get('type') in TENSOR_DATA_LIST:
+            item_value = gen_data(item, False, convert_type, real_data_path)
+        else:
+            item_value = item.get('value')
+        kwargs_item_result.append(item_value)
+    return kwargs_item_result
+
+
+def gen_api_params(api_info, need_grad=True, convert_type=None, real_data_path=None):
+    """
+    Function Description:
+        Based on API basic information, generate input parameters: args, kwargs, for API forward running
+    Parameter:
+        api_info: API basic information. Dict
+        need_grad: set grad for backward
+        convert_type: convert ori_type to dist_type flag.
+    """
+    check_object_type(api_info, dict)
+    if convert_type and convert_type not in Const.CONVERT:
+        error_info = f"convert_type params not support {convert_type}."
+        raise CompareException(CompareException.INVALID_PARAM_ERROR, error_info)
+    kwargs_params = gen_kwargs(api_info, convert_type, real_data_path)
+    if api_info.get("input_args"):
+        args_params = gen_args(api_info.get("input_args"), need_grad, convert_type, real_data_path)
+    else:
+        print_warn_log(f'Warning: No args in {api_info} ')
+        args_params = []
+    return args_params, kwargs_params
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbb7c6f5ef2c75832c0b7a6e22e9d1ccf0624d8b
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py
@@ -0,0 +1,201 @@
+import subprocess
+import json
+import os
+import sys
+import argparse
+import time
+import signal
+import threading
+from collections import namedtuple
+from itertools import cycle
+from tqdm import tqdm
+from ...common import parse_json_info_forward_backward
+from ...common.file_check import FileCheckConst, FileChecker, check_file_suffix, check_link, FileOpen
+from ..compare.compare import Comparator
+from .run_ut import _run_ut_parser, get_validated_result_csv_path, get_validated_details_csv_path, preprocess_forward_content
+from ..common.utils import print_error_log, print_warn_log, print_info_log, create_directory
+from ...common.file_check import check_path_before_create
+
+
+def split_json_file(input_file, num_splits, filter_api):
+    forward_data, backward_data, real_data_path = parse_json_info_forward_backward(input_file)
+    if filter_api:
+        forward_data = preprocess_forward_content(forward_data)
+    for data_name in list(forward_data.keys()):
+        forward_data[f"{data_name}.forward"] = forward_data.pop(data_name)
+    for data_name in list(backward_data.keys()):
+        backward_data[f"{data_name}.backward"] = backward_data.pop(data_name)
+
+    with FileOpen(input_file, 'r') as file:
+        input_data = json.load(file)
+        input_data.pop("data")
+
+    items = list(forward_data.items())
+    total_items = len(items)
+    chunk_size = total_items // num_splits
+    split_files = []
+
+    for i in range(num_splits):
+        start = i * chunk_size
+        end = (i + 1) * chunk_size if i < num_splits - 1 else total_items
+
+        split_forward_data = dict(items[start:end])
+        temp_data = {
+            **input_data,
+            "data":{
+                **split_forward_data,
+                **backward_data
+            }
+        }
+        split_filename = f"temp_part{i}.json"
+        with FileOpen(split_filename, 'w') as split_file:
+            json.dump(temp_data, split_file)
+        split_files.append(split_filename)
+
+    return split_files, total_items
+
+
+def signal_handler(signum, frame):
+    print_warn_log(f'Signal handler called with signal {signum}')
+    raise KeyboardInterrupt()
+
+
+signal.signal(signal.SIGINT, signal_handler)
+signal.signal(signal.SIGTERM, signal_handler)
+
+
+ParallelUTConfig = namedtuple('ParallelUTConfig', ['api_files', 'out_path', 'num_splits',
+                                                   'save_error_data_flag', 'jit_compile_flag', 'device_id',
+                                                   'result_csv_path', 'total_items', 'real_data_path'])
+
+
+def run_parallel_ut(config):
+    processes = []
+    device_id_cycle = cycle(config.device_id)
+    if config.save_error_data_flag:
+        print_info_log("UT task error datas will be saved")
+    print_info_log(f"Starting parallel UT with {config.num_splits} processes")
+    progress_bar = tqdm(total=config.total_items, desc="Total items", unit="items")
+
+    def create_cmd(api_info, dev_id):
+        dirname, filename = os.path.split(os.path.abspath(__file__))
+        run_ut_path = os.path.join(dirname, "run_ut.py")
+        cmd = [
+            sys.executable, run_ut_path,
+            '-api_info', api_info,
+            *(['-o', config.out_path] if config.out_path else []),
+            '-d', str(dev_id),
+            *(['-j'] if config.jit_compile_flag else []),
+            *(['-save_error_data'] if config.save_error_data_flag else []),
+            '-csv_path', config.result_csv_path,
+            *(['-real_data_path', config.real_data_path] if config.real_data_path else [])
+        ]
+        return cmd
+
+    def read_process_output(process):
+        try:
+            while True:
+                if process.poll() is not None:
+                    break
+                output = process.stdout.readline()
+                if output == '':
+                    break
+                if '[ERROR]' in output:
+                    print(output, end='')
+                    sys.stdout.flush()
+        except ValueError as e:
+            print_warn_log(f"An error occurred while reading subprocess output: {e}")
+
+    def update_progress_bar(progress_bar, result_csv_path):
+        while any(process.poll() is None for process in processes):
+            try:
+                with open(result_csv_path, 'r') as result_file:
+                    completed_items = len(result_file.readlines()) - 1
+                    progress_bar.update(completed_items - progress_bar.n)
+            except FileNotFoundError:
+                print_warn_log(f"Result CSV file not found: {result_csv_path}.")
+            except Exception as e:
+                print_error_log(f"An unexpected error occurred while reading result CSV: {e}")
+            time.sleep(1)
+
+    for api_info in config.api_files:
+        cmd = create_cmd(api_info, next(device_id_cycle))
+        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, bufsize=1)
+        processes.append(process)
+        threading.Thread(target=read_process_output, args=(process,), daemon=True).start()
+
+    progress_bar_thread = threading.Thread(target=update_progress_bar, args=(progress_bar, config.result_csv_path))
+    progress_bar_thread.start()
+
+    def clean_up():
+        progress_bar.close()
+        for process in processes:
+            try:
+                process.terminate()
+                process.wait(timeout=1)
+            except subprocess.TimeoutExpired:
+                process.kill()
+        for file in config.api_files:
+            check_link(file)
+            try:
+                os.remove(file)
+            except FileNotFoundError:
+                print_warn_log(f"File not found and could not be deleted: {file}")
+
+    try:
+        for process in processes:
+            process.communicate(timeout=None)
+    except KeyboardInterrupt:
+        print_warn_log("Interrupted by user, terminating processes and cleaning up...")
+    except Exception as e:
+        print_error_log(f"An unexpected error occurred: {e}")
+    finally:
+        if progress_bar.n < config.total_items:
+            print_warn_log("The UT task has not been completed. The parameter '-csv_path' along with the path to the result CSV file will be utilized to resume the UT task.")
+        clean_up()
+        progress_bar_thread.join()
+    try:
+        comparator = Comparator(config.result_csv_path, config.result_csv_path, False)
+        comparator.print_pretest_result()
+    except FileNotFoundError as e:
+        print_error_log(f"Error: {e}")
+    except Exception as e:
+        print_error_log(f"An unexpected error occurred: {e}")
+
+
+def prepare_config(args):
+    check_link(args.api_info_file)
+    api_info = os.path.realpath(args.api_info_file)
+    check_file_suffix(api_info, FileCheckConst.JSON_SUFFIX)
+    out_path = os.path.realpath(args.out_path) if args.out_path else "./"
+    check_path_before_create(out_path)
+    create_directory(out_path)
+    out_path_checker = FileChecker(out_path, FileCheckConst.DIR, ability=FileCheckConst.WRITE_ABLE)
+    out_path = out_path_checker.common_check()
+    split_files, total_items = split_json_file(api_info, args.num_splits, args.filter_api)
+
+    result_csv_path = args.result_csv_path or os.path.join(out_path, f"accuracy_checking_result_{time.strftime('%Y%m%d%H%M%S')}.csv")
+    if not args.result_csv_path:
+        details_csv_path = os.path.join(out_path, f"accuracy_checking_details_{time.strftime('%Y%m%d%H%M%S')}.csv")
+        comparator = Comparator(result_csv_path, details_csv_path, False)
+    else:
+        result_csv_path = get_validated_result_csv_path(args.result_csv_path, 'result')
+        details_csv_path = get_validated_details_csv_path(result_csv_path)
+    print_info_log(f"UT task result will be saved in {result_csv_path}")
+    print_info_log(f"UT task details will be saved in {details_csv_path}")
+    return ParallelUTConfig(split_files, out_path, args.num_splits, args.save_error_data,
+                            args.jit_compile, args.device_id, result_csv_path,
+                            total_items, args.real_data_path)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Run UT in parallel')
+    _run_ut_parser(parser)
+    parser.add_argument('-n', '--num_splits', type=int, choices=range(1, 65), default=8, help='Number of splits for parallel processing. Range: 1-64')
+    args = parser.parse_args()
+    config = prepare_config(args)
+    run_parallel_ut(config)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..bea882f75076655998227baa6e4d3b4708074f08
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_overflow_check.py
@@ -0,0 +1,121 @@
+import argparse
+import os
+import sys
+import torch_npu
+import torch
+from tqdm import tqdm
+from ..run_ut.run_ut import exec_api, generate_device_params, get_api_info
+from ..common.utils import print_info_log, print_warn_log, get_json_contents, print_error_log
+from ...common.file_check import check_link
+
+
+def check_tensor_overflow(x):
+    if isinstance(x, torch.Tensor) and x.numel() != 0 and x.dtype != torch.bool:
+        if len(x.shape) == 0:
+            tensor_max = x.cpu().detach().float().numpy().tolist()
+            tensor_min = tensor_max
+        else:
+            tensor_max = torch._C._VariableFunctionsClass.max(x).cpu().detach().float().numpy().tolist()
+            tensor_min = torch._C._VariableFunctionsClass.min(x).cpu().detach().float().numpy().tolist()
+        # inf
+        if tensor_max == float('inf') or tensor_min == float('-inf'):
+            return True
+        # nan
+        elif tensor_max != tensor_max or tensor_min != tensor_min:
+            return True
+        else:
+            return False
+    elif isinstance(x, bool) or isinstance(x, int) or isinstance(x, float):
+        if x == float('inf') or x == float('-inf') or x != x:
+            return True
+        else:
+            return False
+    else:
+        return False
+
+
+def check_data_overflow(x):
+    if isinstance(x, (tuple, list)) and x:
+        for i, item in enumerate(x):
+            if check_data_overflow(item):
+                return True
+        return False
+    else:
+        return check_tensor_overflow(x)
+
+
+def run_overflow_check(forward_file):
+    print_info_log("start UT test")
+    forward_content = get_json_contents(forward_file)
+    for api_full_name, api_info_dict in tqdm(forward_content.items()):
+        try:
+            run_torch_api(api_full_name, api_info_dict)
+        except Exception as err:
+            api_name = api_full_name.split("_", 1)[1].rsplit("_", 2)[0]
+            if "not implemented for 'Half'" in str(err):
+                print_warn_log(f"API {api_name} not support half tensor in CPU, please add {api_name} to CONVERT_API "
+                               f"'fp16_to_fp32' list in accuracy_tools/api_accuracy_check/common/utils.py file.")
+            elif "expected scalar type Long" in str(err):
+                print_warn_log(f"API {api_name} not support int32 tensor in CPU, please add {api_name} to CONVERT_API "
+                               f"'int32_to_int64' list in accuracy_tools/api_accuracy_check/common/utils.py file.")
+            else:
+                print_error_log(f"Run {api_full_name} UT Error: %s" % str(err))
+
+
+def run_torch_api(api_full_name, api_info_dict):
+    torch.npu.clear_npu_overflow_flag()
+    api_type = api_full_name.split(".")[0]
+    api_name = api_full_name.split(".", 1)[1].rsplit(".", 2)[0]
+    args, kwargs, need_grad = get_api_info(api_info_dict, api_name, real_data_path='')
+    if not need_grad:
+        print_warn_log("%s function with out=... arguments don't support automatic differentiation, skip backward." 
+                       % api_full_name)
+    npu_args, npu_kwargs = generate_device_params(args, kwargs, False, api_name)
+    if kwargs.get("device"):
+        del kwargs["device"]
+    out = exec_api(api_type, api_name, args, kwargs)
+    npu_out = exec_api(api_type, api_name, npu_args, npu_kwargs)
+    cpu_overflow = check_data_overflow(out)
+    npu_overflow = torch_npu.npu.utils.npu_check_overflow(npu_out)
+    if cpu_overflow == npu_overflow:
+        print_warn_log("The %s overflow is a normal overflow." % api_full_name)
+    else:
+        print_warn_log("The %s overflow is an abnormal overflow." % api_full_name)
+    return
+
+
+def _run_overflow_check_parser(parser):
+    parser.add_argument("-api_info", "--api_info_file", dest="api_info_file", default="",
+                        help="<Required> The api param tool result file: generate from api param tool, "
+                             "a json file.",
+                        required=True)
+    parser.add_argument("-j", "--jit_compile", dest="jit_compile", help="<optional> whether to turn on jit compile",
+                        default=False, required=False)
+    parser.add_argument("-d", "--device", dest="device_id", type=int, help="<optional> set NPU device id to run ut",
+                        default=0, required=False)
+
+
+def _run_overflow_check(parser=None):
+    if not parser:
+        parser = argparse.ArgumentParser()
+    _run_overflow_check_parser(parser)
+    args = parser.parse_args(sys.argv[1:])
+    _run_overflow_check_command(args)
+
+
+def _run_overflow_check_command(args):
+    torch.npu.set_compile_mode(jit_compile=args.jit_compile)
+    npu_device = "npu:" + str(args.device_id)
+    check_link(args.api_info_file)
+    api_info = os.path.realpath(args.api_info_file)
+    try:
+        torch.npu.set_device(npu_device)
+    except Exception as error:
+        print_error_log(f"Set NPU device id failed. device id is: {args.device_id}")
+        raise NotImplementedError from error
+    run_overflow_check(api_info)
+
+
+if __name__ == '__main__':
+    _run_overflow_check()
+    print_info_log("UT task completed.")
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_ut.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_ut.py
new file mode 100644
index 0000000000000000000000000000000000000000..abd5d5bdfdb81c07d0fcf19f76c43573c081c56f
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/run_ut.py
@@ -0,0 +1,455 @@
+import argparse
+import os
+import csv
+import re
+import sys
+import time
+import gc
+from collections import namedtuple
+try:
+    import torch_npu
+except ImportError:
+    is_gpu = True
+    current_device = "cuda"
+else:
+    is_gpu = False
+    current_device = "npu"
+import torch
+from tqdm import tqdm
+
+from atat.pytorch.api_accuracy_checker.run_ut.data_generate import gen_api_params, gen_args
+from atat.pytorch.api_accuracy_checker.common.utils import print_info_log, print_warn_log, get_json_contents, \
+    api_info_preprocess, print_error_log, initialize_save_path, Const, create_directory
+from atat.pytorch.api_accuracy_checker.compare.compare import Comparator
+from atat.pytorch.api_accuracy_checker.hook_module.wrap_tensor import TensorOPTemplate
+from atat.pytorch.api_accuracy_checker.hook_module.wrap_functional import FunctionalOPTemplate
+from atat.pytorch.api_accuracy_checker.hook_module.wrap_torch import TorchOPTemplate
+from atat.pytorch.api_accuracy_checker.common.config import msCheckerConfig
+from atat.pytorch.api_accuracy_checker.dump.api_info import APIInfo
+from atat.pytorch.common.parse_json import parse_json_info_forward_backward
+from atat.pytorch.common.file_check import check_path_before_create
+from atat.pytorch.common.file_check import FileOpen, FileCheckConst, FileChecker, \
+    change_mode, check_file_suffix, check_link
+
+current_time = time.strftime("%Y%m%d%H%M%S")
+UT_ERROR_DATA_DIR = 'ut_error_data' + current_time
+RESULT_FILE_NAME = "accuracy_checking_result_" + current_time + ".csv"
+DETAILS_FILE_NAME = "accuracy_checking_details_" + current_time + ".csv"
+RunUTConfig = namedtuple('RunUTConfig', ['forward_content', 'backward_content', 'result_csv_path', 'details_csv_path',
+                                         'save_error_data', 'is_continue_run_ut', 'real_data_path'])
+not_backward_list = ['repeat_interleave']
+not_detach_set = {'resize_', 'resize_as_', 'set_', 'transpose_', 't_', 'squeeze_', 'unsqueeze_'}
+
+tqdm_params = {
+    'smoothing': 0,  # 平滑进度条的预计剩余时间，取值范围0到1
+    'desc': 'Processing',  # 进度条前的描述文字
+    'leave': True,  # 迭代完成后保留进度条的显示
+    'ncols': 75,  # 进度条的固定宽度
+    'mininterval': 0.1,  # 更新进度条的最小间隔秒数
+    'maxinterval': 1.0,  # 更新进度条的最大间隔秒数
+    'miniters': 1,  # 更新进度条之间的最小迭代次数
+    'ascii': None,  # 根据环境自动使用ASCII或Unicode字符
+    'unit': 'it',  # 迭代单位
+    'unit_scale': True,  # 自动根据单位缩放
+    'dynamic_ncols': True,  # 动态调整进度条宽度以适应控制台
+    'bar_format': '{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]'  # 自定义进度条输出格式
+}
+
+
+def exec_api(api_type, api_name, args, kwargs):
+    if api_type == "Functional":
+        functional_api = FunctionalOPTemplate(api_name, str, False)
+        out = functional_api.forward(*args, **kwargs)
+    if api_type == "Tensor":
+        tensor_api = TensorOPTemplate(api_name, str, False)
+        out = tensor_api.forward(*args, **kwargs)
+    if api_type == "Torch":
+        torch_api = TorchOPTemplate(api_name, str, False)
+        out = torch_api.forward(*args, **kwargs)
+    return out
+
+
+def deal_detach(arg, to_detach=True):
+    return arg.detach() if to_detach else arg
+
+
+def deal_dtype(arg, raise_dtype=None):
+    if raise_dtype is None or arg.dtype not in Const.RAISE_PRECISION or raise_dtype == arg.dtype:
+        return arg
+    return arg.type(raise_dtype)
+
+
+def generate_device_params(input_args, input_kwargs, need_backward, api_name):
+    def recursive_arg_to_device(arg_in, to_detach):
+        if isinstance(arg_in, (list, tuple)):
+            return type(arg_in)(recursive_arg_to_device(arg, to_detach) for arg in arg_in)
+        elif isinstance(arg_in, torch.Tensor):
+            if need_backward and arg_in.requires_grad:
+                arg_in = deal_detach(arg_in.clone(), to_detach).to(current_device).requires_grad_()
+                temp_arg_in = arg_in * 1
+                arg_in = temp_arg_in.type_as(arg_in)
+                arg_in.retain_grad()
+                return arg_in
+            else:
+                return deal_detach(arg_in.clone(), to_detach).to(current_device)
+        else:
+            return arg_in
+
+    is_detach = api_name not in not_detach_set
+    device_args = recursive_arg_to_device(input_args, is_detach)
+    device_kwargs = \
+        {key: recursive_arg_to_device(value, key != "out" and is_detach) for key, value in input_kwargs.items()}
+    return device_args, device_kwargs
+
+
+def generate_cpu_params(input_args, input_kwargs, need_backward, api_name):
+    def recursive_arg_to_cpu(arg_in, to_detach, raise_dtype=None):
+        if isinstance(arg_in, (list, tuple)):
+            return type(arg_in)(recursive_arg_to_cpu(arg, to_detach, raise_dtype=raise_dtype) for arg in arg_in)
+        elif isinstance(arg_in, torch.Tensor):
+            if need_backward and arg_in.requires_grad:
+                arg_in = deal_detach(deal_dtype(arg_in.clone(), raise_dtype), to_detach).requires_grad_()
+                temp_arg_in = arg_in * 1
+                arg_in = temp_arg_in.type_as(arg_in)
+                arg_in.retain_grad()
+                return arg_in
+            else:
+                return deal_detach(deal_dtype(arg_in.clone(), raise_dtype=raise_dtype), to_detach)
+        else:
+            return arg_in
+
+    def is_tensor_with_raise_precision(arg_in, check_kwargs=False):
+        if arg_in.dtype in Const.RAISE_PRECISION:
+            return True
+        if check_kwargs and arg_in.dtype in [torch.half, torch.bfloat16]:
+            return True
+        return False
+
+    def recursive_find_dtypes(arg_in, kwargs=None, check_kwargs=False):
+        if isinstance(arg_in, (list, tuple)):
+            return set().union(*tuple(recursive_find_dtypes(arg, kwargs, check_kwargs=check_kwargs) for arg in arg_in))
+        elif isinstance(arg_in, torch.Tensor) and is_tensor_with_raise_precision(arg_in, check_kwargs):
+            return set([arg_in.dtype])
+        elif isinstance(arg_in, dict) and check_kwargs:
+            return set().union(*tuple(recursive_find_dtypes(v, kwargs, check_kwargs=True) for v in arg_in.values()))
+        return set()
+
+    raise_dtype = None
+    need_raise_dtypes = recursive_find_dtypes(input_args)
+    need_raise_dtypes.update(recursive_find_dtypes(input_kwargs, check_kwargs=True))
+    if len(need_raise_dtypes) == 1:
+        raise_dtype = Const.RAISE_PRECISION.get(need_raise_dtypes.pop(), torch.float32)
+    elif len(need_raise_dtypes) >= 2:
+        raise_dtype = torch.float32
+
+    is_detach = api_name not in not_detach_set
+    cpu_args = recursive_arg_to_cpu(input_args, is_detach, raise_dtype=raise_dtype)
+    cpu_kwargs = {key: recursive_arg_to_cpu(value, key != "out" and is_detach, raise_dtype=raise_dtype) for key, value in input_kwargs.items()}
+    return cpu_args, cpu_kwargs
+
+
+def run_ut(config):
+    print_info_log("start UT test")
+    print_info_log(f"UT task result will be saved in {config.result_csv_path}")
+    print_info_log(f"UT task details will be saved in {config.details_csv_path}")
+    if config.save_error_data:
+        error_data_path = os.path.abspath(os.path.join(msCheckerConfig.error_data_path, UT_ERROR_DATA_DIR))
+        print_info_log(f"UT task error_datas will be saved in {error_data_path}")
+    compare = Comparator(config.result_csv_path, config.details_csv_path, config.is_continue_run_ut)
+    with FileOpen(config.result_csv_path, 'r') as file:
+        csv_reader = csv.reader(file)
+        next(csv_reader)
+        api_name_set = {row[0] for row in csv_reader}
+    for i, (api_full_name, api_info_dict) in enumerate(tqdm(config.forward_content.items(), **tqdm_params)):
+        if api_full_name in api_name_set:
+            continue
+        if is_npu_fusion_api(api_full_name): # TODO run_ut does not support to the npu fusion api
+            continue
+        try:
+            if msCheckerConfig.white_list:
+                [_, api_name, _] = api_full_name.split(Const.SEP)
+                if api_name not in set(msCheckerConfig.white_list):
+                    continue
+            data_info = run_torch_api(api_full_name, config.real_data_path, config.backward_content, api_info_dict)
+            is_fwd_success, is_bwd_success = compare.compare_output(api_full_name,
+                                                                    data_info.bench_out,
+                                                                    data_info.device_out,
+                                                                    data_info.bench_grad_out,
+                                                                    data_info.device_grad_out)
+            if config.save_error_data:
+                do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success)
+        except Exception as err:
+            [_, api_name, _] = api_full_name.split(Const.SEP)
+            if "expected scalar type Long" in str(err):
+                print_warn_log(f"API {api_name} not support int32 tensor in CPU, please add {api_name} to CONVERT_API "
+                               f"'int32_to_int64' list in accuracy_tools/api_accuracy_check/common/utils.py file.")
+            else:
+                print_error_log(f"Run {api_full_name} UT Error: %s" % str(err))
+            compare.write_summary_csv((api_full_name, "SKIP", "SKIP", str(err)))
+        finally:
+            if is_gpu:
+                torch.cuda.empty_cache()
+            else:
+                torch.npu.empty_cache()
+            gc.collect()
+    change_mode(compare.save_path, FileCheckConst.DATA_FILE_AUTHORITY)
+    change_mode(compare.detail_save_path, FileCheckConst.DATA_FILE_AUTHORITY)
+    compare.print_pretest_result()
+
+
+def is_npu_fusion_api(api_name):
+    return api_name.split(Const.SEP)[0] == Const.NPU
+
+
+def do_save_error_data(api_full_name, data_info, is_fwd_success, is_bwd_success):
+    if not is_fwd_success or not is_bwd_success:
+        for element in data_info.in_fwd_data_list:
+            UtAPIInfo(api_full_name + '.forward.input', element)
+        UtAPIInfo(api_full_name + '.forward.output.bench', data_info.bench_out)
+        UtAPIInfo(api_full_name + '.forward.output.device', data_info.device_out)
+        UtAPIInfo(api_full_name + '.backward.input', data_info.grad_in)
+        UtAPIInfo(api_full_name + '.backward.output.bench', data_info.bench_grad_out)
+        UtAPIInfo(api_full_name + '.backward.output.device', data_info.device_grad_out)
+
+
+def run_torch_api(api_full_name, real_data_path, backward_content, api_info_dict):
+    in_fwd_data_list = []
+    [api_type, api_name, _] = api_full_name.split(Const.SEP)
+    args, kwargs, need_grad = get_api_info(api_info_dict, api_name, real_data_path)
+    in_fwd_data_list.append(args)
+    in_fwd_data_list.append(kwargs)
+    need_backward = api_full_name in backward_content
+    if not need_grad:
+        print_warn_log("%s function with out=... arguments don't support automatic differentiation, skip backward."
+                       % api_full_name)
+    if api_name in not_backward_list:
+        need_grad = False
+        print_warn_log(
+            "%s function backward result is None, skip backward." % api_full_name)
+    need_backward = need_backward and need_grad
+    if kwargs.get("device"):
+        del kwargs["device"]
+    cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, need_backward, api_name)
+    device_args, device_kwargs = generate_device_params(args, kwargs, need_backward, api_name)
+    bench_grad_out, device_grad_out = None, None
+    out = exec_api(api_type, api_name, cpu_args, cpu_kwargs)
+    device_out = exec_api(api_type, api_name, device_args, device_kwargs)
+    current_path = os.path.dirname(os.path.realpath(__file__))
+    ut_setting_path = os.path.join(current_path, "torch_ut_setting.json")
+    api_setting_dict = get_json_contents(ut_setting_path)
+    grad_input_index = api_setting_dict.get(api_name)
+    grad_index = None
+    grad, bench_grad = None, None
+    if grad_input_index is not None:
+        grad_index = grad_input_index.get('grad_index')
+
+    if need_backward:
+        backward_args = backward_content[api_full_name].get("grad_output")
+        grad = gen_args(backward_args, real_data_path=real_data_path)[0]
+        bench_grad, _ = generate_cpu_params(grad, {}, False, api_name)
+        bench_grad_out = run_backward(cpu_args, bench_grad, grad_index, out)
+        device_grad = grad.clone().detach().to(current_device)
+        device_grad_out = run_backward(device_args, device_grad, grad_index, device_out)
+
+    if grad_index is not None:
+        return UtDataInfo(bench_grad_out, device_grad_out, device_out[grad_index], out[grad_index], bench_grad,
+                          in_fwd_data_list)
+    return UtDataInfo(bench_grad_out, device_grad_out, device_out, out, bench_grad, in_fwd_data_list)
+
+
+def get_api_info(api_info_dict, api_name, real_data_path):
+    convert_type, api_info_dict = api_info_preprocess(api_name, api_info_dict)
+    need_grad = True
+    if api_info_dict.get("input_kwargs") and "out" in api_info_dict.get("input_kwargs"):
+        need_grad = False
+    args, kwargs = gen_api_params(api_info_dict, need_grad, convert_type, real_data_path)
+    return args, kwargs, need_grad
+
+
+def run_backward(args, grad, grad_index, out):
+    if grad_index is not None:
+        out[grad_index].backward(grad)
+    elif isinstance(out, (list, tuple)):
+        raise NotImplementedError("Multiple backward is not supported.")
+    else:
+        out.backward(grad)
+    args_grad = []
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            args_grad.append(arg.grad)
+    grad_out = args_grad
+
+    return grad_out
+
+
+def initialize_save_error_data():
+    error_data_path = msCheckerConfig.error_data_path
+    check_path_before_create(error_data_path)
+    create_directory(error_data_path)
+    error_data_path_checker = FileChecker(msCheckerConfig.error_data_path, FileCheckConst.DIR,
+                                          ability=FileCheckConst.WRITE_ABLE)
+    error_data_path = error_data_path_checker.common_check()
+    initialize_save_path(error_data_path, UT_ERROR_DATA_DIR)
+
+
+def get_validated_result_csv_path(result_csv_path, mode):
+    if mode not in ['result', 'detail']:
+        raise ValueError("The csv mode must be result or detail")
+    result_csv_path_checker = FileChecker(result_csv_path, FileCheckConst.FILE, ability=FileCheckConst.READ_WRITE_ABLE,
+                                          file_type=FileCheckConst.CSV_SUFFIX)
+    validated_result_csv_path = result_csv_path_checker.common_check()
+    if mode == 'result':
+        result_csv_name = os.path.basename(validated_result_csv_path)
+        pattern = r"^accuracy_checking_result_\d{14}\.csv$"
+        if not re.match(pattern, result_csv_name):
+            raise ValueError("When continue run ut, please do not modify the result csv name.")
+    return validated_result_csv_path
+
+
+def get_validated_details_csv_path(validated_result_csv_path):
+    result_csv_name = os.path.basename(validated_result_csv_path)
+    details_csv_name = result_csv_name.replace('result', 'details')
+    details_csv_path = os.path.join(os.path.dirname(validated_result_csv_path), details_csv_name)
+    details_csv_path_checker = FileChecker(details_csv_path, FileCheckConst.FILE,
+                                           ability=FileCheckConst.READ_WRITE_ABLE, file_type=FileCheckConst.CSV_SUFFIX)
+    validated_details_csv_path = details_csv_path_checker.common_check()
+    return validated_details_csv_path
+
+
+def _run_ut_parser(parser):
+    parser.add_argument("-api_info", "--api_info_file", dest="api_info_file", default="", type=str,
+                        help="<Required> The api param tool result file: generate from api param tool, "
+                             "a json file.",
+                        required=True)
+    parser.add_argument("-o", "--out_path", dest="out_path", default="", type=str,
+                        help="<optional> The ut task result out path.",
+                        required=False)
+    parser.add_argument('-save_error_data', dest="save_error_data", action="store_true",
+                        help="<optional> Save compare failed api output.", required=False)
+    parser.add_argument("-j", "--jit_compile", dest="jit_compile", action="store_true",
+                        help="<optional> whether to turn on jit compile", required=False)
+
+    class UniqueDeviceAction(argparse.Action):
+        def __call__(self, parser, namespace, values, option_string=None):
+            unique_values = set(values)
+            if len(values) != len(unique_values):
+                parser.error("device id must be unique")
+            for device_id in values:
+                if not 0 <= device_id:
+                    parser.error("device id must be greater than or equal to 0")
+            setattr(namespace, self.dest, values)
+
+    parser.add_argument("-d", "--device", dest="device_id", nargs='+', type=int,
+                        help="<optional> set device id to run ut, must be unique and in range 0-7",
+                        default=[0], required=False, action=UniqueDeviceAction)
+    parser.add_argument("-csv_path", "--result_csv_path", dest="result_csv_path", default="", type=str,
+                        help="<optional> The path of accuracy_checking_result_{timestamp}.csv, "
+                             "when run ut is interrupted, enter the file path to continue run ut.",
+                        required=False)
+    parser.add_argument("-real_data_path", dest="real_data_path", nargs="?", const="", default="", type=str,
+                        help="<optional> In real data mode, the root directory for storing real data "
+                             "must be configured.",
+                        required=False)
+    parser.add_argument("-f", "--filter_api", dest="filter_api", action="store_true",
+                        help="<optional> Whether to filter the api in the api_info_file.", required=False)
+
+
+def preprocess_forward_content(forward_content):
+    processed_content = {}
+    base_keys_variants = {}
+    for key, value in forward_content.items():
+        base_key = key.rsplit(Const.SEP, 1)[0]
+        new_args = value['args']
+        new_kwargs = value['kwargs']
+        filtered_new_args = [{k: v for k, v in arg.items() if k not in ['Max', 'Min']} for arg in new_args if isinstance(arg, dict)]
+        if base_key in base_keys_variants:
+            is_duplicate = False
+            for variant in base_keys_variants.get(base_key, []):
+                try:
+                    existing_args = processed_content[variant].get('args', [])
+                    existing_kwargs = processed_content[variant].get('kwargs', {})
+                    filtered_existing_args = [{k: v for k, v in arg.items() if k not in ['Max', 'Min']} for arg in existing_args if isinstance(arg, dict)]
+                except KeyError as e:
+                    print_error_log(f"KeyError: {e} when processing {key}")
+                if filtered_existing_args == filtered_new_args and existing_kwargs == new_kwargs:
+                    is_duplicate = True
+                    break
+            if not is_duplicate:
+                processed_content[key] = value
+                base_keys_variants[base_key].append(key)
+        else:
+            processed_content[key] = value
+            base_keys_variants[base_key] = [key]
+    return processed_content
+
+
+def _run_ut(parser=None):
+    if not parser:
+        parser = argparse.ArgumentParser()
+    _run_ut_parser(parser)
+    args = parser.parse_args(sys.argv[1:])
+    run_ut_command(args)
+
+
+def run_ut_command(args):
+    if not is_gpu:
+        torch.npu.set_compile_mode(jit_compile=args.jit_compile)
+    used_device = current_device + ":" + str(args.device_id[0])
+    try:
+        if is_gpu:
+            torch.cuda.set_device(used_device)
+        else:
+            torch.npu.set_device(used_device)
+    except Exception as error:
+        print_error_log(f"Set device id failed. device id is: {args.device_id}")
+        raise NotImplementedError from error
+    check_link(args.api_info_file)
+    api_info = os.path.realpath(args.api_info_file)
+    check_file_suffix(api_info, FileCheckConst.JSON_SUFFIX)
+    out_path = os.path.realpath(args.out_path) if args.out_path else "./"
+    check_path_before_create(out_path)
+    create_directory(out_path)
+    out_path_checker = FileChecker(out_path, FileCheckConst.DIR, ability=FileCheckConst.WRITE_ABLE)
+    out_path = out_path_checker.common_check()
+    save_error_data = args.save_error_data
+    forward_content, backward_content, real_data_path = parse_json_info_forward_backward(api_info)
+    if args.filter_api:
+        forward_content = preprocess_forward_content(forward_content)
+
+    result_csv_path = os.path.join(out_path, RESULT_FILE_NAME)
+    details_csv_path = os.path.join(out_path, DETAILS_FILE_NAME)
+    if args.result_csv_path:
+        result_csv_path = get_validated_result_csv_path(args.result_csv_path, 'result')
+        details_csv_path = get_validated_details_csv_path(result_csv_path)
+    if save_error_data:
+        if args.result_csv_path:
+            time_info = result_csv_path.split('.')[0].split('_')[-1]
+            global UT_ERROR_DATA_DIR
+            UT_ERROR_DATA_DIR = 'ut_error_data' + time_info
+        initialize_save_error_data()
+    run_ut_config = RunUTConfig(forward_content, backward_content, result_csv_path, details_csv_path, save_error_data,
+                                args.result_csv_path, args.real_data_path)
+    run_ut(run_ut_config)
+
+
+class UtDataInfo:
+    def __init__(self, bench_grad_out, device_grad_out, device_out, bench_out, grad_in, in_fwd_data_list):
+        self.bench_grad_out = bench_grad_out
+        self.device_grad_out = device_grad_out
+        self.device_out = device_out
+        self.bench_out = bench_out
+        self.grad_in = grad_in
+        self.in_fwd_data_list = in_fwd_data_list
+
+
+class UtAPIInfo(APIInfo):
+    def __init__(self, api_name, element):
+        super().__init__(api_name,
+                         save_path=self.get_full_save_path(msCheckerConfig.error_data_path, UT_ERROR_DATA_DIR),
+                         is_save_data=True)
+        self.analyze_element(element)
+
+
+if __name__ == '__main__':
+    _run_ut()
+    print_info_log("UT task completed.")
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8df6098b1bab44d197b8e1a2b3e652456224e3f
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/run_ut/torch_ut_setting.json
@@ -0,0 +1,5 @@
+{
+  "topk": {
+    "grad_index": 0
+  }
+}
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/resources/forward.json b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/resources/forward.json
new file mode 100644
index 0000000000000000000000000000000000000000..f938f352460a87222bdb5346873904cb420996cc
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/resources/forward.json
@@ -0,0 +1,3 @@
+{
+    "Functional*silu*0": {"args": [{"type": "torch.Tensor", "dtype": "torch.float32", "shape": [2, 2560, 24, 24], "Max": 5.7421875, "Max_origin": 5.7421875, "Min": -5.125, "Min_origin": -5.125, "requires_grad": true}], "kwargs" :{"inplace": {"type": "bool", "value": false}}}
+}
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/run_test.sh b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/run_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fdd00c6021c9827a68e005616b1b4d916e63e995
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/run_test.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+CUR_DIR=$(dirname $(readlink -f $0))
+TOP_DIR=${CUR_DIR}/..
+TEST_DIR=${TOP_DIR}/"test"
+SRC_DIR=${TOP_DIR}/../
+
+clean() {
+    cd ${TEST_DIR}
+
+    if [ -e ${TEST_DIR}/"report" ]; then
+      rm -r ${TEST_DIR}/"report"
+      echo "remove last ut_report successfully."
+    fi
+
+}
+
+run_ut() {
+    export PYTHONPATH=${SRC_DIR}:${PYTHONPATH}
+    python3 run_ut.py
+}
+
+main() {
+    clean
+    if [ "$1"x == "clean"x ]; then
+      return 0
+    fi
+
+    cd ${TEST_DIR} && run_ut
+}
+
+main $@
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/run_ut.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/run_ut.py
new file mode 100644
index 0000000000000000000000000000000000000000..c73949697941d84782c4983aa484c06b1a7cbcc2
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/run_ut.py
@@ -0,0 +1,41 @@
+import os
+import shutil
+import subprocess
+import sys
+
+def run_ut():
+    cur_dir = os.path.realpath(os.path.dirname(__file__))
+    top_dir = os.path.realpath(os.path.dirname(cur_dir))
+    ut_path = os.path.join(cur_dir, "ut/")
+    src_dir = top_dir
+    report_dir = os.path.join(cur_dir, "report")
+
+    if os.path.exists(report_dir):
+        shutil.rmtree(report_dir)
+
+    os.makedirs(report_dir)
+
+    cmd = ["python3", "-m", "pytest", ut_path, "--junitxml=" + report_dir + "/final.xml",
+           "--cov=" + src_dir, "--cov-branch", "--cov-report=xml:" + report_dir + "/coverage.xml"]
+    
+    result_ut = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+    while result_ut.poll() is None:
+        line = result_ut.stdout.readline().strip()
+        if line:
+            print(line)
+
+    ut_flag = False
+    if result_ut.returncode == 0:
+        ut_flag = True
+        print("run ut successfully.")
+    else:
+        print("run ut failed.")
+
+    return ut_flag
+
+if __name__=="__main__":
+    if run_ut():
+        sys.exit(0)
+    else:
+        sys.exit(1)
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/test_common_utils.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/test_common_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f25e81c09783eeb8c682fd33d3178b99352f6e0
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/test_common_utils.py
@@ -0,0 +1,124 @@
+import unittest
+import os
+import numpy as np
+import torch
+from api_accuracy_checker.common.utils import *
+
+class TestUtils(unittest.TestCase):
+
+    def test_read_json(self):
+        test_dict = {"key": "value"}
+        with open('test.json', 'w') as f:
+            json.dump(test_dict, f)
+        self.assertEqual(read_json('test.json'), test_dict)
+        os.remove('test.json')
+
+    def test_write_csv(self):
+        test_data = [["name", "age"], ["Alice", "20"], ["Bob", "30"]]
+        write_csv(test_data, 'test.csv')
+        with open('test.csv', 'r', encoding='utf-8-sig') as f:
+            reader = csv.reader(f)
+            for i, row in enumerate(reader):
+                self.assertEqual(row, test_data[i])
+        os.remove('test.csv')
+
+    def test_print_info_log(self):
+        try:
+            print_info_log("Test message")
+        except Exception as e:
+            self.fail(f"print_info_log raised exception {e}")
+
+    def test_check_mode_valid(self):
+        try:
+            check_mode_valid(Const.ALL)
+        except Exception as e:
+            self.fail(f"check_mode_valid raised exception {e}")
+
+    def test_check_object_type(self):
+        try:
+            check_object_type(123, int)
+        except Exception as e:
+            self.fail(f"check_object_type raised exception {e}")
+
+    def test_check_file_or_directory_path(self):
+        try:
+            check_file_or_directory_path(__file__)
+        except Exception as e:
+            self.fail(f"check_file_or_directory_path raised exception {e}")
+
+    def test_get_dump_data_path(self):
+        path, exist = get_dump_data_path(os.path.dirname(__file__))
+        self.assertTrue(exist)
+
+    def test_create_directory(self):
+        create_directory('test_dir')
+        self.assertTrue(os.path.exists('test_dir'))
+        os.rmdir('test_dir')
+
+    def test_execute_command(self):
+        execute_command(['echo', 'Hello, World!'])
+
+    def test_parse_arg_value(self):
+        values = "1,2,3;4,5,6"
+        expected_result = [[1, 2, 3], [4, 5, 6]]
+        self.assertEqual(parse_arg_value(values), expected_result)
+
+    def test_parse_value_by_comma(self):
+        value = "1,2,3"
+        expected_result = [1, 2, 3]
+        self.assertEqual(parse_value_by_comma(value), expected_result)
+
+    def test_get_data_len_by_shape(self):
+        shape = [2, 3, 4]
+        expected_result = 24
+        self.assertEqual(get_data_len_by_shape(shape), expected_result)
+
+    def test_add_time_as_suffix(self):
+        name = "test"
+        result = add_time_as_suffix(name)
+        self.assertTrue(result.startswith(name))
+
+    def test_get_time(self):
+        result = get_time()
+        self.assertTrue(isinstance(result, str))
+
+    def test_format_value(self):
+        value = 123.456789
+        expected_result = '123.456789'
+        self.assertEqual(format_value(value), expected_result)
+
+    def test_seed_all(self):
+        seed_all(1234)
+
+    def test_get_process_rank(self):
+        model = torch.nn.Linear(10, 10)
+        rank, _ = get_process_rank(model)
+        self.assertEqual(rank, 0)
+
+    def test_get_json_contents(self):
+        test_dict = {"key": "value"}
+        with open('test.json', 'w') as f:
+            json.dump(test_dict, f)
+        self.assertEqual(get_json_contents('test.json'), test_dict)
+        os.remove('test.json')
+
+    def test_get_file_content_bytes(self):
+        with open('test.txt', 'w') as f:
+            f.write("Hello, World!")
+        self.assertEqual(get_file_content_bytes('test.txt'), b"Hello, World!")
+        os.remove('test.txt')
+
+    def test_islink(self):
+        self.assertFalse(islink(__file__))
+
+    def test_check_path_length_valid(self):
+        self.assertTrue(check_path_length_valid(__file__))
+
+    def test_check_path_pattern_valid(self):
+        self.assertIsNone(check_path_pattern_valid(__file__))
+
+    def test_check_input_file_valid(self):
+        self.assertIsNone(check_input_file_valid(__file__))
+
+    def test_check_need_convert(self):
+        self.assertIsNone(check_need_convert("unknown_api"))
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/test_config.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/test_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..a68057dfb41ca38ba79e1daa992a8f51ce4d64e4
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/common/test_config.py
@@ -0,0 +1,21 @@
+import unittest
+import os
+from api_accuracy_checker.common.config import Config
+
+class TestConfig(unittest.TestCase):
+    def setUp(self):
+        cur_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))
+        yaml_path = os.path.join(cur_path, "config.yaml")
+        self.yaml_file = yaml_path
+        self.config = Config(self.yaml_file)
+
+    def test_validate(self):
+        self.assertEqual(self.config.validate('dump_path', '/path/to/dump'), '/path/to/dump')
+
+        with self.assertRaises(ValueError):
+            self.config.validate('dump_path', 123)
+
+
+    def test_update_config(self):
+        self.config.update_config(dump_path='/new/path/to/dump')
+        self.assertEqual(self.config.dump_path, '/new/path/to/dump')
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_algorithm.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_algorithm.py
new file mode 100644
index 0000000000000000000000000000000000000000..90e18d166f56f98b8c1e1f80f2ae28dab7db67d3
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_algorithm.py
@@ -0,0 +1,32 @@
+import unittest
+import numpy as np
+import torch
+from api_accuracy_checker.compare import compare as cmp
+from api_accuracy_checker.compare import algorithm as alg
+
+class TestAlgorithmMethods(unittest.TestCase):
+
+    def test_get_max_abs_err(self):
+        b_value = np.array([1.0, 2.0, 3.0])
+        n_value = np.array([1.0, 2.0, 3.0])
+        abs_err = np.abs(b_value - n_value)
+        self.assertEqual(alg.get_max_abs_err(abs_err), (0.0, True))
+
+    def test_get_rel_err_ratio_thousandth(self):
+        b_value = np.array([1.0, 2.0, 3.0])
+        n_value = np.array([1.0, 2.0, 3.0])
+        abs_err = np.abs(b_value - n_value)
+        rel_err = alg.get_rel_err_origin(abs_err, b_value)
+        self.assertEqual(alg.get_rel_err_ratio(rel_err, 0.001), (1.0, True))
+
+    def test_get_rel_err_ratio_ten_thousandth(self):
+        b_value = np.array([1.0, 2.0, 3.0])
+        n_value = np.array([1.0, 2.0, 3.0])
+        abs_err = np.abs(b_value - n_value)
+        rel_err = alg.get_rel_err_origin(abs_err, b_value)
+        self.assertEqual(alg.get_rel_err_ratio(rel_err, 0.0001), (1.0, True))
+
+    def test_cosine_sim(self):
+        cpu_output = np.array([1.0, 2.0, 3.0])
+        npu_output = np.array([1.0, 2.0, 3.0])
+        self.assertEqual(alg.cosine_sim(cpu_output, npu_output), (1.0, True, ''))
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_compare.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ce73ce550dfc5d5cd21246dbc2756a6024f6fea
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_compare.py
@@ -0,0 +1,111 @@
+import csv
+import os
+import shutil
+import time
+import unittest
+
+import numpy as np
+import torch.nn.functional
+
+from api_accuracy_checker.compare.compare import Comparator
+from api_accuracy_checker.compare.compare_column import CompareColumn
+
+current_time = time.strftime("%Y%m%d%H%M%S")
+RESULT_FILE_NAME = "accuracy_checking_result_" + current_time + ".csv"
+DETAILS_FILE_NAME = "accuracy_checking_details_" + current_time + '.csv'
+base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+
+class TestCompare(unittest.TestCase):
+    def setUp(self):
+        self.output_path = os.path.join(base_dir, "../compare_result")
+        os.mkdir(self.output_path, mode=0o750)
+        self.result_csv_path = os.path.join(self.output_path, RESULT_FILE_NAME)
+        self.details_csv_path = os.path.join(self.output_path, DETAILS_FILE_NAME)
+        self.is_continue_run_ut = False
+        self.compare = Comparator(self.result_csv_path, self.details_csv_path, self.is_continue_run_ut)
+
+    def tearDown(self) -> None:
+        if os.path.exists(self.output_path):
+            shutil.rmtree(self.output_path)
+
+    def test_compare_dropout(self):
+        dummmy_input = torch.randn(100, 100)
+        bench_out = torch.nn.functional.dropout2d(dummmy_input, 0.3)
+        npu_out = torch.nn.functional.dropout2d(dummmy_input, 0.3)
+        self.assertTrue(self.compare._compare_dropout("api", bench_out, npu_out))
+
+    def test_compare_core_wrapper(self):
+        dummy_input = torch.randn(100, 100)
+        bench_out, npu_out = dummy_input, dummy_input
+        test_final_success, detailed_result_total = self.compare._compare_core_wrapper("api", bench_out, npu_out)
+        actual_cosine_similarity = detailed_result_total[0][3]
+        # 设置一个小的公差值
+        tolerance = 1e-4    
+        # 判断实际的余弦相似度值是否在预期值的公差范围内
+        self.assertTrue(np.isclose(actual_cosine_similarity, 1.0, atol=tolerance))
+        # 对其他值进行比较，确保它们符合预期
+        detailed_result_total[0][3] = 1.0
+        self.assertEqual(detailed_result_total, [['torch.float32', 'torch.float32', (100, 100), 1.0, 0.0, ' ', ' ', ' ',
+         ' ', 0.0, 0.0, 0, 0.0, 0.0, ' ', ' ', ' ', 'pass', 
+         '\nMax abs error is less than 0.001, consider as pass, skip other check and set to SPACE.\n']])
+        self.assertTrue(test_final_success)
+
+        bench_out, npu_out = [dummy_input, dummy_input], [dummy_input, dummy_input]
+        test_final_success, detailed_result_total = self.compare._compare_core_wrapper("api", bench_out, npu_out)
+        actual_cosine_similarity = detailed_result_total[0][3]
+        self.assertTrue(np.isclose(actual_cosine_similarity, 1.0, atol=tolerance))
+        actual_cosine_similarity = detailed_result_total[1][3]
+        self.assertTrue(np.isclose(actual_cosine_similarity, 1.0, atol=tolerance))
+        detailed_result_total[0][3] = 1.0
+        detailed_result_total[1][3] = 1.0
+        self.assertTrue(test_final_success)
+        self.assertEqual(detailed_result_total, [['torch.float32', 'torch.float32', (100, 100), 1.0, 0.0, ' ', ' ', ' ',
+         ' ', 0.0, 0.0, 0, 0.0, 0.0, ' ', ' ', ' ', 'pass', 
+         '\nMax abs error is less than 0.001, consider as pass, skip other check and set to SPACE.\n'], 
+         ['torch.float32', 'torch.float32', (100, 100), 1.0, 0.0, ' ', ' ', ' ', ' ', 0.0, 0.0, 0, 0.0, 0.0, ' ', ' ',
+          ' ', 'pass', '\nMax abs error is less than 0.001, consider as pass, skip other check and set to SPACE.\n']])
+
+    def test_compare_output(self):
+        bench_out, npu_out = torch.randn(100, 100), torch.randn(100, 100)
+        bench_grad, npu_grad = [torch.randn(100, 100)], [torch.randn(100, 100)]
+        api_name = 'Functional*conv2d*0'
+        is_fwd_success, is_bwd_success = self.compare.compare_output(api_name, bench_out, npu_out, bench_grad, npu_grad)
+        self.assertFalse(is_fwd_success)
+        self.assertFalse(is_bwd_success)
+
+        dummy_input = torch.randn(100, 100)
+        bench_out, npu_out = dummy_input, dummy_input
+        is_fwd_success, is_bwd_success = self.compare.compare_output(api_name, bench_out, npu_out)
+        self.assertTrue(is_fwd_success)
+        self.assertTrue(is_bwd_success)
+
+    def test_record_results(self):
+        args = ('Functional*conv2d*0', False, 'N/A', [['torch.float64', 'torch.float32', (32, 64, 112, 112), 1.0,
+                                                       0.012798667686, 'N/A', 0.81631212311, 0.159979121213, 'N/A',
+                                                       'error', '\n']], None)
+        self.compare.record_results(*args)
+        with open(self.details_csv_path, 'r') as file:
+            csv_reader = csv.reader(file)
+            next(csv_reader)
+            api_name_list = [row[0] for row in csv_reader]
+        self.assertEqual(api_name_list[0], 'Functional*conv2d*0.forward.output.0')
+        
+    def test_compare_torch_tensor(self):
+        cpu_output = torch.Tensor([1.0, 2.0, 3.0])
+        npu_output = torch.Tensor([1.0, 2.0, 3.0])
+        compare_column = CompareColumn()
+        status, compare_column, message = self.compare._compare_torch_tensor("api", cpu_output, npu_output, compare_column)
+        self.assertEqual(status, "pass")
+
+    def test_compare_bool_tensor(self):
+        cpu_output = np.array([True, False, True])
+        npu_output = np.array([True, False, True])
+        self.assertEqual(self.compare._compare_bool_tensor(cpu_output, npu_output), (0.0, 'pass', ''))
+        
+    def test_compare_builtin_type(self):
+        compare_column = CompareColumn()
+        bench_out = 1
+        npu_out = 1
+        status, compare_result, message = self.compare._compare_builtin_type(bench_out, npu_out, compare_column)
+        self.assertEqual((status, compare_result.error_rate, message), ('pass', 0, ''))
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_compare_utils.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_compare_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e83c0643ef452c28d11c02bbbc2fee359a1ea2e
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/compare/test_compare_utils.py
@@ -0,0 +1,25 @@
+import unittest
+import numpy as np
+from api_accuracy_checker.compare.compare_utils import CompareConst, check_dtype_comparable
+
+class TestCompareUtils(unittest.TestCase):
+    def test_check_dtype_comparable(self):
+        x = np.array([1, 2, 3], dtype=np.int32)
+        y = np.array([4, 5, 6], dtype=np.int32)
+        self.assertTrue(check_dtype_comparable(x, y))
+
+        x = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        y = np.array([4.0, 5.0, 6.0], dtype=np.float32)
+        self.assertTrue(check_dtype_comparable(x, y))
+
+        x = np.array([True, False, True], dtype=np.bool_)
+        y = np.array([False, True, False], dtype=np.bool_)
+        self.assertTrue(check_dtype_comparable(x, y))
+
+        x = np.array([1, 2, 3], dtype=np.int32)
+        y = np.array([4.0, 5.0, 6.0], dtype=np.float32)
+        self.assertFalse(check_dtype_comparable(x, y))
+
+        x = np.array([1, 2, 3], dtype=np.int32)
+        y = np.array([True, False, True], dtype=np.bool_)
+        self.assertFalse(check_dtype_comparable(x, y))
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_api_info.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_api_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c03d56e722decc424052367dfe9700ba3df94ce
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_api_info.py
@@ -0,0 +1,131 @@
+import os
+import shutil
+import unittest
+import torch
+import numpy as np
+from api_accuracy_checker.dump.api_info import APIInfo, ForwardAPIInfo, BackwardAPIInfo, transfer_types, \
+    get_tensor_extremum, get_type_name, is_builtin_class, analyze_device_in_kwargs, analyze_dtype_in_kwargs
+from api_accuracy_checker.common.config import msCheckerConfig
+
+
+class TestAPIInfo(unittest.TestCase):
+    def setUp(self):
+        if os.path.exists('./step-1'):
+            shutil.rmtree('./step-1')
+        self.api = APIInfo("test_api", APIInfo.get_full_save_path("./", "forward_real_data", True), True)
+
+    def test_analyze_element(self):
+        element = [1, 2, 3]
+        result = self.api.analyze_element(element)
+        self.assertEqual(result,
+                         [{'type': 'int', 'value': 1}, {'type': 'int', 'value': 2}, {'type': 'int', 'value': 3}])
+
+    def test_analyze_tensor(self):
+        tensor = torch.tensor([1, 2, 3], dtype=torch.float32, requires_grad=True)
+        result = self.api._analyze_tensor(tensor)
+        self.assertEqual(result.get('type'), 'torch.Tensor')
+        self.assertTrue(result.get('requires_grad'))
+        datapath = result.get('datapath')
+        self.assertTrue(datapath.startswith('forward_real_data') or datapath.startswith('backward_real_data'))
+
+    def test_analyze_builtin(self):
+        arg = slice(1, 10, 2)
+        result = self.api._analyze_builtin(arg)
+        self.assertEqual(result, {'type': 'slice', 'value': [1, 10, 2]})
+
+    def test_transfer_types(self):
+        data = 10
+        dtype = 'int'
+        result = transfer_types(data, dtype)
+        self.assertEqual(result, 10)
+
+    def test_is_builtin_class(self):
+        element = 10
+        result = is_builtin_class(element)
+        self.assertTrue(result)
+
+    def test_analyze_device_in_kwargs(self):
+        element = torch.device('cuda:0')
+        result = analyze_device_in_kwargs(element)
+        self.assertEqual(result, {'type': 'torch.device', 'value': 'cuda:0'})
+
+    def test_analyze_dtype_in_kwargs(self):
+        element = torch.float32
+        result = analyze_dtype_in_kwargs(element)
+        self.assertEqual(result, {'type': 'torch.dtype', 'value': 'torch.float32'})
+
+    def test_get_tensor_extremum(self):
+        data = torch.tensor([1, 2, 3])
+        result_max, result_max_origin = get_tensor_extremum(data, 'max')
+        result_min, result_min_origin = get_tensor_extremum(data, 'min')
+        self.assertEqual(result_max, 3)
+        self.assertEqual(result_min, 1)
+        self.assertEqual(result_max_origin, 3)
+        self.assertEqual(result_min_origin, 1)
+        
+        data = torch.tensor([1, float("inf"), 2, 3])
+        result_max, result_max_origin = get_tensor_extremum(data, 'max')
+        result_min, result_min_origin = get_tensor_extremum(data, 'min')
+        self.assertEqual(result_max, 3)
+        self.assertEqual(result_min, 1)
+        self.assertEqual(result_max_origin, float("inf"))
+        self.assertEqual(result_min_origin, 1)
+
+        data = torch.tensor([1, float("-inf"), 2, 3])
+        result_max, result_max_origin = get_tensor_extremum(data, 'max')
+        result_min, result_min_origin = get_tensor_extremum(data, 'min')
+        self.assertEqual(result_max, 3)
+        self.assertEqual(result_min, 1)
+        self.assertEqual(result_max_origin, 3)
+        self.assertEqual(result_min_origin, float("-inf"))
+
+        data = torch.tensor([1, float("inf"), float("nan"), 3])
+        result_max, result_max_origin = get_tensor_extremum(data, 'max')
+        result_min, result_min_origin = get_tensor_extremum(data, 'min')
+        self.assertEqual(result_max, 3)
+        self.assertEqual(result_min, 1)
+        self.assertTrue(np.isnan(result_max_origin))
+        self.assertTrue(np.isnan(result_min_origin))
+
+        data = torch.tensor([float("inf"), float("nan")])
+        result_max, result_max_origin = get_tensor_extremum(data, 'max')
+        result_min, result_min_origin = get_tensor_extremum(data, 'min')
+        self.assertEqual(result_max, float("inf"))
+        self.assertEqual(result_min, float("inf"))
+        self.assertTrue(np.isnan(result_max_origin))
+        self.assertTrue(np.isnan(result_min_origin))
+
+        data = torch.tensor([float("nan"), float("nan")])
+        result_max, result_max_origin = get_tensor_extremum(data, 'max')
+        result_min, result_min_origin = get_tensor_extremum(data, 'min')
+        self.assertTrue(np.isnan(result_max))
+        self.assertTrue(np.isnan(result_min))
+        self.assertTrue(np.isnan(result_max_origin))
+        self.assertTrue(np.isnan(result_min_origin))
+
+    def test_get_type_name(self):
+        name = "<class 'int'>"
+        result = get_type_name(name)
+        self.assertEqual(result, 'int')
+
+    def test_ForwardAPIInfo(self):
+        forward_api_info = ForwardAPIInfo("test_forward_api", [1, 2, 3], {"a": 1, "b": 2})
+        self.assertEqual(forward_api_info.api_name, "test_forward_api")
+        self.assertEqual(forward_api_info.save_path,
+                         APIInfo.get_full_save_path(msCheckerConfig.dump_path, 'forward_real_data', True))
+        self.assertEqual(forward_api_info.api_info_struct, {"test_forward_api": {
+            "args": [{'type': 'int', 'value': 1}, {'type': 'int', 'value': 2}, {'type': 'int', 'value': 3}, ],
+            "kwargs": {'a': {'type': 'int', 'value': 1}, 'b': {'type': 'int', 'value': 2}}}})
+
+    def test_BackwardAPIInfo(self):
+        backward_api_info = BackwardAPIInfo("test_backward_api", [1, 2, 3])
+        self.assertEqual(backward_api_info.api_name, "test_backward_api")
+        self.assertEqual(backward_api_info.save_path,
+                         APIInfo.get_full_save_path(msCheckerConfig.dump_path, 'backward_real_data', True))
+        self.assertEqual(backward_api_info.grad_info_struct, {
+            "test_backward_api": [{'type': 'int', 'value': 1}, {'type': 'int', 'value': 2},
+                                  {'type': 'int', 'value': 3}]})
+
+    def tearDown(self):
+        if os.path.exists('./step-1'):
+            shutil.rmtree('./step-1')
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_dump.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_dump.py
new file mode 100644
index 0000000000000000000000000000000000000000..655e624e809a5cceb406b9fce9df4e4f89efb4ee
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_dump.py
@@ -0,0 +1,32 @@
+import unittest
+from api_accuracy_checker.dump.dump import *
+
+class TestDumpUtil(unittest.TestCase):
+    def test_set_dump_switch(self):
+        set_dump_switch("ON")
+        self.assertEqual(DumpUtil.dump_switch, "ON")
+        set_dump_switch("OFF")
+        self.assertEqual(DumpUtil.dump_switch, "OFF")
+
+    def test_get_dump_switch(self):
+        DumpUtil.dump_switch = "ON"
+        self.assertTrue(DumpUtil.get_dump_switch())
+        DumpUtil.dump_switch = "OFF"
+        self.assertFalse(DumpUtil.get_dump_switch())
+
+    def test_incr_iter_num_maybe_exit(self):
+        msCheckerConfig.target_iter = [5]
+        msCheckerConfig.enable_dataloader = True
+
+        DumpUtil.call_num = 6
+        with self.assertRaises(Exception):
+            DumpUtil.incr_iter_num_maybe_exit()
+
+        DumpUtil.call_num = 4
+        DumpUtil.incr_iter_num_maybe_exit()
+        self.assertEqual(DumpUtil.dump_switch, "OFF")
+
+        msCheckerConfig.enable_dataloader = False
+        DumpUtil.call_num = 5
+        DumpUtil.incr_iter_num_maybe_exit()
+        self.assertEqual(DumpUtil.dump_switch, "ON")
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_dump_scope.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_dump_scope.py
new file mode 100644
index 0000000000000000000000000000000000000000..7712552abe49d757a07bcbbd746038ed22d4027b
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_dump_scope.py
@@ -0,0 +1,23 @@
+import unittest
+from api_accuracy_checker.dump.dump_scope import iter_tracer
+from api_accuracy_checker.dump.dump import DumpUtil
+
+
+class TestDumpScope(unittest.TestCase):
+    def test_iter_tracer(self):
+        DumpUtil.call_num = 0
+        
+        def dummy_func():
+            return "Hello, World!"
+        
+        wrapped_func = iter_tracer(dummy_func)
+        result = wrapped_func()
+        self.assertEqual(DumpUtil.dump_switch, "OFF")
+        self.assertEqual(result, "Hello, World!")
+
+        def another_dummy_func():
+            return 123
+        wrapped_func = iter_tracer(another_dummy_func)
+        result = wrapped_func()
+        self.assertEqual(DumpUtil.dump_switch, "OFF")
+        self.assertEqual(result, 123)
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_info_dump.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_info_dump.py
new file mode 100644
index 0000000000000000000000000000000000000000..45e57f2c389292e9226039f56b83966941c603ca
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/dump/test_info_dump.py
@@ -0,0 +1,28 @@
+import unittest
+import os
+from unittest.mock import patch
+from api_accuracy_checker.dump.api_info import APIInfo, BackwardAPIInfo
+from api_accuracy_checker.dump.info_dump import write_api_info_json
+
+
+class TestInfoDump(unittest.TestCase):
+
+    def test_write_api_info_json_backward(self):
+        api_info = BackwardAPIInfo("test_backward_api", [1, 2, 3])
+        with patch('api_accuracy_checker.dump.info_dump.write_json') as mock_write_json:
+            write_api_info_json(api_info)
+            rank = os.getpid()
+            mock_write_json.assert_called_with(f'./step0/backward_info_{rank}.json', api_info.grad_info_struct)
+
+    def test_write_api_info_json_invalid_type(self):
+        api_info = APIInfo("test_api", APIInfo.get_full_save_path("save_path", "forward_real_data", contain_step=True),
+                           is_save_data=True)
+        with self.assertRaises(ValueError):
+            write_api_info_json(api_info)
+
+    def tearDown(self):
+        rank = os.getpid()
+        files = [f'./step0/backward_info_{rank}.json']
+        for file in files:
+            if os.path.exists(file):
+                os.remove(file)
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_functional.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..37058e77fd87e697b7dd7fde5e94b78d01a2cb89
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_functional.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+import unittest
+import torch
+from api_accuracy_checker.hook_module import wrap_functional as wf
+
+class TestWrapFunctional(unittest.TestCase):
+
+    def test_get_functional_ops(self):
+        expected_ops = {'relu', 'sigmoid', 'softmax'}
+        actual_ops = wf.get_functional_ops()
+        self.assertTrue(expected_ops.issubset(actual_ops))
+
+    def test_wrap_functional_ops_and_bind(self):
+        wf.wrap_functional_ops_and_bind(None)
+        self.assertTrue(hasattr(wf.HOOKFunctionalOP, 'wrap_relu'))
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_tensor.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfae3c72771510b141abf9204723bfe48bfa8de3
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_tensor.py
@@ -0,0 +1,29 @@
+# coding=utf-8
+import unittest
+import torch
+import yaml
+from api_accuracy_checker.hook_module.wrap_tensor import get_tensor_ops, HOOKTensor, TensorOPTemplate, wrap_tensor_op, wrap_tensor_ops_and_bind
+
+class TestWrapTensor(unittest.TestCase):
+    def hook(self, a, b):
+        return
+
+    def test_get_tensor_ops(self):
+        result = get_tensor_ops()
+        self.assertIsInstance(result, set)
+
+    def test_HOOKTensor(self):
+        hook_tensor = HOOKTensor()
+        self.assertIsInstance(hook_tensor, HOOKTensor)
+
+    def test_TensorOPTemplate(self):
+        tensor_op_template = TensorOPTemplate('add', self.hook)
+        self.assertEqual(tensor_op_template.op_name_, 'add')
+
+    def test_wrap_tensor_op(self):
+        wrapped_op = wrap_tensor_op('add', self.hook)
+        self.assertTrue(callable(wrapped_op))
+
+    def test_wrap_tensor_ops_and_bind(self):
+        wrap_tensor_ops_and_bind(self.hook)
+        self.assertTrue(hasattr(HOOKTensor, 'wrap_add'))
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_torch.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..40cef939adfd06158eb543c07b3d682e29d6cdab
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/hook_module/test_wrap_torch.py
@@ -0,0 +1,37 @@
+# coding=utf-8
+import unittest
+import torch
+import yaml
+from api_accuracy_checker.hook_module.wrap_torch import *
+
+class TestWrapTorch(unittest.TestCase):
+
+    def setUp(self):
+        self.op_name = 'add' 
+        self.torch_op = wrap_torch_op(self.op_name, self.hook)
+
+    def hook(self, a, b):
+        return
+
+    def test_get_torch_ops(self):
+        ops = get_torch_ops()
+        self.assertIsInstance(ops, set)
+        self.assertIn(self.op_name, ops)
+
+    def test_TorchOPTemplate(self):
+        template = TorchOPTemplate(self.op_name, self.hook)
+        self.assertEqual(template.op_name_, self.op_name)
+        self.assertEqual(template.prefix_op_name_, "Torch*" + str(self.op_name) + "*")
+
+    def test_input_param_need_adapt(self):
+        template = TorchOPTemplate(self.op_name, self.hook)
+        self.assertFalse(template.input_param_need_adapt())
+
+    def test_forward(self):
+        template = TorchOPTemplate(self.op_name, self.hook)
+        result = template.forward(torch.tensor([1, 2, 3]), torch.tensor([4, 5, 6]))
+        torch.testing.assert_allclose(result, torch.tensor([5, 7, 9]))
+
+    def test_wrap_torch_ops_and_bind(self):
+        wrap_torch_ops_and_bind(self.hook)
+        self.assertTrue(hasattr(HOOKTorchOP, "wrap_" + self.op_name))
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/__init__.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_data_generate.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_data_generate.py
new file mode 100644
index 0000000000000000000000000000000000000000..b98f84d516404665b5c3284f1e03f14eedddac55
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_data_generate.py
@@ -0,0 +1,97 @@
+# coding=utf-8
+import unittest
+import numpy as np
+import os
+import copy
+from api_accuracy_checker.run_ut.data_generate import *
+from api_accuracy_checker.common.utils import get_json_contents
+
+base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+forward_file = os.path.join(base_dir, "../resources/forward.json")
+forward_content = get_json_contents(forward_file)
+for api_full_name, api_info_dict in forward_content.items():
+    api_full_name = api_full_name
+    api_info_dict = api_info_dict
+
+max_value = 5.7421875
+min_value = -5.125
+
+class TestDataGenerateMethods(unittest.TestCase):
+    def test_gen_api_params(self):
+        api_info = copy.deepcopy(api_info_dict)
+        args_params, kwargs_params = gen_api_params(api_info, True, None, None)
+        max_diff = abs(args_params[0].max() - max_value)
+        min_diff = abs(args_params[0].min() - min_value)
+        self.assertEqual(len(args_params), 1)
+        self.assertEqual(args_params[0].dtype, torch.float32)
+        self.assertLessEqual(max_diff, 0.001)
+        self.assertLessEqual(min_diff, 0.001)
+        self.assertEqual(args_params[0].shape, torch.Size([2, 2560, 24, 24]))
+        self.assertEqual(kwargs_params, {'inplace': False})
+
+    def test_gen_args(self):
+        args_result = gen_args(api_info_dict.get('args'), real_data_path=None)
+        max_diff = abs(args_result[0].max() - max_value)
+        min_diff = abs(args_result[0].min() - min_value)
+        self.assertEqual(len(args_result), 1)
+        self.assertEqual(args_result[0].dtype, torch.float32)
+        self.assertLessEqual(max_diff, 0.001)
+        self.assertLessEqual(min_diff, 0.001)
+        self.assertEqual(args_result[0].shape, torch.Size([2, 2560, 24, 24]))
+
+    def test_gen_data(self):
+        data = gen_data(api_info_dict.get('args')[0], True, None, None)
+        max_diff = abs(data.max() - max_value)
+        min_diff = abs(data.min() - min_value)
+        self.assertEqual(data.dtype, torch.float32)
+        self.assertEqual(data.requires_grad, True)
+        self.assertLessEqual(max_diff, 0.001)
+        self.assertLessEqual(min_diff, 0.001)
+        self.assertEqual(data.shape, torch.Size([2, 2560, 24, 24]))
+
+    def test_gen_kwargs(self):
+        api_info = copy.deepcopy(api_info_dict)
+        kwargs_params = gen_kwargs(api_info, None)
+        self.assertEqual(kwargs_params, {'inplace': False})
+        
+    def test_gen_kwargs_2(self):
+        k_dict = {"inplace": {"type": "bool", "value": "False"}}
+        for key, value in k_dict.items():
+            gen_torch_kwargs(k_dict, key, value)
+        self.assertEqual(k_dict, {'inplace': False})
+    
+    def test_gen_random_tensor(self):
+        data = gen_random_tensor(api_info_dict.get('args')[0], None)
+        max_diff = abs(data.max() - max_value)
+        min_diff = abs(data.min() - min_value)
+        self.assertEqual(data.dtype, torch.float32)
+        self.assertEqual(data.requires_grad, False)
+        self.assertLessEqual(max_diff, 0.001)
+        self.assertLessEqual(min_diff, 0.001)
+        self.assertEqual(data.shape, torch.Size([2, 2560, 24, 24]))
+        
+    def test_gen_common_tensor(self):
+        info = api_info_dict.get('args')[0]
+        low, high = info.get('Min'), info.get('Max')
+        low_origin, high_origin = info.get('Min_origin'), info.get('Max_origin')
+        low_info = [low, low_origin]
+        high_info = [high, high_origin]
+        data_dtype = info.get('dtype')
+        shape = tuple(info.get('shape'))
+        data = gen_common_tensor(low_info, high_info, shape, data_dtype, None)
+        max_diff = abs(data.max() - max_value)
+        min_diff = abs(data.min() - min_value)
+        self.assertEqual(data.dtype, torch.float32)
+        self.assertEqual(data.requires_grad, False)
+        self.assertLessEqual(max_diff, 0.001)
+        self.assertLessEqual(min_diff, 0.001)
+        self.assertEqual(data.shape, torch.Size([2, 2560, 24, 24]))
+        
+    def test_gen_bool_tensor(self):
+        info = {"type": "torch.Tensor", "dtype": "torch.bool", "shape": [1, 1, 160, 256], \
+            "Max": 1, "Min": 0, "requires_grad": False}
+        low, high = info.get("Min"), info.get("Max")
+        shape = tuple(info.get("shape"))
+        data = gen_bool_tensor(low, high, shape)
+        self.assertEqual(data.shape, torch.Size([1, 1, 160, 256]))
+        self.assertEqual(data.dtype, torch.bool)
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_multi_run_ut.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_multi_run_ut.py
new file mode 100644
index 0000000000000000000000000000000000000000..315b16127972103dffdfe89c941d330c6962305d
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_multi_run_ut.py
@@ -0,0 +1,103 @@
+import unittest
+from unittest.mock import patch, mock_open, MagicMock
+import json
+import signal
+from api_accuracy_checker.run_ut.multi_run_ut import split_json_file, signal_handler, run_parallel_ut, prepare_config, main, ParallelUTConfig
+
+
+class TestMultiRunUT(unittest.TestCase):
+
+    def setUp(self):
+        self.test_json_file = 'test_file.json'
+        self.test_data = {'key1': 'TRUE', 'key2': 'TRUE', 'key3': 'TRUE'}
+        self.test_json_content = json.dumps(self.test_data)
+        self.forward_split_files_content = [
+            {'key1': 'TRUE', 'key2': 'TRUE'},
+            {'key3': 'TRUE', 'key4': 'TRUE'}
+        ]
+
+    @patch('api_accuracy_checker.run_ut.multi_run_ut.FileOpen')
+    def test_split_json_file(self, mock_FileOpen):
+        mock_FileOpen.return_value.__enter__.return_value = mock_open(read_data=self.test_json_content).return_value
+        num_splits = 2
+        split_files, total_items = split_json_file(self.test_json_file, num_splits, False)
+        self.assertEqual(len(split_files), num_splits)
+        self.assertEqual(total_items, len(self.test_data))
+
+    @patch('api_accuracy_checker.run_ut.multi_run_ut.print_warn_log')
+    def test_signal_handler(self, mock_print_warn_log):
+        with self.assertRaises(KeyboardInterrupt):
+            signal_handler(signal.SIGINT, None)
+        mock_print_warn_log.assert_called()
+
+    @patch('subprocess.Popen')
+    @patch('os.path.exists', return_value=True)
+    @patch('builtins.open', new_callable=mock_open)
+    @patch('json.load', side_effect=lambda f: {'key1': 'TRUE', 'key2': 'TRUE'})
+    def test_run_parallel_ut(self, mock_json_load, mock_file, mock_exists, mock_popen):
+        mock_process = MagicMock()
+        mock_process.poll.side_effect = [None, None, 1]
+        mock_process.stdout.readline.side_effect = ['[ERROR] Test Error Message\n', '']
+        mock_popen.return_value = mock_process
+
+        config = ParallelUTConfig(
+            forward_files=['forward_split1.json', 'forward_split2.json'],
+            backward_files=[None, None],
+            out_path='./',
+            num_splits=2,
+            save_error_data_flag=True,
+            jit_compile_flag=False,
+            device_id=[0, 1],
+            result_csv_path='result.csv',
+            total_items=2,
+            real_data_path=None
+        )
+
+        mock_file.side_effect = [
+            mock_open(read_data=json.dumps(self.forward_split_files_content[0])).return_value,
+            mock_open(read_data=json.dumps(self.forward_split_files_content[1])).return_value
+        ]
+
+        run_parallel_ut(config)
+
+        mock_popen.assert_called()
+        mock_exists.assert_called()
+
+    @patch('os.remove')
+    @patch('os.path.realpath', side_effect=lambda x: x)
+    @patch('api_accuracy_checker.run_ut.multi_run_ut.check_link')
+    @patch('api_accuracy_checker.run_ut.multi_run_ut.check_file_suffix')
+    @patch('api_accuracy_checker.run_ut.multi_run_ut.FileChecker')
+    @patch('api_accuracy_checker.run_ut.multi_run_ut.split_json_file', return_value=(['forward_split1.json', 'forward_split2.json'], 2))
+    def test_prepare_config(self, mock_split_json_file, mock_FileChecker, mock_check_file_suffix, mock_check_link, mock_realpath, mock_remove):
+        mock_FileChecker_instance = MagicMock()
+        mock_FileChecker_instance.common_check.return_value = './'
+        mock_FileChecker.return_value = mock_FileChecker_instance
+        args = MagicMock()
+        args.forward_input_file = 'forward.json'
+        args.backward_input_file = None
+        args.out_path = './'
+        args.num_splits = 2
+        args.save_error_data = True
+        args.jit_compile = False
+        args.device_id = [0, 1]
+        args.result_csv_path = None
+        args.real_data_path = None
+
+        config = prepare_config(args)
+
+        self.assertEqual(config.num_splits, 2)
+        self.assertTrue(config.save_error_data_flag)
+        self.assertFalse(config.jit_compile_flag)
+        self.assertEqual(config.device_id, [0, 1])
+        self.assertEqual(len(config.forward_files), 2)
+        self.assertEqual(config.total_items, 2)
+
+    @patch('argparse.ArgumentParser.parse_args')
+    @patch('api_accuracy_checker.run_ut.multi_run_ut.prepare_config')
+    @patch('api_accuracy_checker.run_ut.multi_run_ut.run_parallel_ut')
+    def test_main(self, mock_run_parallel_ut, mock_prepare_config, mock_parse_args):
+        main()
+        mock_parse_args.assert_called()
+        mock_prepare_config.assert_called()
+        mock_run_parallel_ut.assert_called()
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_run_ut.py b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_run_ut.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdcc1cfddeb38d4fca0d2a67a09147b571b35def
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/test/ut/run_ut/test_run_ut.py
@@ -0,0 +1,70 @@
+# coding=utf-8
+import os
+import copy
+import unittest
+from unittest.mock import patch, DEFAULT
+import torch
+from api_accuracy_checker.run_ut.run_ut import *
+from api_accuracy_checker.common.utils import get_json_contents
+
+base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+forward_file = os.path.join(base_dir, "../resources/forward.json")
+forward_content = get_json_contents(forward_file)
+for api_full_name, api_info_dict in forward_content.items():
+    api_full_name = api_full_name
+    api_info_dict = api_info_dict
+    
+class TestRunUtMethods(unittest.TestCase):
+    def test_exec_api(self):
+        api_info = copy.deepcopy(api_info_dict)
+        [api_type, api_name, _] = api_full_name.split("*")
+        args, kwargs, need_grad = get_api_info(api_info, api_name, None)
+        cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, True, '')
+        out = exec_api(api_type, api_name, cpu_args, cpu_kwargs)
+        self.assertEqual(out.dtype, torch.float64)
+        self.assertTrue(out.requires_grad)
+        self.assertEqual(out.shape, torch.Size([2, 2560, 24, 24]))
+
+    def test_generate_device_params(self):
+        mock_tensor = torch.rand([2, 2560, 24, 24], dtype=torch.float32, requires_grad=True)
+        
+        with patch.multiple('torch.Tensor', 
+                           to=DEFAULT, 
+                           clone=DEFAULT, 
+                           detach=DEFAULT, 
+                           requires_grad_=DEFAULT, 
+                           type_as=DEFAULT, 
+                           retain_grad=DEFAULT) as mocks:
+            mocks['clone'].return_value = mock_tensor
+            mocks['detach'].return_value = mock_tensor
+            mocks['requires_grad_'].return_value = mock_tensor
+            mocks['type_as'].return_value = mock_tensor
+            mocks['retain_grad'].return_value = None
+            mocks['to'].return_value = mock_tensor
+            
+            device_args, device_kwargs = generate_device_params([mock_tensor], {'inplace': False}, True, '')
+            self.assertEqual(len(device_args), 1)
+            self.assertEqual(device_args[0].dtype, torch.float32)
+            self.assertTrue(device_args[0].requires_grad)
+            self.assertEqual(device_args[0].shape, torch.Size([2, 2560, 24, 24]))
+            self.assertEqual(device_kwargs, {'inplace': False})
+        
+    def test_generate_cpu_params(self):
+        api_info = copy.deepcopy(api_info_dict)
+        [api_type, api_name, _] = api_full_name.split("*")
+        args, kwargs, need_grad = get_api_info(api_info, api_name, None)
+        cpu_args, cpu_kwargs = generate_cpu_params(args, kwargs, True, '')
+        self.assertEqual(len(cpu_args), 1)
+        self.assertEqual(cpu_args[0].dtype, torch.float64)
+        self.assertTrue(cpu_args[0].requires_grad)
+        self.assertEqual(cpu_args[0].shape, torch.Size([2, 2560, 24, 24]))
+        self.assertEqual(cpu_kwargs, {'inplace': False})
+    
+    def test_UtDataInfo(self):
+        data_info = UtDataInfo(None, None, None, None, None, None)
+        self.assertIsNone(data_info.bench_grad_out)
+        self.assertIsNone(data_info.device_grad_out)
+        self.assertIsNone(data_info.device_out)
+        self.assertIsNone(data_info.bench_out)
+        self.assertIsNone(data_info.grad_in)
+        self.assertIsNone(data_info.in_fwd_data_list)
diff --git a/debug/accuracy_tools/atat/pytorch/common/__init__.py b/debug/accuracy_tools/atat/pytorch/common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b391e103115498a2c2cf8b78f48168822517be73
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/common/__init__.py
@@ -0,0 +1,4 @@
+from .recursive import recursive_apply_transform
+from .log import print_error_log_rank_0, print_info_log_rank_0, print_warn_log_rank_0
+from .parse_json import parse_json_info_forward_backward
+from .utils import seed_all
diff --git a/debug/accuracy_tools/atat/pytorch/common/compare_script.template b/debug/accuracy_tools/atat/pytorch/common/compare_script.template
new file mode 100644
index 0000000000000000000000000000000000000000..91565b3c87fa504ca96e7ebfd03f140f648a64c7
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/common/compare_script.template
@@ -0,0 +1,14 @@
+from ptdbg_ascend import compare
+
+pkl_path = "%s"
+dump_data_dir = "%s"
+
+dump_path_param = {
+    "npu_pkl_path": ,
+    "bench_pkl_path": ,
+    "npu_dump_data_dir": ,
+    "bench_dump_data_dir": ,
+    "is_print_compare_log": True
+}
+
+compare(dump_path_param, output_path="", stack_mode=%s)
diff --git a/debug/accuracy_tools/atat/pytorch/common/exceptions.py b/debug/accuracy_tools/atat/pytorch/common/exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..17733b5bfd5f4b8ffcb3cb3602e3f5f54fdef97d
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/common/exceptions.py
@@ -0,0 +1,75 @@
+
+class CodedException(Exception):
+    def __init__(self, code, error_info=''):
+        self.error_info = self.err_strs.get(code) + error_info
+
+    def __str__(self):
+        return self.error_info
+
+
+class MsaccException(CodedException):
+    INVALID_PARAM_ERROR = 0
+    OVERFLOW_NUMS_ERROR = 1
+    
+    err_strs = {
+        INVALID_PARAM_ERROR: "[msacc] 无效参数： ",
+        OVERFLOW_NUMS_ERROR: "[msacc] 超过预设溢出次数 当前溢出次数:"
+    }
+
+
+class FileCheckException(CodedException):
+    INVALID_FILE_ERROR = 0
+    FILE_PERMISSION_ERROR = 1
+    SOFT_LINK_ERROR = 2
+    ILLEGAL_PATH_ERROR = 3
+    ILLEGAL_PARAM_ERROR = 4
+    FILE_TOO_LARGE_ERROR = 5
+
+    err_strs = {
+        SOFT_LINK_ERROR: "[msacc] 检测到软链接： ",
+        FILE_PERMISSION_ERROR: "[msacc] 文件权限错误： ",
+        INVALID_FILE_ERROR: "[msacc] 无效文件： ",
+        ILLEGAL_PATH_ERROR: "[msacc] 非法文件路径： ",
+        ILLEGAL_PARAM_ERROR: "[msacc] 非法打开方式： ",
+        FILE_TOO_LARGE_ERROR: "[msacc] 文件过大： "
+    }
+
+
+class ParseJsonException(CodedException):
+    UnexpectedNameStruct = 0
+    InvalidDumpJson = 1
+    err_strs = {
+        UnexpectedNameStruct: "[msacc] Unexpected name in json: ",
+        InvalidDumpJson: "[msacc] json格式不正确: ",
+    }
+
+
+class ScopeException(CodedException):
+    InvalidApiStr = 0
+    InvalidScope = 1
+    ArgConflict = 2
+    err_strs = {
+        InvalidApiStr: "[msacc] Invalid api_list: ",
+        InvalidScope: "[msacc] Invalid scope: ",
+        ArgConflict: "[msacc] Scope and api_list conflict: ",
+    }
+
+
+class RepairException(CodedException):
+    InvalidRepairType = 0
+    err_strs = {
+        InvalidRepairType: "[msacc] Invalid repair_type: "
+    }
+
+
+class StepException(CodedException):
+    InvalidPostProcess = 0
+    err_strs = {
+        InvalidPostProcess: "[msacc] 错误的step后处理配置: ",
+    }
+
+class FreeBenchmarkException(CodedException):
+    UnsupportedType = 0
+    err_strs = {
+        UnsupportedType: "[msacc] Free benchmark get unsupported type: "
+    }
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/common/file_check.py b/debug/accuracy_tools/atat/pytorch/common/file_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..95285e017faed0746b7a190b0ab30834cad6777a
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/common/file_check.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import os
+import re
+
+from .log import print_error_log
+from .exceptions import FileCheckException
+from .utils import Const
+
+
+class FileCheckConst:
+    """
+    Class for file check const
+    """
+    READ_ABLE = "read"
+    WRITE_ABLE = "write"
+    READ_WRITE_ABLE = "read and write"
+    DIRECTORY_LENGTH = 4096
+    FILE_NAME_LENGTH = 255
+    FILE_VALID_PATTERN = r"^[a-zA-Z0-9_.:/-]+$"
+    PKL_SUFFIX = ".pkl"
+    NUMPY_SUFFIX = ".npy"
+    JSON_SUFFIX = ".json"
+    PT_SUFFIX = ".pt"
+    CSV_SUFFIX = ".csv"
+    YAML_SUFFIX = ".yaml"
+    MAX_PKL_SIZE = 1 * 1024 * 1024 * 1024
+    MAX_NUMPY_SIZE = 10 * 1024 * 1024 * 1024
+    MAX_JSON_SIZE = 1 * 1024 * 1024 * 1024
+    MAX_PT_SIZE = 10 * 1024 * 1024 * 1024
+    MAX_CSV_SIZE = 1 * 1024 * 1024 * 1024
+    MAX_YAML_SIZE = 10 * 1024 * 1024
+    DIR = "dir"
+    FILE = "file"
+    DATA_DIR_AUTHORITY = 0o750
+    DATA_FILE_AUTHORITY = 0o640
+    FILE_SIZE_DICT = {
+        PKL_SUFFIX: MAX_PKL_SIZE,
+        NUMPY_SUFFIX: MAX_NUMPY_SIZE,
+        JSON_SUFFIX: MAX_JSON_SIZE,
+        PT_SUFFIX: MAX_PT_SIZE,
+        CSV_SUFFIX: MAX_CSV_SIZE,
+        YAML_SUFFIX: MAX_YAML_SIZE
+    }
+
+
+class FileChecker:
+    """
+    The class for check file.
+
+    Attributes:
+        file_path: The file or dictionary path to be verified.
+        path_type: file or dictionary
+        ability(str): FileCheckConst.WRITE_ABLE or FileCheckConst.READ_ABLE to set file has writability or readability
+        file_type(str): The correct file type for file
+    """
+    def __init__(self, file_path, path_type, ability=None, file_type=None, is_script=True):
+        self.file_path = file_path
+        self.path_type = self._check_path_type(path_type)
+        self.ability = ability
+        self.file_type = file_type
+        self.is_script = is_script
+
+    @staticmethod
+    def _check_path_type(path_type):
+        if path_type not in [FileCheckConst.DIR, FileCheckConst.FILE]:
+            print_error_log(f'The path_type must be {FileCheckConst.DIR} or {FileCheckConst.FILE}.')
+            raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR)
+        return path_type
+
+    def common_check(self):
+        """
+        功能：用户校验基本文件权限：软连接、文件长度、是否存在、读写权限、文件属组、文件特殊字符
+        注意：文件后缀的合法性，非通用操作，可使用其他独立接口实现
+        """
+        check_path_exists(self.file_path)
+        check_link(self.file_path)
+        self.file_path = os.path.realpath(self.file_path)
+        check_path_length(self.file_path)
+        check_path_type(self.file_path, self.path_type)
+        self.check_path_ability()
+        if self.is_script:
+            check_path_owner_consistent(self.file_path)
+        check_path_pattern_vaild(self.file_path)
+        check_common_file_size(self.file_path)
+        check_file_suffix(self.file_path, self.file_type)
+        return self.file_path
+
+    def check_path_ability(self):
+        if self.ability == FileCheckConst.WRITE_ABLE:
+            check_path_writability(self.file_path)
+        if self.ability == FileCheckConst.READ_ABLE:
+            check_path_readability(self.file_path)
+        if self.ability == FileCheckConst.READ_WRITE_ABLE:
+            check_path_readability(self.file_path)
+            check_path_writability(self.file_path)
+
+
+class FileOpen:
+    """
+    The class for open file by a safe way.
+
+    Attributes:
+        file_path: The file or dictionary path to be opened.
+        mode(str): The file open mode
+    """
+    SUPPORT_READ_MODE = ["r", "rb"]
+    SUPPORT_WRITE_MODE = ["w", "wb", "a", "ab"]
+    SUPPORT_READ_WRITE_MODE = ["r+", "rb+", "w+", "wb+", "a+", "ab+"]
+
+    def __init__(self, file_path, mode, encoding='utf-8'):
+        self.file_path = file_path
+        self.mode = mode
+        self.encoding = encoding
+        self._handle = None
+
+    def __enter__(self):
+        self.check_file_path()
+        binary_mode = "b"
+        if binary_mode not in self.mode:
+            self._handle = open(self.file_path, self.mode, encoding=self.encoding)
+        else:
+            self._handle = open(self.file_path, self.mode)
+        return self._handle
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._handle:
+            self._handle.close()
+
+    def check_file_path(self):
+        support_mode = self.SUPPORT_READ_MODE + self.SUPPORT_WRITE_MODE + self.SUPPORT_READ_WRITE_MODE
+        if self.mode not in support_mode:
+            print_error_log("File open not support %s mode" % self.mode)
+            raise FileCheckException(FileCheckException.ILLEGAL_PARAM_ERROR)
+        check_link(self.file_path)
+        self.file_path = os.path.realpath(self.file_path)
+        check_path_length(self.file_path)
+        self.check_ability_and_owner()
+        check_path_pattern_vaild(self.file_path)
+        if os.path.exists(self.file_path):
+            check_common_file_size(self.file_path)
+
+    def check_ability_and_owner(self):
+        if self.mode in self.SUPPORT_READ_MODE:
+            check_path_exists(self.file_path)
+            check_path_readability(self.file_path)
+            check_path_owner_consistent(self.file_path)
+        if self.mode in self.SUPPORT_WRITE_MODE and os.path.exists(self.file_path):
+            check_path_writability(self.file_path)
+            check_path_owner_consistent(self.file_path)
+        if self.mode in self.SUPPORT_READ_WRITE_MODE and os.path.exists(self.file_path):
+            check_path_readability(self.file_path)
+            check_path_writability(self.file_path)
+            check_path_owner_consistent(self.file_path)
+
+
+def check_link(path):
+    abs_path = os.path.abspath(path)
+    if os.path.islink(abs_path):
+        print_error_log('The file path {} is a soft link.'.format(path))
+        raise FileCheckException(FileCheckException.SOFT_LINK_ERROR)
+
+
+def check_path_length(path, name_length=None):
+    file_max_name_length = name_length if name_length else FileCheckConst.FILE_NAME_LENGTH
+    if len(path) > FileCheckConst.DIRECTORY_LENGTH or \
+            len(os.path.basename(path)) > file_max_name_length:
+        print_error_log('The file path length exceeds limit.')
+        raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR)
+
+
+def check_path_exists(path):
+    if not os.path.exists(path):
+        print_error_log('The file path %s does not exist.' % path)
+        raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR)
+
+
+def check_path_readability(path):
+    if not os.access(path, os.R_OK):
+        print_error_log('The file path %s is not readable.' % path)
+        raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR)
+
+
+def check_path_writability(path):
+    if not os.access(path, os.W_OK):
+        print_error_log('The file path %s is not writable.' % path)
+        raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR)
+
+
+def check_path_executable(path):
+    if not os.access(path, os.X_OK):
+        print_error_log('The file path %s is not executable.' % path)
+        raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR)
+
+
+def check_other_user_writable(path):
+    st = os.stat(path)
+    if st.st_mode & 0o002:
+        print_error_log('The file path %s may be insecure because other users have write permissions. ' % path)
+        raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR)
+
+
+def check_path_owner_consistent(path):
+    file_owner = os.stat(path).st_uid
+    if file_owner != os.getuid():
+        print_error_log('The file path %s may be insecure because is does not belong to you.' % path)
+        raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR)
+
+
+def check_path_pattern_vaild(path):
+    if not re.match(FileCheckConst.FILE_VALID_PATTERN, path):
+        print_error_log('The file path {} contains special characters.'.format(path))
+        raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR)
+
+
+def check_file_size(file_path, max_size):
+    file_size = os.path.getsize(file_path)
+    if file_size >= max_size:
+        print_error_log(f'The size of file path {file_path} exceeds {max_size} bytes.')
+        raise FileCheckException(FileCheckException.FILE_TOO_LARGE_ERROR)
+
+
+def check_common_file_size(file_path):
+    if os.path.isfile(file_path):
+        for suffix, max_size in FileCheckConst.FILE_SIZE_DICT.items():
+            if file_path.endswith(suffix):
+                check_file_size(file_path, max_size)
+                break
+
+
+def check_file_suffix(file_path, file_suffix):
+    if file_suffix:
+        if not file_path.endswith(file_suffix):
+            print_error_log(f"The {file_path} should be a {file_suffix} file!")
+            raise FileCheckException(FileCheckException.INVALID_FILE_ERROR)
+
+
+def check_path_type(file_path, file_type):
+    if file_type == FileCheckConst.FILE:
+        if not os.path.isfile(file_path):
+            print_error_log(f"The {file_path} should be a file!")
+            raise FileCheckException(FileCheckException.INVALID_FILE_ERROR)
+    if file_type == FileCheckConst.DIR:
+        if not os.path.isdir(file_path):
+            print_error_log(f"The {file_path} should be a dictionary!")
+            raise FileCheckException(FileCheckException.INVALID_FILE_ERROR)
+
+
+def create_directory(dir_path):
+    """
+    Function Description:
+        creating a directory with specified permissions
+    Parameter:
+        dir_path: directory path
+    Exception Description:
+        when invalid data throw exception
+    """
+    dir_path = os.path.realpath(dir_path)
+    try:
+        os.makedirs(dir_path, mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True)
+    except OSError as ex:
+        raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR,
+            'Failed to create {}. Please check the path permission or disk space .{}'.format(dir_path, str(ex))) from ex
+
+
+def check_path_before_create(path):
+    if len(os.path.realpath(path)) > Const.DIRECTORY_LENGTH or len(os.path.basename(path)) > \
+            Const.FILE_NAME_LENGTH:
+        raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR, 'The file path length exceeds limit.')
+
+    if not re.match(Const.FILE_PATTERN, os.path.realpath(path)):
+        raise FileCheckException(FileCheckException.ILLEGAL_PATH_ERROR,
+                                 'The file path {} contains special characters.'.format(path))
+
+
+def change_mode(path, mode):
+    if not os.path.exists(path) or os.path.islink(path):
+        return
+    try:
+        os.chmod(path, mode)
+    except PermissionError as ex:
+        raise FileCheckException(FileCheckException.FILE_PERMISSION_ERROR,
+                                 'Failed to change {} authority. {}'.format(path, str(ex))) from ex
+
diff --git a/debug/accuracy_tools/atat/pytorch/common/file_check_util.py b/debug/accuracy_tools/atat/pytorch/common/file_check_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..61fc4ddf94c8e295b08c395f21776ac0f05f5c61
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/common/file_check_util.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import os
+import re
+
+from .log import print_warn_log, print_error_log
+
+
+class FileCheckConst:
+    """
+    Class for file check const
+    """
+    READ_ABLE = "read"
+    WRITE_ABLE = "write"
+    READ_WRITE_ABLE = "read and write"
+    DIRECTORY_LENGTH = 4096
+    FILE_NAME_LENGTH = 255
+    FILE_VALID_PATTERN = r"^[a-zA-Z0-9_.:/-]+$"
+    PKL_SUFFIX = ".pkl"
+    NUMPY_SUFFIX = ".npy"
+    JSON_SUFFIX = ".json"
+    PT_SUFFIX = ".pt"
+    CSV_SUFFIX = ".csv"
+    YAML_SUFFIX = ".yaml"
+    MAX_PKL_SIZE = 1 * 1024 * 1024 * 1024
+    MAX_NUMPY_SIZE = 10 * 1024 * 1024 * 1024
+    MAX_JSON_SIZE = 1 * 1024 * 1024 * 1024
+    MAX_PT_SIZE = 10 * 1024 * 1024 * 1024
+    MAX_CSV_SIZE = 1 * 1024 * 1024 * 1024
+    MAX_YAML_SIZE = 10 * 1024 * 1024
+    DIR = "dir"
+    FILE = "file"
+    DATA_DIR_AUTHORITY = 0o750
+    DATA_FILE_AUTHORITY = 0o640
+    FILE_SIZE_DICT = {
+        PKL_SUFFIX: MAX_PKL_SIZE,
+        NUMPY_SUFFIX: MAX_NUMPY_SIZE,
+        JSON_SUFFIX: MAX_JSON_SIZE,
+        PT_SUFFIX: MAX_PT_SIZE,
+        CSV_SUFFIX: MAX_CSV_SIZE,
+        YAML_SUFFIX: MAX_YAML_SIZE
+    }
+
+
+class FileCheckException(Exception):
+    """
+    Class for File Check Exception
+    """
+    NONE_ERROR = 0
+    INVALID_PATH_ERROR = 1
+    INVALID_FILE_TYPE_ERROR = 2
+    INVALID_PARAM_ERROR = 3
+    INVALID_PERMISSION_ERROR = 3
+
+    def __init__(self, code, error_info: str = ""):
+        super(FileCheckException, self).__init__()
+        self.code = code
+        self.error_info = error_info
+
+    def __str__(self):
+        return self.error_info
+
+
+class FileChecker:
+    """
+    The class for check file.
+
+    Attributes:
+        file_path: The file or dictionary path to be verified.
+        path_type: file or dictionary
+        ability(str): FileCheckConst.WRITE_ABLE or FileCheckConst.READ_ABLE to set file has writability or readability
+        file_type(str): The correct file type for file
+    """
+    def __init__(self, file_path, path_type, ability=None, file_type=None, is_script=True):
+        self.file_path = file_path
+        self.path_type = self._check_path_type(path_type)
+        self.ability = ability
+        self.file_type = file_type
+        self.is_script = is_script
+
+    @staticmethod
+    def _check_path_type(path_type):
+        if path_type not in [FileCheckConst.DIR, FileCheckConst.FILE]:
+            print_error_log(f'The path_type must be {FileCheckConst.DIR} or {FileCheckConst.FILE}.')
+            raise FileCheckException(FileCheckException.INVALID_PARAM_ERROR)
+        return path_type
+
+    def common_check(self):
+        """
+        功能：用户校验基本文件权限：软连接、文件长度、是否存在、读写权限、文件属组、文件特殊字符
+        注意：文件后缀的合法性，非通用操作，可使用其他独立接口实现
+        """
+        check_path_exists(self.file_path)
+        check_link(self.file_path)
+        self.file_path = os.path.realpath(self.file_path)
+        check_path_length(self.file_path)
+        check_path_type(self.file_path, self.path_type)
+        self.check_path_ability()
+        if self.is_script:
+            check_path_owner_consistent(self.file_path)
+        check_path_pattern_vaild(self.file_path)
+        check_common_file_size(self.file_path)
+        check_file_suffix(self.file_path, self.file_type)
+        return self.file_path
+
+    def check_path_ability(self):
+        if self.ability == FileCheckConst.WRITE_ABLE:
+            check_path_writability(self.file_path)
+        if self.ability == FileCheckConst.READ_ABLE:
+            check_path_readability(self.file_path)
+        if self.ability == FileCheckConst.READ_WRITE_ABLE:
+            check_path_readability(self.file_path)
+            check_path_writability(self.file_path)
+
+
+class FileOpen:
+    """
+    The class for open file by a safe way.
+
+    Attributes:
+        file_path: The file or dictionary path to be opened.
+        mode(str): The file open mode
+    """
+    SUPPORT_READ_MODE = ["r", "rb"]
+    SUPPORT_WRITE_MODE = ["w", "wb", "a", "ab"]
+    SUPPORT_READ_WRITE_MODE = ["r+", "rb+", "w+", "wb+", "a+", "ab+"]
+
+    def __init__(self, file_path, mode, encoding='utf-8'):
+        self.file_path = file_path
+        self.mode = mode
+        self.encoding = encoding
+        self._handle = None
+
+    def __enter__(self):
+        self.check_file_path()
+        binary_mode = "b"
+        if binary_mode not in self.mode:
+            self._handle = open(self.file_path, self.mode, encoding=self.encoding)
+        else:
+            self._handle = open(self.file_path, self.mode)
+        return self._handle
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._handle:
+            self._handle.close()
+
+    def check_file_path(self):
+        support_mode = self.SUPPORT_READ_MODE + self.SUPPORT_WRITE_MODE + self.SUPPORT_READ_WRITE_MODE
+        if self.mode not in support_mode:
+            print_error_log("File open not support %s mode" % self.mode)
+            raise FileCheckException(FileCheckException.INVALID_PARAM_ERROR)
+        check_link(self.file_path)
+        self.file_path = os.path.realpath(self.file_path)
+        check_path_length(self.file_path)
+        self.check_ability_and_owner()
+        check_path_pattern_vaild(self.file_path)
+        if os.path.exists(self.file_path):
+            check_common_file_size(self.file_path)
+
+    def check_ability_and_owner(self):
+        if self.mode in self.SUPPORT_READ_MODE:
+            check_path_exists(self.file_path)
+            check_path_readability(self.file_path)
+            check_path_owner_consistent(self.file_path)
+        if self.mode in self.SUPPORT_WRITE_MODE and os.path.exists(self.file_path):
+            check_path_writability(self.file_path)
+            check_path_owner_consistent(self.file_path)
+        if self.mode in self.SUPPORT_READ_WRITE_MODE and os.path.exists(self.file_path):
+            check_path_readability(self.file_path)
+            check_path_writability(self.file_path)
+            check_path_owner_consistent(self.file_path)
+
+
+def check_link(path):
+    abs_path = os.path.abspath(path)
+    if os.path.islink(abs_path):
+        print_error_log('The file path {} is a soft link.'.format(path))
+        raise FileCheckException(FileCheckException.INVALID_PATH_ERROR)
+
+
+def check_path_length(path, name_length=None):
+    file_max_name_length = name_length if name_length else FileCheckConst.FILE_NAME_LENGTH
+    if len(path) > FileCheckConst.DIRECTORY_LENGTH or \
+            len(os.path.basename(path)) > file_max_name_length:
+        print_error_log('The file path length exceeds limit.')
+        raise FileCheckException(FileCheckException.INVALID_PATH_ERROR)
+
+
+def check_path_exists(path):
+    if not os.path.exists(path):
+        print_error_log('The file path %s does not exist.' % path)
+        raise FileCheckException(FileCheckException.INVALID_PATH_ERROR)
+
+
+def check_path_readability(path):
+    if not os.access(path, os.R_OK):
+        print_error_log('The file path %s is not readable.' % path)
+        raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR)
+
+
+def check_path_writability(path):
+    if not os.access(path, os.W_OK):
+        print_error_log('The file path %s is not writable.' % path)
+        raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR)
+
+
+def check_path_executable(path):
+    if not os.access(path, os.X_OK):
+        print_error_log('The file path %s is not executable.' % path)
+        raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR)
+
+
+def check_other_user_writable(path):
+    st = os.stat(path)
+    if st.st_mode & 0o002:
+        _user_interactive_confirm(
+            'The file path %s may be insecure because other users have write permissions. '
+            'Do you want to continue?' % path)
+
+
+def _user_interactive_confirm(message):
+    while True:
+        check_message = input(message + " Enter 'c' to continue or enter 'e' to exit: ")
+        if check_message == "c":
+            break
+        elif check_message == "e":
+            print_warn_log("User canceled.")
+            raise FileCheckException(FileCheckException.INVALID_PATH_ERROR)
+        else:
+            print("Input is error, please enter 'c' or 'e'.")
+
+
+def check_path_owner_consistent(path):
+    file_owner = os.stat(path).st_uid
+    if file_owner != os.getuid():
+        print_error_log('The file path %s may be insecure because is does not belong to you.' % path)
+        raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR)
+
+
+def check_path_pattern_vaild(path):
+    if not re.match(FileCheckConst.FILE_VALID_PATTERN, path):
+        print_error_log('The file path {} contains special characters.'.format(path))
+        raise FileCheckException(FileCheckException.INVALID_PATH_ERROR)
+
+
+def check_file_size(file_path, max_size):
+    file_size = os.path.getsize(file_path)
+    if file_size >= max_size:
+        _user_interactive_confirm(f'The size of file path {file_path} exceeds {max_size} bytes.'
+                                  f'Do you want to continue?')
+
+
+def check_common_file_size(file_path):
+    if os.path.isfile(file_path):
+        for suffix, max_size in FileCheckConst.FILE_SIZE_DICT.items():
+            if file_path.endswith(suffix):
+                check_file_size(file_path, max_size)
+                break
+
+
+def check_file_suffix(file_path, file_suffix):
+    if file_suffix:
+        if not file_path.endswith(file_suffix):
+            print_error_log(f"The {file_path} should be a {file_suffix} file!")
+            raise FileCheckException(FileCheckException.INVALID_FILE_TYPE_ERROR)
+
+
+def check_path_type(file_path, file_type):
+    if file_type == FileCheckConst.FILE:
+        if not os.path.isfile(file_path):
+            print_error_log(f"The {file_path} should be a file!")
+            raise FileCheckException(FileCheckException.INVALID_FILE_TYPE_ERROR)
+    if file_type == FileCheckConst.DIR:
+        if not os.path.isdir(file_path):
+            print_error_log(f"The {file_path} should be a dictionary!")
+            raise FileCheckException(FileCheckException.INVALID_FILE_TYPE_ERROR)
+
+
+def create_directory(dir_path):
+    """
+    Function Description:
+        creating a directory with specified permissions
+    Parameter:
+        dir_path: directory path
+    Exception Description:
+        when invalid data throw exception
+    """
+    dir_path = os.path.realpath(dir_path)
+    try:
+        os.makedirs(dir_path, mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True)
+    except OSError as ex:
+        print_error_log(
+            'Failed to create {}.Please check the path permission or disk space .{}'.format(dir_path, str(ex)))
+        raise FileCheckException(FileCheckException.INVALID_PATH_ERROR) from ex
+
+
+def change_mode(path, mode):
+    if not os.path.exists(path) or os.path.islink(path):
+        return
+    try:
+        os.chmod(path, mode)
+    except PermissionError as ex:
+        print_error_log('Failed to change {} authority. {}'.format(path, str(ex)))
+        raise FileCheckException(FileCheckException.INVALID_PERMISSION_ERROR) from ex
+
diff --git a/debug/accuracy_tools/atat/pytorch/common/log.py b/debug/accuracy_tools/atat/pytorch/common/log.py
new file mode 100644
index 0000000000000000000000000000000000000000..fab5aca45c08af7253dedf8ee13db10b271683da
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/common/log.py
@@ -0,0 +1,59 @@
+import os
+import time
+import sys
+from .utils import get_rank_if_initialized
+
+
+def on_rank_0(func):
+    def func_rank_0(*args, **kwargs):
+        current_rank = get_rank_if_initialized()
+        if current_rank is None or current_rank == 0:
+            return func(*args, **kwargs)
+
+    return func_rank_0
+
+
+def _print_log(level, msg, end='\n'):
+    current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
+    pid = os.getpid()
+    full_msg = current_time + "(" + str(pid) + ")-[" + level + "]" + msg
+    current_rank = get_rank_if_initialized()
+    if current_rank is not None:
+        full_msg = f"[rank {current_rank}]-" + full_msg
+    print(full_msg, end=end)
+    sys.stdout.flush()
+
+
+def print_info_log(info_msg, end='\n'):
+    """
+    Function Description:
+        print info log.
+    Parameter:
+        info_msg: the info message.
+    """
+    _print_log("INFO", info_msg, end=end)
+
+
+def print_error_log(error_msg):
+    """
+    Function Description:
+        print error log.
+    Parameter:
+        error_msg: the error message.
+    """
+    _print_log("ERROR", error_msg)
+
+
+def print_warn_log(warn_msg):
+    """
+    Function Description:
+        print warn log.
+    Parameter:
+        warn_msg: the warning message.
+    """
+    _print_log("WARNING", warn_msg)
+
+
+print_info_log_rank_0 = on_rank_0(print_info_log)
+print_warn_log_rank_0 = on_rank_0(print_warn_log)
+print_error_log_rank_0 = on_rank_0(print_error_log)
diff --git a/debug/accuracy_tools/atat/pytorch/common/parse_json.py b/debug/accuracy_tools/atat/pytorch/common/parse_json.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dddb185c14abb7e3b6e560322aa6169708a122d
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/common/parse_json.py
@@ -0,0 +1,37 @@
+import json
+from .exceptions import ParseJsonException
+
+
+def parse_json_info_forward_backward(json_path):
+    def parse_data_name_with_pattern(data_name, pattern):
+        name_struct = data_name.split('.')
+        if not name_struct[-1] == pattern:
+            raise ParseJsonException(ParseJsonException.UnexpectedNameStruct,
+                f"{data_name} in file {json_path}")
+        api_name = '.'.join(name_struct[:-1])
+        return api_name
+
+    with open(json_path, 'r') as f:
+        dump_json = json.load(f)
+
+    real_data_path = dump_json.get("dump_path")
+    dump_data = dump_json.get("data")
+    if not dump_data:
+        raise ParseJsonException(ParseJsonException.InvalidDumpJson, "dump数据中没有data字段")
+
+    forward_data = {}
+    backward_data = {}
+    for data_name, data_item in dump_data.items():
+        if "Module" in data_name:
+            continue
+        if "forward" in data_name:
+            api_name = parse_data_name_with_pattern(data_name, "forward")
+            forward_data.update({api_name: data_item})
+        elif "backward" in data_name:
+            api_name = parse_data_name_with_pattern(data_name, "backward")
+            backward_data.update({api_name: data_item})
+        else:
+            raise ParseJsonException(ParseJsonException.UnexpectedNameStruct,
+                f"{data_name} in file {json_path}.")
+
+    return forward_data, backward_data, real_data_path
diff --git a/debug/accuracy_tools/atat/pytorch/common/recursive.py b/debug/accuracy_tools/atat/pytorch/common/recursive.py
new file mode 100644
index 0000000000000000000000000000000000000000..3745a33f9eac6c1c7e8e5437ca375dc4e0f8f22a
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/common/recursive.py
@@ -0,0 +1,23 @@
+import torch
+
+_recursive_key_stack = []
+def recursive_apply_transform(args, transform):
+    global _recursive_key_stack
+    if isinstance(args, (list, tuple)):
+        transform_result = []
+        for i, arg in enumerate(args):
+            _recursive_key_stack.append(str(i))
+            transform_result.append(recursive_apply_transform(arg, transform))
+            _recursive_key_stack.pop()
+        return type(args)(transform_result)
+    elif isinstance(args, dict):
+        transform_result = {}
+        for k, arg in args.items():
+            _recursive_key_stack.append(str(k))
+            transform_result[k] = recursive_apply_transform(arg, transform)
+            _recursive_key_stack.pop()
+        return transform_result
+    else:
+        arg_transform = transform(args, _recursive_key_stack)
+        return arg_transform
+
diff --git a/debug/accuracy_tools/atat/pytorch/common/utils.py b/debug/accuracy_tools/atat/pytorch/common/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3817217ef21c217bf8007d6bc1f115e0dd6bce43
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/common/utils.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import os
+import re
+import random
+import stat
+import torch
+import numpy as np
+from functools import wraps
+try:
+    import torch_npu
+except ImportError:
+    is_gpu = True
+else:
+    is_gpu = False
+
+
+torch_without_guard_version_list = ['2.1']
+for version in torch_without_guard_version_list:
+    if torch.__version__.startswith(version):
+        torch_without_guard_version = True
+        break
+    else:
+        torch_without_guard_version = False
+
+if not is_gpu and not torch_without_guard_version:
+    from torch_npu.utils.device_guard import torch_device_guard as torch_npu_device_guard
+
+npu_distributed_api = ['isend', 'irecv']
+
+
+def parameter_adapter(func):
+
+    @wraps(func)
+    def inner(self, *args, **kwargs):
+        if self.op_name_ == "__getitem__" and len(args) > 1 and isinstance(args[1], torch.Tensor):
+            input_tensor = args[0]
+            indices = args[1]
+            if indices.dtype == torch.uint8:
+                indices = indices.bool()
+            if indices.dtype == torch.bool:
+                if indices.shape == input_tensor.shape:
+                    return getattr(torch._C._VariableFunctionsClass, "masked_select")(input_tensor, indices)
+                else:
+                    indices = getattr(torch._C._VariableFunctionsClass, "nonzero")(indices, as_tuple=True)
+                    return getattr(torch._C._TensorBase, "__getitem__")(input_tensor, indices)
+            elif indices.dtype != torch.bool:
+                if not indices.shape or len(indices.shape) == 1:
+                    return func(self, input_tensor, indices.tolist())
+                elif len(indices.shape) == 2:
+                    result = [func(self, input_tensor, index) for index in indices.tolist()]
+                    return getattr(torch._C._VariableFunctionsClass, "stack")(result, 0)
+                else:
+                    res = [input_tensor[tensor_index] for tensor_index in indices]
+                    return getattr(torch._C._VariableFunctionsClass, "stack")(res, 0)
+        if self.op_name_ == "__eq__" and args[1] is None:
+            return False
+        return func(self, *args, **kwargs)
+    return inner
+
+
+def torch_device_guard(func):
+    if is_gpu or torch_without_guard_version:
+        return func
+    # Parse args/kwargs matched torch.device objects
+
+    @torch_npu_device_guard
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+    return wrapper
+
+
+def get_rank_if_initialized():
+    if torch.distributed.is_initialized():
+        return torch.distributed.get_rank()
+    return None
+
+
+def seed_all(seed=1234, mode=False):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.use_deterministic_algorithms(mode)
+    if is_gpu:
+        torch.cuda.manual_seed_all(seed)
+        torch.cuda.manual_seed(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.enable = False
+        torch.backends.cudnn.benchmark = False
+    else:
+        torch_npu.npu.manual_seed_all(seed)
+        torch_npu.npu.manual_seed(seed)
+
+
+class Const:
+    """
+    Class for const
+    """
+    SEP = "."
+    MODEL_TYPE = ['.onnx', '.pb', '.om']
+    DIM_PATTERN = r"^(-?[0-9]+)(,-?[0-9]+)*"
+    SEMICOLON = ";"
+    COLON = ":"
+    EQUAL = "="
+    COMMA = ","
+    DOT = "."
+    DUMP_RATIO_MAX = 100
+    SUMMERY_DATA_NUMS = 256
+    FLOAT_EPSILON = np.finfo(float).eps
+    SUPPORT_DUMP_MODE = ['api', 'acl']
+    ON = 'ON'
+    OFF = 'OFF'
+    KWARGS = 'kwargs'
+    INPUT = 'input'
+    OUTPUT = 'output'
+    BACKWARD = 'backward'
+    FORWARD = 'forward'
+    PRE_FORWARD = "pre_forward"
+    INPUT_ARGS = 'input_args'
+    INPUT_KWARGS = 'input_kwargs'
+    GRAD_INPUT = 'grad_input'
+    GRAD_OUTPUT = 'grad_output'
+    START = "start"
+    STOP = "stop"
+
+    # dump mode
+    ALL = "all"
+    LIST = "list"
+    RANGE = "range"
+    STACK = "stack"
+    ACL = "acl"
+    API_LIST = "api_list"
+    API_STACK = "api_stack"
+    DUMP_MODE = [ALL, LIST, RANGE, STACK, ACL, API_LIST, API_STACK]
+    AUTO = "auto"
+    ONLINE_DUMP_MODE = [ALL, LIST, AUTO, OFF]
+    SUMMARY = "summary"
+    MD5 = "md5"
+    SUMMARY_MODE = [ALL, SUMMARY, MD5]
+
+    WRITE_FLAGS = os.O_WRONLY | os.O_CREAT
+    WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR
+
+    PKL_SUFFIX = ".pkl"
+    NUMPY_SUFFIX = ".npy"
+    ONE_GB = 1 * 1024 * 1024 * 1024
+    TEN_GB = 10 * 1024 * 1024 * 1024
+    FILE_PATTERN = r'^[a-zA-Z0-9_./-]+$'
+    FILE_NAME_LENGTH = 255
+    DIRECTORY_LENGTH = 4096
+    DISTRIBUTED_PREFIX_LENGTH = 60
+    SUMMARY_COLUMN_NUM = 6
+    STACK_COLUMN_NUM = 2
+    # env dump path
+    ASCEND_WORK_PATH = "ASCEND_WORK_PATH"
+    DUMP_DIR = "dump_data"
+
+    ENV_ENABLE = "1"
+    ENV_DISABLE = "0"
+
+    MAX_SEED_VALUE = 2**32 - 1
+
+    INPLACE_LIST = ["broadcast", "all_reduce", "reduce", "all_gather", "gather", "scatter", "reduce_scatter",
+                    "_reduce_scatter_base", "_all_gather_base", "all_to_all_single"]
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/common/utils_compare.py b/debug/accuracy_tools/atat/pytorch/common/utils_compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..374832b5d8dd7a665e23e4f271f1be2f99e1f711
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/common/utils_compare.py
@@ -0,0 +1,833 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import collections
+import os
+import random
+import re
+import shutil
+import stat
+import subprocess
+import sys
+import time
+import zlib
+import json
+from json.decoder import JSONDecodeError
+from datetime import datetime, timezone
+from functools import wraps
+from pathlib import Path
+import numpy as np
+import torch
+
+from .file_check_util import FileOpen, FileChecker, FileCheckConst
+
+try:
+    import torch_npu
+except ImportError:
+    is_gpu = True
+else:
+    is_gpu = False
+
+torch_without_guard_version_list = ['2.1']
+for version in torch_without_guard_version_list:
+    if torch.__version__.startswith(version):
+        torch_without_guard_version = True
+        break
+    else:
+        torch_without_guard_version = False
+
+if not is_gpu and not torch_without_guard_version:
+    from torch_npu.utils.device_guard import torch_device_guard as torch_npu_device_guard
+
+device = collections.namedtuple('device', ['type', 'index'])
+prefixes = ['api_stack', 'list', 'range', 'acl']
+npu_distributed_api = ['isend', 'irecv']
+
+
+class Const:
+    """
+    Class for const
+    """
+    MODEL_TYPE = ['.onnx', '.pb', '.om']
+    DIM_PATTERN = r"^(-?[0-9]+)(,-?[0-9]+)*"
+    SEMICOLON = ";"
+    COLON = ":"
+    EQUAL = "="
+    COMMA = ","
+    DOT = "."
+    DUMP_RATIO_MAX = 100
+    SUMMERY_DATA_NUMS = 256
+    FLOAT_EPSILON = np.finfo(float).eps
+    SUPPORT_DUMP_MODE = ['api', 'acl']
+    ON = 'ON'
+    OFF = 'OFF'
+    BACKWARD = 'backward'
+    FORWARD = 'forward'
+    PRE_FORWARD = "pre_forward"
+
+    # dump mode
+    ALL = "all"
+    LIST = "list"
+    RANGE = "range"
+    STACK = "stack"
+    ACL = "acl"
+    API_LIST = "api_list"
+    API_STACK = "api_stack"
+    DUMP_MODE = [ALL, LIST, RANGE, STACK, ACL, API_LIST, API_STACK]
+    AUTO = "auto"
+    ONLINE_DUMP_MODE = [ALL, LIST, AUTO, OFF]
+    SUMMARY = "summary"
+    MD5 = "md5"
+    SUMMARY_MODE = [ALL, SUMMARY, MD5]
+
+    WRITE_FLAGS = os.O_WRONLY | os.O_CREAT
+    WRITE_MODES = stat.S_IWUSR | stat.S_IRUSR
+
+    PKL_SUFFIX = ".pkl"
+    NUMPY_SUFFIX = ".npy"
+    ONE_GB = 1 * 1024 * 1024 * 1024
+    TEN_GB = 10 * 1024 * 1024 * 1024
+    FILE_PATTERN = r'^[a-zA-Z0-9_./-]+$'
+    FILE_NAME_LENGTH = 255
+    DIRECTORY_LENGTH = 4096
+    DISTRIBUTED_PREFIX_LENGTH = 60
+    SUMMARY_COLUMN_NUM = 6
+    STACK_COLUMN_NUM = 2
+    # env dump path
+    ASCEND_WORK_PATH = "ASCEND_WORK_PATH"
+    DUMP_DIR = "dump_data"
+
+    ENV_ENABLE = "1"
+    ENV_DISABLE = "0"
+
+    MAX_SEED_VALUE = 2**32 - 1
+
+    INPLACE_LIST = ["broadcast", "all_reduce", "reduce", "all_gather", "gather", "scatter", "reduce_scatter",
+                    "_reduce_scatter_base", "_all_gather_base"]
+
+
+class CompareConst:
+    """
+    Class for compare module const
+    """
+    # compare result column name
+    NPU_NAME = "NPU Name"
+    BENCH_NAME = "Bench Name"
+    NPU_DTYPE = "NPU Dtype"
+    BENCH_DTYPE = "Bench Dtype"
+    NPU_SHAPE = "NPU Tensor Shape"
+    BENCH_SHAPE = "Bench Tensor Shape"
+    NPU_MAX = "NPU max"
+    NPU_MIN = "NPU min"
+    NPU_MEAN = "NPU mean"
+    NPU_NORM = "NPU l2norm"
+    BENCH_MAX = "Bench max"
+    BENCH_MIN = "Bench min"
+    BENCH_MEAN = "Bench mean"
+    BENCH_NORM = "Bench l2norm"
+    MAX_RELATIVE_ERROR = "Max Relative Error"
+    MIN_RELATIVE_ERROR = "Min Relative Error"
+    MEAN_RELATIVE_ERROR = "Mean Relative Error"
+    L2NORM_RELATIVE_ERROR = "L2norm Relative Error"
+    MAX_DIFF = "Max diff"
+    MIN_DIFF = "Min diff"
+    MEAN_DIFF = "Mean diff"
+    NORM_DIFF = "L2norm diff"
+    COSINE = "Cosine"
+    MAX_ABS_ERR = "MaxAbsErr"
+    MAX_RELATIVE_ERR = "MaxRelativeErr"
+    ACCURACY = "Accuracy Reached or Not"
+    STACK = "NPU_Stack_Info"
+    DATA_NAME = "Data_name"
+    ERROR_MESSAGE = "Err_message"
+    ONE_THOUSANDTH_ERR_RATIO = "One Thousandth Err Ratio"
+    FIVE_THOUSANDTHS_ERR_RATIO = "Five Thousandths Err Ratio"
+    NPU_MD5 = "NPU MD5"
+    BENCH_MD5 = "BENCH MD5"
+    RESULT = "Result"
+
+    COMPARE_RESULT_HEADER = [
+        NPU_NAME, BENCH_NAME, NPU_DTYPE, BENCH_DTYPE, NPU_SHAPE, BENCH_SHAPE, COSINE, MAX_ABS_ERR, MAX_RELATIVE_ERR,
+        ONE_THOUSANDTH_ERR_RATIO, FIVE_THOUSANDTHS_ERR_RATIO,
+        NPU_MAX, NPU_MIN, NPU_MEAN, NPU_NORM, BENCH_MAX, BENCH_MIN, BENCH_MEAN, BENCH_NORM, ACCURACY, ERROR_MESSAGE
+    ]
+
+    SUMMARY_COMPARE_RESULT_HEADER = [
+        NPU_NAME, BENCH_NAME, NPU_DTYPE, BENCH_DTYPE, NPU_SHAPE, BENCH_SHAPE, MAX_DIFF, MIN_DIFF, MEAN_DIFF, NORM_DIFF,
+        MAX_RELATIVE_ERROR, MIN_RELATIVE_ERROR, MEAN_RELATIVE_ERROR, L2NORM_RELATIVE_ERROR, NPU_MAX, NPU_MIN, 
+        NPU_MEAN, NPU_NORM, BENCH_MAX, BENCH_MIN, BENCH_MEAN, BENCH_NORM, RESULT, ERROR_MESSAGE
+    ]
+
+    MD5_COMPARE_RESULT_HEADER = [
+        NPU_NAME, BENCH_NAME, NPU_DTYPE, BENCH_DTYPE, NPU_SHAPE, BENCH_SHAPE, NPU_MD5, BENCH_MD5, RESULT
+    ]
+
+    # compare result data
+    NAN = 'Nan'
+    NONE = 'None'
+    SHAPE_UNMATCH = 'shape unmatched'
+    DTYPE_UNMATCH = 'dtype unmatched'
+    PASS = 'Pass'
+    WARNING = 'Warning'
+    DIFF = 'Different'
+
+    # accuracy standards
+    COS_THRESHOLD = 0.99
+    MAX_ABS_ERR_THRESHOLD = 0.001
+    COS_MAX_THRESHOLD = 0.9
+    MAX_ABS_ERR_MAX_THRESHOLD = 1
+    ACCURACY_CHECK_YES = "Yes"
+    ACCURACY_CHECK_NO = "No"
+    ACCURACY_CHECK_UNMATCH = "Unmatched"
+
+    # error message
+    NO_BENCH = "No bench data matched."
+
+    # compare const
+    FLOAT_TYPE = [np.half, np.single, float, np.double, np.float64, np.longdouble]
+
+
+class VersionCheck:
+    """
+    Class for TorchVersion
+    """
+    V1_8 = "1.8"
+    V1_11 = "1.11"
+    V2_0 = "2.0"
+    V2_1 = "2.1"
+
+    @staticmethod
+    def check_torch_version(version):
+        torch_version = torch.__version__
+        if torch_version.startswith(version):
+            return True
+        else:
+            return False
+
+
+class CompareException(Exception):
+    """
+    Class for Accuracy Compare Exception
+    """
+    NONE_ERROR = 0
+    INVALID_PATH_ERROR = 1
+    OPEN_FILE_ERROR = 2
+    CLOSE_FILE_ERROR = 3
+    READ_FILE_ERROR = 4
+    WRITE_FILE_ERROR = 5
+    INVALID_FILE_ERROR = 6
+    PERMISSION_ERROR = 7
+    INDEX_OUT_OF_BOUNDS_ERROR = 8
+    NO_DUMP_FILE_ERROR = 9
+    INVALID_DATA_ERROR = 10
+    INVALID_PARAM_ERROR = 11
+    INVALID_DUMP_RATIO = 12
+    INVALID_DUMP_FILE = 13
+    UNKNOWN_ERROR = 14
+    INVALID_DUMP_MODE = 15
+    PARSE_FILE_ERROR = 16
+    INVALID_COMPARE_MODE = 17
+    OVER_SIZE_FILE_ERROR = 18
+    INVALID_SUMMARY_MODE = 19
+    INVALID_TASK_ERROR = 20
+
+    def __init__(self, code, error_info: str = ""):
+        super(CompareException, self).__init__()
+        self.code = code
+        self.error_info = error_info
+
+    def __str__(self):
+        return self.error_info
+
+
+class DumpException(CompareException):
+    pass
+
+
+class OverflowConst:
+    """
+    Class for Overflow
+    """
+    OVERFLOW_DEBUG_MODE_ENABLE = "OVERFLOW_DEBUG_MODE_ENABLE"
+    OVERFLOW_ORIGINAL_MODE = 0
+    OVERFLOW_DEBUG_MODE = 1
+
+
+def make_dump_path_if_not_exists(dump_path):
+    if not os.path.exists(dump_path):
+        try:
+            Path(dump_path).mkdir(mode=0o750, exist_ok=True, parents=True)
+        except OSError as ex:
+            print_error_log(
+                'Failed to create {}.Please check the path permission or disk space .{}'.format(dump_path, str(ex)))
+            raise CompareException(CompareException.INVALID_PATH_ERROR) from ex
+    else:
+        if not os.path.isdir(dump_path):
+            print_error_log('{} already exists and is not a directory.'.format(dump_path))
+
+
+def _print_log(level, msg, end='\n'):
+    current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
+    pid = os.getgid()
+    print(current_time + "(" + str(pid) + ")-[" + level + "]" + msg, end=end)
+    sys.stdout.flush()
+
+
+def print_info_log(info_msg, end='\n'):
+    """
+    Function Description:
+        print info log.
+    Parameter:
+        info_msg: the info message.
+    """
+    _print_log("INFO", info_msg, end=end)
+
+
+def print_error_log(error_msg):
+    """
+    Function Description:
+        print error log.
+    Parameter:
+        error_msg: the error message.
+    """
+    _print_log("ERROR", error_msg)
+
+
+def print_warn_log(warn_msg):
+    """
+    Function Description:
+        print warn log.
+    Parameter:
+        warn_msg: the warning message.
+    """
+    _print_log("WARNING", warn_msg)
+
+
+def check_mode_valid(mode, scope=None, api_list=None):
+    if scope is None:
+        scope = []
+    if api_list is None:
+        api_list = []
+    if not isinstance(scope, list):
+        raise ValueError("scope param set invalid, it's must be a list.")
+    if not isinstance(api_list, list):
+        raise ValueError("api_list param set invalid, it's must be a list.")
+    mode_check = {
+        Const.ALL: lambda: None,
+        Const.RANGE: lambda:  ValueError("set_dump_switch, scope param set invalid, it's must be [start, end].") if len(scope) != 2 else None,
+        Const.LIST: lambda:  ValueError("set_dump_switch, scope param set invalid, it's should not be an empty list.") if len(scope) == 0 else None,
+        Const.STACK: lambda:  ValueError("set_dump_switch, scope param set invalid, it's must be [start, end] or [].") if len(scope) > 2 else None,
+        Const.ACL: lambda:  ValueError("set_dump_switch, scope param set invalid, only one api name is supported in acl mode.") if len(scope) != 1 else None,
+        Const.API_LIST: lambda:  ValueError("Current dump mode is 'api_list', but the content of api_list parameter is empty or valid.") if len(api_list) < 1 else None,
+        Const.API_STACK: lambda: None,
+    }
+    if mode not in Const.DUMP_MODE:
+        msg = "Current mode '%s' is not supported. Please use the field in %s" % \
+              (mode, Const.DUMP_MODE)
+        raise CompareException(CompareException.INVALID_DUMP_MODE, msg)
+
+    if mode_check.get(mode)() is not None:
+        raise mode_check.get(mode)()
+
+
+def check_switch_valid(switch):
+    if switch not in ["ON", "OFF"]:
+        print_error_log("Please set switch with 'ON' or 'OFF'.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+
+
+def check_dump_mode_valid(dump_mode):
+    if not isinstance(dump_mode, list):
+        print_warn_log("Please set dump_mode as a list.")
+        dump_mode = [dump_mode]
+    if not all(mode in ["all", "forward", "backward", "input", "output"] for mode in dump_mode):
+        raise ValueError("Please set dump_mode as a list containing one or more of the following: 'all', 'forward', 'backward', 'input', 'output'.")
+    if 'input' not in dump_mode and 'output' not in dump_mode:
+        dump_mode.extend(['input', 'output'])
+    if 'forward' not in dump_mode and 'backward' not in dump_mode:
+        dump_mode.extend(['forward', 'backward'])
+    if 'all' in dump_mode or set(["forward", "backward", "input", "output"]).issubset(set(dump_mode)):
+        return ["forward", "backward", "input", "output"]
+    return dump_mode
+
+
+def check_summary_mode_valid(summary_mode):
+    if summary_mode not in Const.SUMMARY_MODE:
+        msg = "The summary_mode is not valid"
+        raise CompareException(CompareException.INVALID_SUMMARY_MODE, msg)
+
+
+def check_summary_only_valid(summary_only):
+    if not isinstance(summary_only, bool):
+        print_error_log("Params summary_only only support True or False.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    return summary_only
+
+
+def check_compare_param(input_parma, output_path, stack_mode=False, summary_compare=False, md5_compare=False):
+    if not (isinstance(input_parma, dict) and isinstance(output_path, str)):
+        print_error_log("Invalid input parameters")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    check_file_or_directory_path(input_parma.get("npu_json_path"), False)
+    check_file_or_directory_path(input_parma.get("bench_json_path"), False)
+    check_file_or_directory_path(input_parma.get("stack_json_path"), False)
+    if not summary_compare and not md5_compare:
+        check_file_or_directory_path(input_parma.get("npu_dump_data_dir"), True)
+        check_file_or_directory_path(input_parma.get("bench_dump_data_dir"), True)
+    check_file_or_directory_path(output_path, True)
+    with FileOpen(input_parma.get("npu_json_path"), "r") as npu_json, \
+         FileOpen(input_parma.get("bench_json_path"), "r") as bench_json, \
+         FileOpen(input_parma.get("stack_json_path"), "r") as stack_json:
+        check_json_file(input_parma, npu_json, bench_json, stack_json)
+
+
+def md5_find(data):
+    for key_op in data:
+        for api_info in data[key_op]:
+            if isinstance(data[key_op][api_info], list):
+                for i in range(len(data[key_op][api_info])):
+                    if data[key_op][api_info][i] == None:
+                        continue
+                    elif 'md5' in data[key_op][api_info][i]:
+                        return True
+            elif 'md5' in data[key_op][api_info]:
+                return True
+    return False
+
+
+def task_dumppath_get(input_param):
+    npu_json_path = input_param.get("npu_json_path", None)
+    bench_json_path = input_param.get("bench_json_path", None)
+    if not npu_json_path or not bench_json_path:
+        print_error_log(f"Please check the json path is valid.")
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+    with FileOpen(npu_json_path, 'r') as npu_f:
+        npu_json_data = json.load(npu_f)
+    with FileOpen(bench_json_path, 'r') as bench_f:
+        bench_json_data = json.load(bench_f)
+    if npu_json_data['task'] != bench_json_data['task']:
+        print_error_log(f"Please check the dump task is consistent.")
+        raise CompareException(CompareException.INVALID_TASK_ERROR)
+    else:
+        if npu_json_data['task'] == 'tensor':
+            summary_compare = False
+            md5_compare = False
+        elif npu_json_data['task'] == 'statistics':
+            md5_compare = md5_find(npu_json_data['data'])
+            if md5_compare:
+                summary_compare = False
+            else:
+                summary_compare = True
+        else:
+            print_error_log(f"Compare is not required for overflow_check.")
+            raise CompareException(CompareException.INVALID_TASK_ERROR)
+        input_param['npu_dump_data_dir'] = npu_json_data['dump_data_dir']
+        input_param['bench_dump_data_dir'] = bench_json_data['dump_data_dir']
+        return summary_compare, md5_compare
+
+
+def check_configuration_param(stack_mode=False, auto_analyze=True, fuzzy_match=False):
+    if not (isinstance(stack_mode, bool) and isinstance(auto_analyze, bool) and isinstance(fuzzy_match, bool)):
+        print_error_log("Invalid input parameters which should be only bool type.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+
+
+def check_file_or_directory_path(path, isdir=False):
+    """
+    Function Description:
+        check whether the path is valid
+    Parameter:
+        path: the path to check
+        isdir: the path is dir or file
+    Exception Description:
+        when invalid data throw exception
+    """
+    if isdir:
+        path_checker = FileChecker(path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE)
+    else:
+        path_checker = FileChecker(path, FileCheckConst.FILE, FileCheckConst.READ_ABLE)
+    path_checker.common_check()
+
+
+def _check_pkl(pkl_file_handle, file_name):
+    tensor_line = pkl_file_handle.readline()
+    if len(tensor_line) == 0:
+        print_error_log("dump file {} have empty line!".format(file_name))
+        raise CompareException(CompareException.INVALID_DUMP_FILE)
+    pkl_file_handle.seek(0, 0)
+
+
+def _check_json(json_file_handle, file_name):
+    tensor_line = json_file_handle.readline()
+    if len(tensor_line) == 0:
+        print_error_log("dump file {} have empty line!".format(file_name))
+        raise CompareException(CompareException.INVALID_DUMP_FILE)
+    json_file_handle.seek(0, 0)
+
+
+def is_starts_with(string, prefix_list):
+    return any(string.startswith(prefix) for prefix in prefix_list)
+
+
+def check_stack_mode(pkl_fp):
+    api_prefix = ""
+    api_pattern = r'\[\"([0-9a-zA-Z_.]+_(for|back)ward)_(in|out)put(\.[0-9]+)?'
+    is_stack_mode = False
+    for index, line in enumerate(pkl_fp):
+        if index == 0:
+            api_match = re.search(api_pattern, line)
+            api_prefix = api_match.group(1)
+        elif api_prefix and line.startswith(f'["{api_prefix}'):
+            if line.startswith(f'["{api_prefix}_stack_info'):
+                is_stack_mode = True
+                break
+        else:
+            break
+    pkl_fp.seek(0, 0)
+    return is_stack_mode
+
+
+def check_pkl_file(input_param, npu_pkl, bench_pkl, stack_mode):
+    _check_pkl(npu_pkl, input_param.get("npu_pkl_path"))
+    _check_pkl(bench_pkl, input_param.get("bench_pkl_path"))
+
+    npu_pkl_stack_mode = check_stack_mode(npu_pkl)
+    bench_pkl_stack_mode = check_stack_mode(bench_pkl)
+
+    if not npu_pkl_stack_mode and not bench_pkl_stack_mode:
+        if stack_mode:
+            print_error_log("The current file does not contain stack information, please turn off the stack_mode")
+            raise CompareException(CompareException.INVALID_COMPARE_MODE)
+    elif npu_pkl_stack_mode and bench_pkl_stack_mode:
+        if not stack_mode:
+            print_error_log("The current file contains stack information, please turn on the stack_mode")
+            raise CompareException(CompareException.INVALID_COMPARE_MODE)
+    else:
+        print_error_log("The dump mode of the two files is not same, please check the dump files")
+        raise CompareException(CompareException.INVALID_COMPARE_MODE)
+
+
+def check_json_file(input_param, npu_json, bench_json, stack_json):
+    _check_json(npu_json, input_param.get("npu_json_path"))
+    _check_json(bench_json, input_param.get("bench_json_path"))
+    _check_json(stack_json, input_param.get("stack_json_path"))
+
+
+def check_file_size(input_file, max_size):
+    try:
+        file_size = os.path.getsize(input_file)
+    except OSError as os_error:
+        print_error_log('Failed to open "%s". %s' % (input_file, str(os_error)))
+        raise CompareException(CompareException.INVALID_FILE_ERROR) from os_error
+    if file_size > max_size:
+        print_error_log('The size (%d) of %s exceeds (%d) bytes, tools not support.'
+                        % (file_size, input_file, max_size))
+        raise CompareException(CompareException.INVALID_FILE_ERROR)
+
+
+def check_file_not_exists(file_path):
+    if os.path.exists(file_path) or os.path.islink(file_path):
+        remove_path(file_path)
+
+
+def remove_path(path):
+    if not os.path.exists(path):
+        return
+    try:
+        if os.path.islink(path) or os.path.isfile(path):
+            os.remove(path)
+        else:
+            shutil.rmtree(path)
+    except PermissionError as err:
+        print_error_log("Failed to delete {}. Please check the permission.".format(path))
+        raise CompareException(CompareException.INVALID_PATH_ERROR) from err
+
+
+def get_dump_data_path(dump_dir):
+    """
+    Function Description:
+        traverse directories and obtain the absolute path of dump data
+    Parameter:
+        dump_dir: dump data directory
+    Return Value:
+        dump data path,file is exist or file is not exist
+    """
+    dump_data_path = None
+    file_is_exist = False
+
+    check_file_or_directory_path(dump_dir, True)
+    for dir_path, sub_paths, files in os.walk(dump_dir):
+        if len(files) != 0:
+            dump_data_path = dir_path
+            file_is_exist = True
+            break
+        dump_data_path = dir_path
+    return dump_data_path, file_is_exist
+
+
+def modify_dump_path(dump_path, mode):
+    if mode == Const.ALL:
+        return dump_path
+    file_name = os.path.split(dump_path)
+    mode_file_name = mode + "_" + file_name[-1]
+    return os.path.join(file_name[0], mode_file_name)
+
+
+def create_directory(dir_path):
+    """
+    Function Description:
+        creating a directory with specified permissions
+    Parameter:
+        dir_path: directory path
+    Exception Description:
+        when invalid data throw exception
+    """
+    if not os.path.exists(dir_path):
+        try:
+            os.makedirs(dir_path, mode=0o700)
+        except OSError as ex:
+            print_error_log(
+                'Failed to create {}.Please check the path permission or disk space .{}'.format(dir_path, str(ex)))
+            raise CompareException(CompareException.INVALID_PATH_ERROR) from ex
+
+
+def execute_command(cmd):
+    """
+    Function Description:
+        run the following command
+    Parameter:
+        cmd: command
+    Exception Description:
+        when invalid command throw exception
+    """
+    print_info_log('Execute command:%s' % cmd)
+    process = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    while process.poll() is None:
+        line = process.stdout.readline()
+        line = line.strip()
+        if line:
+            print(line)
+    if process.returncode != 0:
+        print_error_log('Failed to execute command:%s' % " ".join(cmd))
+        raise CompareException(CompareException.INVALID_DATA_ERROR)
+
+
+def save_numpy_data(file_path, data):
+    """
+    save_numpy_data
+    """
+    if not os.path.exists(os.path.dirname(file_path)):
+        os.makedirs(os.path.dirname(file_path))
+    np.save(file_path, data)
+
+
+def parse_value_by_comma(value):
+    """
+    parse value by comma, like '1,2,4,8'
+    """
+    value_list = []
+    value_str_list = value.split(Const.COMMA)
+    for value_str in value_str_list:
+        value_str = value_str.strip()
+        if value_str.isdigit() or value_str == '-1':
+            value_list.append(int(value_str))
+        else:
+            print_error_log("please check your input shape.")
+            raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    return value_list
+
+
+def get_data_len_by_shape(shape):
+    data_len = 1
+    for item in shape:
+        if item == -1:
+            print_error_log("please check your input shape, one dim in shape is -1.")
+            return -1
+        data_len = data_len * item
+    return data_len
+
+
+def add_time_as_suffix(name):
+    return '{}_{}.csv'.format(name, time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())))
+
+
+def get_time():
+    return datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
+
+
+def format_value(value):
+    return '{:.12f}'.format(value)
+
+
+def torch_device_guard(func):
+    if is_gpu or torch_without_guard_version:
+        return func
+    # Parse args/kwargs matched torch.device objects
+
+    @torch_npu_device_guard
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+    return wrapper
+
+
+def seed_all(seed=1234, mode=False):
+    check_seed_all(seed, mode)
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.use_deterministic_algorithms(mode)
+    if is_gpu:
+        torch.cuda.manual_seed_all(seed)
+        torch.cuda.manual_seed(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.enable = False
+        torch.backends.cudnn.benchmark = False
+    else:
+        torch_npu.npu.manual_seed_all(seed)
+        torch_npu.npu.manual_seed(seed)
+
+
+def check_seed_all(seed, mode):
+    if isinstance(seed, int):
+        if seed < 0 or seed > Const.MAX_SEED_VALUE:
+            print_error_log(f"Seed must be between 0 and {Const.MAX_SEED_VALUE}.")
+            raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    else:
+        print_error_log(f"Seed must be integer.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    if not isinstance(mode, bool):
+        print_error_log(f"seed_all mode must be bool.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+
+
+def get_process_rank(model):
+    print_info_log("Rank id is not provided. Trying to get the rank id of the model.")
+    try:
+        local_device = next(model.parameters()).device
+    except StopIteration:
+        print_warn_log('There is no parameter in the model. Fail to get rank id.')
+        return 0, False
+    if local_device.type == 'cpu':
+        print_warn_log("Warning: the debugger is unable to get the rank id. "
+            "This may cause the dumpped data to be corrupted in the "
+            "case of distributed training. (You may ignore this if you are using only one card.) "
+            "Transfer the model to npu or gpu before register_hook() to avoid this warning.")
+        return 0, False
+    else:
+        return local_device.index, True
+
+
+def parameter_adapter(func):
+
+    @wraps(func)
+    def inner(self, *args, **kwargs):
+        if self.op_name_ == "__getitem__" and len(args) > 1 and isinstance(args[1], torch.Tensor):
+            input_tensor = args[0]
+            indices = args[1]
+            if indices.dtype == torch.uint8:
+                indices = indices.bool()
+            if indices.dtype == torch.bool:
+                if indices.shape == input_tensor.shape:
+                    return getattr(torch._C._VariableFunctionsClass, "masked_select")(input_tensor, indices)
+                else:
+                    indices = getattr(torch._C._VariableFunctionsClass, "nonzero")(indices, as_tuple=True)
+                    return getattr(torch._C._TensorBase, "__getitem__")(input_tensor, indices)
+            elif indices.dtype != torch.bool:
+                if not indices.shape or len(indices.shape) == 1:
+                    return func(self, input_tensor, indices.tolist())
+                elif len(indices.shape) == 2:
+                    result = [func(self, input_tensor, index) for index in indices.tolist()]
+                    return getattr(torch._C._VariableFunctionsClass, "stack")(result, 0)
+                else:
+                    res = [input_tensor[tensor_index] for tensor_index in indices]
+                    return getattr(torch._C._VariableFunctionsClass, "stack")(res, 0)
+        if self.op_name_ == "__eq__" and args[1] is None:
+            return False
+        return func(self, *args, **kwargs)
+    return inner
+
+
+def generate_compare_script(dump_path, pkl_file_path, dump_switch_mode):
+    template_path = os.path.join(os.path.dirname(__file__), "compare_script.template")
+    pkl_dir = os.path.dirname(pkl_file_path)
+    compare_script_path = os.path.join(pkl_dir, "compare_data.py")
+    is_api_stack = "True" if dump_switch_mode == Const.API_STACK else "False"
+
+    try:
+        with FileOpen(template_path, 'r') as ftemp, \
+           os.fdopen(os.open(compare_script_path, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as fout:
+            code_temp = ftemp.read()
+            fout.write(code_temp % (pkl_file_path, dump_path, is_api_stack))
+    except OSError:
+        print_error_log(f"Failed to open file. Please check file {template_path} or path {pkl_dir}.")
+
+    print_info_log(f"Generate compare script successfully which is {compare_script_path}.")
+
+
+def check_is_npu():
+    return not is_gpu
+
+
+def check_file_valid(file_path):
+    if os.path.islink(file_path):
+        print_error_log('The file path {} is a soft link.'.format(file_path))
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+    if len(os.path.realpath(file_path)) > Const.DIRECTORY_LENGTH or len(os.path.basename(file_path)) > \
+            Const.FILE_NAME_LENGTH:
+        print_error_log('The file path length exceeds limit.')
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+    if not re.match(Const.FILE_PATTERN, os.path.realpath(file_path)):
+        print_error_log('The file path {} contains special characters.'.format(file_path))
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+    if os.path.isfile(file_path):
+        file_size = os.path.getsize(file_path)
+        if file_path.endswith(Const.PKL_SUFFIX) and file_size > Const.ONE_GB:
+            print_error_log('The file {} size is greater than 1GB.'.format(file_path))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+        if file_path.endswith(Const.NUMPY_SUFFIX) and file_size > Const.TEN_GB:
+            print_error_log('The file {} size is greater than 10GB.'.format(file_path))
+            raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+
+def get_md5_for_tensor(x):
+    if x.dtype == torch.bfloat16:
+        x = x.float()
+    tensor_bytes = x.cpu().detach().numpy().tobytes()
+    crc32_hash = zlib.crc32(tensor_bytes)
+    return f"{crc32_hash:08x}"
+
+
+def check_path_before_create(path):
+    if len(os.path.realpath(path)) > Const.DIRECTORY_LENGTH or len(os.path.basename(path)) > \
+            Const.FILE_NAME_LENGTH:
+        print_error_log('The file path length exceeds limit.')
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+    if not re.match(Const.FILE_PATTERN, os.path.realpath(path)):
+        print_error_log('The file path {} contains special characters.'.format(path))
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+
+
+def check_inplace_op(prefix):
+    if len(prefix) > Const.DISTRIBUTED_PREFIX_LENGTH:
+        return False
+    match_op = re.findall(r"Distributed_(.+?)_\d", prefix)
+    op_name = match_op[0] if match_op else None
+    return op_name in Const.INPLACE_LIST
diff --git a/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py b/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..b67dee10d470f3d25bbabbdf12231556cfd1a823
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/compare/acc_compare.py
@@ -0,0 +1,915 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import json
+import multiprocessing
+import os.path
+import stat
+import sys
+import torch
+
+import numpy as np
+import pandas as pd
+
+from .match import graph_mapping
+from ..advisor.advisor import Advisor
+from ..common.utils_compare import check_compare_param, add_time_as_suffix, \
+    print_info_log, print_warn_log, print_error_log, CompareException, Const, \
+    CompareConst, format_value, check_file_not_exists, check_configuration_param, \
+    task_dumppath_get
+from ..common.file_check_util import FileChecker, FileCheckConst, change_mode, FileOpen
+
+
+def correct_data(result):
+    if result == CompareConst.NAN:
+        return result
+    if float(result) > 0.99999:
+        return '1.0'
+    return result
+
+
+def cosine_similarity(n_value, b_value):
+    np.seterr(divide='ignore', invalid='ignore')
+    if len(n_value) == 1:
+        return "unsupported", "This tensor is scalar."
+    num = n_value.dot(b_value)
+    a_norm = np.linalg.norm(n_value)
+    b_norm = np.linalg.norm(b_value)
+    message = ''
+    if a_norm <= Const.FLOAT_EPSILON and b_norm <= Const.FLOAT_EPSILON:
+        result = '1.0'
+    elif a_norm <= Const.FLOAT_EPSILON:
+        message = 'Cannot compare by Cosine Similarity, All the data is Zero in npu dump data.'
+        result = CompareConst.NAN
+    elif b_norm <= Const.FLOAT_EPSILON:
+        message = 'Cannot compare by Cosine Similarity, All the data is Zero in Bench dump data.'
+        result = CompareConst.NAN
+    else:
+        cos = num / (a_norm * b_norm)
+        if np.isnan(cos):
+            message = 'Cannot compare by Cosine Similarity, the dump data has NaN.'
+            result = CompareConst.NAN
+        else:
+            result = format_value(cos)
+    result = correct_data(result)
+    return result, message
+
+
+def get_rmse(n_value, b_value):
+    if len(n_value) == 0 and len(b_value) == 0:
+        rmse = '0'
+    elif len(n_value) == 0:
+        rmse = CompareConst.NAN
+    elif len(b_value) == 0:
+        rmse = CompareConst.NAN
+    else:
+        rmse = np.linalg.norm(n_value - b_value) / np.sqrt(len(n_value))
+    if np.isnan(rmse):
+        rmse = CompareConst.NAN
+    return rmse, ""
+
+
+def get_mape(n_value, b_value):
+    if len(n_value) == 0 and len(b_value) == 0:
+        mape = '0'
+    elif len(n_value) == 0:
+        mape = CompareConst.NAN
+    elif len(b_value) == 0:
+        mape = CompareConst.NAN
+    elif not np.all(n_value) and not np.all(b_value):
+        mape = '0'
+    elif not np.all(b_value):
+        mape = CompareConst.NAN
+    else:
+        mape_val = np.sum(np.abs((n_value - b_value) / b_value)) / len(b_value) * 100
+        mape = CompareConst.NAN if np.isnan(mape_val) else str(round(mape_val, 4)) + '%'
+    return mape, ""
+
+
+def get_max_abs_err(n_value, b_value):
+    temp_res = n_value - b_value
+    max_value = np.max(np.abs(temp_res))
+    return format_value(max_value), ""
+
+
+def get_relative_err(n_value, b_value):
+    np.seterr(divide='ignore', invalid='ignore')
+    if b_value.dtype in CompareConst.FLOAT_TYPE:
+        zero_mask = (b_value == 0)
+        b_value[zero_mask] += np.finfo(b_value.dtype).eps
+        n_value[zero_mask] += np.finfo(b_value.dtype).eps
+    else:
+        n_value, b_value = n_value.astype(float), b_value.astype(float)
+        zero_mask = (b_value == 0)
+        b_value[zero_mask] += np.finfo(float).eps
+        n_value[zero_mask] += np.finfo(float).eps
+    relative_err = np.divide((n_value - b_value), b_value)
+    return np.abs(relative_err)
+
+
+def get_max_relative_err(n_value, b_value, input_relative_err=None):
+    if input_relative_err is None:
+        relative_err = get_relative_err(n_value, b_value)
+    else:
+        relative_err = input_relative_err
+    max_relative_err = np.max(np.abs(relative_err))
+    if np.isnan(max_relative_err):
+        message = 'Cannot compare by MaxRelativeError, the data contains nan in dump data.'
+        return CompareConst.NAN, message
+    return format_value(max_relative_err), ""
+
+
+def rel_err_ratio(relative_err, threshold):
+    return format_value(np.sum(relative_err < threshold) / np.size(relative_err))
+
+
+def check_graph_mode(a_op_name, b_op_name):
+    if "Aten" in a_op_name and "Aten" not in b_op_name:
+        return True
+    if "Aten" not in a_op_name and "Aten" in b_op_name:
+        return True
+    return False
+
+
+def check_op(npu_dict, bench_dict, fuzzy_match):
+    a_op_name = npu_dict["op_name"]
+    b_op_name = bench_dict["op_name"]
+    graph_mode = check_graph_mode(a_op_name[0], b_op_name[0])
+    if graph_mode:
+        return graph_mapping.match(a_op_name[0], b_op_name[0])
+    struct_match = check_struct_match(npu_dict, bench_dict)
+    if not fuzzy_match:
+        return a_op_name == b_op_name and struct_match
+    is_match = True
+    try:
+        is_match = fuzzy_check_op(a_op_name, b_op_name)
+    except Exception as err:
+        print_warn_log("%s and %s can not fuzzy match." % (a_op_name, b_op_name))
+        is_match = False
+    return is_match and struct_match
+
+
+def check_struct_match(npu_dict, bench_dict):
+    npu_struct_in = npu_dict.get("input_struct")
+    bench_struct_in = bench_dict.get("input_struct")
+    npu_struct_out = npu_dict.get("output_struct")
+    bench_struct_out = bench_dict.get("output_struct")
+    is_match = npu_struct_in == bench_struct_in and npu_struct_out == bench_struct_out
+    if not is_match:
+        if len(npu_struct_in) == 0 or len(bench_struct_in) == 0 or len(npu_struct_in) != len(bench_struct_in):
+            return False
+        struct_in_is_match = check_type_shape_match(npu_struct_in, bench_struct_in)
+        struct_out_is_match = check_type_shape_match(npu_struct_out, bench_struct_out)
+        is_match = struct_in_is_match and struct_out_is_match
+    return is_match
+
+
+def check_type_shape_match(npu_struct, bench_struct):
+    shape_type_match = False
+    for npu_type_shape, bench_type_shape in zip(npu_struct, bench_struct):
+        npu_type = npu_type_shape[0]
+        npu_shape = npu_type_shape[1]
+        bench_type = bench_type_shape[0]
+        bench_shape = bench_type_shape[1]
+        shape_match = npu_shape == bench_shape
+        type_match = npu_type == bench_type
+        if not type_match:
+            if [npu_type, bench_type] in [["torch.float16", "torch.float32"], ["torch.float32", "torch.float16"],
+                                          ["torch.float16", "torch.bfloat16"], ["torch.bfloat16", "torch.float16"]]:
+                type_match = True
+            else:
+                type_match = False
+        shape_type_match = shape_match and type_match
+        if not shape_type_match:
+            return False
+    return shape_type_match
+
+
+def fuzzy_check_op(npu_name_list, bench_name_list):
+    if len(npu_name_list) == 0 or len(bench_name_list) == 0 or len(npu_name_list) != len(bench_name_list):
+        return False
+    is_match = True
+    for npu_name, bench_name in zip(npu_name_list, bench_name_list):
+        is_match = fuzzy_check_name(npu_name, bench_name)
+        if not is_match:
+            break
+    return is_match
+
+
+def fuzzy_check_name(npu_name, bench_name):
+    if "forward" in npu_name and "forward" in bench_name:
+        is_match = rename_api(npu_name, "forward") == rename_api(bench_name, "forward")
+    elif "backward" in npu_name and "backward" in bench_name:
+        is_match = rename_api(npu_name, "backward") == rename_api(bench_name, "backward")
+    else:
+        is_match = npu_name == bench_name
+    return is_match
+
+
+def rename_api(npu_name, process):
+    npu_split = npu_name.split(process)
+    torch_func_index, in_out = npu_split[0], npu_split[1]
+    torch_func_split = torch_func_index.rsplit("_", 2)
+    torch_func = str(torch_func_split[0]) + str(in_out)
+    return torch_func
+
+
+def merge_tensor(tensor_list, summary_compare, md5_compare):
+    op_dict = {}
+    op_dict["op_name"] = []
+    op_dict["input_struct"] = []
+    op_dict["kwargs_struct"] = []
+    op_dict["output_struct"] = []
+    op_dict["summery"] = []
+    op_dict["stack_info"] = []
+
+    all_mode_bool = summary_compare == False and md5_compare == False
+    if all_mode_bool:
+        op_dict["data_name"] = []
+
+    for tensor in tensor_list:
+        if len(tensor) == 2:
+            op_dict['stack_info'].append(tensor['full_info'])
+            break
+        op_dict["op_name"].append(tensor['full_op_name'])
+        if not md5_compare:
+            if tensor['full_op_name'].find("input") != -1:
+                op_dict["input_struct"].append((tensor['dtype'], tensor['shape']))
+            elif tensor['full_op_name'].find("kwarg") != -1:
+                op_dict["kwargs_struct"].append((tensor['dtype'], tensor['shape']))
+            elif tensor['full_op_name'].find("output") != -1:
+                op_dict["output_struct"].append((tensor['dtype'], tensor['shape']))
+        else:
+            if tensor['full_op_name'].find("input") != -1:
+                op_dict["input_struct"].append((tensor['dtype'], tensor['shape'], tensor['md5']))
+            elif tensor['full_op_name'].find("kwarg") != -1:
+                op_dict["kwargs_struct"].append((tensor['dtype'], tensor['shape'], tensor['md5']))
+            elif tensor['full_op_name'].find("output") != -1:
+                op_dict["output_struct"].append((tensor['dtype'], tensor['shape'], tensor['md5']))
+
+        op_dict["summery"].append([tensor['Max'], tensor['Min'], tensor['Mean'], tensor['Norm']])
+
+        if all_mode_bool:
+            op_dict["data_name"].append(tensor['data_name'])
+
+    if not op_dict["kwargs_struct"]:
+        del op_dict["kwargs_struct"]
+    return op_dict
+
+
+def match_op(npu_queue, bench_queue, fuzzy_match):
+    for b_index, b_op in enumerate(bench_queue[0: -1]):
+        if check_op(npu_queue[-1], b_op, fuzzy_match):
+            return len(npu_queue) - 1, b_index
+    if check_op(npu_queue[-1], bench_queue[-1], fuzzy_match):
+        return len(npu_queue) - 1, len(bench_queue) - 1
+    for n_index, n_op in enumerate(npu_queue[0: -1]):
+        if check_op(n_op, bench_queue[-1], fuzzy_match):
+            return n_index, len(bench_queue) - 1
+    return -1, -1
+
+
+def get_accuracy(result, n_dict, b_dict, summary_compare=False, md5_compare=False):
+    def get_accuracy_core(n_start, n_len, b_start, b_len, key):
+        min_len = min(n_len, b_len)
+        npu_stack_info = n_dict.get("stack_info", None)
+        bench_stack_info = b_dict.get("stack_info", None)
+        has_stack = npu_stack_info and bench_stack_info
+
+        all_mode_bool = summary_compare == False and md5_compare == False
+        if all_mode_bool:
+            npu_data_name = n_dict.get("data_name", None)
+            bench_data_name = b_dict.get("data_name", None)
+        has_data_name = False
+
+        for index in range(min_len):
+            if all_mode_bool:
+                has_data_name = npu_data_name[n_start + index] and bench_data_name[b_start + index]
+
+            n_name = n_dict['op_name'][n_start + index]
+            b_name = b_dict['op_name'][b_start + index]
+            n_struct = n_dict[key][index]
+            b_struct = b_dict[key][index]
+            err_msg = ""
+            if md5_compare:
+                result_item = [n_name, b_name, n_struct[0], b_struct[0], n_struct[1], b_struct[1],
+                               n_struct[2], b_struct[2], CompareConst.PASS if n_struct[2] == b_struct[2] else CompareConst.DIFF]
+                if has_stack and index == 0 and key == "input_struct":
+                    result_item.extend(npu_stack_info)
+                else:
+                    result_item.append(CompareConst.NONE)
+                if all_mode_bool and has_data_name:
+                    result_item.append(npu_data_name[n_start + index])
+                result.append(result_item)
+                continue
+
+            if summary_compare:
+                result_item = [n_name, b_name, n_struct[0], b_struct[0], n_struct[1], b_struct[1],
+                               " ", " ", " ", " ", " ", " ", " ", " "]
+            else:
+                result_item = [n_name, b_name, n_struct[0], b_struct[0], n_struct[1], b_struct[1],
+                               " ", " ", " ", " ", " "]
+
+            npu_summery_data = n_dict.get("summery")[n_start + index]
+            result_item.extend(npu_summery_data)
+            bench_summery_data = b_dict.get("summery")[b_start + index]
+            result_item.extend(bench_summery_data)
+
+            if summary_compare:
+                start_idx = CompareConst.SUMMARY_COMPARE_RESULT_HEADER.index(CompareConst.MAX_DIFF)
+                warning_flag = False
+                for i, (npu_val, bench_val) in enumerate(zip(npu_summery_data, bench_summery_data)):
+                    if isinstance(npu_val, (float, int)) and isinstance(bench_val, (float, int)):
+                        diff = npu_val - bench_val
+                        if bench_val != 0:
+                            relative = str(abs((diff/bench_val) * 100)) + '%'
+                        else:
+                            relative = "N/A"
+                        result_item[start_idx + i] = diff
+                        result_item[start_idx + i + 4] = relative
+                        magnitude_diff = abs(diff) / (max(abs(npu_val), abs(bench_val)) + 1e-10)
+                        if magnitude_diff > 0.5:
+                            warning_flag = True
+                    else:
+                        result_item[start_idx + i] = CompareConst.NONE
+                accuracy_check = CompareConst.WARNING if warning_flag else ""
+                err_msg += "Need double check api accuracy." if warning_flag else ""
+                result_item[start_idx:] = [f'{str(x)}\t' if str(x) in ('inf', '-inf', 'nan') else x for x in result_item[start_idx:]]
+
+            result_item.append(accuracy_check if summary_compare else CompareConst.ACCURACY_CHECK_YES)
+            result_item.append(err_msg)
+            if has_stack and index == 0 and key == "input_struct":
+                result_item.extend(npu_stack_info)
+            else:
+                result_item.append(CompareConst.NONE)
+            if all_mode_bool and has_data_name:
+                result_item.append(npu_data_name[n_start + index])
+
+            result.append(result_item)
+
+        if n_len > b_len:
+            for index in range(b_len, n_len):
+                if all_mode_bool:
+                    has_data_name = npu_data_name[n_start + index] and bench_data_name[b_start + index]
+
+                n_name = n_dict['op_name'][n_start + index]
+                n_struct = n_dict[key][index]
+                if md5_compare:
+                    result_item = [n_name, CompareConst.NAN, n_struct[0], CompareConst.NAN,
+                                   n_struct[1], CompareConst.NAN, n_struct[2], CompareConst.NAN, CompareConst.NAN]
+                    result.append(result_item)
+                    continue
+                result_item = [n_name, CompareConst.NAN, n_struct[0], CompareConst.NAN,
+                               n_struct[1], CompareConst.NAN, " ", " ", " ", " ", " "]
+                summery_data = n_dict.get("summery")[n_start + index]
+                result_item.extend(summery_data)
+                summery_data = [CompareConst.NAN for _ in range(len(n_dict.get("summery")[0]))]
+                result_item.extend(summery_data)
+
+                err_msg = ""
+                result_item.append(CompareConst.ACCURACY_CHECK_YES)
+                result_item.append(err_msg)
+
+                if has_stack and index == 0 and key == "input_struct":
+                    result_item.extend(npu_stack_info)
+                else:
+                    result_item.append(CompareConst.NONE)
+                if all_mode_bool and has_data_name:
+                    result_item.append(npu_data_name[n_start + index])
+
+                result.append(result_item)
+
+    n_num = len(n_dict['op_name'])
+    b_num = len(b_dict['op_name'])
+    n_num_input = len([name for name in n_dict['op_name'] if 'input' in name])
+    b_num_input = len([name for name in b_dict['op_name'] if 'input' in name])
+    n_num_kwarg = len([name for name in n_dict['op_name'] if 'kwarg' in name])
+    b_num_kwarg = len([name for name in b_dict['op_name'] if 'kwarg' in name])
+    n_num_output = n_num - n_num_input - n_num_kwarg
+    b_num_output = b_num - b_num_input - b_num_kwarg
+    get_accuracy_core(0, n_num_input, 0, b_num_input, 'input_struct')
+    get_accuracy_core(n_num_input, n_num_kwarg, b_num_input, b_num_kwarg, "kwargs_struct")
+    get_accuracy_core(n_num_input + n_num_kwarg, n_num_output, b_num_input + b_num_kwarg, b_num_output, 'output_struct')
+
+
+def _do_multi_process(input_parma, result_path):
+    try:
+        _handle_multi_process(compare_ops, input_parma, result_path, multiprocessing.Manager().RLock())
+    except FileNotFoundError as error:
+        print("File not Found. compare failed!")
+        return
+    except IOError as error:
+        print("IOEError. compare failed!")
+        return
+
+
+def read_dump_path(result_path):
+    try:
+        csv_pd = pd.read_csv(result_path)
+        npu_dump_name_list = csv_pd.iloc[0:, 0].tolist()
+        npu_dump_tensor_list = csv_pd.iloc[0:, -1].tolist()
+        # bench_dump_name_list = csv_pd.iloc[0:, 1].tolist()
+        op_name_mapping_dict = {}
+        for index, _ in enumerate(npu_dump_name_list):
+            npu_dump_name = npu_dump_name_list[index]
+            npu_dump_tensor = npu_dump_tensor_list[index]
+            # bench_dump_name = bench_dump_name_list[index]
+            op_name_mapping_dict[npu_dump_name] = [npu_dump_tensor, npu_dump_tensor]
+        return op_name_mapping_dict
+    except FileNotFoundError as e:
+        print_error_log('{} file is not found.'.format(result_path))
+        raise CompareException(CompareException.OPEN_FILE_ERROR) from e
+    except IOError as e:
+        print_error_log('{} read csv failed.'.format(result_path))
+        raise CompareException(CompareException.READ_FILE_ERROR) from e
+
+
+def _handle_multi_process(func, input_parma, result_path, lock):
+    process_num = int((multiprocessing.cpu_count() + 1) / 2)
+    op_name_mapping_dict = read_dump_path(result_path)
+    op_names = []
+    for _ in range(process_num):
+        op_names.append([])
+    all_op_names = list(op_name_mapping_dict.keys())
+    for i, op_name in enumerate(all_op_names):
+        op_names[i % process_num].append(op_name)
+    all_tasks = []
+    pool = multiprocessing.Pool(process_num)
+
+    def err_call(args):
+        print_error_log('multiprocess compare failed! Reason: {}'.format(args))
+        try:
+            pool.terminate()
+            if os.path.exists(result_path):
+                os.remove(result_path)
+        except OSError as e:
+            print_error_log("pool terminate failed")
+
+    for process_idx, fusion_op_names in enumerate(op_names):
+        idx = [process_num, process_idx]
+        task = pool.apply_async(func,
+                                args=(idx, fusion_op_names, op_name_mapping_dict, result_path, lock, input_parma),
+                                error_callback=err_call)
+        all_tasks.append(task)
+    pool.close()
+    pool.join()
+
+
+def compare_ops(idx, fusion_op_names, dump_path_dict, result_path, lock, input_parma):
+    cos_result = []
+    max_err_result = []
+    max_relative_err_result = []
+    err_mess = []
+    one_thousand_err_ratio_result = []
+    five_thousand_err_ratio_result = []
+    is_print_compare_log = input_parma.get("is_print_compare_log")
+    for i, op_name in enumerate(fusion_op_names):
+        if is_print_compare_log:
+            print("start compare: {}".format(op_name))
+
+        if op_name == '-1':
+            cos_sim = max_abs_err = max_relative_err = err_msg = one_thousand_err_ratio = five_thousand_err_ratio = CompareConst.NONE
+        else:
+            cos_sim, max_abs_err, max_relative_err, err_msg, one_thousand_err_ratio, five_thousand_err_ratio = compare_by_op(op_name, dump_path_dict, input_parma)
+
+        if is_print_compare_log:
+            print("[{}] Compare result: cosine {}, max_abs_err {}, max_relative_err {}, {}, one_thousand_err_ratio {}, five_thousand_err_ratio {}".format(op_name, cos_sim, max_abs_err, max_relative_err, err_msg, one_thousand_err_ratio, five_thousand_err_ratio))
+        cos_result.append(cos_sim)
+        max_err_result.append(max_abs_err)
+        max_relative_err_result.append(max_relative_err)
+        err_mess.append(err_msg)
+        one_thousand_err_ratio_result.append(one_thousand_err_ratio)
+        five_thousand_err_ratio_result.append(five_thousand_err_ratio)
+    _save_cmp_result(idx, cos_result, max_err_result, max_relative_err_result, err_mess, one_thousand_err_ratio_result,
+                     five_thousand_err_ratio_result, result_path, lock)
+
+
+def _save_cmp_result(idx, cos_result, max_err_result, max_relative_err_result, err_msg, one_thousand_err_ratio_result, five_thousand_err_ratio_result, result_path, lock):
+    lock.acquire()
+    try:
+        csv_pd = pd.read_csv(result_path, dtype=str)
+        process_num = idx[0]
+        process_idx = idx[1]
+        for i, _ in enumerate(cos_result):
+            process_index = i * process_num + process_idx
+            csv_pd.loc[process_index, CompareConst.COSINE] = cos_result[i]
+            csv_pd.loc[process_index, CompareConst.MAX_ABS_ERR] = max_err_result[i]
+            csv_pd.loc[process_index, CompareConst.MAX_RELATIVE_ERR] = max_relative_err_result[i]
+            csv_pd.loc[process_index, CompareConst.ERROR_MESSAGE] = err_msg[i]
+            csv_pd.loc[process_index, CompareConst.ACCURACY] = check_accuracy(cos_result[i], max_err_result[i])
+            csv_pd.loc[process_index, CompareConst.ONE_THOUSANDTH_ERR_RATIO] = one_thousand_err_ratio_result[i]
+            csv_pd.loc[process_index, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO] = five_thousand_err_ratio_result[i]
+        csv_pd.to_csv(result_path, index=False)
+    except FileNotFoundError as e:
+        print_error_log('{} file is not found.'.format(result_path))
+        raise CompareException(CompareException.OPEN_FILE_ERROR) from e
+    except IOError as e:
+        print_error_log('{} read csv failed.'.format(result_path))
+        raise CompareException(CompareException.READ_FILE_ERROR) from e
+    finally:
+        lock.release()
+
+
+def check_accuracy(cos, max_abs_err):
+    if cos == CompareConst.SHAPE_UNMATCH:
+        return CompareConst.ACCURACY_CHECK_UNMATCH
+    if cos == CompareConst.NONE or max_abs_err == CompareConst.NONE:
+        return CompareConst.NONE
+    if cos == "N/A" or max_abs_err == "N/A":
+        return CompareConst.ACCURACY_CHECK_NO
+    try:
+        cos, max_abs_err = float(cos), float(max_abs_err)
+    except ValueError:
+        print_warn_log("Cosine or MaxAbsErr can not get float value.")
+        return CompareConst.NONE
+    if cos < CompareConst.COS_THRESHOLD and max_abs_err > CompareConst.MAX_ABS_ERR_THRESHOLD:
+        return CompareConst.ACCURACY_CHECK_NO
+    if cos < CompareConst.COS_MAX_THRESHOLD or max_abs_err > CompareConst.MAX_ABS_ERR_MAX_THRESHOLD:
+        return CompareConst.ACCURACY_CHECK_NO
+    return CompareConst.ACCURACY_CHECK_YES
+
+
+def compare_by_op(op_name, op_name_mapping_dict, input_parma):
+    npu_bench_name_list = op_name_mapping_dict[op_name]
+    data_name = npu_bench_name_list[1]
+    if data_name == '-1' or data_name == -1:
+        return CompareConst.NONE, CompareConst.NONE, CompareConst.NONE, CompareConst.NO_BENCH, CompareConst.NONE, CompareConst.NONE
+    try:
+        n_path = os.path.join(input_parma.get("npu_dump_data_dir"), npu_bench_name_list[0])
+        b_path = os.path.join(input_parma.get("bench_dump_data_dir"), npu_bench_name_list[1])
+        n_path_checker = FileChecker(n_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
+                                     FileCheckConst.PT_SUFFIX, False)
+        b_path_checker = FileChecker(b_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
+                                     FileCheckConst.PT_SUFFIX, False)
+        n_path = n_path_checker.common_check()
+        b_path = b_path_checker.common_check()
+        n_value = torch.load(n_path, map_location=torch.device('cpu')).detach().numpy()
+        b_value = torch.load(b_path, map_location=torch.device('cpu')).detach().numpy()
+    except IOError as error:
+        return CompareConst.NAN, CompareConst.NAN, CompareConst.NAN, "Dump file: {} not found.".format(error.filename), CompareConst.NAN, CompareConst.NAN
+    relative_err = get_relative_err(n_value, b_value)
+    if len(n_value.shape) == 0:
+        if n_value.dtype == bool:
+            n_value = n_value.astype(float)
+            b_value = b_value.astype(float)
+        max_abs_err, _ = get_max_abs_err(n_value, b_value)
+        max_relative_err, _ = get_max_relative_err(n_value, b_value, input_relative_err=relative_err)
+        return "unsupported", max_abs_err, max_relative_err, "This is type of scalar data, can not compare.", CompareConst.NAN, CompareConst.NAN
+    if n_value.size == 0:
+        return "unsupported", 0, 0, "This is empty data, can not compare.", 0, 0
+    if n_value.shape != b_value.shape:
+        return CompareConst.SHAPE_UNMATCH, CompareConst.SHAPE_UNMATCH, CompareConst.SHAPE_UNMATCH, "Shape of NPU and bench Tensor do not match. Skipped.", CompareConst.SHAPE_UNMATCH, CompareConst.SHAPE_UNMATCH
+    if n_value.dtype != b_value.dtype:
+        print_warn_log("Dtype of NPU and bench Tensor do not match: {}".format(op_name))
+        err_msg = " Dtype of NPU and bench Tensor do not match."
+    else:
+        err_msg = ""
+
+    n_value, b_value = handle_inf_nan(n_value, b_value)
+    if n_value is CompareConst.NAN or b_value is CompareConst.NAN:
+        return "N/A", "N/A", "N/A", "The position of inf or nan in NPU and bench Tensor do not match.", "N/A", "N/A"
+
+    n_value = n_value.reshape(-1).astype(float)
+    b_value = b_value.reshape(-1).astype(float)
+    err_msg = ""
+    cos_sim, message = cosine_similarity(n_value, b_value)
+
+    abs_err = np.abs(n_value - b_value)
+    max_abs_err = format_value(np.max(abs_err))
+    max_relative_err, message = get_max_relative_err(n_value, b_value, input_relative_err=relative_err)
+    one_thousand_err_ratio = rel_err_ratio(relative_err, 0.001)
+    five_thousand_err_ratio = rel_err_ratio(relative_err, 0.005)
+
+    if not err_msg:
+        err_msg += message
+    else:
+        err_msg = err_msg + ' ' + message
+
+    if npu_bench_name_list[0] != npu_bench_name_list[1]:
+        err_msg += " Fuzzy matching data, the comparison accuracy may be affected."
+    return cos_sim, max_abs_err, max_relative_err, err_msg, one_thousand_err_ratio, five_thousand_err_ratio
+
+
+def handle_inf_nan(n_value, b_value):
+    n_inf = np.isinf(n_value)
+    b_inf = np.isinf(b_value)
+    n_nan = np.isnan(n_value)
+    b_nan = np.isnan(b_value)
+    if np.any(n_inf) or np.any(b_inf) or np.any(n_nan) or np.any(b_nan):
+        if np.array_equal(n_inf, b_inf) and np.array_equal(n_nan, b_nan):
+            n_value[n_inf] = 0
+            b_value[b_inf] = 0
+            n_value[n_nan] = 0
+            b_value[b_nan] = 0
+        else:
+            return CompareConst.NAN, CompareConst.NAN
+    return n_value, b_value
+
+
+def compare(input_parma, output_path, stack_mode=False, auto_analyze=True,
+            fuzzy_match=False):
+    try:
+        summary_compare, md5_compare = task_dumppath_get(input_parma)
+        check_configuration_param(stack_mode, auto_analyze, fuzzy_match)
+        check_compare_param(input_parma, output_path, stack_mode, summary_compare, md5_compare)
+    except CompareException as error:
+        print_error_log('Compare failed. Please check the arguments and do it again!')
+        sys.exit(error.code)
+    compare_core(input_parma, output_path, stack_mode=stack_mode,
+                 auto_analyze=auto_analyze, fuzzy_match=fuzzy_match, summary_compare=summary_compare,
+                 md5_compare=md5_compare)
+
+
+def compare_core(input_parma, output_path, stack_mode=False, auto_analyze=True,
+                 suffix='', fuzzy_match=False, summary_compare=False, md5_compare=False):
+    print_info_log("Please check whether the input data belongs to you. If not, there may be security risks.")
+    file_name = add_time_as_suffix("compare_result" + suffix)
+    file_path = os.path.join(os.path.realpath(output_path), file_name)
+    check_file_not_exists(file_path)
+
+    with FileOpen(input_parma.get("npu_json_path"), "r") as npu_json, \
+            FileOpen(input_parma.get("bench_json_path"), "r") as bench_json, \
+            FileOpen(input_parma.get("stack_json_path"), "r") as stack_json, \
+            os.fdopen(os.open(file_path, os.O_RDWR | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP), 'w+') \
+                    as fout:
+        compare_process([npu_json, bench_json, stack_json, fout], stack_mode, fuzzy_match, summary_compare, md5_compare)
+        if summary_compare:
+            print_info_log(f"Summary compare result is {file_path}")
+
+    if not md5_compare and not summary_compare:
+        _do_multi_process(input_parma, file_path)
+    change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY)
+    if auto_analyze:
+        advisor = Advisor(file_path, output_path)
+        advisor.analysis()
+
+
+def parse(pkl_file, module_name_prefix):
+    if not isinstance(module_name_prefix, str):
+        print_error_log("The parameter:module_name_prefix is not a string.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    with FileOpen(pkl_file, "r") as f:
+        done = False
+        title_printed = False
+        while not done:
+            pkl_line = f.readline()
+            if pkl_line == '\n':
+                continue
+            if len(pkl_line) == 0:
+                done = True
+                break
+
+            msg = json.loads(pkl_line)
+            info_prefix = msg[0]
+            if not info_prefix.startswith(module_name_prefix):
+                continue
+
+            if info_prefix.find("stack_info") != -1:
+                print("\nTrace back({}):".format(msg[0]))
+                for item in reversed(msg[1]):
+                    print("  File \"{}\", line {}, in {}".format(item[0], item[1], item[2]))
+                    print("    {}".format(item[3]))
+                continue
+            if len(msg) > 5:
+                summery_info = "  [{}][dtype: {}][shape: {}][max: {}][min: {}][mean: {}]" \
+                    .format(msg[0], msg[3], msg[4], msg[5][0], msg[5][1], msg[5][2])
+                if not title_printed:
+                    print("\nStatistic Info:")
+                    title_printed = True
+                print(summery_info)
+
+
+def op_item_parse(item, op_name, index, item_list=[], top_bool=True):
+    if item == None or (isinstance(item, dict) and len(item) == 0):
+        if not top_bool:
+            tmp = {'full_op_name': op_name + '.' + str(index), 'Max': None, 'Min': None, 'Mean': None, 'Norm': None, 'dtype': None, 'shape': None, 'md5': None, 'data_name': '-1'}
+        else:
+            tmp = {'full_op_name': op_name + '.0', 'Max': None, 'Min': None, 'Mean': None, 'Norm': None, 'dtype': None, 'shape': None, 'md5': None, 'data_name': '-1'}
+        item_list.append(tmp)
+        return item_list
+    if index == None:
+        if isinstance(item, dict):
+            full_op_name = op_name + '.0'
+        else:
+            full_op_name = op_name
+    else:
+        full_op_name = op_name + '.' + str(index)
+    if isinstance(item, dict):
+        if 'dtype' in item:
+            parsed_item = item
+            parsed_item['full_op_name'] = full_op_name
+            item_list.append(parsed_item)
+        else:
+            parsed_item = {}
+            if item['type'] == 'slice':
+                parsed_item['full_op_name'] = full_op_name
+                parsed_item['dtype'] = 'slice'
+                parsed_item['shape'] = str(np.shape(np.array(item['value'])))
+                parsed_item['md5'] = None
+                parsed_item['Max'] = None
+                parsed_item['Min'] = None
+                parsed_item['Mean'] = None
+                parsed_item['Norm'] = None
+                parsed_item['data_name'] = '-1'
+                item_list.append(parsed_item)
+            else:
+                parsed_item['full_op_name'] = full_op_name
+                parsed_item['dtype'] = str(type(item['value']))
+                parsed_item['shape'] = '[]'
+                parsed_item['md5'] = None
+                parsed_item['Max'] = item['value']
+                parsed_item['Min'] = item['value']
+                parsed_item['Mean'] = item['value']
+                parsed_item['Norm'] = item['value']
+                parsed_item['data_name'] = '-1'
+                item_list.append(parsed_item)
+    else:
+        for j in range(len(item)):
+            op_item_parse(item[j], full_op_name, j, top_bool=False)
+    return item_list
+
+
+def read_op(op_data, op_name):
+    op_parsed_list = []
+    if 'forward' in op_name:
+        if 'input_args' in op_data:
+            input_item = op_data['input_args']
+            input_parsed_list = op_item_parse(input_item, op_name + '_input', None)
+            op_parsed_list = input_parsed_list.copy()
+            input_parsed_list.clear()
+        if 'input_kwargs' in op_data:
+            kwargs_item = op_data['input_kwargs']
+            if isinstance(kwargs_item, dict) and "type" in kwargs_item or isinstance(kwargs_item, list):
+                kwarg_parsed_list = op_item_parse(kwargs_item, op_name + '_input', None)
+                op_parsed_list += kwarg_parsed_list
+                kwarg_parsed_list.clear()
+            elif kwargs_item:
+                for kwarg in kwargs_item:
+                    kwarg_parsed_list = op_item_parse(kwargs_item[kwarg], op_name + '_input.' + kwarg, None)
+                    op_parsed_list += kwarg_parsed_list
+                    kwarg_parsed_list.clear()
+        if 'output' in op_data:
+            output_item = op_data['output']
+            output_parsed_list = op_item_parse(output_item, op_name + '_output', None)
+            op_parsed_list += output_parsed_list
+            output_parsed_list.clear()
+    if 'backward' in op_name:
+        if 'grad_input' in op_data:
+            input_item = op_data['grad_input']
+            input_parsed_list = op_item_parse(input_item, op_name + '_input', None)
+            op_parsed_list = input_parsed_list.copy()
+            input_parsed_list.clear()
+        if 'grad_output' in op_data:
+            output_item = op_data['grad_output']
+            output_parsed_list = op_item_parse(output_item, op_name + '_output', None)
+            op_parsed_list += output_parsed_list
+            output_parsed_list.clear()
+    return op_parsed_list
+
+
+def compare_process(file_handles, stack_mode, fuzzy_match, summary_compare=False, md5_compare=False):
+    npu_json_handle, bench_json_handle, stack_json_handle, output_csv_handle = file_handles
+    npu_json_data = json.load(npu_json_handle)
+    bench_json_data = json.load(bench_json_handle)
+    stack_json_data = json.load(stack_json_handle)
+
+    if fuzzy_match:
+        print_warn_log("This task uses fuzzy matching, which may affect the accuracy of the comparison.")
+
+    npu_ops_queue = []
+    bench_ops_queue = []
+    result = []
+
+    ops_npu_iter = iter(npu_json_data['data'])
+    ops_bench_iter = iter(bench_json_data['data'])
+    read_err_npu = True
+    read_err_bench = True
+
+    while True:
+        if not read_err_npu or not read_err_bench:
+            break
+        try:
+            op_name_npu = next(ops_npu_iter)
+            read_err_npu = True
+
+            npu_op_data = npu_json_data['data'][op_name_npu]
+            npu_op_parsed_list = read_op(npu_op_data, op_name_npu)
+            if op_name_npu in stack_json_data:
+                npu_op_parsed_list.append({'full_op_name': op_name_npu, 'full_info': stack_json_data[op_name_npu]})
+            else:
+                npu_op_parsed_list.append({'full_op_name': op_name_npu, 'full_info': None})
+
+            npu_ops_queue.append(merge_tensor(npu_op_parsed_list, summary_compare, md5_compare))
+        except StopIteration:
+            read_err_npu = False
+            continue
+        try:
+            op_name_bench = next(ops_bench_iter)
+            read_err_bench = True
+
+            bench_op_data = bench_json_data['data'][op_name_bench]
+            bench_op_parsed_list = read_op(bench_op_data, op_name_bench)
+            if op_name_bench in stack_json_data:
+                bench_op_parsed_list.append(
+                    {'full_op_name': op_name_bench, 'full_info': stack_json_data[op_name_bench]})
+            else:
+                bench_op_parsed_list.append({'full_op_name': op_name_bench, 'full_info': None})
+
+            bench_ops_queue.append(merge_tensor(bench_op_parsed_list, summary_compare, md5_compare))
+        except StopIteration:
+            read_err_bench = False
+            continue
+
+        if len(npu_ops_queue) == 0 or len(bench_ops_queue) == 0:
+            break
+
+        n_match_point, b_match_point = match_op(npu_ops_queue, bench_ops_queue, fuzzy_match)
+        if n_match_point == -1 and b_match_point == -1:
+            continue
+        n_match_data = npu_ops_queue[n_match_point]
+        b_match_data = bench_ops_queue[b_match_point]
+        un_match_data = npu_ops_queue[0: n_match_point]
+        for npu_data in un_match_data:
+            get_un_match_accuracy(result, npu_data, md5_compare, summary_compare)
+        get_accuracy(result, n_match_data, b_match_data, summary_compare, md5_compare)
+        del npu_ops_queue[0: n_match_point + 1]
+        del bench_ops_queue[0: b_match_point + 1]
+    if npu_ops_queue:
+        for npu_data in npu_ops_queue:
+            get_un_match_accuracy(result, npu_data, md5_compare, summary_compare)
+
+    header = []
+    if md5_compare:
+        header = CompareConst.MD5_COMPARE_RESULT_HEADER[:]
+    elif summary_compare:
+        header = CompareConst.SUMMARY_COMPARE_RESULT_HEADER[:]
+    else:
+        header = CompareConst.COMPARE_RESULT_HEADER[:]
+
+    all_mode_bool = summary_compare == False and md5_compare == False
+    if stack_mode:
+        if all_mode_bool:
+            header.append(CompareConst.STACK)
+            header.append(CompareConst.DATA_NAME)
+        else:
+            header.append(CompareConst.STACK)
+    else:
+        if all_mode_bool:
+            for row in result:
+                del row[-2]
+            header.append(CompareConst.DATA_NAME)
+        else:
+            for row in result:
+                del row[-1]
+
+    result_df = pd.DataFrame(result, columns=header)
+    result_df.to_csv(output_csv_handle, index=False)
+
+
+def get_un_match_accuracy(result, n_dict, md5_compare, summary_compare):
+    index_out = 0
+    npu_stack_info = n_dict.get("stack_info", None)
+    bench_name, bench_type, bench_shape = CompareConst.NAN, CompareConst.NAN, CompareConst.NAN
+    err_msg = CompareConst.NO_BENCH
+    accuracy_check_res = CompareConst.NAN
+    for index, n_name in enumerate(n_dict["op_name"]):
+        if n_name.find("input") != -1:
+            n_struct = n_dict["input_struct"][index]
+        else:
+            n_struct = n_dict["output_struct"][index_out]
+            index_out += 1
+
+        result_item = [n_name, bench_name, n_struct[0], bench_type, n_struct[1], bench_shape]
+        if md5_compare:
+            result_item.extend([CompareConst.NAN] * 3)
+            if npu_stack_info and index == 0:
+                result_item.extend(npu_stack_info)
+            result.append(result_item)
+            continue
+        if summary_compare:
+            result_item.extend([CompareConst.NAN] * 4)
+        else:
+            result_item.extend([CompareConst.NAN] * 5)
+        summery_data = n_dict.get("summery")[index]
+        result_item.extend(summery_data)
+        summery_data = [CompareConst.NAN] * 4
+        result_item.extend(summery_data)
+        result_item.append(accuracy_check_res)
+        result_item.append(err_msg)
+        if npu_stack_info and index == 0:
+            result_item.extend(npu_stack_info)
+        result.append(result_item)
diff --git a/debug/accuracy_tools/atat/pytorch/compare/distributed_compare.py b/debug/accuracy_tools/atat/pytorch/compare/distributed_compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..0edfa77de43b916815960f15a266ba3ceca6c93a
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/compare/distributed_compare.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import os
+import sys
+import re
+from ..common.utils_compare import print_error_log, CompareException, check_compare_param, check_file_or_directory_path, \
+    check_configuration_param, task_dumppath_get
+from .acc_compare import compare_core
+
+
+def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs):
+    def check_and_return_dir_contents(dump_dir, prefix):
+        check_file_or_directory_path(dump_dir, True)
+        contents = os.listdir(dump_dir)
+        pattern = re.compile(rf'^{prefix}(?:0|[0-9][1-9]*)?$')
+        for name in contents:
+            match = pattern.match(name)
+            if match is None:
+                msg = (f"dump_dir contains '{name}'. Expected '{prefix}'. This name is not in the format of dump output. "
+                        f"Please check and delete irrelevant files in {dump_dir} and try again.")
+                print_error_log(msg)
+                raise CompareException(CompareException.INVALID_PATH_ERROR)
+        return contents
+
+
+    def extract_json(dirname, stack_json=False):
+        json_path = ''
+        for fname in os.listdir(dirname):
+            full_path = os.path.join(dirname, fname)
+            if full_path.endswith('.json'):
+                json_path = full_path
+                if not stack_json and 'stack' not in json_path:
+                    break
+                if stack_json and 'stack' in json_path:
+                    break
+
+        # Provide robustness on invalid directory inputs
+        if not json_path:
+            print_error_log(f'No file is found in dump dir {dirname}. ')
+            raise CompareException(CompareException.NO_DUMP_FILE_ERROR)
+        return json_path
+
+
+    if kwargs.get('suffix'):
+        print_error_log("Argument 'suffix' is not supported for compare_distributed.")
+        raise CompareException(CompareException.INVALID_PARAM_ERROR)
+    stack_mode = kwargs.get('stack_mode', False)
+    auto_analyze = kwargs.get('auto_analyze', True)
+    fuzzy_match = kwargs.get('fuzzy_match', False)
+    # get the ranks and match by order
+    npu_ranks = sorted(check_and_return_dir_contents(npu_dump_dir, 'rank'))
+    bench_ranks = sorted(check_and_return_dir_contents(bench_dump_dir, 'rank'))
+    if len(npu_ranks) != len(bench_ranks):
+        print_error_log('The number of ranks in the two runs are different. '
+            'Unable to match the ranks. Please use another folder to compare '
+            'or use compare() api and manually match the ranks.')
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+    for nr, br in zip(npu_ranks, bench_ranks):
+        n_dir = os.path.join(npu_dump_dir, nr)
+        b_dir = os.path.join(bench_dump_dir, br)
+        s_dir = b_dir
+        npu_json_path = extract_json(n_dir, stack_json=False)
+        bench_json_path = extract_json(b_dir, stack_json=False)
+        stack_json_path = extract_json(s_dir, stack_json=True)
+
+        dump_result_param = {
+            'npu_json_path': npu_json_path,
+            'bench_json_path': bench_json_path,
+            'stack_json_path': stack_json_path,
+            'is_print_compare_log': True
+        }
+        try:
+            summary_compare, md5_compare = task_dumppath_get(dump_result_param)
+            check_configuration_param(stack_mode, auto_analyze, fuzzy_match)
+            check_compare_param(dump_result_param, output_path, stack_mode=stack_mode, summary_compare=summary_compare)
+        except CompareException as error:
+            print_error_log('Compare failed. Please check the arguments and do it again!')
+            sys.exit(error.code)
+        compare_core(dump_result_param, output_path, suffix=f'_{nr}-{br}', summary_compare=summary_compare,
+                     md5_compare=md5_compare, **kwargs)
diff --git a/debug/accuracy_tools/atat/pytorch/compare/mapping.yaml b/debug/accuracy_tools/atat/pytorch/compare/mapping.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eaffbe7a18be7bce56a8d4714b66f38d50f1e110
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/compare/mapping.yaml
@@ -0,0 +1,607 @@
+__and__: __and__
+__iand__: __iand__
+__ilshift__: __ilshift__
+__ior__: __ior__
+__irshift__: __irshift__
+__ixor__: __ixor__
+__lshift__: __lshift__
+__or__: __or__
+__rshift__: __rshift__
+__xor__: __xor__
+_adaptive_avg_pool2d: adaptive_avg_pool2d
+_adaptive_avg_pool3d: adaptive_avg_pool3d
+_cdist_forward: cdist
+_cudnn_rnn: rnn
+_embedding_bag: embedding_bag
+_fft_c2c: fft
+_fft_c2r: rfft
+_foreach_add_: _foreach_add_
+_foreach_addcdiv: _foreach_addcdiv
+_foreach_copy_: _foreach_copy_
+_foreach_lerp_: _foreach_lerp_
+_foreach_maximum: _foreach_maximum
+_foreach_mul: _foreach_mul
+_foreach_neg_: _foreach_neg_
+_foreach_pow: _foreach_pow
+_foreach_reciprocal_: _foreach_reciprocal_
+_foreach_sign: _foreach_sign
+_foreach_sqrt: _foreach_sqrt
+_foreach_sqrt_: _foreach_sqrt_
+_foreach_sub: _foreach_sub
+_fused_adam: FusedAdam
+_linalg_det: det
+_linalg_eigh: eigh
+_linalg_slogdet: slogdet
+_linalg_svd: svd
+_list_to_tensor: as_tensor
+_log_softmax: log_softmax
+_native_batch_norm_legit: batch_norm
+_nested_tensor_from_tensor_list: _nested_tensor_from_tensor_list
+_pdist_forward: pdist
+_pin_memory: pin_memory
+_reshape_alias: reshape
+_resize_output_: resize_
+_softmax: softmax
+_to_copy: to
+abs: abs
+abs_: abs_
+absolute: abs
+absolute_: abs_
+acos: acos
+acos_: acos_
+acosh: acosh
+acosh_: acosh_
+adaptive_max_pool2d: adaptive_max_pool2d
+adaptive_max_pool3d: adaptive_max_pool3d
+add: add
+add_: add_
+addbmm: addbmm
+addbmm_: addbmm_
+addcdiv: addcdiv
+addcdiv_: addcdiv_
+addcmul: addcmul
+addcmul_: addcmul_
+addmm: addmm
+addmm_: addmm_
+addmv: addmv
+addmv_: addmv_
+addr: addr
+affine_grid_generator: affine_grid
+alias: alias
+all: all
+alpha_dropout: AlphaDropout
+amax: amax
+amin: amin
+aminmax: aminmax
+angle: angle
+any: any
+arange: arange
+arccos: acos
+arccos_: arccos_
+arccosh: arccosh
+arccosh_: arccosh_
+arcsin: asin
+arcsin_: arcsin_
+arcsinh: asinh
+arcsinh_: arcsinh_
+arctan: atan
+arctan2: atan2
+arctan2_: arctan2_
+arctan_: arctan_
+arctanh: arctanh
+arctanh_: arctanh_
+argmax: argmax
+argmin: argmin
+argsort: argsort
+as_strided: as_strided
+asin: asin
+asin_: asin_
+asinh: asinh
+asinh_: asinh_
+atan: atan
+atan2: atan2
+atan2_: atan2_
+atan_: atan_
+atanh: atanh
+atanh_: atanh_
+avg_pool2d: avg_pool2d
+avg_pool3d: avg_pool3d
+baddbmm: baddbmm
+baddbmm_: baddbmm_
+bernoulli: bernoulli
+bernoulli_: bernoulli_
+binary_cross_entropy: BCELoss
+binary_cross_entropy_with_logits: binary_cross_entropy_with_logits
+bitwise_and: bitwise_and
+bitwise_and_: bitwise_and_
+bitwise_left_shift: __lshift__
+bitwise_left_shift_: bitwise_left_shift_
+bitwise_not: bitwise_not
+bitwise_not_: bitwise_not_
+bitwise_or: bitwise_or
+bitwise_or_: bitwise_or_
+bitwise_right_shift: __rshift__
+bitwise_right_shift_: bitwise_right_shift_
+bitwise_xor: bitwise_xor
+bitwise_xor_: bitwise_xor_
+bmm: bmm
+broadcast_tensors: broadcast_tensors
+bucketize: bucketize
+cat: cat
+cauchy: Cauchy
+cauchy_: cauchy_
+ceil: ceil
+ceil_: ceil_
+celu: celu
+celu_: celu_
+cholesky: cholesky
+cholesky_inverse: cholesky_inverse
+cholesky_solve: cholesky_solve
+clamp: clamp
+clamp_: clamp_
+clamp_max: clamp_max
+clamp_max_: clamp_max_
+clamp_min: clamp_min
+clamp_min_: clamp_min_
+clip: clip
+clip_: clip_
+clone: clone
+col2im: col2im
+complex: complex
+conj_physical: conj
+conj_physical_: conj_
+constant_pad_nd: pad
+convolution: Conv2d
+copy: copy_
+copy_: copy_
+copysign: copysign
+copysign_: copysign_
+cos: cos
+cos_: cos_
+cosh: cosh
+cosh_: cosh_
+count_nonzero: count_nonzero
+cudnn_batch_norm: BatchNorm2d
+cummax: cummax
+cummin: cummin
+cumprod: cumprod
+cumprod_: cumprod_
+cumsum: cumsum
+cumsum_: cumsum_
+deg2rad: deg2rad
+deg2rad_: deg2rad_
+detach: detach
+diag: diag
+diag_embed: diag_embed
+diagonal: diagonal
+diagonal_copy: diagonal
+diagonal_scatter: diagonal
+digamma: digamma
+digamma_: digamma_
+dist: dist
+div: div
+div_: div_
+divide: div
+divide_: divide_
+dot: dot
+dropout: dropout
+elu: ELU
+elu_: elu_
+embedding: embedding
+empty_like: empty_like
+empty_strided: empty_strided
+eq: eq
+eq_: eq_
+erf: erf
+erf_: erf_
+erfc: erfc
+erfc_: erfc_
+erfinv: erfinv
+erfinv_: erfinv_
+exp: exp
+exp2: exp2
+exp2_: exp2_
+exp_: exp_
+expand: expand
+expm1: expm1
+expm1_: expm1_
+exponential: Exponential
+exponential_: exponential_
+eye: eye
+fft_fft: fft
+fft_fft2: fft2
+fft_fftn: fftn
+fft_fftshift: fftshift
+fft_hfft: hfft
+fft_hfft2: hfft2
+fft_hfftn: hfftn
+fft_ifft: ifft
+fft_ifft2: ifft2
+fft_ifftn: ifftn
+fft_ifftshift: ifftshift
+fft_ihfft: ihfft
+fft_ihfft2: ihfft2
+fft_ihfftn: ifftn
+fft_irfft: irfft
+fft_irfft2: irfft2
+fft_irfftn: irfftn
+fft_rfft: rfft
+fft_rfft2: rfft2
+fft_rfftn: rfftn
+fill: fill_
+fill_: fill_
+fix: fix
+fix_: fix_
+flip: flip
+float_power_: float_power_
+floor: floor
+floor_: floor_
+floor_divide: floor_divide
+floor_divide_: floor_divide_
+fmax: fmax
+fmin: fmin
+fmod: fmod
+fmod_: fmod_
+frac: frac
+frac_: frac_
+full: full
+full_like: full_like
+gather: gather
+gcd: gcd
+gcd_: gcd_
+ge: ge
+ge_: ge_
+gelu: GELU
+gelu_: gelu_
+geometric: Geometric
+geometric_: geometric_
+glu: glu
+greater: gt
+greater_: ge_
+greater_equal: ge
+greater_equal_: ge_
+grid_sampler_2d: grid_sample
+grid_sampler_3d: grid_sample
+gru: GRU
+gt: gt
+gt_: gt_
+hardshrink: Hardshrink
+hardsigmoid: hardsigmoid
+hardsigmoid_: hardsigmoid_
+hardswish: hardswish
+hardswish_: hardswish_
+hardtanh: hardtanh
+hardtanh_: hardtanh_
+heaviside: heaviside
+heaviside_: heaviside_
+hinge_embedding_loss: HingeEmbeddingLoss
+huber_loss: huber_loss
+hypot: hypot
+hypot_: hypot_
+i0: i0
+i0_: i0_
+igamma: igamma
+igamma_: igamma_
+igammac: igammac
+igammac_: igammac_
+index: __getitem__
+index_add: index_add
+index_add_: index_add_
+index_copy: index_copy_
+index_copy_: index_copy_
+index_fill: index_fill_
+index_fill_: index_fill_
+index_put: index_put_
+index_put_: index_put_
+index_reduce: index_select
+index_select: index_select
+is_pinned: is_pinned
+is_same_size: is_same_size
+isinf: isinf
+isnan: isnan
+isneginf: isneginf
+isposinf: isposinf
+istft: istft
+item: item
+lcm: lcm
+lcm_: lcm_
+le: le
+le_: le_
+leaky_relu: LeakyReLU
+leaky_relu_: leaky_relu_
+lerp: lerp
+lerp_: lerp_
+less: less
+less_: less_
+less_equal: le
+less_equal_: less_equal_
+lgamma: lgamma
+lgamma_: lgamma_
+linalg_cholesky_ex: cholesky
+linalg_cross: cross
+linalg_householder_product: householder_product
+linalg_inv_ex: inv
+linalg_ldl_factor_ex: ldl
+linalg_ldl_solve: ldl_solve
+linalg_lu: lu
+linalg_lu_factor_ex: lu_factor
+linalg_lu_solve: lu_solve
+linalg_matrix_exp: matrix_exp
+linalg_qr: qr
+linalg_solve_triangular: solve
+linalg_vector_norm: norm
+linspace: linspace
+log: log
+log10: log10
+log10_: log10_
+log1p: log1p
+log1p_: log1p_
+log2: log2
+log2_: log2_
+log_: log_
+log_normal: LogNormal
+log_sigmoid_forward: log_sigmoid
+logaddexp: logaddexp
+logaddexp2: logaddexp2
+_native_batch_norm_legit_functional: batch_norm
+logcumsumexp: logcumsumexp
+logical_and: logical_and
+logical_and_: logical_and_
+logical_not: logical_not
+logical_not_: logical_not_
+logical_or: logical_or
+logical_or_: logical_or_
+logical_xor: logical_xor
+logical_xor_: logical_xor_
+logit: logit
+logit_: logit_
+logspace: logspace
+logsumexp: logsumexp
+lstm: LSTM
+lt: lt
+lt_: lt_
+lu_unpack: lu_unpack
+margin_ranking_loss: margin_ranking_loss
+masked_fill: masked_fill
+masked_fill_: masked_fill_
+matmul: matmul
+max: max
+max_pool2d_with_indices: MaxPool2d
+max_pool3d_with_indices: MaxPool3d
+max_unpool2d: MaxUnpool2d
+max_unpool3d: max_unpool3d
+maximum: maximum
+mean: mean
+median: median
+meshgrid: meshgrid
+min: min
+minimum: minimum
+mish: Mish
+mish_: mish_
+mm: mm
+mode: mode
+mse_loss: mse_loss
+mul: mul
+mul_: mul_
+multi_margin_loss: MultiMarginLoss
+multilabel_margin_loss_forward: multilabel_margin_loss
+multinomial: multinomial
+multiply: multiply
+multiply_: mul_
+mv: mv
+mvlgamma: mvlgamma
+mvlgamma_: mvlgamma_
+name: name
+nan_to_num: nan_to_num
+nan_to_num_: nan_to_num_
+nanmedian: nanmedian
+nansum: nansum
+narrow_copy: narrow
+native_batch_norm: BatchNorm2d
+native_dropout: dropout
+native_group_norm: group_norm
+native_layer_norm: LayerNorm
+ne: ne
+ne_: ne_
+neg: neg
+neg_: neg_
+negative: neg
+negative_: neg_
+new_empty: new_empty
+new_empty_strided: new_empty_strided
+new_full: new_full
+new_ones: new_ones
+new_zeros: new_zeros
+nextafter: nextafter
+nextafter_: nextafter_
+nll_loss: nll_loss
+nll_loss2d_forward: NLLLoss2d
+nll_loss_forward: NLLLoss
+nonzero_static: nonzero
+norm: norm
+normal: normal
+normal_: normal_
+not_equal: ne
+not_equal_: ne_
+ones: ones
+ones_like: ones_like
+ormqr: ormqr
+pairwise_distance: pairwise_distance
+pdist: pdist
+permute: permute
+pin_memory: pin_memory
+pixel_shuffle: PixelShuffle
+polar: polar
+polygamma: polygamma
+positive: positive
+pow: pow
+pow_: pow_
+prelu: prelu
+prod: prod
+quantized_gru: GRU
+quantized_lstm: LSTM
+rad2deg: rad2deg
+rad2deg_: rad2deg_
+rand: rand
+rand_like: rand_like
+randint: randint
+randint_like: randint_like
+randn: randn
+randn_like: randn_like
+randperm: randperm
+reciprocal: reciprocal
+reciprocal_: reciprocal_
+reflection_pad1d: reflection_pad1d
+reflection_pad2d: reflection_pad2d
+reflection_pad3d: ReflectionPad3d
+relu: relu
+relu6: relu6
+relu_: relu_
+remainder: remainder
+remainder_: remainder_
+renorm: renorm
+renorm_: renorm_
+repeat: repeat
+repeat_interleave: repeat_interleave
+replication_pad1d: ReplicationPad1d
+replication_pad2d: replication_pad2d
+replication_pad3d: replication_pad3d
+resize_as_: resize_as_
+rnn_relu: RNN
+rnn_tanh: RNN
+roll: roll
+rot90: rot90
+round: round
+round_: round_
+rrelu_with_noise: RReLU
+rrelu_with_noise_: rrelu_with_noise
+rsqrt: rsqrt
+rsqrt_: rsqrt_
+rsub: rsub
+scalar_tensor: scalar_tensor
+scatter: scatter_
+scatter_: scatter_
+scatter_add: scatter_add
+scatter_add_: scatter_add_
+searchsorted: searchsorted
+select: select
+selu: selu
+selu_: selu_
+sgn: sgn
+sgn_: sgn_
+sigmoid: sigmoid
+sigmoid_: sigmoid_
+sign: sign
+sign_: sign_
+signbit: signbit
+silu: silu
+silu_: silu_
+sin: sin
+sin_: sin_
+sinc: sinc
+sinc_: sinc_
+sinh: sinh
+sinh_: sinh_
+slice: slice
+smooth_l1_loss: smooth_l1_loss
+soft_margin_loss: soft_margin_loss
+softplus: softplus
+softshrink: softshrink
+sort: sort
+special_airy_ai: airy_ai
+special_bessel_j0: j0
+special_bessel_j1: j1
+special_bessel_y0: y0
+special_bessel_y1: y1
+special_chebyshev_polynomial_t: chebyshev_t
+special_chebyshev_polynomial_u: chebyshev_u
+special_entr: entr
+special_erfcx: erfcx
+special_hermite_polynomial_h: hermite
+special_hermite_polynomial_he: he
+special_i0: i0
+special_i0e: i0e
+special_i1: i1
+special_i1e: i1e
+special_laguerre_polynomial_l: laguerre_l
+special_log_ndtr: log_ndtr
+special_modified_bessel_i0: i0
+special_modified_bessel_i1: i1
+special_modified_bessel_k0: k0
+special_modified_bessel_k1: i1
+special_ndtr: ndtr
+special_ndtri: ndtri
+special_scaled_modified_bessel_k0: i0e
+special_scaled_modified_bessel_k1: scaled_modified_bessel_k1
+special_spherical_bessel_j0: spherical_jn
+special_xlog1py: xlog1py
+special_zeta: zeta
+split: split
+split_with_sizes: split
+sqrt: sqrt
+sqrt_: sqrt_
+square: square
+square_: square_
+squeeze: squeeze
+stack: stack
+std: std
+std_mean: std_mean
+stft: stft
+sub: sub
+sub_: sub_
+subtract: sub
+subtract_: subtract_
+sum: sum
+t: t
+t_: t_
+take: take
+tan: tan
+tan_: tan_
+tanh: tanh
+tanh_: tanh_
+threshold: threshold
+threshold_: threshold_
+to: to
+topk: topk
+trace: trace
+transpose: transpose
+transpose_: transpose_
+triangular_solve: triangular_solve
+tril: tril
+tril_: tril_
+tril_indices: tril_indices
+triu: triu
+triu_: triu_
+triu_indices: triu_indices
+true_divide: true_divide
+true_divide_: true_divide_
+trunc: trunc
+trunc_: trunc_
+unbind: unbind
+unfold: unfold
+uniform: Uniform
+uniform_: uniform_
+unsafe_chunk: unsafe_chunk
+unsafe_split: split
+unsafe_split_with_sizes: split_with_sizes
+unsqueeze: unsqueeze
+unsqueeze_: unsqueeze_
+upsample_bicubic2d: interpolate
+upsample_bilinear2d: upsample_bilinear
+upsample_nearest1d: interpolate
+upsample_nearest2d: interpolate
+upsample_nearest3d: interpolate
+var: var
+var_mean: var_mean
+vdot: vdot
+view: view
+where: where
+xlogy: xlogy
+xlogy_: xlogy_
+zero: zeros
+zero_: zero_
+zeros: zeros
+zeros_like: zeros_like
+
+
+
diff --git a/debug/accuracy_tools/atat/pytorch/compare/match.py b/debug/accuracy_tools/atat/pytorch/compare/match.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8e815efb4bd7e6c20ac56e52624d4aed7b47915
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/compare/match.py
@@ -0,0 +1,36 @@
+import os
+import yaml
+from ..common.file_check_util import FileOpen
+from ..common.utils_compare import CompareException
+
+
+class AtenIrMapping():
+    def __init__(self):
+        cur_path = os.path.dirname(os.path.realpath(__file__))
+        yaml_path = os.path.join(cur_path, "mapping.yaml")
+        with FileOpen(yaml_path, 'r') as f:
+            self.aten_mapping = yaml.safe_load(f)
+
+    def match(self, op1, op2):
+        if "Aten" in op1 and "Aten" not in op2:
+            return self.match_op(op1, op2)
+        else:
+            return self.match_op(op2, op1)
+
+    def match_op(self, aten_op, torch_op):
+        try:
+            aten_op_raw_name_overload = '_'.join(aten_op.split("_")[1:-3])
+            aten_op_raw_name = aten_op_raw_name_overload.split('.')[0]
+            torch_op_raw_name = '_'.join(torch_op.split("_")[1:-3]).lower()
+        except IndexError as e:
+            err_msg = f"Dump op name format error: {aten_op}, {torch_op}. Your dump data may be corrupted."
+            raise CompareException.INVALID_DATA_ERROR(err_msg) from e
+        matching_op = self.aten_mapping.get(aten_op_raw_name)
+        if matching_op is None:
+            return False
+        if matching_op.lower() == torch_op_raw_name:
+            return True
+        return False
+
+
+graph_mapping = AtenIrMapping()
diff --git a/debug/accuracy_tools/atat/pytorch/debugger/__init__.py b/debug/accuracy_tools/atat/pytorch/debugger/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py b/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..60af8234a6099216588e510217052a861f812eda
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/debugger/debugger_config.py
@@ -0,0 +1,73 @@
+from ..common import print_warn_log_rank_0, seed_all
+from ...core.utils import Const
+
+class DebuggerConfig:
+    def __init__(self, common_config, task_config, task, dump_path, level):
+        self.dump_path = dump_path if dump_path else common_config.dump_path
+        self.task = task or common_config.task or Const.STATISTICS
+        self.rank = common_config.rank if common_config.rank else []
+        self.step = common_config.step if common_config.step else []
+        self.level = level or common_config.level or "L1"
+        self.seed = common_config.seed if common_config.seed else 1234
+        self.is_deterministic = common_config.is_deterministic if common_config.is_deterministic else False
+        self.scope = task_config.scope if task_config.scope else []
+        self.list = task_config.list if task_config.list else []
+        self.data_mode =  task_config.data_mode if task_config.data_mode else ["all"]
+        self.backward_input = task_config.backward_input
+        self.summary_mode = task_config.summary_mode if task_config.summary_mode else Const.STATISTICS
+        self.overflow_num = task_config.overflow_num if task_config.overflow_num else 1
+        self.repair_scope = None
+        self.repair_api_str = None
+        self.on_step_end = None
+        self.repair_type = None
+        
+        if self.task == "free_benchmark":
+            self.fuzz_device = task_config.fuzz_device if task_config.fuzz_device else 'npu'
+            self.handler_type = task_config.handler_type if task_config.handler_type else 'check'
+            self.pert_mode = task_config.pert_mode if task_config.pert_mode else 'improve_precision'
+            self.fuzz_level = task_config.fuzz_level if task_config.fuzz_level else 'L1'
+            self.fuzz_stage = task_config.fuzz_stage if task_config.fuzz_stage else 'forward'
+            self.preheat_config = {
+                "if_preheat": task_config.if_preheat if task_config.if_preheat is not None else True, 
+                "preheat_step": task_config.preheat_step if task_config.preheat_step else 15, 
+                "max_sample": task_config.max_sample if task_config.max_sample else 20, 
+            }
+            
+        self.check()
+        if self.step:
+            self.step.sort()
+        seed_all(self.seed, self.is_deterministic)
+
+    def check_kwargs(self):
+        if self.task and self.task not in Const.TASK_LIST:
+            raise Exception("task is invalid")
+        if self.level and self.level not in Const.LEVEL_LIST:
+            raise Exception("level is invalid")
+        if not self.dump_path:
+            raise Exception("Invalid dump path, please check your config")
+
+    def check(self):
+        self.check_kwargs()
+        self._check_rank()
+        self._check_step()
+        return True
+
+    def _check_rank(self):
+        if self.rank:
+            for rank_id in self.rank:
+                if not isinstance(rank_id, int) or rank_id < 0:
+                    raise ValueError(f"rank {self.rank} must be an integer and greater than or equal to 0.")
+            else:
+                print_warn_log_rank_0(f"Rank argument is provided. Only rank {self.rank} data will be dumpped.")
+
+    def _check_step(self):
+        if self.step:
+            for s in self.step:
+                if not isinstance(s, int) or s < 0:
+                    raise ValueError(f"step element {s} must be an integer and greater than or equal to 0.")
+    
+    def check_model(self, model):
+        if self.level in ["L0", "mix"] and not model:
+            raise Exception(
+                f"For level {self.level}, PrecisionDebugger must receive a model argument.",
+            )
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/debugger/precision_debugger.py b/debug/accuracy_tools/atat/pytorch/debugger/precision_debugger.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d11988271adb226f24a65d2bcbe5f3b4ed3cc5
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/debugger/precision_debugger.py
@@ -0,0 +1,72 @@
+import torch
+from .debugger_config import DebuggerConfig
+from ..service import Service
+from ..common import print_warn_log_rank_0
+from ..pt_config import parse_json_config
+from ..common.exceptions import MsaccException
+
+
+class PrecisionDebugger:
+    _instance = None
+
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super(PrecisionDebugger, cls).__new__(cls)
+            cls._instance.config = None
+            cls._instance.enable_dataloader = False
+        return cls._instance
+
+    def __init__(
+        self,
+        config_path=None,
+        task=None,
+        dump_path=None,
+        level=None,
+        model=None,
+        step=None,
+    ):
+        if not hasattr(self, "initialized"):
+            self.initialized = True
+            self.model = self.check_model_valid(model)
+            common_config, task_config = parse_json_config(config_path, task)
+            if step:
+                common_config.step = step
+            self.config = DebuggerConfig(
+                common_config, task_config, task, dump_path, level
+            )
+            self.config.check_model(self.model)
+            self.service = Service(self.config)
+
+    @classmethod
+    def start(cls):
+        instance = cls._instance
+        if not instance:
+            raise Exception("No instance of PrecisionDebugger found.")
+        if instance.enable_dataloader:
+            print_warn_log_rank_0("DataLoader is enabled, start() skipped.")
+        else:
+            instance.service.start(instance.model)
+
+    @classmethod
+    def stop(cls):
+        instance = cls._instance
+        if not instance:
+            raise Exception("PrecisionDebugger instance is not created.")
+        if instance.enable_dataloader:
+            print_warn_log_rank_0("DataLoader is enabled, stop() skipped.")
+        else:
+            instance.service.stop()
+
+    @classmethod
+    def step(cls):
+        if not cls._instance:
+            raise Exception("PrecisionDebugger instance is not created.")
+        cls._instance.service.step()
+
+    @staticmethod
+    def check_model_valid(model):
+        if not model or isinstance(model, torch.nn.Module):
+            return model
+        raise MsaccException(
+            MsaccException.INVALID_PARAM_ERROR, "model 参数必须是torch.nn.Module类型。"
+        )
diff --git a/debug/accuracy_tools/atat/pytorch/doc/FAQ.md b/debug/accuracy_tools/atat/pytorch/doc/FAQ.md
new file mode 100644
index 0000000000000000000000000000000000000000..daaa79abd956f7a585b6d76a45812c4e7b4fc6ae
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/doc/FAQ.md
@@ -0,0 +1,202 @@
+# 精度预检工具
+
+1. 预检工具在dump和run_ut的过程中，是否需要同时开启或关闭jit编译（jit_compile）？
+
+   答：是。
+
+2. 预检工具对于type_as这类涉及数据类型转换操作的API，是否具有参考性？
+
+   由于这类API在CPU侧存在精度先提升后下降的操作，因此这类API的有效性的参考价值有限。
+
+3. run ut过程中出现报错：ERROR:Got unsupported ScalarType BFloat16
+
+   答：请使用最新版本的工具。
+
+4. Dropout算子，CPU和NPU的随机应该不一样，为什么结果比对是一致的？
+
+   答：这个结果是正常的，工具对该算子有特殊处理，只判定位置为0的位置比例大约和设定p值相当。
+
+5. 为什么浮点型数据bench和CPU的dtype不一致？
+
+   答：对于fp16的数据，CPU会上升一个精度fp32去计算，这是和算子那边对齐的精度结论，CPU用更高精度去计算会更接近真实值。
+
+6. 添加预检工具后截取操作报错：`IndexError: too many indices for tensor of dimension x` 或 `TypeError: len() of a 0-d tensor`。
+
+   答：注释工具目录api_accuracy_checker/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- __getitem__`，工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。
+
+7. 添加预检工具后F.gelu触发ValueError报错：`activation_func must be F.gelu`等。
+
+   答：注释工具目录api_accuracy_checker/hook_module/support_wrap_ops.yaml文件中functional:下的的`- gelu`，工具会跳过dump该API。如果是需要dump的关键位置API也可以考虑根据报错堆栈信息注释引发报错的类型检查。
+
+8. 添加预检工具后触发AsStrided算子相关的报错，或者编译相关的报错，如：`Failed to compile Op [AsStrided]`。
+
+   答：注释工具目录api_accuracy_checker/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- t`和`- transpose`。
+
+9. Tensor 魔法函数具体对应什么操作？
+
+   答：
+
+   | Tensor魔法函数  | 具体操作         |
+   | --------------- | ---------------- |
+   | `__add__`       | +                |
+   | `__and__`       | &                |
+   | `__bool__`      | 返回Tensor布尔值 |
+   | `__div__`       | /                |
+   | `__eq__`        | ==               |
+   | `__ge__`        | >=               |
+   | `__gt__`        | >                |
+   | `__iadd__`      | +=               |
+   | `__iand__`      | &=               |
+   | `__idiv__`      | /=               |
+   | `__ifloordiv__` | //=              |
+   | `__ilshift__`   | <<=              |
+   | `__imod__`      | %=               |
+   | `__imul__`      | *=               |
+   | `__ior__`       | \|=              |
+   | `__irshift__`   | >>=              |
+   | `__isub__`      | -=               |
+   | `__ixor__`      | ^=               |
+   | `__lshift__`    | <<               |
+   | `__matmul__`    | 矩阵乘法         |
+   | `__mod__`       | %                |
+   | `__mul__`       | *                |
+   | `__nonzero__`   | 同`__bool__`     |
+   | `__or__`        | \|               |
+   | `__radd__`      | +（反向）        |
+   | `__rmul__`      | *（反向）        |
+   | `__rshift__`    | >>               |
+   | `__sub__`       | -                |
+   | `__truediv__`   | 同`__div__`      |
+   | `__xor__`       | ^                |
+
+# 精度比对工具
+
+## 工具使用
+
+### dump指定融合算子
+
+dump指定操作当前支持dump指定融合算子的输入输出，需要在att/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/hook_module/support_wrap_ops.yaml中添加，比如以下代码段调用的softmax融合算子
+
+```
+def npu_forward_fused_softmax(self, input_, mask):
+    resl = torch_npu.npu_scaled_masked_softmax(input_, mask, self.scale, False)
+    return resl
+```
+
+如果需要dump其中调用的npu_scaled_masked_softmax算子的输入输出信息，需要在support_wrap_ops.yaml中的torch_npu: 中自行添加该融合算子即可：
+
+```
+- npu_scaled_masked_softmax
+```
+
+（npu_scaled_masked_softmax融合算子工具已支持dump，本例仅供参考）
+
+## 常见问题
+
+### 1. 在同一个目录多次执行dump会冲突吗？
+
+会，同一个目录多次dump，会覆盖上一次结果，可以使用dump_tag参数修改dump目录名称。
+
+### 2. 如何dump算子级的数据？
+
+需要配置level为L2模式。
+
+### 3. 工具比对发现NPU和标杆数据的API无法完全对齐？
+
+torch版本和硬件差异属于正常情况。
+
+## 异常情况
+
+### 2. HCCL 报错： error code: EI0006
+
+**故障现象**
+
+使用atat工具时，报错： error code: EI0006。
+
+**故障原因**
+
+CANN软件版本较低导致不兼容。
+
+**故障处理**
+
+升级新版CANN软件版本。
+
+### 3. torch_npu._C._clear_overflow_npu() RuntimeError NPU error，error code is 107002
+
+如果运行溢出检测功能遇到这个报错，采取以下解决方法：
+如果是单卡运行，添加如下代码，0是卡号，选择自己空闲的卡号。
+
+```
+torch.npu.set_device('npu:0')
+```
+
+如果多卡运行，请在代码中修改对应卡号，比如进程使用卡号为{rank}时可以添加如下代码：
+
+```
+torch.npu.set_device(f'npu:{rank}')
+```
+
+如果运行精度比对功能遇到这个报错，尝试安装最新版本的atat。
+
+### 4. 运行compare.py时报错：json.decoder.JSONDecodeError: Extra data: line 1 column 37(char 36)
+
+遇到这种情况，先更新工具版本为最新版本，再重新运行训练代码dump数据，再用新的dump数据进行精度比对，如果最新版本未能解决问题，请联系atat工具开发人员。
+
+### 5. AssertionError: assert set(WrapTensorOps) <= set(_tensor_ops)
+
+遇到这种情况，先检查安装的torch版本，建议先更新工具版本为2.2以上，版本2.2的工具支持torch1.8、1.11和2.0
+
+### 6. dump得到的VF_lstm_99_forward_input.1.0.npy、VF_lstm_99_forward_input.1.1.npy类似的数据是否正常？
+
+带1.0/1.1/1.2后缀的npy是正常现象，例如当输入数据为[[tensor1, tensor2, tensor3]]会生成这样的后缀。
+
+### 8. 进行compare报错：The current file contains stack information, please turn on the stack_mode
+
+在比对脚本中，设置stack_mode=True，例如：
+
+```
+from ptdbg_ascend import *
+dump_result_param={
+"npu_pkl_path": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl",
+"bench_pkl_path": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump.pkl",
+"npu_dump_data_dir": "./npu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump",
+"bench_dump_data_dir": "./gpu_dump/ptdbg_dump_v2.0/rank0/api_stack_dump",
+"is_print_compare_log": True
+}
+compare(dump_result_param, "./output", stack_mode=True)
+```
+
+### 9. dump指定反向API的kernel级别的数据报错：NameError：name 'torch_npu' is not defined
+
+- 如果是npu环境，请安装torch_npu；
+- 如果是gpu环境，暂不支持dump指定API的ACL级别的数据
+
+### 10. 配置dump_path后，使用工具报错：[ERROR]The file path /home/xxx/dump contains special characters
+
+- 请检查你设置的dump绝对路径是否包含特殊字符，确保路径名只包含大小写字母、数字、下划线、斜杠、点和短横线
+- 注意，如果你执行脚本的路径为/home/abc++/，你设置的dump_path="./dump"，工具实际校验的路径为绝对路径/home/abc++/dump，++为特殊字符，会引发本条报错
+
+### 11. 无法dump matmul权重的反向梯度数据
+
+- matmul期望的输入是二维，当输入不是二维时，会将输入通过view操作展成二维，再进行matmul运算，因此在反向求导时，backward_hook能拿到的是UnsafeViewBackward这步操作里面数据的梯度信息，取不到MmBackward这步操作里面数据的梯度信息，即权重的反向梯度数据。
+- 典型的例子有，当linear的输入不是二维，且无bias时，会调用output = input.matmul(weight.t()),因此拿不到linear层的weight的反向梯度数据。
+
+### 12. dump.json文件中的某些api的dtype类型为float16，但是读取此api的npy文件显示的dtype类型为float32
+
+- atat工具在dump数据时需要将原始数据从npu to cpu上再转换为numpy类型，npu to cpu的逻辑和gpu to cpu是保持一致的，都存在dtype可能从float16变为float32类型的情况，如果出现dtype不一致的问题，最终dump数据的dtype以pkl文件为准。
+
+### 13. 使用dataloader后raise异常Exception: ptdbg: exit after iteration [x, x, x]
+
+- 正常现象，dataloader通过raise结束程序，堆栈信息可忽略。
+
+### 14. 添加atat工具后截取操作报错：`IndexError: too many indices for tensor of dimension x` 或 `TypeError: len() of a 0-d tensor`。
+
+- 注释工具目录atat/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- __getitem__`，工具会跳过dump该API。如果是需要dump的关键位置api也可以考虑根据报错堆栈信息注释引发报错的类型检查。
+
+### 15. 添加atat工具后F.gelu触发ValueError报错：`activation_func must be F.gelu`等。
+
+- 注释工具目录atat/hook_module/support_wrap_ops.yaml文件中functional:下的的`- gelu`，工具会跳过dump该API。如果是需要dump的关键位置api也可以考虑根据报错堆栈信息注释引发报错的类型检查。
+
+### 16. 添加atat工具后触发AsStrided算子相关的报错，或者编译相关的报错，如：`Failed to compile Op [AsStrided]`。
+
+- 注释工具目录atat/hook_module/support_wrap_ops.yaml文件中Tensor:下的`- t`和`- transpose`。
diff --git a/debug/accuracy_tools/atat/pytorch/doc/api_accuracy_checker.md b/debug/accuracy_tools/atat/pytorch/doc/api_accuracy_checker.md
new file mode 100644
index 0000000000000000000000000000000000000000..e827ac266afe8b7f5110bd4d03a885f4bd30669b
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/doc/api_accuracy_checker.md
@@ -0,0 +1,278 @@
+# **精度预检工具**
+
+## 简介
+
+精度预检工具通过扫描昇腾NPU上用户训练模型中所有API，输出精度情况的诊断和分析。工具通过dump模型中所有的API前反向信息；构造相应的API单元测试，将NPU输出与标杆（CPU高精度）比对，从而计算对应的精度指标，该过程称为run_ut；将NPU环境下dump的预检数据拷贝至GPU环境，同样执行run_ut；最后通过**新精度标准比对法**将NPU和GPU的预检结果进行比对，从而找出NPU中存在精度问题的API。
+
+**新精度标准比对法**：依据新精度标准，对不同的API采取不同的比对算法进行比对（包括绝对阈值法，标杆比对法、二进制一致法、ULP误差比对法和双千指标法），最终给定预检判定结果。
+
+**真实数据模式**：精度预检工具支持随机生成模式和真实数据模式，即在预检dump时可以选择由工具构造随机数进行输入获得dump数据或选择获取真实输入数据进行预检dump操作；随机生成模式执行效率高，可以快速获得结果，但数据精度低，只能大致判断精度问题；真实数据模式执行效率略低于随机生成模式，但是数据精度高，可以准确判断精度问题。
+
+**工具支持PyTorch版本**：1.11.0/2.0/2.1/2.2。
+
+**工具特性**
+
+- 落盘数据小。
+- 支持随机生成模式和真实数据模式。
+- 单API测试，排除整网中的累计误差问题。
+
+## 预检流程
+
+精度预检操作流程如下：
+
+1. 在NPU和GPU环境下分别安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。
+2. 在NPU训练脚本内添加atat工具dump接口PrecisionDebugger采集待预检数据。详见《[精度数据采集](./dump.md)》。
+3. 将NPU环境下dump的预检数据拷贝至GPU环境。
+4. 在NPU和GPU环境下分别执行run_ut，生成结果用于最终api_precision_compare操作的输入。详见“**run_ut预检操作**”。
+5. 将NPU和GPU执行run_ut生成的`accuracy_checking_details_{timestamp}.csv`结果文件拷贝至同一环境下。
+6. 运行api_precision_compare.py，输出结果为预检操作的最终结果。详见“**预检结果比对**”。
+
+## 预检操作
+
+### run_ut预检操作
+
+完成待预检数据采集后，仅仅获取了API的输入数据，为了得到NPU vs CPU高精度（标杆）的预检比对结果和GPU vs CPU高精度（标杆）的预检比对结果，还需要进行run_ut操作。
+
+run_ut预检操作包括如下场景：
+
+- 使用run_ut.py执行预检：run_ut.py适用于数据量较小的单卡场景。
+- 使用multi_run_ut.py执行多线程预检：multi_run_ut.py适用于数据量较大的大模型场景。
+
+#### 使用run_ut.py执行预检
+
+1. 将API信息输入给run_ut模块运行精度检测并比对，运行如下命令： 
+
+   ```bash
+   atat -f pytorch run_ut -api_info ./dump.json
+   ```
+
+   某些场景下（如推理），可以不指定backward_info_0.json，不影响预检功能。
+
+   | 参数名称                     | 说明                                                         | 是否必选                           |
+   | ---------------------------- | ------------------------------------------------------------ | ---------------------------------- |
+   | -api_info或--api_info_file   | 指定API信息文件dump.json。                                   | 是                                 |
+   | -save_error_data             | 保存精度未达标的API输入输出数据。                            | 否                                 |
+   | -o或--out_path               | 指定run_ut执行结果存盘路径，默认“./”（相对于run_ut的路径）。 | 否                                 |
+   | -j或--jit_compile            | 开启jit编译。                                                | 否                                 |
+   | -d或--device                 | 指定Device ID，选择UT代码运行所在的卡，默认值为0。           | 否                                 |
+   | -csv_path或--result_csv_path | 指定本次运行中断时生成的`accuracy_checking_result_{timestamp}.csv`文件路径，执行run_ut中断时，若想从中断处继续执行，配置此参数即可。需要指定为上次中断的`accuracy_checking_result_{timestamp}.csv`文件。详见“**断点续检**”。 | run_ut操作中断后继续执行场景下必选 |
+   | -f或--filter_api             | 过滤模型中除最大值和最小值以外其他参数和结构相同的API。适用于模型较大且重复API较多的场景。 | 否                                 |
+   
+   run_ut执行结果包括`accuracy_checking_result_{timestamp}.csv`和`accuracy_checking_details_{timestamp}.csv`两个文件。`accuracy_checking_result_{timestamp}.csv`是API粒度的，标明每个API是否通过测试。建议用户先查看`accuracy_checking_result_{timestamp}.csv`文件，对于其中没有通过测试的或者特定感兴趣的API，根据其API name字段在`accuracy_checking_details_{timestamp}.csv`中查询其各个输出的达标情况以及比较指标。详细介绍请参见“**预检结果**”。
+   
+2. （可选）如果需要保存比对不达标的输入和输出数据，可以在run_ut执行命令结尾添加-save_error_data，例如：
+
+   ```bash
+   atat -f pytorch run_ut -api_info ./dump.json -save_error_data
+   ```
+
+   数据默认会存盘到'./ut_error_data{timestamp}'路径下（相对于启动run_ut的路径），有需要的话，用户可以通过修改att/debug/accuracy_tools/api_accuracy_checker目录下，config.yaml文件的error_data_path参数来配置保存路径，详见“config.yaml文件说明”。
+
+#### 使用multi_run_ut.py执行多线程预检
+
+multi_run_ut.py脚本，可以并行执行多个run_ut操作，从而降低预检耗时。
+
+命令示例如下：
+
+```bash
+atat -f pytorch multi_run-ut -api_info ./dump.json -n 32 -d 0 1 2 3
+```
+
+某些场景下（如推理），可以不指定backward_info_0.json，不影响预检功能。
+
+| 参数名称                     | 说明                                                         | 是否必选                           |
+| ---------------------------- | ------------------------------------------------------------ | ---------------------------------- |
+| -api_info或--api_info_file   | 指定API信息文件dump.json。                                   | 是                                 |
+| -save_error_data             | 保存精度未达标的API输入输出数据。                            | 否                                 |
+| -o或--out_path               | 指定run_ut执行结果存盘路径，默认“./”（相对于run_ut的路径）。 | 否                                 |
+| -j或--jit_compile            | 开启jit编译。                                                | 否                                 |
+| -n                           | 同时执行run_ut线程的数量，默认为8，最大支持64，但每个Device最大支持8个线程，当指定多个线程和多个Device时，则线程数在每张卡上均分。 | 否                                 |
+| -d或--device                 | 指定Device ID，选择UT代码运行所在的卡，默认值为0，支持同时指定0~7，共8个Device。 | 否                                 |
+| -csv_path或--result_csv_path | 指定本次运行中断时生成的`accuracy_checking_result_{timestamp}.csv`文件路径，执行run_ut中断时，若想从中断处继续执行，配置此参数即可。需要指定为上次中断的`accuracy_checking_result_{timestamp}.csv`文件。详见“**断点续检**”。 | run_ut操作中断后继续执行场景下必选 |
+| -f或--filter_api             | 过滤模型中除最大值和最小值以外其他参数和结构相同的API。适用于模型较大且重复API较多的场景。 | 否                                 |
+
+#### 断点续检
+
+精度预检run_ut过程中，若因环境、数据量过大等原因导致预检进程中断，那么当用户解决这些问题后，重新执行run_ut操作，可以通过断点续检操作继续前面未完成的预检，会在-csv_path指定的`accuracy_checking_result_{timestamp}.csv`文件以及对应的`accuracy_checking_details_{timestamp}.csv`文件中继续写入后续的结果，不会重新创建结果文件。
+
+须指定为上次预检中断的`accuracy_checking_result_{timestamp}.csv`文件。请勿修改`accuracy_checking_result_{timestamp}.csv`和`accuracy_checking_details_{timestamp}.csv`文件名，包括时间戳，否则断点续检会因无法识别到文件名而失败。
+
+断点续检操作通过如下命令执行：
+
+```bash
+atat -f pytorch run_ut -api_info ./dump.json -csv_path /home/xxx/ut/accuracy_checking_result_{timestamp}.csv
+```
+
+#### API预检白名单
+
+run_ut过程支持API预检白名单，操作方式如下：
+
+修改att/debug/accuracy_tools/api_accuracy_checker目录下config.yaml文件的white_list参数，配置需要预检的API名称，详见“config.yaml文件说明”。
+
+### config.yaml文件说明
+
+config.yaml文件可以通过配置参数来控制dump和run_ut操作的真实数据模式以及白名单等功能。
+
+文件路径为：att/debug/accuracy_tools/atat/pytorch/api_accuracy_checker/config.yaml 
+
+| 参数名称          | 说明                                                         | 是否必选 |
+| ----------------- | ------------------------------------------------------------ | -------- |
+| dump_path         | 设置dump路径，默认为当前目录。若指定目录不存在，则自动创建。 | 否       |
+| real_data         | 真实数据模式，可取值True或False，默认为False，表示随机数据模式，配置为True后开启真实数据模式，dump信息增加forward_real_data和backward_real_data目录，目录下保存每个API输入的具体数值。 | 否       |
+| enable_dataloader | 自动dump数据开关，可取值True（开启）、False（关闭），默认关闭。 | 否       |
+| target_iter       | 指定dump某个step的数据，默认为[1]，须指定为训练脚本中存在的step。target_iter为list格式，可配置逐个step，例如：target_iter=[0,1,2]；也可以配置step范围，例如：target_iter=list(range(0,9))，表示dump第0到第8个step。 | 否       |
+| white_list        | API dump白名单，指定dump具体API数据，也可以直接配置预检的API白名单，详细请参见“**API预检白名单**”。参数示例：white_list=["conv1d", "conv2d"]。默认未配置白名单，即dump全量API数据。 | 否       |
+| error_data_path   | 配置保存精度未达标的API输入输出数据路径。                    | 否       |
+| jit_compile       | 开启jit编译。                                                | 否       |
+| precision         | 浮点数表示位数，默认取小数点后14位。                         | 否       |
+
+## 预检结果
+
+精度预检生成的`accuracy_checking_result_{timestamp}.csv`和`accuracy_checking_details_{timestamp}.csv`文件示例如下：
+
+可以通过先查看`accuracy_checking_result_{timestamp}.csv`文件的Forward Test Success和Backward Test Success，判断是否存在未通过测试的API，再查看`accuracy_checking_details_{timestamp}.csv`文件的API详细达标情况，API达标情况介绍请参见“**API预检指标**”。
+
+`accuracy_checking_result_{timestamp}.csv`
+
+![accuracy_checking_result](img/accuracy_checking_result.png)
+
+| 字段                  | 含义                                                         |
+| --------------------- | ------------------------------------------------------------ |
+| API name              | API名称。                                                    |
+| Forward Test Success  | 前向API是否通过测试，pass为通过，warning为待观察，error为错误。 |
+| Backward Test Success | 反向API是否通过测试，pass为通过，warning为待观察，error为错误，如果是空白的话代表该API没有反向输出。 |
+| Message               | 提示信息。                                                   |
+
+该结果为中间结果，仅作为参考，建议完成“**预检结果比对**”后查看比对结果。该结果后续将会删除。
+
+Forward Test Success和Backward Test Success是否通过测试是由`accuracy_checking_details_{timestamp}.csv`中的余弦相似度、最大绝对误差、双百双千双万指标判定结果决定的。
+
+需要注意的是`accuracy_checking_details_{timestamp}.csv`中可能存在一个API的前向（反向）有多个输出，那么每个输出记录一行，而在`accuracy_checking_result_{timestamp}.csv`中的结果需要该API的所有结果均为pass才能标记为pass，只要存在一个error则标记error，仅存在waring和pass且不存在error标记waring。
+
+`accuracy_checking_details_{timestamp}.csv`
+
+![accuracy_checking_details](img/accuracy_checking_details.png)
+
+| 字段                | 含义                                                         |
+| ------------------- | ------------------------------------------------------------ |
+| API name            | NPU或GPU下的API名称。                                        |
+| Bench Dtype         | 标杆数据的API数据类型。                                      |
+| DEVICE Dtype        | NPU或GPU数据的API数据类型。                                  |
+| Shape               | API的Shape信息。                                             |
+| 余弦相似度          | NPU或GPU数据与标杆数据的余弦相似度。                         |
+| 最大绝对误差        | NPU或GPU数据与标杆数据的最大绝对误差。                       |
+| 双百指标            | 双百精度指标。是指NPU或GPU的Tensor中的元素逐个与对应的标杆数据对比，相对误差小于百分之一的个数占总元素个数的比例。测试通过标准为相对误差大于百分之一的个数占总元素个数的比例小于百分之一。 |
+| 双千指标            | 双千精度指标。是指NPU或GPU的Tensor中的元素逐个与对应的标杆数据对比，相对误差小于千分之一的个数占总元素个数的比例。测试通过标准为相对误差大于千分之一的个数占总元素个数的比例小于千分之一。 |
+| 双万指标            | 双万精度指标。是指NPU或GPU的Tensor中的元素逐个与对应的标杆数据对比，相对误差小于万分之一的个数占总元素个数的比例。测试通过标准为相对误差大于万分之一的个数占总元素个数的比例小于万分之一。 |
+| 二进制一致错误率    | NPU或GPU数据中每个Tensor精度不一致的数值的数量与Tensor中数值数量的比值。只有数据是builtin类型（bool、int、float、str）、torch.bool和torch的int类型或者在新精度标准中使用二进制一致算法进行比对的API才会展示。 |
+| 误差均衡性          | NPU或GPU数据与标杆数据精度差的上下浮动情况。                 |
+| 均方根误差          | NPU或GPU数据与标杆数据的均方根误差。                         |
+| 小值域错误占比      | NPU或GPU Tensor中与标杆的绝对误差大于错误阈值的小值在小值域（小值的总数量）中的占比。判断为小值以及绝对误差的错误阈值见“**小值域阈值**”。 |
+| 相对误差最大值      | NPU或GPU数据与标杆数据相对误差的最大值。                     |
+| 相对误差平均值      | NPU或GPU数据与标杆数据相对误差的平均值。                     |
+| inf/nan错误率       | NPU与标杆inf/nan计算不一致的元素个数占总元素的个数比例。     |
+| 相对误差错误率      | NPU与标杆的正常值计算相对误差，其大于错误阈值的元素个数占正常值元素个数的比例。 |
+| 绝对误差错误率      | NPU与标杆的小值计算绝对误差，其大于错误阈值的元素个数占小值元素个数的比例。 |
+| ULP误差最大值       | NPU或GPU数据与标杆数据ULP误差的最大值（取绝对值后）。        |
+| ULP误差平均值       | NPU或GPU数据与标杆数据ULP误差的平均值（取绝对值后）。        |
+| ULP误差大于阈值占比 | NPU或GPU数据与标杆数据的ULP误差（取绝对值后）大于阈值（当NPU或GPU数据类型为float16或bfloat16时，阈值为1；当NPU或GPU数据类型为float32时，阈值为32）的元素个数占总元素的个数比例。 |
+| Status              | API预检通过状态，pass表示通过测试，error表示未通过，warning表示测试未通过双千或双万精度指标，SKIP表示该API的某个参数的反向不要计算梯度，所以没有任何计算过程，其他信息均为空。 |
+| message             | 提示信息。                                                   |
+
+### 小值域阈值
+
+判定为小值的阈值为：
+
+- torch.float32：e-6
+- torch.float16：e-3
+- torch.bfloat16：e-3
+
+小值域的绝对误差阈值为：
+
+- torch.float32：e-9
+- torch.float16：e-5
+- torch.bfloat16：e-5
+
+### API预检指标
+
+API预检指标是通过对`accuracy_checking_details_{timestamp}.csv`中的余弦相似度、最大绝对误差双百、双千、双万精度指标的数值进行判断，得出该API是否符合精度标准的参考指标。
+
+API预检通过测试，则在`accuracy_checking_details_{timestamp}.csv`文件中的“Status”列标记“pass”，否则标记“error”或“warning”，详细规则如下：
+
+1. 余弦相似度 > 0.99：≤ 0.99为不达标，标记“error”，> 0.99达标，进行下一步；
+2. 最大绝对误差 ＜ 0.001：＜ 0.001达标，标记“pass”，≥ 0.001为不达标，进行下一步；
+3. 双百、双千、双万精度指标：
+   - 对于float16和bfloat16数据：双百指标不通过，标记“error”；双百指标通过，双千指标不通过，标记“warning”；双百、双千指标均通过，标记“pass”。
+   - 对于float32和float64数据：双千指标不通过，标记“error”；双千指标通过，双万指标不通过，标记“warning”；双千、双万指标均通过，标记“pass”。
+
+4. 在`accuracy_checking_result_{timestamp}.csv`中以“Forward Test Success”和“Backward Test Success”字段统计该算子前向反向输出的测试结果，对于标记“pass”的算子，则在`accuracy_checking_result_{timestamp}.csv`中标记“TRUE”表示测试通过，对于标记“error”或“warning”的算子，则在`accuracy_checking_result_{timestamp}.csv`中标记“FALSE”表示测试不通过。由于一个算子可能有多个前向或反向的输入或输出，那么该类算子的输入或输出中必须全为“pass”，才能在`accuracy_checking_result_{timestamp}.csv`中标记“TRUE”，只要有一个输入或输出标记“error”或“warning”，那么在`accuracy_checking_result_{timestamp}.csv`中标记“FALSE”。
+
+## 预检结果比对
+
+需要同时获取NPU和GPU环境下run_ut操作的预检结果`accuracy_checking_details_{timestamp}.csv`文件。执行如下命令进行NPU和GPU预检结果的比对：
+
+```bash
+atat -f pytorch api_precision_compare -npu /home/xxx/npu/accuracy_checking_details_{timestamp}.csv -gpu /home/xxx/gpu/accuracy_checking_details_{timestamp}.csv -o /home/xxx/
+```
+
+| 参数名称             | 说明                                                         | 是否必选 |
+| -------------------- | ------------------------------------------------------------ | -------- |
+| -npu或--npu_csv_path | NPU预检结果`accuracy_checking_details_{timestamp}.csv`文件路径。默认从当前目录下识别该文件。 | 否       |
+| -gpu或--gpu_csv_path | GPU预检结果`accuracy_checking_details_{timestamp}.csv`文件路径。默认从当前目录下识别该文件。 | 否       |
+| -o或--out_path       | 指定api_precision_compare.py执行结果存盘路径，默认为当前目录。 | 否       |
+
+执行完成后输出`api_precision_compare_result_{timestamp}.csv`和`api_precision_compare_details_{timestamp}.csv`文件。文件示例如下：
+
+可以通过先查看`api_precision_compare_result_{timestamp}.csv`文件的Forward Test Success和Backward Test Success，判断是否存在未通过测试的API，再查看`api_precision_compare_details_{timestamp}.csv`文件的API详细达标情况。
+
+`api_precision_compare_result_{timestamp}.csv`
+
+![api_precision_compare_result](img/api_precision_compare_result.png)
+
+| 字段                  | 含义                                                         |
+| --------------------- | ------------------------------------------------------------ |
+| API name              | API名称。                                                    |
+| Forward Test Success  | 前向API是否通过测试，pass为通过，warning为待观察，error为错误，skip表示该API的数据类型不支持使用新精度标准进行比对，如float64。 |
+| Backward Test Success | 反向API是否通过测试，pass为通过，warning为待观察，error为错误，如果是空白的话代表该API没有反向输出，skip表示该API的数据类型不支持使用新精度标准进行比对，如float64。 |
+| Message               | 提示信息。                                                   |
+
+Forward Test Success和Backward Test Success是否通过测试是由`api_precision_compare_details_{timestamp}.csv`中的各个指标判定结果决定的。需要注意的是`api_precision_compare_details_{timestamp}.csv`中可能存在一个API的前向（反向）有多个输出，那么每个输出记录一行，而在`api_precision_compare_result_{timestamp}.csv`中的结果需要该API的所有结果均为pass才能标记为pass，只要存在一个error则标记error，仅存在warning和pass且不存在error标记warning。
+
+`api_precision_compare_details_{timestamp}.csv`
+
+![api_precision_compare_details](img/api_precision_compare_details.png)
+
+| 字段                     | 含义                                                         |
+| ------------------------ | ------------------------------------------------------------ |
+| API name                 | NPU或GPU下的API名称。                                        |
+| 小值域错误比值           | NPU与CPU的小值域的错误比率/GPU与CPU的小值域的错误比率。标杆比对法指标。 |
+| 小值域错误判定结果       | 小值域错误比值小于等于1标记为pass，1~2之间标记为waring，大于2标记为error。 |
+| 均方根误差比值           | NPU与CPU的均方根误差/GPU与CPU的均方根误差。标杆比对法指标。  |
+| 均方根误差判定结果       | 均方根误差比值小于等于1标记为pass，1~2之间标记为waring，大于2标记为error。 |
+| 相对误差最大值比值       | NPU与CPU的相对误差最大值/GPU与CPU的相对误差最大值。标杆比对法指标。 |
+| 相对误差最大值判定结果   | 相对误差最大值比值小于等于1标记为pass，1~10之间标记为waring，大于10标记为error。 |
+| 相对误差平均值比值       | NPU与CPU的相对误差的平均值/GPU与CPU的相对误差的平均值。标杆比对法指标。 |
+| 相对误差平均值判定结果   | 相对误差平均值比值小于等于1标记为pass，1~2之间标记为waring，大于2标记为error。 |
+| 误差均衡性比值           | NPU与CPU的误差均衡性/GPU与CPU的误差均衡性。标杆比对法指标。  |
+| 误差均衡性判定结果       | 误差均衡性比值小于等于1标记为pass，1~2之间标记为waring，大于2标记为error。该字段暂不参与api_precision_compare_result的结果判定。 |
+| inf/nan错误率            | NPU与标杆inf/nan计算不一致的元素个数占总元素的个数比例。绝对阈值法指标。 |
+| inf/nan判定结果          | inf/nan错误率判定结果，等于0标记为pass，其余情况标记为error。 |
+| 相对误差错误率           | NPU与标杆的正常值计算相对误差，其大于错误阈值的元素个数占正常值元素个数的比例。绝对阈值法指标。 |
+| 相对误差判定结果         | 相对误差错误率判定结果，等于0标记为pass，其余情况标记为error。 |
+| 绝对误差错误率           | NPU与标杆的小值计算绝对误差，其大于错误阈值的元素个数占小值元素个数的比例。绝对阈值法指标。 |
+| 绝对误差判定结果         | 绝对误差错误率判定结果，等于0标记为pass，其余情况标记为error。 |
+| 二进制一致错误率         | NPU或GPU数据中每个Tensor精度不一致的数值的数量与Tensor中数值数量的比值。只有数据是builtin类型（bool、int、float、str）、torch.bool和torch的int类型或者在新精度标准中使用二进制一致算法进行比对的API才会展示。二进制一致法指标。 |
+| 二进制一致错误率判定结果 | 二进制一致错误率判定结果，等于0标记为pass，其余情况标记为error。 |
+| ULP误差平均值            | NPU数据与标杆数据ULP误差的平均值（取绝对值后）。ULP误差比对法指标。 |
+| ULP误差大于阈值占比      | NPU数据与标杆数据的ULP误差（取绝对值后）大于阈值（当NPU数据类型为float16或bfloat16时，阈值为1；当NPU数据类型为float32时，阈值为32）的元素个数占总元素的个数比例。ULP误差比对法指标。 |
+| ULP误差大于阈值占比比值  | NPU与CPU的ULP误差大于阈值占比/GPU与CPU的ULP误差大于阈值占比。ULP误差比对法指标。 |
+| ULP误差判定结果          | ULP误差判定结果。<br/>     当NPU或GPU数据类型是float16或bfloat16时，以下两条标准满足其一标记为pass，否则标记为error：<br>          NPU ULP误差大于阈值占比小于0.001；<br/>          NPU ULP误差大于阈值占比小于GPU ULP误差大于阈值占比。<br/>     当NPU或GPU数据类型是float32时，以下三条标准满足其一标记为pass，否则标记为error：<br/>          NPU ULP误差平均值小于64；<br/>          NPU ULP误差大于阈值占比小于0.05；<br/>          NPU ULP误差大于阈值占比小于GPU ULP误差大于阈值占比。 |
+| 双千指标                 | 双千精度指标。是指NPU的Tensor中的元素逐个与对应的标杆数据对比，相对误差小于千分之一的个数占总元素个数的比例。测试通过标准为相对误差大于千分之一的个数占总元素个数的比例小于千分之一。仅conv1d和conv2d使用该指标。双千指标法指标。 |
+| 双千指标判定结果         | 双千指标判定结果。双千指标大于0.999标记为pass，否则标记为error。 |
+| 比对结果                 | 综合所有指标的最终结果。如果比对指标中有error，则标记为error；有warning，则标记为warning；否则标记为pass。 |
+| 比对算法                 | API使用的比对算法，为标杆比对法、二进制一致法、绝对阈值法和ULP误差比对法中的一种。 |
+| Message                  | 提示信息。当前提示该API比对结果为error或warning时对应不符合标准的指标。 |
+
+# FAQ
+
+[FAQ](./FAQ.md)
diff --git "a/debug/accuracy_tools/atat/pytorch/doc/atat\347\262\276\345\272\246\345\267\245\345\205\267\346\225\260\346\215\256dump\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md" "b/debug/accuracy_tools/atat/pytorch/doc/atat\347\262\276\345\272\246\345\267\245\345\205\267\346\225\260\346\215\256dump\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md"
new file mode 100644
index 0000000000000000000000000000000000000000..ed175ff30172a54d8d4868097599ab8518b45e4f
--- /dev/null
+++ "b/debug/accuracy_tools/atat/pytorch/doc/atat\347\262\276\345\272\246\345\267\245\345\205\267\346\225\260\346\215\256dump\346\240\207\345\207\206\346\200\247\350\203\275\345\237\272\347\272\277\346\212\245\345\221\212.md"
@@ -0,0 +1,182 @@
+# atat精度工具标准性能基线报告
+
+## 环境信息
+
+NPU：Atlas A2 训练系列产品
+
+CPU：
+
+![输入图片说明](img/cpu_info.png)
+
+Torch：2.1.0
+
+CANN：8.0.T2
+
+除上述环境信息影响性能外，API的数量、种类以及Shape都会对性能产生影响，因此本次选取指定网络进行测试，为了避免算子编译耗时的影响，所有模型运行时都开启二进制，模型中添加torch.npu.set_compile_mode(jit_compile=False)，所有模型都dump第二个step的数据。
+
+## 模型信息和性能基线
+
+大模型在使用atat工具dump数据时，建议先简化模型层数，减少dump数据量。
+
+以下场景的性能基线测试数据均为多次测试后取平均值，因此实际运行时性能数据可能会根据环境状态稍有浮动。
+
+### 工具配置信息
+
+dump全部API级别输入输出数据以及相应堆栈信息，配置如下：
+
+```python
+debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump")
+debugger.configure_hook(mode="api_stack")
+```
+
+多卡指定rank0 dump，配置如下：
+
+```python
+debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump",rank=0)
+debugger.configure_hook(mode="api_stack")
+```
+
+dump保存API统计信息的pkl文件，配置如下：
+
+```python
+debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump")
+debugger.configure_hook(mode="api_stack", summary_only=True)
+```
+
+### YOLOV5s
+
+单卡
+
+主要数据类型：FLOAT32
+
+启动命令参数：python3 train_ptdbg.py --data ./data/coco.yaml --cfg yolov5s.yaml --weights '' --epochs 1 --batch-size 8 --device 1
+
+dump保存API统计信息的pkl文件耗时：**7s**
+
+进行单卡dump全部API级别输入输出数据以及相应堆栈信息耗时：**11s**
+
+- dump存盘的API numpy文件大小：13G
+
+  ![输入图片说明](img/YOLOV5S_1.png)
+
+- api numpy文件数量：3009个
+
+  ![输入图片说明](img/YOLOV5S_2.png)
+
+
+
+
+### GPT-3
+
+#### NUM_LAYER：1
+
+8卡
+
+主要数据类型：FLOAT16
+
+启动命令参数：
+
+```
+python3 -m torch.distributed.launch $DISTRIBUTED_ARGS ../../pretrain_gpt_ptdbg.py --num-layers 1 --hidden-size 12288 --num-attention-heads 24 --micro-batch-size 2 --global-batch-size 2 --seq-length 1024 --max-position-embeddings 1024 --train-iters 10 --lr-decay-iters 320000 --save $CHECKPOINT_PATH --load $CHECKPOINT_PATH --data-path $DATA_PATH --tensor-model-parallel-size 8 --use-distributed-optimizer --pipeline-model-parallel-size 8 --vocab-file gpt2-vocab.json --merge-file gpt2-merges.txt --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.375e-5 --lr-decay-style cosine --min-lr 0.375e-6 --weight-decay 0.1 --clip-grad 1.0 --lr-warmup-fraction .01 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006
+--recompute-granularity full --recompute-method uniform --no-gradient-accumulation-fusion --log-interval 1 --save-interval 10000 --eval-interval 1000 --eval-iters 10 --fp16
+```
+
+dump保存API统计信息的pkl文件耗时：**3.3s**
+
+进行8卡dump全部API级别输入输出数据以及相应堆栈信息耗时：**53s**
+
+- dump存盘的api numpy文件大小：145G
+
+  ![输入图片说明](img/GPT-3_1.png)
+
+- API numpy文件数量：5130个
+
+  ![输入图片说明](img/GPT-3_2.png)
+
+
+**经测试8卡同时写入磁盘已达到磁盘I/O上限，工具的dump速度取决于磁盘性能，本机环境多进程写入磁盘上限为3GB/秒左右，理论上保存145GB的数据需要50秒左右，如果dump的数据中包含许多的小文件，那么耗时将会更久。**
+
+指定rank0 dump耗时：**9s**
+
+- dump存盘的api numpy文件大小：19G
+
+  ![输入图片说明](img/GPT-3_3.png)
+
+- api numpy文件数量：643个
+
+  ![输入图片说明](img/GPT-3_4.png)
+
+
+#### NUM_LAYER：8
+
+8卡
+
+主要数据类型：FLOAT16
+
+启动命令参数：
+
+```
+python3 -m torch.distributed.launch $DISTRIBUTED_ARGS ../../pretrain_gpt_ptdbg.py --num-layers 8 --hidden-size 12288 --num-attention-heads 24 --micro-batch-size 2 --global-batch-size 2 --seq-length 1024 --max-position-embeddings 1024 --train-iters 10 --lr-decay-iters 320000 --save $CHECKPOINT_PATH --load $CHECKPOINT_PATH --data-path $DATA_PATH --tensor-model-parallel-size 8 --use-distributed-optimizer --pipeline-model-parallel-size 8 --vocab-file gpt2-vocab.json --merge-file gpt2-merges.txt --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.375e-5 --lr-decay-style cosine --min-lr 0.375e-6 --weight-decay 0.1 --clip-grad 1.0 --lr-warmup-fraction .01 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --recompute-granularity full --recompute-method uniform --no-gradient-accumulation-fusion --log-interval 1 --save-interval 10000 --eval-interval 1000 --eval-iters 10 --fp16
+```
+
+dump保存API统计信息的pkl文件耗时：**6.7s**
+
+进行8卡dump全部API级别输入输出数据以及相应堆栈信息耗时：**323s**
+
+- dump存盘的API numpy文件大小：878G
+
+  ![输入图片说明](img/GPT-3_5.png)
+
+- API numpy文件数量：24002个
+
+  ![输入图片说明](img/GPT-3_6.png)
+
+
+指定rank0 dump耗时：**47s**
+
+- dump存盘的API numpy文件大小：110G
+
+  ![输入图片说明](img/GPT-3_7.png)
+
+- API numpy文件数量：3002个
+
+  ![输入图片说明](img/GPT-3_8.png)
+
+
+### BLOOM-7B
+
+8卡
+
+NUM_LAYER：1
+
+主要数据类型：BFLOAT16
+
+启动命令参数：
+
+```
+python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_llama.py --DDP-impl local --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --sequence-parallel --num-layers 1 --hidden-size 12288 --position-embedding-type rope --normalization RMSNorm --ffn-hidden-size 11008 --num-attention-heads 24 --attention-dropout 0.0 --hidden-dropout 0.0 --init-method-std 0.01 --micro-batch-size 2 --global-batch-size 2 --seq-length 1024 --max-position-embeddings 1024 --data-path $DATA_PATH --tokenizer-name-or-path $TOKENIZER_PATH --tokenizer-not-use-fast --split 100,0,0 --distributed-backend nccl --lr 1.25e-5 --min-lr 1.25e-6 --lr-decay-style cosine --weight-decay 1e-1 --clip-grad 1.0 --initial-loss-scale 65536.0 --adam-beta1 0.9 --adam-beta2 0.95 --log-interval 1 --load ${LOAD_CHECKPOINT_PATH} --save ${SAVE_CHECKPOINT_PATH} --save-interval 10000 --eval-interval 10000 --eval-iters 0 --use-fused-rotary-pos-emb --no-masked-softmax-fusion --no-load-optim --no-load-rng --train-iters 20 --lr-warmup-fraction 0.01 --mlp-layer-fusion --use-flash-attn --use-fused-rmsnorm --bf16
+```
+
+dump保存API统计信息的pkl文件耗时：**3s**
+
+进行8卡dump全部API级别输入输出数据以及相应堆栈信息耗时：**61s**
+
+- dump存盘的API numpy文件大小：160G
+
+  ![输入图片说明](img/BLOOM-7B_1.png)
+
+- API numpy文件数量：4924个
+
+  ![输入图片说明](img/BLOOM-7B_2.png)
+
+
+指定rank0 dump耗时：**17s**
+
+- dump存盘的API numpy文件大小：20G
+
+  ![输入图片说明](img/BLOOM-7B_3.png)
+
+- API numpy文件数量：633个
+
+  ![输入图片说明](img/BLOOM-7B_4.png)
+
diff --git a/debug/accuracy_tools/atat/pytorch/doc/dump.md b/debug/accuracy_tools/atat/pytorch/doc/dump.md
new file mode 100644
index 0000000000000000000000000000000000000000..26cbcc7d73213f68203e62fcb5bfc306ce6c57c3
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/doc/dump.md
@@ -0,0 +1,165 @@
+# **精度数据采集**
+
+atat工具主要通过在训练脚本内添加dump接口并启动训练的方式来采集精度数据。
+
+执行dump操作需要安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。
+
+## dump接口介绍
+
+### PrecisionDebugger
+
+**功能说明**
+
+通过加载dump配置文件的方式来确定dump操作的详细配置。
+
+可以在from atat.pytorch import PrecisionDebugger和模型初始化之间的任意位置添加该接口。
+
+**原型**
+
+```Python
+PrecisionDebugger(config_path=None, task=None, dump_path=None, level=None)
+```
+
+说明：上述参数除config_path外，其他参数均在[config.json](../../config)文件中可配，此处的参数优先级高于config.json文件中的配置，而config.json文件可以配置更多参数，若需要进行更多场景的精度数据dump，建议配置[config.json](../../config)文件。
+
+**参数说明**
+
+| 参数名      | 说明                                                         | 是否必选 |
+| ----------- | ------------------------------------------------------------ | -------- |
+| config_path | 指定dump配置文件路径，String类型。参数示例："./config.json"。未配置该路径时，默认使用../../config目录下的config.json文件的默认配置。 | 否       |
+| task        | dump的任务类型，String类型。可取值"statistics"（仅dump API统计信息）、"tensor"（dump API统计信息和完全复刻整网的API运行情况的真实数据）、"overflow_check"（溢出检测），默认未配置，取"statistics"，参数示例：task="tensor"。 | 否       |
+| dump_path   | 设置dump数据目录路径，String类型。参数示例：dump_path="./dump_path"。 | 是       |
+| level       | dump级别，根据不同级别dump不同数据，String类型。可取值：<br>        "L0"：dump module模块级精度数据，仅PyTorch场景支持”。<br/>        "L1"：dump API级精度数据，默认值。<br/>        "L2"：dump kernel级精度数据，仅MindSpore场景支持。<br/>        "mix"：dump module模块级和API级精度数据。<br/>配置示例：level="L1"。 | 否       |
+
+### start函数
+
+**功能说明**
+
+启动函数。
+
+在模型初始化之后的任意位置添加。
+
+**原型**
+
+```Python
+debugger.start(model)
+```
+
+该函数为类函数，可以使用debugger.start()也可以使用PrecisionDebugger.start()。
+
+### stop函数
+
+**功能说明**
+
+停止函数。
+
+在**start**函数之后的任意位置添加。
+
+**原型**
+
+```Python
+debugger.stop()
+```
+
+该函数为类函数，可以使用debugger.stop()也可以使用PrecisionDebugger.stop()。
+
+### step函数
+
+**功能说明**
+
+结束标识。
+
+在最后一个**stop**函数后或一个step结束的位置添加。
+
+**原型**
+
+```Python
+debugger.step()
+```
+
+该函数为类函数，可以使用debugger.step()也可以使用PrecisionDebugger.step()。
+
+## 示例代码
+
+```Python
+from atat.pytorch import PrecisionDebugger
+debugger = PrecisionDebugger(config_path="./config.json", dump_path="./dump_path")
+# 请勿将以上初始化流程插入到循环代码中
+
+# 模型初始化
+# 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop()
+debugger.start(model)
+
+# 需要dump的代码片段1
+
+debugger.stop()
+debugger.start(model)
+
+# 需要dump的代码片段2
+
+debugger.stop()
+debugger.step()
+```
+
+## dump结果文件介绍
+
+训练结束后，工具将dump的数据保存在dump_path参数指定的目录下。
+
+dump结果目录结构示例如下：
+
+```Python
+├── dump_path
+│   ├── step0
+│   |   ├── rank0
+│   |   │   ├── dump_tensor_data
+|   |   |   |    ├── Tensor.permute.1.forward.pt
+|   |   |   |    ├── MyModule.0.forward.input.pt        # 开启模块级精度数据dump时存在模块级的dump数据文件
+|   |   |   |    ...
+|   |   |   |    └── Fcuntion.linear.5.backward.output.pt
+│   |   |   ├── dump.json        # 保存前反向算子、算子的统计量信息或溢出算子信息。包含dump数据的API名称（命名格式为：`{api_type}_{api_name}_{API调用次数}_{前向反向}_{input/output}.{参数序号}`）、dtype、 shape、各数据的max、min、mean、L2norm统计信息以及当配置summary_mode="md5"时的md5数据。其中，“参数序号”表示该API下的第n个参数，例如1，则为第一个参数，若该参数为list格式，则根据list继续排序，例如1.1，表示该API的第1个参数的第1个子参数；L2norm表示2范数（平方根）
+│   |   |   ├── stack.json        # 算子调用栈信息
+│   |   |   └── construct.json        # 分层分级结构
+│   |   ├── rank1
+|   |   |   ├── dump_tensor_data
+|   |   |   |   └── ...
+│   |   |   ├── dump.json
+│   |   |   ├── stack.json
+|   |   |   └── construct.json
+│   |   ├── ...
+│   |   |
+|   |   └── rank7
+│   ├── step1
+│   |   ├── ...
+│   ├── step2
+```
+
+dump过程中，pt文件在对应算子或者模块被执行后就会落盘，而json文件则需要在正常执行PrecisionDebugger.stop()或set_dump_switch("OFF")后才会被落盘保存，异常的程序终止会保存终止前被执行算子的相关pt文件，但是不会生成json文件。
+
+其中`dump_{version}`为默认命名，debugger方式dump不支持修改该文件夹名称；rank为设备上各卡的ID，每张卡上dump的数据会生成对应dump目录。
+
+pt文件保存的前缀和PyTorch对应关系如下：
+
+| 前缀        | Torch模块           |
+| ----------- | ------------------- |
+| Tensor      | torch.Tensor        |
+| Torch       | torch               |
+| Functional  | torch.nn.functional |
+| NPU         | NPU亲和算子         |
+| VF          | torch._VF           |
+| Aten        | torch.ops.aten      |
+| Distributed | torch.distributed   |
+
+## 工具支持的API列表
+
+atat工具维护固定的API支持列表，若需要删除或增加dump的API，可以在atat/pytorch/hook_module/support_wrap_ops.yaml文件内手动修改，如下示例：
+
+```Python
+functional:  # functional为算子类别，找到对应的类别，在该类别下按照下列格式删除或添加API
+  - conv1d
+  - conv2d
+  - conv3d
+```
+
+# FAQ
+
+[FAQ](./FAQ.md)
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_1.png b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..3853626d6fab127915425238c76f16836a4cef0b
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_1.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_2.png b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..732abb496a4e0b171be8e21e70c09ea492383fe8
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_2.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_3.png b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f06074887e9a9df9b601dba6d26a45d924245d5
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_3.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_4.png b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_4.png
new file mode 100644
index 0000000000000000000000000000000000000000..b0bd9d40a5b9e414f7713c121fb2adb52d908bf6
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/BLOOM-7B_4.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_1.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..d633249c2ccd9b365c825374fe5472f33dd812f4
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_1.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_2.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..c4748ca479a85fe1857e2b599569714f4f8dfbd3
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_2.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_3.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..b20c34d943e9765f640cebb3820370c9e99b5527
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_3.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_4.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_4.png
new file mode 100644
index 0000000000000000000000000000000000000000..1dbf6600e742d49e9022fe3696c9ff2adcdbaf39
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_4.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_5.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_5.png
new file mode 100644
index 0000000000000000000000000000000000000000..5bf1c26d3d8734b14e003dcf1b26361ed300217d
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_5.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_6.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_6.png
new file mode 100644
index 0000000000000000000000000000000000000000..1c326caa7f4e17c924456461622eedaabcd18362
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_6.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_7.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_7.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ee73472850e27e18da472aa3a42f747f7239d7e
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_7.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_8.png b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_8.png
new file mode 100644
index 0000000000000000000000000000000000000000..6bedfab396f306478df4f6ce8869b3ab65c92b5e
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/GPT-3_8.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_1.png b/debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..791e49c178ef3d88bdbc7ca4eae2e78ffc35e250
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_1.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_2.png b/debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..61357f3feb95b5f70d6317dbfef7b516c0492449
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/YOLOV5S_2.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_details.png b/debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_details.png
new file mode 100644
index 0000000000000000000000000000000000000000..6c39840f633743b3b1dd54dfa5307c0e82ba20b7
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_details.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_result.png b/debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_result.png
new file mode 100644
index 0000000000000000000000000000000000000000..7e81606cb3a53364bb069393882ef162c8ce454c
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/accuracy_checking_result.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_details.png b/debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_details.png
new file mode 100644
index 0000000000000000000000000000000000000000..b31ba52350989488df1245f4840886f2a7db4944
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_details.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_result.png b/debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_result.png
new file mode 100644
index 0000000000000000000000000000000000000000..8087d83c8035d2939bebc4ecd9f213c0810d88c7
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/api_precision_compare_result.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/auto_analyze_log.png b/debug/accuracy_tools/atat/pytorch/doc/img/auto_analyze_log.png
new file mode 100644
index 0000000000000000000000000000000000000000..999b47f97ef5661316c7e61dbdc93c87996259f3
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/auto_analyze_log.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl.png b/debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl.png
new file mode 100644
index 0000000000000000000000000000000000000000..c64e9380c6d9c01bb2ad18c81e430ead0800bb7d
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl_md5.png.png b/debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl_md5.png.png
new file mode 100644
index 0000000000000000000000000000000000000000..81ba1935e69218467b006f05dfffbe54f3f04cb4
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/compare_result_pkl_md5.png.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/cpu_info.png b/debug/accuracy_tools/atat/pytorch/doc/img/cpu_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..744d237e975e555160ecdc391810a3681d05252a
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/cpu_info.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/img/module_compare.png b/debug/accuracy_tools/atat/pytorch/doc/img/module_compare.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e1ea564eb191807034afd8aceac92b29b62a086
Binary files /dev/null and b/debug/accuracy_tools/atat/pytorch/doc/img/module_compare.png differ
diff --git a/debug/accuracy_tools/atat/pytorch/doc/parse_tool.md b/debug/accuracy_tools/atat/pytorch/doc/parse_tool.md
new file mode 100644
index 0000000000000000000000000000000000000000..23000912910e8f95b4cb74c7983961918bd9a513
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/doc/parse_tool.md
@@ -0,0 +1,286 @@
+# **数据解析工具**
+
+数据解析工具（parse_tool）提供命令行交互式界面，提供更多的数据解析功能并且展示结果。
+
+使用场景：本工具主要用于精度比对前后两次NPU kernel层级dump数据的一致性。
+
+## 进入parse交互式界面
+
+安装atat工具后（详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节），可以通过使用命令 **atat -f pytorch parse** 进入交互式界面，如下所示：
+
+```bash
+atat -f pytorch parse
+Parse >>>
+```
+
+可在parse的界面中执行Shell命令，以及如下场景的相关解析命令：
+
+- 支持指定kernel层级算子数据比对。
+- 支持指定kernel层级算子数据转换及展示。
+- 支持交互式指定pkl文件中API对应dump数据查看。
+- 支持API进行可选层级比对和打印（统计级和像素级）。
+
+Ctrl+C可以退出parse交互式界面。不退出parse交互式界面若需要执行非该界面下的内置Shell命令，且命令与parse交互式界面命令冲突时，非该界面命令需要使用run命令，在相关命令前加上run前缀，如下示例：
+
+```bash
+atat -f pytorch parse
+Parse >>> run vim cli.py
+Parse >>> vim cli.py
+```
+
+以上各场景详细介绍请参见下文章节。
+
+## kernel层级算子数据批量转换
+
+本功能会将原有待比对dump数据目录下的dump数据按照算子名和时间戳进行梳理并分类，之后再将dump数据转为为npy文件。
+
+依赖：CANN包中的msaccucmp工具，需要安装Ascend-CANN-toolkit，详见《[CANN 软件安装指南](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F700%2Fenvdeployment%2Finstg%2Finstg_0001.html)》。
+
+输入以下比对命令进行数据转换。
+
+```bash
+cad -m my_dump_path [-out output_path] [-asc msaccucmp_path]
+```
+
+| 参数名称 | 说明                                                         | 是否必选 |
+| -------- | ------------------------------------------------------------ | -------- |
+| -m       | 待转换kernel dump数据目录。需要指定到kernel dump数据的deviceid级目录。 | 是       |
+| -out     | 结果输出目录，须指定已存在的目录，默认为./parse_data/acl_batch_convert。未指定时保存在默认路径下，比对结束后会打印log提示输出结果存放路径。 | 否       |
+| -asc     | 指定msaccucmp路径，默认路径为：/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py。 | 否       |
+
+**示例**
+
+```
+# 传入待比对数据目录
+Parse >>> cad -m /home/xxx/my_dump_path/20000124003856/0
+# 转换结果打印
+......
+╭──────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+# 转换前的dump文件
+│ SrcFile: /home/xxx/my_dump_path/20000124003856/0/272/TransData.trans_TransData_22.112.21.948645536672764 │
+# 转换后的npy文件
+│ - TransData.trans_TransData_22.112.21.948645536672764.output.0.npy                                       │
+│ - TransData.trans_TransData_22.112.21.948645536672764.input.0.npy                                        │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+......
+[INFO] The comparison result have been written to "./parse_data/acl_batch_convert".
+```
+
+输出结果：
+
+原dump数据目录：
+
+```
+├── /home/xxx/my_dump_path/20000124003856/0/
+│   ├── 272
+│   │   ├── {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}
+│   │   ...
+│   ├── 512
+│   ...
+```
+
+转换后：
+
+```
+├── ./parse_data/acl_batch_convert/{timestamp}
+│   ├── {op_name1}
+│   │   ├── {timestamp1}
+│   │   |   ├── {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input/output}.{参数序号}.npy
+│   │   |   │   ...
+│   │   ├── {timestamp2}
+│   │   |   ...
+│   ├── {op_name2}
+│   ├── ...
+```
+
+## kernel层级算子数据比对
+
+本功能主要用于比对前后两次NPU kernel层级dump数据的一致性。
+
+本功能支持批量比对，若需要进行批量比对，需要先将两份待比对的NPU kernel层级dump数据进行“**kernel层级算子数据批量转换**”，可以使两份数据更好的匹配；若直接进行dump数据的比对，建议只比对单个dump数据文件。
+
+输入以下比对命令进行数据比对。
+
+```bash
+vc -m my_dump_path -g golden_dump_path [-out output_path] [-cmp_path msaccucmp_path]
+```
+
+| 参数名称  | 说明                                                         | 是否必选 |
+| --------- | ------------------------------------------------------------ | -------- |
+| -m        | 待比对kernel dump数据目录。如果比对单个算子，需要指定到kernel dump数据的model_id级目录；如果批量比对，则指定到cad转换后的timestamp级目录。 | 是       |
+| -g        | 标杆kernel dump数据目录。如果比对单个算子，需要指定到kernel dump数据的model_id级目录；如果批量比对，则指定到cad转换后的timestamp级目录。 | 是       |
+| -out      | 结果输出目录，须指定已存在的目录，默认为./parse_data/acl_batch_comapre。未指定时保存在默认路径下，比对结束后会打印log提示输出结果存放路径。 | 否       |
+| -cmp_path | 指定msaccucmp路径，默认路径为：/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py | 否       |
+
+输出结果：batch_compare_{timestamp}.csv文件。
+
+**示例**
+
+```bash
+# 传入待比对数据目录以及标杆数据目录
+Parse >>> vc -m ./my_dump_path -g ./golden_data_path
+[INFO]Compare result is saved in : parse_data/acl_batch_comapre/batch_compare_1707271118.csv
+```
+
+## kernel算子数据的npy转换
+
+依赖：CANN包中的msaccucmp工具，需要安装Ascend-CANN-toolkit，详见《[CANN 软件安装指南](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F700%2Fenvdeployment%2Finstg%2Finstg_0001.html)》。
+
+输入以下转换命令进行数据转换， 将kernel级别dump数据转为npy文件。
+
+```bash
+dc -n file_name/file_path [-f format] [-out output_path]
+```
+
+| 参数名称  | 说明                                                         | 是否必选 |
+| --------- | ------------------------------------------------------------ | -------- |
+| -n        | 需转换的dump数据文件或dump数据文件目录。                     | 是       |
+| -f        | 开启format转换，指定该参数时需要配置format格式。当前内置的Format转换支持如下类型： FRACTAL_NZ转换NCHW FRACTAL_NZ转换成NHWC FRACTAL_NZ转换ND HWCN转换FRACTAL_Z HWCN转换成NCHW HWCN转换成NHWC NC1HWC0转换成HWCN NC1HWC0转换成NCHW NC1HWC0转换成NHWC NCHW转换成FRACTAL_Z NCHW转换成NHWC NHWC转换成FRACTAL_Z NHWC转换成HWCN NHWC转换成NCHW NDC1HWC0转换成NCDHW | 否       |
+| -out      | 结果输出目录。                                               | 否       |
+| -cmp_path | 指定msaccucmp路径，默认路径为：/usr/local/Ascend/ascend-toolkit/latest/tools/operator_cmp/compare/msaccucmp.py | 否       |
+
+- 输出结果：npy文件。
+
+- 若指定-out参数需要用户传入输出路径，并且路径需要已存在。
+
+- 若未指定输出目录， 则比对结束后将结果保存在默认目录 “./parse_data/convert_result”中，比对结束后会打印log提示输出结果存放路径及转换结果。
+
+- 输入以下命令，展示npy数据统计信息。
+
+  ```bash
+  pt -n file_path
+  ```
+
+  | 参数名称 | 说明          | 是否必选 |
+  | -------- | ------------- | -------- |
+  | -n       | npy文件路径。 | 是       |
+
+  打印统计信息：shape, dtype, max, min和mean。默认在npy文件路径下将该数据保存为txt文件。
+
+**示例1**
+
+```bash
+# 传入需转换的dump文件目录
+Parse >>> dc -n ./dump_data/
+......
+# 转换结果
+╭──────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ SrcFile: ./dump_data/
+│  - Add.fp32_vars_add_2fp32_vars_Relu_9.31.5.1636595794731103.input.0.npy                             │
+│  - Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.output.0.npy                            │
+│  - Add.fp32_vars_add_2fp32_vars_Relu_9.31.5.1636595794731103.input.1.npy                             │
+│  - Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.input.1.npy                             │
+│  - Add.fp32_vars_add_3fp32_vars_Relu_12.40.5.1636595794846124.input.1.npy                            │
+│  - Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.input.0.npy                             │
+│  - Add.fp32_vars_add_3fp32_vars_Relu_12.40.5.1636595794846124.input.0.npy                            │
+│  - Add.fp32_vars_add_2fp32_vars_Relu_9.31.5.1636595794731103.output.0.npy                            │
+│  - Add.fp32_vars_add_3fp32_vars_Relu_12.40.5.1636595794846124.output.0.npy                           │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────╯
+```
+
+**示例2**
+
+```bash
+# 查看某个dump数据块的数据信息
+# 默认会将数据中的tensor保存成 txt
+Parse >>> pt -n ./parse_data/dump_convert/Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.output.0.npy
+......
+# 打印统计信息
+[Shape: (1, 16, 56, 56, 16)] [Dtype: float16] [Max: 452.0] [Min: -408.5] [Mean: -3.809]
+Path: ./parse_data/dump_convert/Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.input.0.npy                           
+TextFile:./parse_data/dump_convert/Add.fp32_vars_add_1fp32_vars_Relu_6.24.5.1636595794631347.input.0.npy.txt
+```
+
+## dump.json文件中指定API的dump数据信息查看（暂不支持）
+
+输入以下命令，解析并输出dump.json文件中指定API的统计信息。
+
+```bash
+pk -f pkl_path -n api_name
+```
+
+| 参数名称 | 说明                    | 是否必选 |
+| -------- | ----------------------- | -------- |
+| -f       | 指定dump.json文件路径。 | 是       |
+| -n       | 指定API名称。           | 是       |
+
+- 输出结果：打印统计信息（shape, dtype, max和min mean）。
+- 若pkl文件中存在相应的堆栈信息，则会打印堆栈信息。
+
+**示例**
+
+```bash
+# 传入pkl文件及api名称
+Parse >>> pk -f ./torch_dump/xxx/rank0/dump.json -n Functional_conv2d_0_forward
+......
+# 打印统计信息及堆栈（pkl文件不包含堆栈则不会打印堆栈）
+
+Statistic Info:
+  [Functional_conv2d_0_forward_input.0][dtype: torch.float32][shape: [2, 1, 2, 2]][max: 1.576936960220337][min: -0.9757485389709473][mean: 0.4961632490158081]
+  [Functional_conv2d_0_forward_input.1][dtype: torch.float32][shape: [2, 1, 2, 2]][max: 0.20064473152160645][min: -0.47102075815200806][mean: -0.20796933770179749]
+  [Functional_conv2d_0_forward_input.2][dtype: torch.float32][shape: [2]][max: 0.17380613088607788][min: -0.16853803396224976][mean: 0.0026340484619140625]
+  [Functional_conv2d_0_forward_output][dtype: torch.float32][shape: [2, 2, 1, 1]][max: 0.02364911139011383][min: -1.762906551361084][mean: -0.6710853576660156]
+```
+
+## API可选层级比对
+
+输入以下命令, 进行统计级和像素级比对。
+
+```bash
+cn -m my_data*.npy -g gloden*.npy [-p num] [-al atol] [-rl rtol]
+```
+
+- 统计级比对：对tensor整体进行余弦值及相对误差的计算。
+- 像素级比对：对输入的两个npy文件进行逐元素比对。若两个tensor对应元素的相对误差或绝对误差大于**误差阈值**（-al和-rl配置）则被标记为错误数据。
+
+| 参数名称 | 说明                                            | 是否必选 |
+| -------- | ----------------------------------------------- | -------- |
+| -m       | 待比对数据。                                    | 是       |
+| -g       | 标杆数据。                                      | 是       |
+| -p       | 设置比对结束后打印错误元素的个数，默认值20。    | 否       |
+| -al      | 判定数据存在精度问题的绝对误差阈值，默认0.001。 | 否       |
+| -rl      | 判定数据存在精度问题的相对误差阈值，默认0.001。 | 否       |
+| -s       | 将npy文件保存成txt文件，用于查看，默认开启。    | 否       |
+
+输出结果：
+
+- 统计级比对结果。
+- 两个文件的统计信息（shape, dtype, max, min和mean）。
+- 错误数据打印表格。
+
+**示例**
+
+```bash
+# 对比两个tensor的数据
+Parse >>> cn -m Add.InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.323.1619494134703053.output.0.npy -g InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.0.1619492699305998.npy -p 10 -s -al 0.002 -rl 0.005
+                  Error Item Table                                        Top Item Table
+┏━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ ┏━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
+┃ Index ┃ Left          ┃ Right        ┃ Diff         ┃ ┃ Index ┃ Left        ┃ Right       ┃ Diff          ┃
+┡━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ ┡━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
+│ 155   │ 0.024600908   │ 0.022271132  │ 0.002329776  │ │ 0     │ -0.9206961  │ -0.9222216  │ 0.0015255213  │
+│ 247   │ 0.015752593   │ 0.017937578  │ 0.0021849852 │ │ 1     │ -0.6416973  │ -0.64051837 │ 0.0011789203  │
+│ 282   │ -0.0101207765 │ -0.007852031 │ 0.0022687456 │ │ 2     │ -0.35383835 │ -0.35433492 │ 0.0004965663  │
+│ 292   │ 0.019581757   │ 0.02240482   │ 0.0028230622 │ │ 3     │ -0.18851271 │ -0.18883198 │ 0.00031927228 │
+│ 640   │ -0.06593232   │ -0.06874806  │ 0.0028157383 │ │ 4     │ -0.43508735 │ -0.43534422 │ 0.00025686622 │
+│ 1420  │ 0.09293677    │ 0.09586689   │ 0.0029301196 │ │ 5     │ 1.4447614   │ 1.4466647   │ 0.0019032955  │
+│ 1462  │ -0.085207745  │ -0.088047795 │ 0.0028400496 │ │ 6     │ -0.3455438  │ -0.3444429  │ 0.0011008978  │
+│ 1891  │ -0.03433288   │ -0.036525503 │ 0.002192624  │ │ 7     │ -0.6560242  │ -0.6564579  │ 0.0004336834  │
+│ 2033  │ 0.06828873    │ 0.07139922   │ 0.0031104907 │ │ 8     │ -2.6964858  │ -2.6975214  │ 0.0010356903  │
+│ 2246  │ -0.06376442   │ -0.06121233  │ 0.002552092  │ │ 9     │ -0.73746175 │ -0.73650354 │ 0.00095820427 │
+└───────┴───────────────┴──────────────┴──────────────┘ └───────┴─────────────┴─────────────┴───────────────┘
+╭───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ Left:                                                                                                                                 |
+│  |- NpyFile: ./dump/temp/decode/Add.InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.323.1619494134703053.output.0.npy                 |
+│  |- TxtFile: ./dump/temp/decode/Add.InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.323.1619494134703053.output.0.npy.txt             |
+│  |- NpySpec: [Shape: (32, 8, 8, 320)] [Dtype: float32] [Max: 5.846897] [Min: -8.368301] [Mean: -0.72565556]                           |
+│ DstFile:                                                                                                                              │
+│  |- NpyFile: ./dump/cpu/InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.0.1619492699305998.npy                                        |
+│  |- TxtFile: ./dump/cpu/InceptionV3_InceptionV3_Mixed_7a_Branch_0_add_3.0.1619492699305998.npy.txt                                    |
+│  |- NpySpec: [Shape: (32, 8, 8, 320)] [Dtype: float32] [Max: 5.8425903] [Min: -8.374472] [Mean: -0.7256237]                           │
+│ NumCnt:   655360                                                                                                                      │
+│ AllClose: False                                                                                                                       │
+│ CosSim:   0.99999493                                                                                                                  │
+│ ErrorPer: 0.023504638671875  (rl= 0.005, al= 0.002)                                                                                   │
+╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+```
+
diff --git a/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_compare.md b/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_compare.md
new file mode 100644
index 0000000000000000000000000000000000000000..9beda3b02f2d72383a2bcaa4c20bcd9c5b8ba971
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_compare.md
@@ -0,0 +1,153 @@
+# **精度比对工具**
+
+## CPU或GPU与NPU精度数据比对
+
+### 总体说明
+
+- 本节主要介绍CPU或GPU与NPU精度数据比对的函数以及示例，执行精度比对操作前需要先完成CPU或GPU与NPU的精度数据dump，详见《[精度数据采集](./dump.md)》。
+
+- 比对函数均通过单独创建精度比对脚本执行，可支持单卡和多卡场景的精度数据比对。
+
+- 工具性能：比对数据量较小时（参考值单份文件小于10GB），参考比对速度0.1GB/s；比对数据量较大时，参考比对速度0.3GB/s。 推荐环境配置：独占环境，CPU核心数192，固态硬盘（IO速度参考：固态硬盘 > 500MB/s，机械硬盘60 ~ 170MB/s）。
+
+  用户环境性能弱于标准约束或非独占使用的比对速度酌情向下浮动。比对速度的计算方式：两份比对文件大小/比对耗时。
+
+### 约束
+
+- NPU自研API，在CPU或GPU若没有对应的API，该API的dump数据不比对。
+- NPU与CPU或GPU的计算结果误差可能会随着模型的执行不断累积，最终会出现同一个API因为输入的数据差异较大而无法比对的情况。
+- CPU或GPU与NPU中两个相同的API会因为调用次数不同导致无法比对或比对到错误的API，不影响整体运行，该API忽略。
+
+### compare_distributed
+
+**功能说明**
+
+将CPU或GPU与NPU的dump文件进行比对，支持单卡和多卡，可同时比对多卡的dump数据。多机场景需要每个设备单独执行比对操作。可自动检索和匹配对应卡和进程所dump的数据文件，再调用compare进行比对。单机单卡时与compare函数二选一。
+
+**函数原型**
+
+```Python
+compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs)
+```
+
+**参数说明**
+
+| 参数名         | 说明                                                         | 是否必选 |
+| -------------- | ------------------------------------------------------------ | -------- |
+| npu_dump_dir   | 配置NPU环境下的dump目录。dump数据目录须指定到step级。参数示例：'./npu_dump/step0'。数据类型：str。 | 是       |
+| bench_dump_dir | 配置CPU、GPU或NPU环境下的dump目录。参数示例：'./gpu_dump/step0'。数据类型：str。 | 是       |
+| output_path    | 配置比对结果csv文件存盘目录。需要预先创建output_path目录。参数示例：'./output'。文件名称基于时间戳自动生成，格式为：`compare_result_rank{npu_ID}-rank{cpu/gpu/npu_ID}_{timestamp}.csv`。数据类型：str。 | 是       |
+| **kwargs       | 支持compare的所有可选参数。                                  | 否       |
+
+**函数示例**
+
+创建比对脚本，例如compare_distributed.py，拷贝如下代码，具体参数请根据实际环境修改。
+
+```Python
+from atat.pytorch import *
+compare_distributed('./npu_dump/step0', './gpu_dump/step0', './output')
+```
+
+dump数据目录须指定到step级。
+
+### compare
+
+**功能说明**
+
+将CPU或GPU与NPU的dump文件进行比对，仅支持单机单卡。
+
+**函数原型**
+
+```Python
+compare(input_param, output_path, stack_mode=False, auto_analyze=True, fuzzy_match=False)
+```
+
+**参数说明**
+
+| 参数名       | 说明                                                         | 是否必选 |
+| ------------ | ------------------------------------------------------------ | -------- |
+| input_param  | 配置dump数据文件及目录。数据类型：dict。配置参数包括：<br>        "npu_json_path"：指定NPU dump目录下的dump.json文件。参数示例："npu_json_path": "./npu_dump/dump.json"。必选。<br/>        "bench_json_path"：指定CPU、GPU或NPU dump目录下的dump.json文件。参数示例："bench_json_path": "./gpu_dump/dump.json"。必选。<br/>        "stack_json_path"：指定NPU dump目录下的stack.json文件。参数示例："stack_json_path": "./npu_dump/stack.json"。可选。<br/>        "is_print_compare_log"：配置是否开启日志打屏。可取值True或False。可选。 | 是       |
+| output_path  | 配置比对结果csv文件存盘目录。参数示例：'./output'。文件名称基于时间戳自动生成，格式为：`compare_result_{timestamp}.csv`。数据类型：str。 | 是       |
+| stack_mode   | 配置stack_mode的开关。仅当配置"stack_json_path"需要开启。可取值True或False，参数示例：stack_mode=True，默认为False。数据类型：bool。 | 否       |
+| auto_analyze | 自动精度分析，开启后工具自动针对比对结果进行分析，识别到第一个精度不达标节点（在比对结果文件中的“Accuracy Reached or Not”列显示为No），并给出问题可能产生的原因（打屏展示并生成advisor_{timestamp}.txt文件）。可取值True或False，参数示例：auto_analyze=False，默认为True。数据类型：bool。 | 否       |
+| fuzzy_match  | 模糊匹配。开启后，对于网络中同一层级且命名仅调用次数不同的API，可匹配并进行比对。可取值True或False，参数示例：fuzzy_match=True，默认为False。数据类型：bool。 | 否       |
+
+**函数示例**
+
+单机单卡场景下创建比对脚本，例如compare.py，拷贝如下代码，具体参数请根据实际环境修改。
+
+```Python
+from atat.pytorch import compare
+dump_result_param={
+"npu_json_path": "./npu_dump/dump.json",
+"bench_json_path": "./gpu_dump/dump.json",
+"stack_json_path": "./npu_dump/stack.json",
+"is_print_compare_log": True
+}
+compare(dump_result_param, output_path="./output", stack_mode=True)
+```
+
+### 统计量比对
+
+若使用**compare**或**compare_distributed**函数创建的比对脚本中，在[config.json](../../config/config.json)文件中配置"task": "statistics"方式dump时，可以进行统计量比对，此时比对dump.json文件中的统计信息，开启后的比对结果文件生成Max diff、Min diff、Mean diff和L2norm diff，表示NPU dump数据中API的输入或输出与标杆数据输入或输出的最大值、最小值、平均值以及L2范数的差。可以通过该值判断API是否存在精度问题：当某个API的输入和输出的Max diff、Min diff、Mean diff和L2norm diff均为0或无限趋于0，那么可以判断该API无精度问题，反之则可能存在精度问题。
+
+**比对脚本示例**
+
+以compare.py为例。
+
+```Python
+from atat.pytorch import compare
+dump_result_param={
+"npu_json_path": "./npu_dump/dump.json",
+"bench_json_path": "./gpu_dump/dump.json",
+"stack_json_path": "./npu_dump/stack.json",
+"is_print_compare_log": True
+}
+compare(dump_result_param, output_path="./output", stack_mode=True)
+```
+
+**比对结果**
+
+数据量比对同样生成`compare_result_{timestamp}.csv`和`advisor_{timestamp}.txt`文件。其中`advisor_{timestamp}.txt`主要对`compare_result_{timestamp}.csv`中可能存在精度问题（Result为Waring）的API提出定位建议；`compare_result_{timestamp}.csv`主要有如下两种情况：
+
+- "summary_mode": "statistics"时比对dump.json文件：
+
+  ![compare_result_pkl](img/compare_result_pkl.png)
+
+  上图是对dump.json文件中NPU及标杆API的统计信息进行比对，判断可能存在精度问题的API，文件中记录NPU及标杆API的基本信息和统计信息，其中需要关注Result列，包含结果：Waring（NPU与标杆统计信息的比对中存在相对误差大于0.5，则需要重点检查该API）；为空（相对误差小于等于0.5，可以不需要重点关注，但不代表不存在精度问题）；Nan（表示统计信息数据没有匹配上）。
+
+- "summary_mode": "md5"时比对dump.json文件：
+
+  ![compare_result_pkl_md5.png](img/compare_result_pkl_md5.png.png)
+
+  上图是对dump.json文件中NPU及标杆API的MD5信息进行比对，判断API数据的完整性，文件中记录NPU及标杆API的基本信息和MD5信息，其中需要关注Result列，包含结果：Pass（表示NPU与标杆的MD5值一致，即API数据完整）；Different（表示NPU与标杆的MD5值不一致，即API数据不完全一致，可以通过NPU_Stack_Info列API调用栈查询该API的详细信息）；Nan（表示MD5信息数据没有匹配上）。
+
+## 计算精度评价指标
+
+通过计算精度评价指标可以直接从精度比对结果文件中找出不符合精度标准的算子。
+
+PyTorch精度比对是以CPU或GPU的计算结果为标杆，计算Cosine（余弦相似度）、MaxAbsErr（最大绝对误差）和MaxRelativeErr（最大相对误差），根据这两个结果判断API在运行时是否存在精度问题。
+
+计算精度评价指标：
+
+1. Cosine：通过计算两个向量的余弦值来判断其相似度，数值越接近于1说明计算出的两个张量越相似，实际可接受阈值为大于0.99。在计算中可能会存在nan，主要由于可能会出现其中一个向量为0。
+
+2. MaxAbsErr：当最大绝对误差越接近0表示其计算的误差越小，实际可接受阈值为小于0.001。
+
+3. MaxRelativeErr：当最大相对误差越接近0表示其计算的误差越小。
+
+   当dump数据中存在0或Nan时，比对结果中最大相对误差则出现inf或Nan的情况，属于正常现象。
+
+4. One Thousandth Err Ratio（双千分之一）、Five Thousandths Err Ratio（双千分之五）精度指标：是指NPU的Tensor中的元素逐个与对应的标杆数据对比，相对误差大于千分之一、千分之五的比例占总元素个数的比例小于千分之一、千分之五。该数据仅作为精度下降趋势的参考，并不参与计算精度是否通过的判定。
+
+精度比对结果csv文件中只需要通过Accuracy Reached or Not来判断计算精度是否达标，判断标准如下：
+
+1. Cosine < 0.99 且 MaxAbsError > 0.001时，精度不达标，标记为“No”。
+2. Cosine < 0.9，精度不达标，标记为“No”。
+3. MaxAbsError > 1，精度不达标，标记为“No”。
+4. 其余情况下记为精度达标，标记为“Yes”。
+
+# FAQ
+
+[FAQ](./FAQ.md)
+
diff --git a/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_overview.md b/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..708d90b3487c47249c5f6a8b0f37671e8918e7e2
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_overview.md
@@ -0,0 +1,68 @@
+# **精度比对工具**
+
+## 简介
+
+在PyTorch训练网络，对同一模型或API调试过程中，遇到API相关的计算精度问题，定位时费时费力。
+
+atat的精度比对工具，用来进行PyTorch整网API粒度的数据dump、精度比对和溢出检测，从而定位PyTorch训练场景下的精度问题。
+
+**使用场景**
+
+主要的使用场景包括：
+
+- 同一模型，从CPU或GPU移植到NPU中存在精度下降问题，对比NPU芯片中的API计算数值与CPU或GPU芯片中的API计算数值，进行问题定位。
+- 同一模型，进行迭代（模型、框架版本升级或设备硬件升级）时存在的精度下降问题，对比相同模型在迭代前后版本的API计算数值，进行问题定位。
+
+## 原理介绍
+
+精度对比工具，通过在PyTorch模型中注册hook，跟踪计算图中API的前向传播与反向传播时的输入与输出，排查存在计算精度误差，进行问题的精准定位。
+
+**精度比对流程**
+
+1. 当模型在CPU或GPU上进行正向和反向传播时，分别dump每一层的数值输入与输出。
+
+2. 当模型在NPU中进行计算时，采用相同的方式dump下相应的数据。
+
+3. 通过对比dump出的数值，计算余弦相似度和最大绝对误差的方式，定位和排查NPU API存在的计算精度问题。如下图所示。
+
+   精度比对逻辑图
+
+   ![module_compare](img/module_compare.png)
+
+**API匹配条件**
+
+进行精度比对时，需要判断CPU或GPU的API与NPU的API是否相同可比对，须满足以下匹配条件：
+
+- 两个API的名称相同，API命名规则：`{api_type}.{api_name}.{api调用次数}.{正反向}.{输入输出}.index`，如：Functional.conv2d.1.backward.input.0。
+- 两个API的输入输出Tensor数量和各个Tensor的Shape相同。
+
+通常满足以上两个条件，工具就认为是同一个API，成功进行API的匹配，后续进行相应的计算精度比对。
+
+## 精度比对总体流程
+
+1. 准备CPU或GPU训练工程。
+
+2. 在环境下安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。
+
+3. 在训练脚本内添加atat工具dump接口PrecisionDebugger采集标杆数据。详见《[精度数据采集](./dump.md)》。
+
+4. 执行训练dump数据。
+
+5. 将CPU或GPU训练工程迁移为NPU训练工程。详见《[PyTorch模型迁移调优指南](https://www.hiascend.com/document/detail/zh/Pytorch/60RC1/ptmoddevg/trainingmigrguide/PT_LMTMOG_0003.html)》。
+
+6. 在NPU环境下安装atat工具。详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。
+
+7. 在NPU训练脚本内添加atat工具dump接口PrecisionDebugger采集标杆数据。详见《[精度数据采集](./dump.md)》。
+
+8. NPU环境下执行训练dump数据。
+
+9. 执行精度比对。
+
+   1. 创建并配置精度比对脚本，例如compare.py。
+
+   2. 执行CPU或GPU dump与NPU dump数据的精度比对。
+
+   3. 比对结果分析。
+
+      详见《[CPU或GPU与NPU精度数据比对](./ptdbg_ascend_compare.md)》。
+
diff --git a/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_quickstart.md b/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_quickstart.md
new file mode 100644
index 0000000000000000000000000000000000000000..ae6e3b0b4bbad4796b0332ee8a41b3ae14e5f94e
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/doc/ptdbg_ascend_quickstart.md
@@ -0,0 +1,381 @@
+# **精度比对工具**
+
+本文主要介绍atat的精度比对工具的快速入门和场景化示例。
+
+本文介绍的操作需要安装atat工具，详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。
+
+本文介绍的操作主要是精度数据dump和精度比对，详细操作指导可参考《[精度数据采集](./dump.md)》和《[CPU或GPU与NPU精度数据比对](./ptdbg_ascend.md)》。
+
+## 快速入门
+
+### 单卡场景精度比对
+
+**精度分析建议**
+
+PyTorch训练场景的精度问题分析建议参考以下思路进行精度比对和比对结果分析：
+
+1. 整网比对：dump整网数据并进行精度比对，初步定位异常范围。
+
+   对于模型数据庞大（比如达到T级别）的场景，不推荐直接dump整网比对，整网dump可能导致磁盘不足，需要预留足够的存储空间或者分多次dump。
+
+2. 缩小范围：根据Accuracy Reached or Not找出不符合精度标准的API。
+
+3. 范围比对：对不符合精度标准的API重新dump详细信息。
+
+4. 分析原因并优化：分析API精度不符合标准的原因并进行优化调整。
+
+5. 整网比对：重新进行整网比对，判断优化后的API是否已符合精度标准以及是否出现新的精度问题。
+
+6. 重复1~5步，直到不存在精度问题为止。
+
+**精度分析示例**
+
+1. 修改dump配置文件config.json。
+
+   ```json
+   {
+       "task": "tensor",
+       "dump_path": "./npu_dump",
+       "rank": [],
+       "step": [],
+       "level": "L1",
+       "seed": 1234,
+       "is_deterministic": false,
+   
+       "tensor": {
+           "scope": [], 
+           "list": [],
+           "data_mode": ["all"],
+           "summary_mode": "statistics"
+       }
+   }
+   ```
+
+2. 在训练脚本内添加atat工具，dump整网数据。
+
+   分别dump CPU或GPU以及NPU数据，在PyTorch训练脚本插入dump接口，示例代码如下（下面以NPU为例，CPU或GPU dump基本相同）：
+
+   ```python
+   from atat.pytorch import PrecisionDebugger
+   debugger = PrecisionDebugger(config_path="./config.json", dump_path="./npu_dump")
+   # 请勿将以上初始化流程插入到循环代码中
+   
+   # 模型初始化
+   # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop()
+   debugger.start()
+   
+   # 需要dump的代码片段1
+   
+   debugger.stop()
+   debugger.start()
+   
+   # 需要dump的代码片段2
+   
+   debugger.stop()
+   debugger.step()
+   ```
+
+3. 比对整网数据。
+
+   第1步中的NPU dump数据目录为npu_dump，假设GPU dump数据目录为gpu_dump；dump将生成dump.json、stack.json、construct.json文件以及dump数据目录。
+
+   创建并配置精度比对脚本，以创建compare.py为例，示例代码如下：
+
+   ```python
+   from atat.pytorch import compare
+   dump_result_param={
+   "npu_json_path": "./npu_dump/dump.json",
+   "bench_json_path": "./gpu_dump/dump.json",
+   "stack_json_path": "./npu_dump/stack.json",
+   "is_print_compare_log": True
+   }
+   compare(dump_result_param, output_path="./output", stack_mode=True)
+   ```
+   
+   执行比对：
+
+   ```bash
+python3 compare.py
+   ```
+   
+   在output目录下生成结果文件，包括：`compare_result_{timestamp}.csv`和`advisor_{timestamp}.txt`
+
+4. 找出存在问题的API。
+
+   1. 根据`advisor_{timestamp}.txt`或打屏信息的提示，可找到存在精度问题的算子（Suspect Nodes）和专家建议（Expert Advice)。
+
+      ![auto_analyze_log](img/auto_analyze_log.png)
+
+   2. 根据第2步结果文件`compare_result_{timestamp}.csv`中的Accuracy Reached or No字段显示为NO的API，针对该API执行后续比对操作，分析该API存在的精度问题。
+
+5. （可选）重新比对。
+
+   根据第3步的dump数据重新配置compare.py并执行比对，可以对单API模型进行问题复现。
+
+**注意**：部分API存在调用嵌套关系，比如functional.batch_norm实际调用torch.batch_norm，该场景会影响kernel init初始化多次，导致功能异常。
+
+### 溢出检测场景
+
+溢出检测是针对NPU的PyTorch API，检测是否存在溢出的情况。当前仅支持识别aicore浮点溢出。
+
+溢出检测原理：针对溢出阶段，开启acl dump模式，重新对溢出阶段执行，落盘数据。
+
+建议按照如下步骤操作：
+
+1. 修改dump配置文件config.json。
+
+   ```json
+   {
+       "task": "overflow_check",
+       "dump_path": "./npu_dump",
+       "rank": [],
+       "step": [],
+       "level": "L1",
+       "seed": 1234,
+       "is_deterministic": false,
+   
+       "overflow_check": {
+           "overflow_nums": 3
+       }
+   }
+   ```
+   
+2. 在NPU训练脚本内添加atat工具，执行溢出检测dump。
+
+   ```python
+   from atat.pytorch import PrecisionDebugger
+   debugger = PrecisionDebugger(config_path="./config.json", dump_path="./npu_dump")
+   # 请勿将以上初始化流程插入到循环代码中
+   
+   # 模型初始化
+   # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop()
+   debugger.start()
+   
+   # 需要dump的代码片段1
+   
+   debugger.stop()
+   debugger.start()
+   
+   # 需要dump的代码片段2
+   
+   debugger.stop()
+   debugger.step()
+   ```
+   
+   多卡使用时各卡单独计算溢出次数。
+   
+3. NPU环境下执行训练dump溢出数据。
+
+   针对输入正常但输出存在溢出的API，会在训练执行目录下将溢出的API信息dump并保存为`dump.json`通过《[溢出解析工具](./run_overflow_check.md)》对json文件进行解析，输出溢出API为正常溢出还是非正常溢出，从而帮助用户快速判断。
+
+   溢出解析工具执行命令如下：
+
+   ```bash
+   atat -f pytorch run_overflow_check -api_info ./dump.json
+   ```
+   
+   反向过程溢出的API暂不支持精度预检功能。
+   
+
+当重复执行溢出检测dump操作时，需要删除上一次dump目录下的溢出检测dump数据，否则将因重名而报错。
+
+**注意事项**
+
+* （暂不支持）level为L2场景下，会增加npu的内存消耗，请谨慎开启。
+* （暂不支持）l部分API存在调用嵌套关系，比如functional.batch_norm实际调用torch.batch_norm，该场景会影响acl init初始化多次，导致level为L2功能异常。
+* 混合精度动态loss scale场景下，正常训练会有"Gradient overflow. SKipping step"日志，添加溢出检测后日志消失，可以通过设置环境变量export OVERFLOW_DEBUG_MODE_ENABLE=1，并将register_hook位置调整amp.initialize之前解决。此功能需要cann包配套支持，不支持版本执行报错EZ3003。
+
+## 场景化示例
+
+### 多卡场景精度比对
+
+精度工具支持多卡场景的精度比对，多卡场景的dump步骤与单卡场景完全一致，请参见“**单卡场景精度比对**”章节，不同的是多卡数据精度比对时需要使用“compare_distributed”函数进行比对。
+
+如下示例：
+
+说明：多机多卡场景需要每个节点单独执行比对操作。
+
+假设NPU dump 数据目录为npu_dump，GPU dump数据目录为gpu_dump。
+
+1. 创建比对脚本，例如compare_distributed.py，拷贝如下代码。
+
+   ```python
+   from atat.pytorch import *
+   compare_distributed('./npu_dump/step0', './gpu_dump/step0', './output')
+   ```
+
+   dump数据目录须指定到step级。
+
+2. 执行比对：
+
+   ```bash
+   python3 compare_distributed.py
+   ```
+
+两次运行须用相同数量的卡，传入`compare_distributed`的两个文件夹下须有相同个数的rank文件夹，且不包含其他无关文件，否则将无法比对。
+
+**多卡set_dump_path注意事项**
+
+多卡一般为多进程，须保证每个进程都正确调用PrecisionDebugger，或把PrecisionDebugger插入到import语句后，如：
+
+```python
+from atat.pytorch import PrecisionDebugger
+debugger = PrecisionDebugger(config_path="./config.json", dump_path="./npu_dump")
+```
+
+如此可保证set_dump_path在每个进程都被调用。
+
+### NPU vs NPU精度比对
+
+对于NPU vs NPU场景，是针对同一模型，进行迭代（模型、API版本升级或设备硬件升级）时存在的精度下降问题，对比相同模型在迭代前后版本的API计算数值，进行问题定位。
+
+一般情况下迭代涉及NPU自定义算子，因此，可以仅dump NPU自定义算子进行比对。比对精度问题分析请参见“**单卡场景精度比对**”章节。
+
+工具当前支持dump NPU自定义算子如下：
+
+| 序号 | NPU自定义算子                                   |
+| :--- | ----------------------------------------------- |
+| 1    | torch_npu.one_                                  |
+| 2    | torch_npu.npu_sort_v2                           |
+| 3    | torch_npu.npu_transpose                         |
+| 4    | torch_npu.npu_broadcast                         |
+| 5    | torch_npu.npu_dtype_cast                        |
+| 6    | torch_npu.empty_with_format                     |
+| 7    | torch_npu.npu_one_hot                           |
+| 8    | torch_npu.npu_stride_add                        |
+| 9    | torch_npu.npu_ps_roi_pooling                    |
+| 10   | torch_npu.npu_roi_align                         |
+| 11   | torch_npu.npu_nms_v4                            |
+| 12   | torch_npu.npu_iou                               |
+| 13   | torch_npu.npu_nms_with_mask                     |
+| 14   | torch_npu.npu_pad                               |
+| 15   | torch_npu.npu_bounding_box_encode               |
+| 16   | torch_npu.npu_bounding_box_decode               |
+| 17   | torch_npu.npu_batch_nms                         |
+| 18   | torch_npu.npu_slice                             |
+| 19   | torch_npu._npu_dropout                          |
+| 20   | torch_npu.npu_indexing                          |
+| 21   | torch_npu.npu_ifmr                              |
+| 22   | torch_npu.npu_max                               |
+| 23   | torch_npu.npu_scatter                           |
+| 24   | torch_npu.npu_layer_norm_eval                   |
+| 25   | torch_npu.npu_alloc_float_status                |
+| 26   | torch_npu.npu_confusion_transpose               |
+| 27   | torch_npu.npu_bmmV2                             |
+| 28   | torch_npu.fast_gelu                             |
+| 29   | torch_npu.npu_sub_sample                        |
+| 30   | torch_npu.npu_deformable_conv2d                 |
+| 31   | torch_npu.npu_mish                              |
+| 32   | torch_npu.npu_anchor_response_flags             |
+| 33   | torch_npu.npu_yolo_boxes_encode                 |
+| 34   | torch_npu.npu_grid_assign_positive              |
+| 35   | torch_npu.npu_normalize_batch                   |
+| 36   | torch_npu.npu_masked_fill_range                 |
+| 37   | torch_npu.npu_linear                            |
+| 38   | torch_npu.npu_bert_apply_adam                   |
+| 39   | torch_npu.npu_giou                              |
+| 40   | torch_npu.npu_ciou                              |
+| 41   | torch_npu.npu_diou                              |
+| 42   | torch_npu.npu_sign_bits_pack                    |
+| 43   | torch_npu.npu_sign_bits_unpack                  |
+| 44   | torch_npu.npu_flash_attention                   |
+| 45   | torch_npu.npu_scaled_masked_softmax             |
+| 46   | torch_npu.npu_rotary_mul                        |
+| 47   | torch_npu.npu_roi_align                         |
+| 48   | torch_npu.npu_roi_alignbk                       |
+| 49   | torch_npu.npu_ptiou                             |
+| 50   | torch_npu.npu_fusion_attention                  |
+| 51   | torch_npu.npu_dropout_with_add_softmax          |
+| 52   | torch_npu.npu_random_choice_with_mask           |
+| 53   | torch_npu.npu_rotated_iou                       |
+| 54   | torch_npu.npu_conv2d                            |
+| 55   | torch_npu.npu_conv3d                            |
+| 56   | torch_npu.npu_softmax_cross_entropy_with_logits |
+| 57   | torch_npu.npu_all_gather_base_mm                |
+| 58   | torch_npu.npu_swiglu                            |
+| 59   | torch_npu.npu_rms_norm                          |
+| 60   | torch_npu.npu_mm_reduce_scatter_base            |
+| 61   | torch_npu.npu_mm_all_reduce_base                |
+| 62   | torch_npu.npu_conv_transpose2d                  |
+| 63   | torch_npu.npu_convolution                       |
+| 64   | torch_npu.npu_convolution_transpose             |
+| 65   | torch_npu.npu_min                               |
+| 66   | torch_npu.npu_nms_rotated                       |
+| 67   | torch_npu.npu_reshape                           |
+| 68   | torch_npu.npu_rotated_box_decode                |
+| 69   | torch_npu.npu_rotated_box_encode                |
+| 70   | torch_npu.npu_rotated_overlaps                  |
+| 71   | torch_npu.npu_silu                              |
+| 72   | torch_npu.npu_fused_attention_score             |
+| 73   | torch_npu.npu_multi_head_attention              |
+| 74   | torch_npu.npu_gru                               |
+| 75   | torch_npu.npu_incre_flash_attention             |
+| 76   | torch_npu.npu_prompt_flash_attention            |
+| 77   | torch_npu.npu_lstm                              |
+| 78   | torch_npu.npu_apply_adam                        |
+
+### 通信API的数据dump
+
+通信类API数据可以使用全量dump方式获取，若只dump通信类API数据，可以使用如下示例：
+
+1. 修改dump配置文件config.json。
+
+   ```json
+   {
+       "task": "tensor",
+       "dump_path": "./npu_dump",
+       "rank": [],
+       "step": [],
+       "level": "L1",
+       "seed": 1234,
+       "is_deterministic": false,
+   
+       "tensor": {
+           "scope": [], 
+           "list": ["distributed"],
+           "data_mode": ["all"],
+           "summary_mode": "statistics"
+       }
+   }
+   ```
+
+2. 在训练脚本内添加atat工具，dump整网数据。
+
+   ```python
+   from atat.pytorch import PrecisionDebugger
+   debugger = PrecisionDebugger(config_path="./config.json", dump_path="./npu_dump")
+   # 请勿将以上初始化流程插入到循环代码中
+   
+   # 模型初始化
+   # 下面代码也可以用PrecisionDebugger.start()和PrecisionDebugger.stop()
+   debugger.start()
+   
+   # 需要dump的代码片段1
+   
+   debugger.stop()
+   debugger.start()
+   
+   # 需要dump的代码片段2
+   
+   debugger.stop()
+   debugger.step()
+   ```
+
+通信类API支持列表：
+
+| 序号 | Distributed          |
+| :--- | -------------------- |
+| 1    | send                 |
+| 2    | recv                 |
+| 3    | broadcast            |
+| 4    | all_reduce           |
+| 5    | reduce               |
+| 6    | all_gather           |
+| 7    | gather               |
+| 8    | isend                |
+| 9    | irecv                |
+| 10   | scatter              |
+| 11   | reduce_scatter       |
+| 12   | _reduce_scatter_base |
+| 13   | _all_gather_base     |
+
diff --git a/debug/accuracy_tools/atat/pytorch/doc/run_overflow_check.md b/debug/accuracy_tools/atat/pytorch/doc/run_overflow_check.md
new file mode 100644
index 0000000000000000000000000000000000000000..1bdc4f354cfaf0bfbdf701baa7dfb05f3771e30b
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/doc/run_overflow_check.md
@@ -0,0 +1,25 @@
+# **溢出解析工具**
+
+针对训练过程中的溢出检测场景（当《[精度数据采集](./dump.md)》开启溢出检测dump时），对于输入正常但输出存在溢出的API，会在训练执行目录下将溢出的API信息按照前向和反向分类，dump并保存为`dump.json`，前向过程溢出的API可通过该工具对`dump.json`进行解析，输出溢出API为正常溢出还是非正常溢出，从而帮助用户快速判断。
+
+工具支持PyTorch版本：1.11.0/2.0/2.1/2.2。
+
+操作步骤如下：
+
+1. 安装工具。
+
+   详见《[MindStudio精度调试工具](../../README.md)》的“工具安装”章节。
+
+2. 执行溢出API解析操作。
+
+   ```bash
+   atat -f pytorch run_overflow_check -api_info ./dump.json
+   ```
+   
+| 参数名称                   | 说明                                               | 是否必选 |
+| -------------------------- | -------------------------------------------------- | -------- |
+| -api_info或--api_info_file | 指定API信息文件dump.json。                         | 是       |
+| -j或--jit_compile          | 开启jit编译。                                      | 否       |
+| -d或--device               | 指定Device ID，选择UT代码运行所在的卡，默认值为0。 | 否       |
+
+反向过程溢出的API暂不支持该功能。
diff --git a/debug/accuracy_tools/atat/pytorch/dump/dump.py b/debug/accuracy_tools/atat/pytorch/dump/dump.py
new file mode 100644
index 0000000000000000000000000000000000000000..f890360000b7f6d6d7fb7804d779b4a24199a356
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/dump/dump.py
@@ -0,0 +1,455 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import inspect
+import json
+import os
+import threading
+from pathlib import Path
+
+import numpy as np
+import torch
+
+try:
+    import torch_npu
+except ImportError:
+    is_gpu = True
+else:
+    is_gpu = False
+
+from atat.core.utils import (print_warn_log, Const, print_info_log, modify_dump_path, check_inplace_op, CompareConst,
+                             print_error_log)
+from atat.core.file_check_util import FileOpen, change_mode, FileCheckConst
+from atat.pytorch.common.utils import get_md5_for_tensor
+from ..dump.utils import check_writable
+from .utils import (DumpUtil, check_if_in_api_list, make_dump_data_dir, get_tensor_rank, create_dirs_if_not_exist,
+                    CompareException, check_single_rank_folder)
+
+
+forward_init_status = False
+backward_init_status = False
+
+thread_lock = threading.Lock()
+pkl_name = ""
+rank = os.getpid() + 100000
+multi_output_apis = ["_sort_", "npu_flash_attention"]
+module_count = {}
+
+
+class APIList(list):
+    threshold = 1000
+
+    def __init__(self, *args):
+        self.dump_count = 0
+        self.pkl_mode_changed = False
+        super().__init__(*args)
+
+    def flush(self):
+        pkl_path = get_pkl_file_path()
+        if len(self) == 0 or pkl_path == "":
+            return
+        with FileOpen(pkl_path, 'a') as f:
+            try:
+                f.write('\n'.join(json.dumps(item) for item in self))
+                f.write('\n')
+            except IOError as ex:
+                raise Exception("write to disk failed") from ex
+        self.dump_count += 1
+        print_info_log(f"write {len(self)} items to {pkl_path} the {self.dump_count} time")
+        if not self.pkl_mode_changed:
+            change_mode(pkl_path, FileCheckConst.DATA_FILE_AUTHORITY)
+            self.pkl_mode_changed = True
+        self.clear()
+
+    def append(self, data):
+        list.append(self, data)
+        if len(self) >= APIList.threshold:
+            self.flush()
+
+
+api_list = APIList()
+
+
+class DataInfo(object):
+    def __init__(self, save_data, summary_data, dtype, shape, md5=None):
+        if md5 is None:
+            md5 = []
+        self.save_data = save_data
+        self.summary_data = summary_data
+        self.dtype = dtype
+        self.shape = shape
+        self.md5 = md5
+
+
+def get_not_float_tensor_info(data):
+    if DumpUtil.summary_mode == "md5":
+        return DataInfo([], [], str(data.dtype), tuple(data.shape), get_md5_for_tensor(data))
+    if data.numel() == 0 or data.dtype == torch.bool:
+        tensor_max = []
+        tensor_min = []
+        tensor_mean = []
+    elif len(data.shape) == 0:
+        item = data.float().item()
+        tensor_max = item
+        tensor_min = item
+        tensor_mean = item
+    else:
+        tensor_max = torch._C._VariableFunctionsClass.max(data).float().item()
+        tensor_min = torch._C._VariableFunctionsClass.min(data).float().item()
+        tensor_mean = torch._C._VariableFunctionsClass.mean(data.float()).float().item()
+    return get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean, CompareConst.NAN)
+
+
+def get_scalar_data_info(data):
+    summary_data = [data, data, data, data]
+    return DataInfo(data, summary_data, str(type(data)), str([]))
+
+
+def get_float_tensor_info(data):
+    if DumpUtil.summary_mode == "md5":
+        return DataInfo([], [], str(data.dtype), tuple(data.shape), get_md5_for_tensor(data))
+    tensor_max = torch._C._VariableFunctionsClass.max(data).float().item()
+    tensor_min = torch._C._VariableFunctionsClass.min(data).float().item()
+    tensor_mean = torch._C._VariableFunctionsClass.mean(data).float().item()
+    tensor_norm = torch._C._VariableFunctionsClass.norm(data).float().item()
+    return get_tensor_data_info(data, tensor_max, tensor_min, tensor_mean, tensor_norm)
+
+
+def get_tensor_data_info(data, *tensor_args):
+    summary_data = []
+    summary_data.extend([*tensor_args])
+    if DumpUtil.summary_mode == "all":
+        saved_tensor = data.contiguous().cpu().detach()
+        if data.dtype == torch.bfloat16:
+            saved_numpy = saved_tensor.to(torch.float32).numpy()
+        else:
+            saved_numpy = saved_tensor.numpy()
+        return DataInfo(saved_numpy, summary_data, str(data.dtype), tuple(data.shape))
+    return DataInfo([], summary_data, str(data.dtype), tuple(data.shape))
+
+
+def dump_tensor(x, prefix, dump_step):
+    if isinstance(x, (tuple, list)) and x:
+        for i, item in enumerate(x):
+            dump_tensor(item, "{}.{}".format(prefix, i), dump_step)
+        return
+    elif isinstance(x, torch.Tensor):
+        if x.is_meta:
+            print_info_log(f"Meta tensor {prefix} is skipped.")
+            return
+        x_clone = x.clone().detach()
+        if x_clone.numel() == 0 or len(x_clone.shape) == 0 or not x_clone.is_floating_point():
+            if DumpUtil.dump_filter_switch == Const.OFF:
+                data_info = get_not_float_tensor_info(x_clone)
+                dump_data_by_rank_count(dump_step, prefix, data_info)
+            else:
+                return
+        else:
+            data_info = get_float_tensor_info(x_clone)
+            dump_data_by_rank_count(dump_step, prefix, data_info)
+
+    elif DumpUtil.dump_filter_switch == Const.OFF:
+        if isinstance(x, bool) or isinstance(x, int) or isinstance(x, float):
+            data_info = get_scalar_data_info(x)
+            dump_data_by_rank_count(dump_step, prefix, data_info)
+
+
+def append_pkl_data(dump_step, prefix, data_info):
+    global api_list
+    thread_lock.acquire()
+    api_list.append([prefix, dump_step, data_info.md5, data_info.dtype, data_info.shape, data_info.summary_data])
+    thread_lock.release()
+
+
+def dump_data(prefix, data_info):
+    if DumpUtil.summary_mode != "all":
+        return
+    output_path = os.path.join(DumpUtil.dump_data_dir, f'{prefix}.npy')
+    try:
+        np.save(output_path, data_info.save_data)
+        change_mode(output_path, FileCheckConst.DATA_FILE_AUTHORITY)
+    except Exception as e:
+        print_warn_log("Dump data failed, error: {}".format(e))
+
+
+def thread_dump_data(prefix, data_info):
+    DumpUtil.dump_thread_pool.submit(dump_data, prefix, data_info)
+
+
+def dump_data_by_rank_count(dump_step, prefix, data_info):
+    print_info_log(f"ptdbg is analyzing rank{rank} api: {prefix}" + " " * 10, end='\r')
+    if DumpUtil.is_single_rank and DumpUtil.dump_thread_pool:
+        thread_dump_data(prefix, data_info)
+    else:
+        dump_data(prefix, data_info)
+    append_pkl_data(dump_step, prefix, data_info)
+
+
+def dump_stack_info(name_template):
+    if check_inplace_op(name_template) and Const.PRE_FORWARD in name_template:
+        return
+
+    stack_str = []
+    try:
+        for (_, path, line, func, code, _) in inspect.stack()[4:]:
+            if code:
+                stack_line = [path, str(line), func, code[0].strip() if code else code]
+            else:
+                stack_line = [path, str(line), func, code]
+            stack_str.append(stack_line)
+    except Exception as e:
+        print_warn_log("Dump stack info failed, error: {}".format(e))
+        stack_str.append('')
+
+    prefix = name_template.format("stack_info")
+    if DumpUtil.dump_switch_mode in Const.DUMP_MODE:
+        complement_set = set(['forward', 'backward', 'input', 'output']) - set(DumpUtil.dump_mode)
+        if not any(mode in prefix for mode in complement_set):
+            api_list.append([prefix, stack_str])
+    else:
+        api_list.append([prefix, stack_str])
+
+
+def dump_api_tensor(dump_step, in_feat, name_template, out_feat):
+    if check_inplace_op(name_template):
+        if Const.PRE_FORWARD in name_template:
+            name_template = name_template.replace(Const.PRE_FORWARD, Const.FORWARD)
+        else:
+            if Const.BACKWARD in name_template and Const.BACKWARD in DumpUtil.dump_mode:
+                return
+            elif Const.BACKWARD not in name_template and Const.FORWARD in DumpUtil.dump_mode:
+                if "output" in DumpUtil.dump_mode:
+                    dump_tensor(in_feat, name_template.format("output"), dump_step)
+                if "input" in DumpUtil.dump_mode:
+                    return
+
+    if Const.BACKWARD in name_template and Const.BACKWARD in DumpUtil.dump_mode:
+        if 'input' in DumpUtil.dump_mode:
+            dump_tensor(out_feat, name_template.format("input"), dump_step)
+        if 'output' in DumpUtil.dump_mode:
+            dump_tensor(in_feat, name_template.format("output"), dump_step)
+    elif Const.BACKWARD not in name_template and Const.FORWARD in DumpUtil.dump_mode:
+        if 'input' in DumpUtil.dump_mode:
+            dump_tensor(in_feat, name_template.format("input"), dump_step)
+        if 'output' in DumpUtil.dump_mode:
+            dump_tensor(out_feat, name_template.format("output"), dump_step)
+
+
+def rename_():
+    global rank
+    global pkl_name
+    if rank is not None and pkl_name is not None:
+        dir_name = os.path.join(DumpUtil.dump_root, "step{}".format(DumpUtil.iter_num), "rank{}".format(os.getpid() + 100000))
+        new_name = os.path.join(DumpUtil.dump_root, "step{}".format(DumpUtil.iter_num), "rank{}".format(rank))
+        if not os.path.exists(new_name) and os.path.exists(dir_name):
+            _, file_name = os.path.split(pkl_name)
+            os.rename(dir_name, new_name)
+            pkl_name = os.path.join(new_name, file_name)
+
+
+def dump_acc_cmp(name, in_feat, out_feat, dump_step, module):
+    if not DumpUtil.get_dump_switch():
+        return
+    if DumpUtil.dump_switch_mode == Const.API_LIST and not check_if_in_api_list(name):
+        return
+    if DumpUtil.dump_switch_mode in [Const.LIST, Const.ACL, Const.RANGE, Const.STACK] and not DumpUtil.check_switch_scope(name):
+        return
+    dump_file = DumpUtil.get_dump_path()
+    dump_file = modify_dump_path(dump_file, DumpUtil.dump_switch_mode)
+    global rank
+    dump_dir, dump_filename = os.path.split(dump_file)
+    dump_dir = os.path.join(dump_dir, "step{}".format(DumpUtil.iter_num))
+    if not os.path.exists(dump_dir):
+        Path(dump_dir).mkdir(mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True)
+    dump_file = os.path.join(dump_dir, dump_filename)
+    rank_this = get_tensor_rank(in_feat, out_feat)
+    DumpUtil.dump_root = os.path.dirname(DumpUtil.dump_path)
+    if rank_this is not None and rank != rank_this:
+        rank = rank_this
+        rename_()
+        if not DumpUtil.dump_init_enable:
+            if '.pkl' in dump_filename:
+                npy_dir = dump_filename[:-4]
+            else:
+                npy_dir = dump_filename
+            DumpUtil.dump_data_dir = os.path.join(DumpUtil.dump_root, "step{}".format(DumpUtil.iter_num), "rank{}".format(rank), npy_dir)
+    if DumpUtil.target_rank is not None:
+        if rank != DumpUtil.target_rank:
+            return
+    dump_file = create_dirs_if_not_exist(rank, dump_file)
+    global pkl_name
+    pkl_name = dump_file
+    if DumpUtil.dump_init_enable:
+        DumpUtil.dump_init_enable = False
+        DumpUtil.dump_data_dir = make_dump_data_dir(dump_file) \
+            if DumpUtil.dump_switch_mode not in [Const.STACK, Const.ACL] and DumpUtil.summary_mode == "all" else ""
+        if os.path.exists(dump_file) and not os.path.isdir(dump_file):
+            check_writable(dump_file)
+            try:
+                os.remove(dump_file)
+            except FileNotFoundError as e:
+                print_warn_log("The file does not exist, error: {}".format(e))
+
+    name_prefix = name
+    name_template = f"{name_prefix}" + "_{}"
+    if DumpUtil.is_single_rank is None:
+        DumpUtil.is_single_rank = check_single_rank_folder(dump_dir)
+    if DumpUtil.dump_switch_mode in [Const.ALL, Const.API_LIST]:
+        dump_api_tensor(dump_step, in_feat, name_template, out_feat)
+    elif DumpUtil.dump_switch_mode == Const.API_STACK:
+        dump_api_tensor(dump_step, in_feat, name_template, out_feat)
+        dump_stack_info(name_template)
+    else:
+        if DumpUtil.dump_switch_mode == Const.ACL:
+            acl_dump(module, name, name_prefix)
+        elif DumpUtil.dump_switch_mode != Const.STACK:
+            dump_api_tensor(dump_step, in_feat, name_template, out_feat)
+        dump_stack_info(name_template)
+
+
+def acl_dump(module, module_name, name_prefix):
+    if name_prefix in DumpUtil.backward_input:
+        dump_mode_backward_acl_dump(module, module_name, DumpUtil.backward_input.get(name_prefix))
+    else:
+        forward_acl_dump(module, module_name)
+
+
+def Op_Need_Trigger(module_name):
+    if 'Tensor___getitem___' in module_name:
+        return True
+    return False
+
+
+def forward_acl_dump(module, module_name):
+    global forward_init_status
+    global backward_init_status
+    if not forward_init_status and not backward_init_status:
+        forward_init_status = True
+        torch_npu.npu.synchronize()
+        torch_npu.npu.init_dump()
+        torch_npu.npu.set_dump(DumpUtil.dump_config)
+        torch_npu.npu.synchronize()
+        if Op_Need_Trigger(module_name):
+            module.forward(*module.input_args, **module.input_kwargs).cpu()
+        else:
+            module.forward(*module.input_args, **module.input_kwargs)
+        torch_npu.npu.synchronize()
+        torch_npu.npu.finalize_dump()
+        torch_npu.npu.synchronize()
+    del module.input_args
+    del module.input_kwargs
+    forward_init_status = False
+    print_info_log("Dump %s op file." % module_name)
+
+
+def acl_backward_dump_status(output, grad, module_name):
+    if isinstance(output, torch.Tensor):
+        output.backward(grad, retain_graph=True)
+        return True
+
+    for api_name in multi_output_apis:
+        if api_name in module_name:
+            output[0].backward(grad, retain_graph=True)
+            return True
+    return False
+
+
+def dump_mode_backward_acl_dump(module, module_name, grad_path):
+    global forward_init_status
+    global backward_init_status
+    module_name = module_name.replace(Const.FORWARD, Const.BACKWARD)
+    if not forward_init_status and not backward_init_status:
+        forward_init_status = True
+        module.input_args = list(module.input_args)
+        for i, data in enumerate(module.input_args):
+            if isinstance(data, torch.Tensor) and data.grad_fn:
+                module.input_args[i] = data.detach().requires_grad_()
+        output = module.forward(*module.input_args, **module.input_kwargs)
+        grad = torch.tensor(np.load(grad_path)).to("npu").requires_grad_()
+        torch_npu.npu.init_dump()
+        torch_npu.npu.set_dump(DumpUtil.dump_config)
+        torch_npu.npu.synchronize()
+        if not acl_backward_dump_status(output, grad, module_name):
+            print_warn_log("The output of {} is not of tensor type and cannot be automatically derived. "
+                            "you can manually construct a single API backward case for ACL dump.".format(module_name))
+        torch_npu.npu.synchronize()
+        torch_npu.npu.finalize_dump()
+    del module.input_args
+    del module.input_kwargs
+    forward_init_status = False
+    print_info_log("Dump %s op file." % module_name)
+
+
+def module_count_func(name, name_template):
+    module_name = name.split("_")[-3]
+    if Const.FORWARD in name_template:
+        if module_name not in module_count:
+            module_count[module_name] = [0, [0]]
+        else:
+            if module_count[module_name][-1] and \
+                    module_count[module_name][0] != module_count[module_name][-1][-1]:
+                module_count[module_name][-1].pop()
+            module_count[module_name][0] += 1
+            module_count[module_name][-1].append(module_count[module_name][0])
+        index = module_count[module_name][0]
+    else:
+        backward_stack = module_count[module_name][-1] if module_name in module_count else []
+        if not backward_stack:
+            print_warn_log("The backward stack of {} is empty.".format(module_name))
+            index = "abnormal"
+        else:
+            index = backward_stack.pop()
+    return index
+
+
+def acc_cmp_dump(name, **kwargs):
+    dump_step = kwargs.get('dump_step', 1)
+    pid = kwargs.get('pid')
+    name_template = name
+    if not pid:
+        return RuntimeError("Not get the specified process pid.")
+
+    def acc_cmp_hook(module, in_feat, out_feat=None):
+        nonlocal name, name_template
+        if "_{}_" in name_template:
+            try:
+                index = module_count_func(name, name_template)
+            except IndexError as e:
+                print_error_log(f"Get module {name_template} index failed.")
+                raise CompareException(CompareException.INDEX_OUT_OF_BOUNDS_ERROR) from e
+            name = name_template.format(index)
+        if pid == os.getpid():
+            dump_acc_cmp(name, in_feat, out_feat, dump_step, module)
+        if hasattr(module, "input_args"):
+            del module.input_args
+        if hasattr(module, "input_kwargs"):
+            del module.input_kwargs
+
+    return acc_cmp_hook
+
+
+def write_to_disk():
+    api_list.flush()
+
+
+def get_pkl_file_path():
+    return pkl_name
+
+
+def reset_module_count():
+    global module_count
+    module_count = {}
diff --git a/debug/accuracy_tools/atat/pytorch/dump/utils.py b/debug/accuracy_tools/atat/pytorch/dump/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e58f35606a4a4f9cf9e7ae732beeedb7777cdef
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/dump/utils.py
@@ -0,0 +1,357 @@
+import os
+import re
+import shutil
+from pathlib import Path
+import torch
+import torch.distributed as dist
+
+from atat.core.utils import print_error_log, CompareException, DumpException, Const, get_time, print_info_log, \
+    check_mode_valid, check_switch_valid, check_dump_mode_valid, check_summary_only_valid, generate_compare_script, \
+    check_file_valid, make_dump_path_if_not_exists, check_path_before_create, check_summary_mode_valid
+from atat.core.file_check_util import FileChecker, FileCheckConst, check_path_length, check_path_pattern_vaild
+from atat.pytorch.common.utils import check_is_npu
+
+from ..dump import dump
+
+dump_count = 0
+range_begin_flag, range_end_flag = False, False
+
+
+def check_list_or_acl_mode(name_prefix):
+    global dump_count
+    for item in DumpUtil.dump_switch_scope:
+        if name_prefix.startswith(item):
+            dump_count = dump_count + 1
+            return True
+    return False
+
+
+def check_range_mode(name_prefix):
+    global range_begin_flag
+    global range_end_flag
+    if name_prefix.startswith(DumpUtil.dump_switch_scope[0]):
+        range_begin_flag = True
+        return True
+    if name_prefix.startswith(DumpUtil.dump_switch_scope[1]):
+        range_end_flag = True
+        return True
+    if range_begin_flag and not range_end_flag:
+        return True
+    return False
+
+
+def check_stack_mode(name_prefix):
+    if len(DumpUtil.dump_switch_scope) == 0:
+        return True
+    elif len(DumpUtil.dump_switch_scope) == 1:
+        return name_prefix.startswith(DumpUtil.dump_switch_scope[0])
+    elif len(DumpUtil.dump_switch_scope) == 2:
+        return check_range_mode(name_prefix)
+    else:
+        print_error_log("dump scope is invalid, Please set the scope mode in"
+                        " set_dump_switch with 'all', 'list', 'range', 'stack', 'acl', 'api_list'!")
+    return False
+
+
+class DumpConfig:
+    def __init__(self, mode=None, scope=None, api_list=None, filter_switch=None, dump_mode=None, summary_only=False, summary_mode="all"):
+        self.mode = mode
+        self.scope = scope
+        self.api_list = api_list
+        self.filter_switch = filter_switch
+        self.dump_mode = dump_mode
+        self.summary_only = summary_only
+        self.summary_mode = summary_mode
+
+
+class DumpUtil(object):
+    dump_root = None
+    dump_data_dir = None
+    dump_path = None
+    dump_switch = None
+    dump_switch_mode = Const.ALL # all, api_stack, list, stack...
+    dump_switch_scope = []
+    dump_init_enable = False
+    dump_api_list = []
+    dump_filter_switch = None
+    dump_mode = ['forward', 'backward', 'input', 'output']
+    backward_input = {}
+    dump_dir_tag = 'ptdbg_dump'
+    dump_config = None
+    dataloader_iter = 0
+    target_iter = None
+    iter_num = 0
+    target_rank = None
+    summary_only = False
+    need_replicate = False
+    summary_mode = "all"
+    is_single_rank = None
+    dump_thread_pool = None
+
+
+    @staticmethod
+    def set_dump_path(save_path):
+        DumpUtil.dump_path = save_path
+        DumpUtil.dump_init_enable = True
+
+    @staticmethod
+    def set_acl_config(acl_config):
+        if not acl_config:
+            raise ValueError("acl_config must be configured when mode is 'acl'")
+        acl_config_checker = FileChecker(acl_config, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
+                                         FileCheckConst.JSON_SUFFIX)
+        acl_config = acl_config_checker.common_check()
+        DumpUtil.dump_config = acl_config
+
+    @staticmethod
+    def set_dump_switch(switch, dump_config):
+        DumpUtil.dump_switch = switch
+        if dump_config.mode is not None:
+            DumpUtil.dump_switch_mode = dump_config.mode
+        DumpUtil.dump_init_enable = True
+        if dump_config.scope is not None:
+            DumpUtil.dump_switch_scope = dump_config.scope
+        if dump_config.api_list is not None:
+            DumpUtil.dump_api_list = [api.lower() for api in dump_config.api_list]
+        if dump_config.filter_switch is not None:
+            DumpUtil.dump_filter_switch = dump_config.filter_switch
+        if dump_config.dump_mode is not None:
+            DumpUtil.dump_mode = dump_config.dump_mode if isinstance(dump_config.dump_mode, list) else [dump_config.dump_mode]
+
+        if dump_config.mode == Const.ACL:
+            DumpUtil.dump_switch_scope = [api_name.replace("backward", "forward") for api_name in dump_config.scope]
+
+        DumpUtil.summary_only = dump_config.summary_only
+        DumpUtil.summary_mode = dump_config.summary_mode
+
+    check_mapper = {
+        Const.LIST: check_list_or_acl_mode,
+        Const.ACL: check_list_or_acl_mode,
+        Const.RANGE: check_range_mode,
+        Const.STACK: check_stack_mode
+    }
+
+    @staticmethod
+    def check_switch_scope(name_prefix):
+        if DumpUtil.dump_switch_mode in DumpUtil.check_mapper:
+            check_func = DumpUtil.check_mapper[DumpUtil.dump_switch_mode]
+            return check_func(name_prefix)
+        return False
+
+    @staticmethod
+    def get_dump_path():
+        if DumpUtil.dump_path:
+            return DumpUtil.dump_path
+
+        if DumpUtil.dump_switch_mode == Const.ALL:
+            raise RuntimeError("get_dump_path: the file path is empty,"
+                               " you must use set_dump_path to set a valid dump path!!!")
+        else:
+            dir_path = os.path.realpath("./")
+            dump_file_name = "scope_dump_{}_{}_{}.pkl".format(
+                DumpUtil.dump_switch_mode, DumpUtil.dump_switch_scope[0], get_time())
+            DumpUtil.dump_path = os.path.join(dir_path, dump_file_name)
+            return DumpUtil.dump_path
+
+    @staticmethod
+    def get_dump_switch():
+        return DumpUtil.dump_switch == "ON"
+
+
+def set_dump_path(fpath=None, dump_tag='ptdbg_dump'):
+    fpath = load_env_dump_path(fpath)
+    check_file_valid(fpath)
+    if not re.match(Const.FILE_PATTERN, dump_tag):
+        print_error_log('The file path {} contains special characters.'.format(dump_tag))
+        raise CompareException(CompareException.INVALID_PATH_ERROR)
+    real_path = os.path.realpath(fpath)
+    make_dump_path_if_not_exists(real_path)
+    fpath_checker = FileChecker(real_path, FileCheckConst.DIR, FileCheckConst.WRITE_ABLE)
+    fpath_checker.common_check()
+    DumpUtil.set_dump_path(real_path)
+    DumpUtil.dump_dir_tag = dump_tag
+
+
+def get_tensor_rank(in_feat, out_feat):
+    if dist.is_initialized():
+        return dist.get_rank()
+
+    def get_tensor_rank_single(x):
+        if isinstance(x, (list, tuple)):
+            if len(x) > 0:
+                return get_tensor_rank_single(x[0])
+            return None
+        elif isinstance(x, torch.Tensor):
+            device = x.device
+            if device.type == 'cpu':
+                return None
+            else:
+                return device.index
+        return None
+    in_rank = get_tensor_rank_single(in_feat)
+    if in_rank is None:
+        out_rank = get_tensor_rank_single(out_feat)
+        if out_rank is None:
+            return None
+        return out_rank
+    return in_rank
+
+
+def create_dirs_if_not_exist(rank, dump_file):
+    dump_path, file_name = os.path.split(dump_file)
+    rank_dir = os.path.join(dump_path, f"rank{rank}")
+    dump_file = os.path.join(rank_dir, file_name)
+    if not os.path.isdir(rank_dir):
+        check_path_pattern_vaild(dump_file)
+        check_path_length(dump_file, name_length=200)
+        Path(rank_dir).mkdir(mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True)
+    return dump_file
+
+
+def generate_dump_path_str():
+    if DumpUtil.dump_switch_mode == 'acl':
+        if DumpUtil.dump_config == '':
+            print_error_log("Please provide dump config for register hook before turning on dump switch!")
+            raise DumpException(DumpException.NONE_ERROR)
+        dump_path = f"according to dump config {DumpUtil.dump_config}"
+    else:
+        dump_dir, dump_file = os.path.split(DumpUtil.dump_path)
+        if not dump_file.endswith(".pkl"):
+            dump_dir = DumpUtil.dump_path
+        dump_path = f"to {dump_dir}"
+    return dump_path
+
+
+def set_dump_switch(switch, mode=Const.ALL, scope=None, api_list=None, filter_switch=Const.OFF, dump_mode=None,
+                    summary_only=False):
+    if scope is None:
+        scope = []
+    if api_list is None:
+        api_list = []
+    if dump_mode is None:
+        dump_mode = [Const.ALL]
+    check_switch_valid(switch)
+    if not DumpUtil.dump_path:
+        set_dump_path()
+    dump_config = DumpConfig(summary_only=summary_only)
+    DumpUtil.set_dump_switch(switch, dump_config)
+    dump_path_str = generate_dump_path_str()
+    if switch == "OFF":
+        dump.write_to_disk()
+        if check_is_npu() and DumpUtil.dump_switch_mode in [Const.ALL, Const.API_STACK, Const.LIST, Const.RANGE, Const.API_LIST]:
+            generate_compare_script(DumpUtil.dump_data_dir, dump.get_pkl_file_path(), DumpUtil.dump_switch_mode)
+    set_dump_switch_print_info(switch, mode, dump_path_str)
+    set_dump_switch_config(mode=mode, scope=scope, api_list=api_list, filter_switch=filter_switch, dump_mode=dump_mode,
+                           summary_only=summary_only)
+
+
+def set_dump_switch_config(mode=Const.ALL, scope=None, api_list=None, filter_switch=Const.OFF, dump_mode=None,
+                           summary_only=False, summary_mode="all"):
+    if scope is None:
+        scope = []
+    if api_list is None:
+        api_list = []
+    if dump_mode is None:
+        dump_mode = [Const.ALL]
+    try:
+        check_summary_mode_valid(summary_mode)
+        check_mode_valid(mode, scope, api_list)
+        check_switch_valid(filter_switch)
+        dump_mode = check_dump_mode_valid(dump_mode)
+        summary_only = check_summary_only_valid(summary_only)
+    except (CompareException, AssertionError) as err:
+        print_error_log(str(err))
+        raise CompareException(CompareException.INVALID_PARAM_ERROR) from err
+    switch = DumpUtil.dump_switch
+    dump_config = DumpConfig(mode, scope, api_list, filter_switch, dump_mode, summary_only, summary_mode)
+    DumpUtil.set_dump_switch("OFF", dump_config)
+    DumpUtil.dump_switch = switch
+
+
+def set_dump_switch_print_info(switch, mode, dump_path_str):
+    global dump_count
+    if switch == "ON":
+        print_info_log(f"Dump switch is turned on. Dump data will be saved {dump_path_str}. ")
+        if mode == Const.LIST:
+            dump_count = 0
+    else:
+        print_info_log(f"Dump switch is turned off. ")
+        if mode == Const.LIST:
+            print_info_log("The number of matched dump is {}".format(dump_count))
+
+
+def check_if_in_api_list(name):
+    if not DumpUtil.dump_api_list:
+        return False
+    for api in DumpUtil.dump_api_list:
+        if api.lower() in name.lower():
+            return True
+    return False
+
+
+def set_backward_input(backward_input):
+    for index, api_name in enumerate(DumpUtil.dump_switch_scope):
+        DumpUtil.backward_input[api_name] = backward_input[index]
+
+
+def make_dump_data_dir(dump_file_name):
+    dump_path, file_name = os.path.split(os.path.realpath(dump_file_name))
+    name_body, name_extension = os.path.splitext(file_name)
+    output_dir = os.path.join(dump_path, f"{name_body}")
+    check_path_before_create(output_dir)
+    if not os.path.exists(output_dir):
+        Path(output_dir).mkdir(mode=0o750, exist_ok=True)
+    else:
+        shutil.rmtree(output_dir, ignore_errors=True)
+        Path(output_dir).mkdir(mode=0o750, exist_ok=True)
+    return output_dir
+
+
+def make_dump_dirs():
+    dump_file_name, dump_file_name_body = "dump.pkl", "dump"
+    dump_root_dir = load_env_dump_path(DumpUtil.dump_path)
+    tag_dir = os.path.join(dump_root_dir, DumpUtil.dump_dir_tag)
+    check_path_length(tag_dir)
+    check_path_pattern_vaild(tag_dir)
+    Path(tag_dir).mkdir(mode=0o750, parents=True, exist_ok=True)
+    DumpUtil.dump_dir = tag_dir
+    dump_file_path = os.path.join(tag_dir, dump_file_name)
+    DumpUtil.set_dump_path(dump_file_path)
+
+
+def check_writable(dump_file):
+    if not os.access(dump_file, os.W_OK):
+        print_error_log(
+            'The path {} does not have permission to write. Please check the path permission'.format(
+                dump_file))
+        raise DumpException(DumpException.INVALID_PATH_ERROR)
+
+
+def load_env_dump_path(dump_path):
+    if not dump_path:
+        dump_path = os.getenv(Const.ASCEND_WORK_PATH)
+        if dump_path:
+            try:
+                dump_path = os.path.join(str(dump_path), Const.DUMP_DIR)
+            except TypeError as err:
+                print_error_log("Generating dump path from environment variables ASCEND_WORK_PATH failed.")
+                raise DumpException(DumpException.INVALID_PATH_ERROR) from err
+        else:
+            print_error_log("Dump path is None, you can configure it in the following ways:\n"
+                            "1. Configure set_dump_path function.\n"
+                            "2. Configure the dump_path parameter of PrecisionDebugger.\n"
+                            "3. Set environment variables ASCEND_WORK_PATH.")
+            raise DumpException(DumpException.INVALID_PATH_ERROR)
+    return dump_path
+
+
+def check_single_rank_folder(dump_path):
+    rank_folder_pattern = re.compile(r'^rank\d+$')
+    rank_folder_count = 0
+    for item in os.listdir(dump_path):
+        full_path = os.path.join(dump_path, item)
+        if os.path.isdir(full_path) and rank_folder_pattern.match(item):
+            rank_folder_count += 1
+            if rank_folder_count > 1:
+                return False
+    return rank_folder_count == 1
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/__init__.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ffe161cba432405e2dc8d98f9be89053b58849d
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/__init__.py
@@ -0,0 +1,8 @@
+from atat.pytorch.common import print_warn_log_rank_0, print_info_log_rank_0
+from atat.pytorch.common.exceptions import FreeBenchmarkException
+from atat.pytorch.common.utils import Const
+
+from .main import FreeBenchmarkCheck
+from .common.params import UnequalRow
+
+__all__ = [FreeBenchmarkCheck, UnequalRow]
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/__init__.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/constant.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/constant.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b72437f2280ca44a20fc5e370f1cfd9b9ea3ac4
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/constant.py
@@ -0,0 +1,66 @@
+from typing import Dict
+
+import numpy as np
+import torch
+from atat.pytorch.free_benchmark.common.enums import FuzzThreshold
+from atat.pytorch.free_benchmark.common.params import BenchmarkThd
+
+
+class CommonField:
+    DEVICE = "device"
+    META = "meta"
+    FUZZ_TENSOR = "fuzz_tensor"
+    REQUIRES_GRAD = "requires_grad"
+    HOLD_PLACE = "hold_place"
+    DISTRIBUTED_OP = "torch.distributed"
+
+
+class ThresholdConfig:
+    PERTURBATION_VALUE_DICT: Dict = {
+        torch.bfloat16: FuzzThreshold.BF16_THD,
+        torch.float16: FuzzThreshold.F16_THD,
+        torch.float32: FuzzThreshold.F32_THD,
+        torch.float64: FuzzThreshold.F64_THD,
+    }
+
+    ABS_TOL_VALUE_DICT: Dict = {
+        torch.bfloat16: FuzzThreshold.BF16_THD,
+        torch.float16: FuzzThreshold.F16_THD,
+        torch.float32: FuzzThreshold.F32_THD,
+        torch.float64: FuzzThreshold.F64_THD,
+    }
+
+    # bit翻转需要匹配到等长或更长的整型
+    PERTURBATION_BIT_DICT = {
+        torch.bfloat16: torch.int16,
+        torch.float16: torch.int16,
+        torch.float32: torch.int32,
+        torch.float64: torch.int64,
+    }
+
+    # 输入噪声下界
+    NOISE_INPUT_LOWER_BOUND = 1e-8
+    COMP_CONSISTENT = 1.0
+    COMP_NAN = np.nan
+    SYMBOL_FLIPPING = "symbol_flipping"
+    BACKWARD_OUTPUT_LOWER_BOUND = 1e-3
+    SMALL_VALUE = 1.0
+    # 预热初始阈值
+    PREHEAT_INITIAL_THD = 2.05
+    API_THD_STEP = 2.0
+
+    DTYPE_PER_THD = {
+        torch.float16: 1.002,
+        torch.float32: 1.0002,
+    }
+    BENCHMARK_THD_DICT = {
+        torch.float32: BenchmarkThd(2**-14, 1.0, 2**-14, 1e-4),
+        torch.float16: BenchmarkThd(2**-11, 1.0, 2**-11, 1e-4),
+        torch.bfloat16: BenchmarkThd(2**-8, 1.0, 2**-8, 1e-4),
+    }
+
+
+class PreheatConfig:
+    IF_PREHEAT = "if_preheat"
+    PREHEAT_STEP = "preheat_step"
+    MAX_SAMPLE = "max_sample"
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/counter.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..186b75c71aeaf71fc2adab7ec38c7f00f6b7fdb7
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/counter.py
@@ -0,0 +1,72 @@
+from collections import defaultdict
+from atat.pytorch.free_benchmark.common.constant import ThresholdConfig
+
+
+class PreheatCounter:
+    def __init__(self) -> None:
+        self.api_called_time: dict = defaultdict(int)
+        self.api_sample_time: dict = defaultdict(int)
+        self.one_step_used_api: dict = defaultdict(int)
+        self.api_thd: dict = defaultdict(dict)
+        self.preheat_record: dict = defaultdict(dict)
+        self.dtype_map: dict = {}
+        self.if_preheat: dict = defaultdict(dict)
+        self.step = 0
+
+    def clear_step(self):
+        self.preheat_record.clear()
+        self.api_called_time.clear()
+        self.api_sample_time.clear()
+    
+    def check_step(self, current_step):
+        if current_step != self.step:
+            self.clear_step()
+            self.step = current_step
+
+    def add_api_called_time(self, api_name: str):
+        self.api_called_time[api_name] += 1
+
+    def get_api_called_time(self, api_name: str) -> int:
+        return self.api_called_time[api_name]
+    
+    def add_api_sample_time(self, api_name: str):
+        self.api_sample_time[api_name] += 1
+
+    def get_api_sample_time(self, api_name: str) -> int:
+        return self.api_sample_time[api_name]
+
+    def add_one_step_used_api(self, api_name: str):
+        self.one_step_used_api[api_name] += 1
+
+    def get_one_step_used_api(self, api_name: str):
+        return self.one_step_used_api[api_name]
+
+    def update_preheat_record(self, api_name, dtype, cmp_result):
+        # 记录预热阶段CPU标杆比对的结果
+        if str(dtype) not in self.preheat_record[api_name].keys():
+            self.preheat_record[api_name][str(dtype)] = list()
+        self.preheat_record[api_name][str(dtype)].append(cmp_result)
+        self.dtype_map[str(dtype)] = dtype
+
+    def update_api_thd(self, api_name, dtype, threshold, dthreshold):
+        self.api_thd[api_name][str(dtype)] = (
+            threshold if threshold > dthreshold else dthreshold
+        )
+
+    def get_api_thd(self, api_name, dtype):
+        if not str(dtype) in self.api_thd[api_name]:
+            self.api_thd[api_name][str(dtype)] = ThresholdConfig.PREHEAT_INITIAL_THD
+            self.dtype_map[str(dtype)] = dtype
+        return self.api_thd[api_name][str(dtype)]
+
+    def set_api_preheat(self, api_name, dtype_str, is_preheat=True):
+        # 标记cpu不一致的dtype 不再进行预热
+        self.if_preheat[api_name][dtype_str] = is_preheat
+
+    def get_api_preheat(self, api_name, dtype):
+        # 标记cpu不一致的dtype 不再进行预热
+        if str(dtype) not in self.if_preheat[api_name]:
+            return True
+        return self.if_preheat[api_name][str(dtype)]
+
+preheat_counter = PreheatCounter()
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/enums.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/enums.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfb1bbaa40dc2a535a02aa914f823906b0a374ab
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/enums.py
@@ -0,0 +1,37 @@
+class PerturbationMode:
+    ADD_NOISE = "add_noise"
+    CHANGE_VALUE = "change_value"
+    IMPROVE_PRECISION = "improve_precision"
+    NO_CHANGE = "no_change"
+    BIT_NOISE = "bit_noise"
+    TO_CPU = "to_cpu"
+
+
+class DeviceType:
+    NPU = "npu"
+    CPU = "cpu"
+
+
+class FuzzThreshold:
+    BF16_THD = 1e-4
+    F16_THD = 1e-6
+    F32_THD = 1e-8
+    F64_THD = 1e-16
+
+
+class NormType:
+    ONE_NORM = (1, "one_norm")
+    TWO_NORM = (2, "two_norm")
+    ENDLESS_NORM = (3, "endless_norm")
+
+
+class HandlerType:
+    CHECK = "check"
+    PREHEAT = "preheat"
+    FIX = "fix"
+
+
+class FuzzLevel:
+    BASE_LEVEL = "L1"
+    ADV_LEVEL = "L2"
+    REAL_LEVEL = "L3"
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/params.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/params.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5dfefb43f856383af93068840e9e48e1590c431
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/params.py
@@ -0,0 +1,130 @@
+from abc import ABC
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import torch
+from atat.pytorch.free_benchmark import Const, print_warn_log_rank_0
+from atat.pytorch.free_benchmark.common.enums import (
+    DeviceType,
+    FuzzLevel,
+    PerturbationMode,
+)
+from atat.pytorch.free_benchmark.common.utils import Tools
+
+
+@dataclass
+class DataParams:
+    args: Optional[Tuple] = None
+    kwargs: Optional[Dict] = None
+    valid_input_index: Optional[int] = None
+    original_result: Optional[Any] = None
+    perturbed_result: Optional[Any] = None
+    is_consistent: Optional[bool] = True
+    perturbed_value: Optional[Any] = None
+    origin_func: Optional[Callable] = None
+    api_type: Optional[str] = None
+    fuzz_stage: Optional[str] = None
+    grad_unequal_flag: Optional[bool] = True
+
+
+@dataclass
+class HandlerParams:
+    handler_type: Optional[str] = None
+    api_name: Optional[str] = None
+    pert_mode: Optional[PerturbationMode] = None
+    step: Optional[int] = None
+    fuzz_stage: Optional[str] = None
+    fuzz_device: Optional[DeviceType] = None
+    preheat_config: Optional[Dict] = None
+    fuzz_level: Optional[str] = None
+
+
+@dataclass
+class UnequalRow:
+    rank: Optional[int] = None
+    pert_mode: Optional[PerturbationMode] = None
+    stage: Optional[str] = None
+    step: Optional[int] = None
+    api_name: Optional[str] = None
+    max_rel: Optional[float] = None
+    dtype: Optional[str] = None
+    shape: Optional[str] = None
+    output_index: Optional[int] = None
+
+
+@dataclass
+class BenchmarkThd:
+    rtol: Optional[float] = None  # 相对误差阈值
+    small_value: Optional[float] = None  # 小值域
+    small_value_atol: Optional[float] = None  # 小值域绝对阈值
+    err_balance: Optional[float] = None  # 误差均衡性
+
+
+def check_args_type(args: Tuple) -> int:
+    for i, arg in enumerate(args):
+        if torch.is_tensor(arg):
+            if arg.is_meta:
+                continue
+            if not torch.is_floating_point(arg):
+                continue
+            return i
+        if isinstance(arg, (List, Tuple, Dict)):
+            return i
+    return -1
+
+
+def data_pre_deal(name, func, args, kwargs):
+    data_params = DataParams(args=args, kwargs=kwargs, origin_func=func)
+    index = check_args_type(args)
+    data_params.valid_input_index = index
+    if index == -1:
+        print_warn_log_rank_0(
+            f"[atat] Free benchmark: 无标杆工具不支持当前算子的输入类型 {name}."
+        )
+    return data_params
+
+
+def make_handler_params(name, config, step):
+    handler_params = HandlerParams()
+    handler_params.api_name = name
+    handler_params.step = step
+    handler_params.handler_type = config.handler_type
+    handler_params.fuzz_stage = config.fuzz_stage
+    handler_params.fuzz_device = config.fuzz_device
+    handler_params.preheat_config = config.preheat_config
+    handler_params.fuzz_level = config.fuzz_level
+    handler_params.pert_mode = config.pert_mode
+    return handler_params
+
+
+def make_unequal_row(
+    data_params: DataParams,
+    handle_params: HandlerParams,
+    ratio: float = None,
+    index: int = None,
+):
+    row = UnequalRow(
+        api_name=handle_params.api_name,
+        pert_mode=handle_params.pert_mode,
+        output_index=index,
+        stage=handle_params.fuzz_stage,
+        step=handle_params.step,
+    )
+    if isinstance(ratio, float):
+        row.max_rel = ratio - 1
+    origin_tensor = data_params.original_result
+    perturbed_tensor = data_params.perturbed_result
+    if index:
+        origin_tensor = origin_tensor[index]
+        perturbed_tensor = perturbed_tensor[index]
+        row.output_index = index
+    if isinstance(origin_tensor, torch.Tensor):
+        row.dtype = origin_tensor.dtype
+        row.shape = origin_tensor.shape
+    row.rank = Tools.get_dist_rank()
+    # 以下暂不支持
+    if handle_params.fuzz_level == FuzzLevel.ADV_LEVEL:
+        pass
+    if handle_params.fuzz_level == FuzzLevel.REAL_LEVEL:
+        pass
+    return row
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/common/utils.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..24d25967635b3dcfd1da89e1f54d3282fa1181ed
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/common/utils.py
@@ -0,0 +1,98 @@
+import torch
+from atat.pytorch.free_benchmark.common.enums import DeviceType
+
+
+class Tools:
+
+    @staticmethod
+    def is_float_tensor(tensor) -> bool:
+        if isinstance(tensor, torch.Tensor) and torch.is_floating_point(tensor):
+            return True
+        if isinstance(tensor, (list, tuple)):
+            for value in tensor:
+                if isinstance(value, torch.Tensor) and torch.is_floating_point(value):
+                    return True
+        return False
+
+    @staticmethod
+    def get_dist_rank():
+        try:
+            return torch.distributed.get_rank()
+        except RuntimeError:
+            return 0
+
+    @staticmethod
+    def get_first_tensor_dtype(tensor_seq):
+        if isinstance(tensor_seq, torch.Tensor):
+            return tensor_seq.dtype
+        if isinstance(tensor_seq, (list, tuple)):
+            for object_ in tensor_seq:
+                if isinstance(object_, torch.Tensor):
+                    return object_.dtype
+        raise RuntimeError("The sequence does not contain tensors.")
+
+    @staticmethod
+    def get_pure_api_name(api_name: str):
+        return api_name.rsplit(".", 2)[0]
+
+    @staticmethod
+    def convert_device_and_dtype(
+        tensor_seq, device: str = DeviceType.CPU, change_dtype: bool = False
+    ):
+        if isinstance(tensor_seq, torch.Tensor):
+            if change_dtype and tensor_seq.dtype in [torch.float16, torch.bfloat16]:
+                return tensor_seq.detach().to(device).to(torch.float32)
+            return tensor_seq.detach().to(device)
+        if isinstance(tensor_seq, dict):
+            return {
+                key: Tools.convert_device_and_dtype(value, device, change_dtype)
+                for key, value in tensor_seq.items()
+            }
+        if isinstance(tensor_seq, (tuple, list)):
+            return type(tensor_seq)(
+                [
+                    Tools.convert_device_and_dtype(value, device, change_dtype)
+                    for value in tensor_seq
+                ]
+            )
+        return tensor_seq
+
+    @staticmethod
+    def convert_fuzz_output_to_origin(origin, perturbed):
+        if isinstance(origin, torch.Tensor):
+            origin.data = perturbed.to(origin.dtype).to(origin.device)
+            return origin
+        if isinstance(origin, dict):
+            output = dict()
+            for key, value in origin.items():
+                output[key] = Tools.convert_fuzz_output_to_origin(value, perturbed[key])
+            return output
+        if isinstance(origin, (tuple, list)):
+            result = list()
+            for index_, value in enumerate(origin):
+                result.append(
+                    Tools.convert_fuzz_output_to_origin(value, perturbed[index_])
+                )
+            return type(origin)(result)
+        return origin
+    
+class TorchC:
+    sum = torch._C._VariableFunctionsClass.sum
+    isinf = torch._C._VariableFunctionsClass.isinf
+    isfinite = torch._C._VariableFunctionsClass.isfinite
+    isnan = torch._C._VariableFunctionsClass.isnan
+    logical_not = torch._C._VariableFunctionsClass.logical_not
+    subtract = torch._C._VariableFunctionsClass.subtract
+    abs = torch._C._VariableFunctionsClass.abs
+    where = torch._C._VariableFunctionsClass.where
+    div = torch._C._VariableFunctionsClass.div
+    max = torch._C._VariableFunctionsClass.max
+    min = torch._C._VariableFunctionsClass.min
+    gt = torch._C._VariableFunctionsClass.gt
+    ge = torch._C._VariableFunctionsClass.ge
+    lt = torch._C._VariableFunctionsClass.lt
+    mean = torch._C._VariableFunctionsClass.mean
+    full = torch._C._VariableFunctionsClass.full
+    add = torch._C._VariableFunctionsClass.add
+    bitwise_xor = torch._C._VariableFunctionsClass.bitwise_xor
+    clone = torch._C._VariableFunctionsClass.clone
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/grad_saver.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/grad_saver.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8752656ed72bc21773aca2bb06d4e69d96a5c4b
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/grad_saver.py
@@ -0,0 +1,172 @@
+import torch
+from atat.pytorch.free_benchmark import print_info_log_rank_0, print_warn_log_rank_0
+from atat.pytorch.free_benchmark.common.params import DataParams, HandlerParams
+from atat.pytorch.free_benchmark.common.constant import CommonField
+from atat.pytorch.free_benchmark.common.utils import Tools
+from atat.pytorch.free_benchmark.result_handlers.handler_factory import (
+    FuzzHandlerFactory,
+)
+from atat.pytorch.free_benchmark.perturbed_layers.layer_factory import LayerFactory
+
+
+class GradSaver:
+
+    def __init__(self, origin_func, handler_params: HandlerParams):
+
+        self.handler_params = handler_params
+        self.api_name = handler_params.api_name
+        self.origin_func = origin_func
+        self.data_params = DataParams()
+        self.is_compare = True
+        self.kwargs = dict()
+        self.perturbed_grad_input = tuple()
+        self.origin_grad_input = tuple()
+        self.need_grad_flag = list()
+        self.backward_input = tuple()
+
+    def register_compare_func_for_inputs(self, inputs, data_processor):
+        _index = 0
+        for j, obj in enumerate(inputs):
+            if torch.is_tensor(obj) and obj.requires_grad:
+
+                def compare_func(grad, new_grad_index=_index, input_index=j):
+                    if not self.is_compare:
+                        return grad
+                    try:
+                        perturbed_grad = self.check_grad_input(grad, new_grad_index)
+                        handler = FuzzHandlerFactory.create(self.handler_params)
+                        self.compare_grad_results(
+                            handler, grad, perturbed_grad, index=input_index
+                        )
+                        data_processor.update_unequal_rows(handler.get_unequal_rows())
+                    except Exception as e:
+                        print_warn_log_rank_0(
+                            f"[atat] Free benchmark: grad compara error: {e}"
+                        )
+                        return grad
+                    return grad
+
+                obj.register_hook(compare_func)
+                _index += 1
+
+    def compare_grad_results(self, handler, origin_grad, perturbed_grad, index):
+        # TODO get dtype?
+        self.data_params.original_result = origin_grad
+        self.data_params.perturbed_result = perturbed_grad
+        self.data_params.grad_unequal_flag = False
+        self.data_params.valid_input_index = index
+        try:
+            handler.handle(self.data_params)
+            if not self.data_params.is_consistent:
+                self.is_compare = False
+                self.data_params.grad_unequal_flag = True
+                self.data_params.is_consistent = True
+                self.data_params.perturbed_result = self.perturbed_grad_input
+                self.data_params.original_result = self.origin_grad_input
+                handler.handle(self.data_params)
+        except Exception as e:
+            print_warn_log_rank_0(
+                f"[atat] Free benchmark: compare two vjp failed: api:{self.handler_params.api_name}."
+                f"{e}"
+            )
+
+    def check_grad_input(self, origin_grad, new_grad_index):
+        if self.perturbed_grad_input is None:
+            print_info_log_rank_0(
+                f"[atat] Free benchmark: grad not exsits : {self.api_name}."
+            )
+            return None
+        try:
+            with torch.no_grad():
+                perturbed_grad = self.perturbed_grad_input[new_grad_index].to(
+                    origin_grad.device
+                )
+        except IndexError:
+            print_warn_log_rank_0(
+                f"[atat] Free benchmark: grad index out of range. api:{self.handler_params.api_name}."
+                f"index:{new_grad_index}, perturbation grad len {len(self.perturbed_grad_input)}"
+            )
+            return None
+        if origin_grad.shape != perturbed_grad.shape:
+            print_warn_log_rank_0(
+                f"[atat] Free benchmark: grad shapes are unconsistent. api:{self.handler_params.api_name}."
+                f"origin:{origin_grad.shape}, perturbation: {perturbed_grad.shape}"
+            )
+            return None
+        return perturbed_grad
+
+    def cache_backward_input(self, backward_input_list):
+        _inputs = []
+        with torch.no_grad():
+            for backward_input in backward_input_list:
+                if torch.is_tensor(backward_input):
+                    _inputs.append(
+                        {
+                            CommonField.DEVICE: backward_input.device,
+                            CommonField.FUZZ_TENSOR: backward_input.cpu(),
+                            CommonField.REQUIRES_GRAD: backward_input.requires_grad,
+                        }
+                    )
+                else:
+                    _inputs.append(backward_input)
+        self.backward_input = _inputs
+
+    def get_vjp_input(self):
+        inner_args_tmp = []
+        need_grad_tensors = []
+        for object_ in self.backward_input:
+            if isinstance(object_, dict) and CommonField.FUZZ_TENSOR in object_.keys():
+                tensor_ = torch.tensor(
+                        object_.get(CommonField.FUZZ_TENSOR).data,
+                        dtype=object_.get(CommonField.FUZZ_TENSOR).dtype,
+                        device=object_.get(CommonField.DEVICE),
+                        requires_grad=object_.get(CommonField.REQUIRES_GRAD),
+                    )
+                
+                if tensor_.requires_grad:
+                    inner_args_tmp.append(CommonField.HOLD_PLACE)
+                    need_grad_tensors.append(tensor_)
+                    self.need_grad_flag.append(True)
+                else:
+                    self.need_grad_flag.append(False)
+                    inner_args_tmp.append(tensor_)
+            else:
+                self.need_grad_flag.append(False)
+                inner_args_tmp.append(object_)
+
+        return need_grad_tensors, tuple(inner_args_tmp)
+
+    def get_grad_input_from_vjp(self, need_grad_tensors, grad_output, inner_args):
+        def vjp_func(*inputs):
+            _real_input = []
+            index_ = 0
+            for object_ in inner_args:
+                if object_ is CommonField.HOLD_PLACE:
+                    _real_input.append(inputs[index_])
+                    index_ += 1
+                else:
+                    _real_input.append(object_)
+            kwargs = self.kwargs.copy()
+            if 'inplace' in kwargs:
+                kwargs['inplace'] = False
+            return self.origin_func(*_real_input, **kwargs)
+
+        _, grad_input = torch.autograd.functional.vjp(
+            vjp_func, tuple(need_grad_tensors), grad_output
+        )
+        return grad_input
+
+    def calculate_perturbed_grad_input(self, grad_output, need_grad_tensors, inner_args):
+        self.data_params.args = [need_grad_tensors, grad_output, inner_args]
+        self.data_params.kwargs = {}
+        self.data_params.valid_input_index = 0
+        self.data_params.origin_func = self.get_grad_input_from_vjp
+        layer = LayerFactory.create(
+            self.handler_params.api_name,
+            self.handler_params.fuzz_device,
+            self.handler_params.pert_mode,
+        )
+        layer.handle(self.data_params)
+        self.perturbed_grad_input = tuple(
+            [x.cpu() for x in self.data_params.perturbed_result]
+        )
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/single_benchmark.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/single_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed834c468ba6f15437da4479a3e2b3257fd7b6c1
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/compare/single_benchmark.py
@@ -0,0 +1,103 @@
+import torch
+import math
+
+from atat.pytorch.free_benchmark import print_warn_log_rank_0
+from atat.pytorch.free_benchmark.common.utils import TorchC
+from atat.pytorch.free_benchmark.common.constant import ThresholdConfig
+
+
+class SingleCompare:
+    def __init__(self) -> None:
+        self.relative_err = None
+        self.absolute_err = None
+        self.eb = None
+        self.threshold = None
+
+    def compare_seq(self, actual, golden):
+        if isinstance(golden, torch.Tensor):
+            return self.compare_tensor_seq(actual, golden)
+        elif isinstance(golden, dict):
+            return self.compare_dict_seq(actual, golden)
+        elif isinstance(golden, (tuple, list)):
+            return self.compare_list_seq(actual, golden)
+        elif isinstance(golden, float):
+            return self.compare_float_seq(actual, golden)
+        else:
+            return self.compare_other_seq(actual, golden)
+
+    def compare_tensor_seq(self, actual, golden):
+        self.threshold = ThresholdConfig.BENCHMARK_THD_DICT.get(
+            actual.dtype, ThresholdConfig.BENCHMARK_THD_DICT.get(torch.float32)
+        )
+        if self.filter_overflow(golden) > 0:
+            print_warn_log_rank_0("[atat] Free Benchmark: inf and nan"
+                                  "in golden tensor is not supported.")
+            return True
+        actual = self.replace_inf_or_nan(actual)
+        actual = actual.to(torch.float64)
+        golden = golden.to(torch.float64).to(actual.device)
+        self._cal_compare_metrics(actual, golden)
+        if self.absolute_err > self.threshold.small_value_atol:
+            return False
+        if self.relative_err > self.threshold.rtol:
+            return False
+        if self.eb > self.threshold.err_balance:
+            return False
+        return True
+
+
+    def _cal_compare_metrics(self, actual, golden):
+        diff_value = TorchC.subtract(actual, golden)
+        diff_abs = TorchC.abs(diff_value)
+        golden_abs = TorchC.abs(golden)
+        # 使用绝对误差的元素
+        self.absolute_err = TorchC.max(TorchC.where(
+            TorchC.lt(TorchC.abs(actual), self.threshold.small_value), diff_abs, 0
+        ))
+        diff_rel = TorchC.div(diff_abs, golden_abs)
+        # 使用相对误差的元素
+        self.relative_err = TorchC.max(TorchC.where(
+            TorchC.ge(TorchC.abs(actual), self.threshold.small_value), diff_rel, 0
+        ))
+        # 获取误差均衡性
+        divided = TorchC.where(
+            TorchC.ge(TorchC.abs(golden), self.threshold.small_value), golden_abs, 1
+            )
+        self.eb = TorchC.mean(TorchC.div(diff_value, divided))
+
+    def compare_dict_seq(self, actual, golden):
+        if len(actual) != len(golden):
+            return False
+        for key, value in golden.items():
+            if not self.compare_seq(value, actual.get(key)):
+                return False
+        return True
+
+    def compare_list_seq(self, actual, golden):
+        if len(actual) != len(golden):
+            return False
+        for index_, value in enumerate(golden):
+            if not self.compare_seq(value, actual[index_]):
+                return False
+        return True
+
+    def compare_float_seq(self, actual, golden):
+        return math.isclose(actual, golden)
+
+    def compare_other_seq(self, actual, golden):
+        return actual == golden
+
+    @staticmethod
+    def filter_overflow(tensor) -> int:
+        inf_num = TorchC.sum(TorchC.isinf(tensor))
+        nan_num = TorchC.sum(TorchC.isnan(tensor))
+        return inf_num + nan_num
+
+    @staticmethod
+    def replace_inf_or_nan(tensor):
+        finite_mask = TorchC.isfinite(tensor)
+        inf_or_nan_mask = TorchC.logical_not(finite_mask)
+        inf_or_nan_num = TorchC.sum(inf_or_nan_mask).item()
+        if inf_or_nan_num > 0:
+            tensor[inf_or_nan_mask] = 1
+        return tensor
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/main.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2e0005181d967ed8437e3047f9d967b1370d4e3
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/main.py
@@ -0,0 +1,102 @@
+import importlib
+from abc import ABC
+
+import torch
+from atat.pytorch.free_benchmark import Const, print_warn_log_rank_0
+
+from atat.pytorch.free_benchmark.common.params import data_pre_deal, make_handler_params
+from atat.pytorch.free_benchmark.common.enums import (
+    PerturbationMode,
+    FuzzLevel,
+    DeviceType,
+    HandlerType
+)
+from atat.pytorch.free_benchmark.compare.grad_saver import GradSaver
+from atat.pytorch.free_benchmark.perturbed_layers.layer_factory import LayerFactory
+from atat.pytorch.free_benchmark.result_handlers.handler_factory import (
+    FuzzHandlerFactory,
+)
+
+
+class FreeBenchmarkCheck(ABC):
+
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.config = config
+        if self.config.pert_mode is None:
+            self.config.pert_mode = PerturbationMode.IMPROVE_PRECISION
+        if self.config.fuzz_level is None:
+            self.config.fuzz_level = FuzzLevel.BASE_LEVEL
+        if self.config.fuzz_device is None:
+            self.config.fuzz_device = DeviceType.NPU
+        self.current_iter = 0
+
+    def update_iter(self, update_iter):
+        self.current_iter = update_iter
+    
+    def if_fix(self):
+        if self.config.handler_type==HandlerType.FIX:
+            return True
+        return False
+
+    def pre_forward(self, name, module, data_processor, args, kwargs):
+        if not self.config.fuzz_stage == Const.BACKWARD:
+            return
+        origin_func = (
+            module._slow_forward if torch._C._get_tracing_state() else module.forward
+        )
+        handler_params = make_handler_params(name, self.config, self.current_iter)
+        grad_saver = GradSaver(origin_func, handler_params)
+        grad_saver.kwargs = kwargs
+        grad_saver.register_compare_func_for_inputs(args, data_processor)
+        grad_saver.cache_backward_input(args)
+        setattr(module, "grad_saver", grad_saver)
+
+    def forward(self, name, module, args, kwargs, output):
+        if not self.config.fuzz_stage == Const.FORWARD:
+            return output, []
+        origin_func = (
+            module._slow_forward if torch._C._get_tracing_state() else module.forward
+        )
+        data_params = data_pre_deal(name, origin_func, args, kwargs)
+        if data_params.valid_input_index == -1:
+            return output, []
+        data_params.original_result = output
+        data_params.fuzz_stage = self.config.fuzz_stage
+
+        layer = LayerFactory.create(
+            name, self.config.fuzz_device, self.config.pert_mode
+        )
+        layer.handle(data_params)
+        handler_params = make_handler_params(name, self.config, self.current_iter)
+        handler = FuzzHandlerFactory.create(handler_params)
+        handler.handle(data_params)
+        return output, handler.get_unequal_rows()
+
+    def backward(self, name, module, grad_output):
+
+        if not self.config.fuzz_stage == Const.BACKWARD:
+            return
+        try:
+            grad_saver = getattr(module, "grad_saver")
+        except AttributeError:
+            print_warn_log_rank_0(
+                f"[atat] Free benchmark:  get grad saver failed. api_name:{name}"
+            )
+            return
+
+        _new_grad_output = grad_output
+        try:
+            need_grad_tensors, _inner_args = grad_saver.get_vjp_input()
+            origin_grad_input = grad_saver.get_grad_input_from_vjp(
+                tuple(need_grad_tensors), _new_grad_output, _inner_args
+            )
+            grad_saver.origin_grad_input = tuple([x.cpu() for x in origin_grad_input])
+            grad_saver.calculate_perturbed_grad_input(
+                _new_grad_output, need_grad_tensors, _inner_args
+            )
+        except Exception as e:
+            print_warn_log_rank_0(
+                f"[atat] Free benchmark: grad vjp calculate failed. api_name:{name} error: {e}"
+            )
+            return
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/__init__.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/base_layer.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/base_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa572fd8e8dc8b62493dfa1fecc587b934c83a99
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/base_layer.py
@@ -0,0 +1,13 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+from atat.pytorch.free_benchmark.common.params import DataParams
+
+
+class BaseLayer(ABC):
+    def __init__(self, api_name: str) -> None:
+        self.api_name = api_name
+
+    @abstractmethod
+    def handle(self, params: DataParams) -> Any:
+        pass
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/layer_factory.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/layer_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d09438ce04132c9c5c301d758dc06818805082e
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/layer_factory.py
@@ -0,0 +1,41 @@
+from atat.pytorch.free_benchmark import FreeBenchmarkException
+from atat.pytorch.free_benchmark.common.enums import DeviceType, PerturbationMode
+from atat.pytorch.free_benchmark.perturbed_layers.npu.improve_precision import (
+    ImprovePrecisionLayer,
+)
+from atat.pytorch.free_benchmark.perturbed_layers.npu.add_noise import AddNoiseLayer
+from atat.pytorch.free_benchmark.perturbed_layers.npu.bit_noise import BitNoiseLayer
+from atat.pytorch.free_benchmark.perturbed_layers.npu.no_change import NoChangeLayer
+from atat.pytorch.free_benchmark.perturbed_layers.npu.change_value import (
+    ChangeValueLayer,
+)
+from atat.pytorch.free_benchmark.perturbed_layers.run_cpu import CpuLayer
+
+
+class LayerFactory:
+    layers = {
+        DeviceType.NPU: {
+            PerturbationMode.ADD_NOISE: AddNoiseLayer,
+            PerturbationMode.CHANGE_VALUE: ChangeValueLayer,
+            PerturbationMode.NO_CHANGE: NoChangeLayer,
+            PerturbationMode.BIT_NOISE: BitNoiseLayer,
+            PerturbationMode.IMPROVE_PRECISION: ImprovePrecisionLayer,
+        },
+        DeviceType.CPU: {PerturbationMode.TO_CPU: CpuLayer},
+    }
+
+    @staticmethod
+    def create(api_name: str, device_type: str, mode: str):
+        layer = LayerFactory.layers.get(device_type)
+        if not layer:
+            raise FreeBenchmarkException(
+                FreeBenchmarkException.UnsupportedType,
+                f"无标杆工具不支持当前设备 {device_type}",
+            )
+        layer = layer.get(mode)
+        if not layer:
+            raise FreeBenchmarkException(
+                FreeBenchmarkException.UnsupportedType,
+                f"无标杆工具无法识别该扰动因子 {mode} on {device_type}",
+            )
+        return layer(api_name)
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/__init__.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py
new file mode 100644
index 0000000000000000000000000000000000000000..d03dbe931d91e5ed91b70c7b2b8fe1fb8f1342fa
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/add_noise.py
@@ -0,0 +1,93 @@
+import torch
+from atat.pytorch.free_benchmark import (
+    print_info_log_rank_0,
+    print_warn_log_rank_0,
+)
+from atat.pytorch.free_benchmark.common.constant import ThresholdConfig
+from atat.pytorch.free_benchmark.common.params import DataParams
+from atat.pytorch.free_benchmark.common.utils import TorchC
+from atat.pytorch.free_benchmark.common.enums import PerturbationMode
+from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
+    NpuBaseLayer,
+)
+
+
+class AddNoiseLayer(NpuBaseLayer):
+
+    def _get_noise(self, tensor_obj):
+        dtype = tensor_obj.dtype
+        device = str(tensor_obj.device)
+        noise = TorchC.full(
+            tensor_obj.shape,
+            self.perturbed_value,
+            device=device,
+            dtype=dtype,
+        )
+        return noise
+
+    def _check_details(self, tensor_obj):
+        """
+        判断是否需要添加扰动
+        """
+        if not self.perturbed_value:
+            print_warn_log_rank_0(
+                f"[atat] Free Benchmark: For {self.api_name}, "
+                f"dtype unsupported. Cancel perturbation."
+            )
+            return False
+        if tensor_obj.numel() == 0:
+            print_warn_log_rank_0(
+                f"[atat] Free benchmark: For {self.api_name}, tensor shape must > 0."
+                f" Cancel adding noise."
+            )
+            return False
+        abs_tol = ThresholdConfig.ABS_TOL_VALUE_DICT.get(
+            tensor_obj.dtype, ThresholdConfig.NOISE_INPUT_LOWER_BOUND
+        )
+        try:
+            max_val = TorchC.max(TorchC.abs(tensor_obj)).item()
+        except Exception:
+            print_warn_log_rank_0(
+                f"[atat] Free Benchmark: For {self.api_name}, "
+                f"when calculate maximun value, tensor is changed to float32."
+            )
+            max_val = TorchC.max(TorchC.abs(tensor_obj.to(torch.float32))).item()
+        if max_val < abs_tol:
+            print_warn_log_rank_0(
+                f"[atat] Free Benchmark: For {self.api_name}, "
+                f"Maximun value is less than the  minimun threshold. Cancel add noise."
+            )
+            return False
+        return True
+
+    def add_noise(self, tensor_obj):
+        if isinstance(tensor_obj, torch.Tensor):
+            self.perturbed_value = ThresholdConfig.PERTURBATION_VALUE_DICT.get(
+                tensor_obj.dtype
+            )
+            if not self.pre_check(tensor_obj):
+                return tensor_obj
+            noise = self._get_noise(tensor_obj)
+            result = TorchC.where(
+                TorchC.gt(TorchC.abs(tensor_obj), self.perturbed_value**0.5),
+                TorchC.add(noise, tensor_obj),
+                tensor_obj,
+            ).to(tensor_obj.dtype)
+            self.is_added = True
+            return result
+        if isinstance(tensor_obj, dict):
+            return {key: self.add_noise(value) for key, value in tensor_obj.items()}
+        if isinstance(tensor_obj, (tuple, list)):
+            return type(tensor_obj)([self.add_noise(value) for value in tensor_obj])
+        return tensor_obj
+
+    def handle(self, params: DataParams) -> torch.Any:
+        """
+        对输入添加扰动并返回
+        """
+        print_info_log_rank_0(
+            f"[atat] Free benchmark: Perturbation is "
+            f"{PerturbationMode.ADD_NOISE} of {self.api_name}."
+        )
+        params.perturbed_value = self.add_noise(params.args[params.valid_input_index])
+        return self.perturbed_result(params)
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py
new file mode 100644
index 0000000000000000000000000000000000000000..72d04af412067882826ea402ed6fa00490bce348
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/bit_noise.py
@@ -0,0 +1,107 @@
+import torch
+from atat.pytorch.free_benchmark import (
+    print_info_log_rank_0,
+    print_warn_log_rank_0,
+)
+from atat.pytorch.free_benchmark.common.constant import ThresholdConfig
+from atat.pytorch.free_benchmark.common.params import DataParams
+from atat.pytorch.free_benchmark.common.utils import TorchC
+from atat.pytorch.free_benchmark.common.enums import PerturbationMode
+from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
+    NpuBaseLayer,
+)
+
+
+class BitNoiseLayer(NpuBaseLayer):
+    def __init__(self, api_name):
+        super().__init__(api_name)
+        self.bit_mode = TorchC.bitwise_xor
+        self.bit_tail: int = 1
+        self.bit_type = None
+
+    def _check_details(self, tensor_obj):
+        """
+        判断是否需要添加扰动,  bit翻转
+        """
+        if not self.bit_type:
+            print_warn_log_rank_0(
+                f"[atat] Free Benchmark: For {self.api_name}, "
+                f"dtype unsupported. Cancel perturbation."
+            )
+            return False
+        if tensor_obj.numel() == 0:
+            print_warn_log_rank_0(
+                f"[atat] Free benchmark: For {self.api_name}, tensor shape must > 0"
+                f" Cancel adding noise."
+            )
+            return False
+        abs_tol = ThresholdConfig.ABS_TOL_VALUE_DICT.get(
+            tensor_obj.dtype, ThresholdConfig.NOISE_INPUT_LOWER_BOUND
+        )
+        try:
+            max_val = TorchC.max(TorchC.abs(tensor_obj)).item()
+        except Exception:
+            print_warn_log_rank_0(
+                f"[atat] Free Benchmark: For {self.api_name}, "
+                f"when calculate maximun value, tensor is changed to float32."
+            )
+            max_val = TorchC.max(TorchC.abs(tensor_obj.to(torch.float32))).item()
+        if max_val < abs_tol:
+            print_warn_log_rank_0(
+                f"[atat] Free Benchmark: For {self.api_name}, "
+                f"Maximun value is less than the  minimun threshold. Cancel add noise."
+            )
+            return False
+        return True
+
+    def _set_perturbation_bit(self, tensor_obj):
+        """
+        根据不同浮点数确定不同位数扰动值
+        """
+        bit_len_type = ThresholdConfig.PERTURBATION_BIT_DICT.get(tensor_obj.dtype)
+        if bit_len_type:
+            self.bit_tail = 1
+            self.bit_type = bit_len_type
+
+    def add_bit_noise(self, tensor_obj):
+        """
+        对输入添加噪声
+        """
+        # finfo应该列入黑名单
+
+        if isinstance(tensor_obj, torch.Tensor):
+            self._set_perturbation_bit(tensor_obj)
+            if not self.pre_check(tensor_obj):
+                return tensor_obj
+            sub_normal = torch.finfo(tensor_obj.dtype).smallest_normal
+            noise = TorchC.full(
+                tensor_obj.shape,
+                self.bit_tail,
+                device=tensor_obj.device,
+                dtype=self.bit_type,
+            )
+            result = tensor_obj.view(self.bit_type)
+            result = TorchC.where(
+                TorchC.gt(TorchC.abs(tensor_obj), sub_normal),
+                self.bit_mode(result, noise),
+                result,
+            ).view(tensor_obj.dtype)
+
+            self.is_added = True
+            return result
+        if isinstance(tensor_obj, dict):
+            return {key: self.add_bit_noise(value) for key, value in tensor_obj.items()}
+        if isinstance(tensor_obj, (tuple, list)):
+            return type(tensor_obj)([self.add_bit_noise(value) for value in tensor_obj])
+        return tensor_obj
+
+    def handle(self, params: DataParams) -> torch.Any:
+        """
+        对输入添加扰动并返回
+        """
+        print_info_log_rank_0(
+            f"[atat] Free benchmark: Perturbation is "
+            f"{PerturbationMode.BIT_NOISE} of {self.api_name}."
+        )
+        params.perturbed_value = self.add_bit_noise(params.args[params.valid_input_index])
+        return self.perturbed_result(params)
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/change_value.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/change_value.py
new file mode 100644
index 0000000000000000000000000000000000000000..f860fdc34b802a80ad9b787e7addbb5dce6cb893
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/change_value.py
@@ -0,0 +1,63 @@
+import torch
+from atat.pytorch.free_benchmark import print_warn_log_rank_0, print_info_log_rank_0
+from atat.pytorch.free_benchmark.common.params import DataParams
+from atat.pytorch.free_benchmark.common.utils import TorchC
+from atat.pytorch.free_benchmark.common.enums import PerturbationMode
+from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
+    NpuBaseLayer,
+)
+
+
+class ChangeValueLayer(NpuBaseLayer):
+    def __init__(self, api_name):
+        super().__init__(api_name)
+        self.head: int = 0
+        self.tail: int = 1
+
+    def _check_details(self, tensor_obj):
+        """
+        判断是否需要添加扰动,  bit翻转
+        """
+        if tensor_obj.size(0) < 2:
+            print_warn_log_rank_0(
+                f"[atat] Free Benchmark: For {self.api_name}, "
+                f"size 0 must greater than 1. Cancel change value."
+            )
+            return False
+        return True
+
+    def change_value(self, tensor_obj):
+        """
+        交换张量首尾
+        """
+        if isinstance(tensor_obj, torch.Tensor) and self.pre_check(tensor_obj):
+            new_tensor = TorchC.clone(tensor_obj)
+            if new_tensor.ndim == 1:
+                temp_first = TorchC.clone(new_tensor[self.head])
+                temp_last = TorchC.clone(new_tensor[self.tail])
+                new_tensor[self.head] = temp_first
+                new_tensor[self.tail] = temp_last
+            else:
+                temp_first = TorchC.clone(new_tensor[self.head][self.head])
+                temp_last = TorchC.clone(new_tensor[self.tail][self.tail])
+                new_tensor[self.head][self.head] = temp_first
+                new_tensor[self.tail][self.tail] = temp_last
+                
+            self.is_added = True
+            return new_tensor
+        if isinstance(tensor_obj, dict):
+            return {key: self.change_value(value) for key, value in tensor_obj.items()}
+        if isinstance(tensor_obj, (tuple, list)):
+            return type(tensor_obj)([self.change_value(value) for value in tensor_obj])
+        return tensor_obj
+
+    def handle(self, params: DataParams) -> torch.Any:
+        """
+        对输入添加扰动并返回
+        """
+        print_info_log_rank_0(
+            f"[atat] Free benchmark: Perturbation is "
+            f"{PerturbationMode.CHANGE_VALUE} of {self.api_name}."
+        )
+        params.perturbed_value = self.change_value(params.args[params.valid_input_index])
+        return self.perturbed_result(params)
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb126972c6853b81d24db8138880601f9a3af21a
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/improve_precision.py
@@ -0,0 +1,64 @@
+import torch
+from atat.pytorch.free_benchmark import Const, print_info_log_rank_0
+from atat.pytorch.free_benchmark.common.constant import CommonField
+from atat.pytorch.free_benchmark.common.params import DataParams
+from atat.pytorch.free_benchmark.common.enums import PerturbationMode
+from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
+    NpuBaseLayer,
+)
+
+
+class ImprovePrecisionLayer(NpuBaseLayer):
+
+    def _set_improve_valus(self, inputs):
+        # TODO why
+        if inputs.dtype in [torch.float16, torch.bfloat16]:
+            self.perturbed_value = torch.float32
+
+    def _change_dtype(self, inputs):
+        if hasattr(inputs, CommonField.DEVICE):
+            device = inputs.device
+            if device is CommonField.META:
+                new_inputs = inputs.to(
+                    device=CommonField.META, dtype=self.perturbed_value
+                )
+            else:
+                new_inputs = inputs.to(dtype=self.perturbed_value).to(device)
+        else:
+            new_inputs = inputs.to(dtype=self.perturbed_value)
+        return new_inputs
+
+    def improve_tensor_precision(self, tensor_obj):
+        if (
+            isinstance(tensor_obj, torch.Tensor)
+            and torch.is_floating_point(tensor_obj)
+            and tensor_obj.dtype not in [torch.float32, torch.float64]
+        ):
+            self._set_improve_valus(tensor_obj)
+            tensor_obj = self._change_dtype(tensor_obj)
+            return tensor_obj
+        if isinstance(tensor_obj, dict):
+            return {
+                key: self.improve_tensor_precision(value)
+                for key, value in tensor_obj.items()
+            }
+        if isinstance(tensor_obj, (tuple, list)):
+            return type(tensor_obj)(
+                [self.improve_tensor_precision(value) for value in tensor_obj]
+            )
+        return tensor_obj
+
+    def handle(self, params: DataParams) -> torch.Any:
+        print_info_log_rank_0(
+            f"[atat] Free benchmark: Perturbation is "
+            f"{PerturbationMode.IMPROVE_PRECISION} of {self.api_name}."
+        )
+        new_args = self.improve_tensor_precision(params.args)
+        if params.fuzz_stage == Const.BACKWARD:
+            new_kwargs = {}
+        else:
+            new_kwargs = self.improve_tensor_precision(params.kwargs)
+        if "inplace" in new_kwargs:
+            new_kwargs["inplace"] = False
+        params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
+        return params.perturbed_result
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/no_change.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/no_change.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ec0f4445def3d80eb51d1f9eb875c60cde8f8e6
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/no_change.py
@@ -0,0 +1,29 @@
+import torch
+from atat.pytorch.free_benchmark import print_info_log_rank_0
+from atat.pytorch.free_benchmark.common.params import DataParams
+from atat.pytorch.free_benchmark.common.enums import PerturbationMode
+from atat.pytorch.free_benchmark.perturbed_layers.npu.npu_base_layser import (
+    NpuBaseLayer,
+)
+
+
+class NoChangeLayer(NpuBaseLayer):
+
+    def no_change(self, tensor_obj):
+        """
+        交换张量首尾
+        """
+        self.is_added = True
+        return tensor_obj
+
+
+    def handle(self, params: DataParams) -> torch.Any:
+        """
+        对输入添加扰动并返回
+        """
+        print_info_log_rank_0(
+            f"[atat] Free benchmark: Perturbation is "
+            f"{PerturbationMode.NO_CHANGE} of {self.api_name}."
+        )
+        params.perturbed_value = self.no_change(params.args[params.valid_input_index])
+        return self.perturbed_result(params)
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca502365e1b1b4ae0b37e2ecc48bff3b203f765c
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/npu/npu_base_layser.py
@@ -0,0 +1,46 @@
+from abc import abstractmethod
+from typing import Any
+import torch
+from atat.pytorch.free_benchmark.common.constant import CommonField, ThresholdConfig
+from atat.pytorch.free_benchmark.common.utils import TorchC
+from atat.pytorch.free_benchmark.common.params import DataParams
+from atat.pytorch.free_benchmark.perturbed_layers.base_layer import BaseLayer
+
+
+class NpuBaseLayer(BaseLayer):
+    def __init__(self, api_name: str) -> None:
+        super().__init__(api_name)
+        self.perturbed_value = None  # 扰动的元素
+        self.is_added = False  # 标记当前算子输入是否调整
+
+    @abstractmethod
+    def handle(self, params: DataParams) -> Any:
+        pass
+
+    def _check_details(self, tensor_obj):
+        return True
+
+    def pre_check(self, tensor_obj):
+        """
+        检查张量是否符合标准(float类型且最大值大于对应精度最小值)
+        """
+        # 只针对第一个满足要求的添加扰动
+        if self.is_added:
+            return False
+        if not torch.is_floating_point(tensor_obj):
+            return False
+        if not self._check_details(tensor_obj):
+            return False
+        return True
+
+    @staticmethod
+    def perturbed_result(params: DataParams) -> Any:
+        args_front = params.args[: params.valid_input_index]
+        args_rear = params.args[params.valid_input_index + 1 :]
+        # 此处会将有inplace属性的算子换为非inplace
+        if "inplace" in params.kwargs:
+            params.kwargs["inplace"] = False
+        params.perturbed_result = params.origin_func(
+            *args_front, params.perturbed_value, *args_rear, **params.kwargs
+        )
+        return params.perturbed_result
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/run_cpu.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/run_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..387f9447fd29276e3c43bcdabf0e8a3a05b8ecec
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/perturbed_layers/run_cpu.py
@@ -0,0 +1,19 @@
+import torch
+from atat.pytorch.free_benchmark import print_info_log_rank_0
+from atat.pytorch.free_benchmark.common.params import DataParams
+from atat.pytorch.free_benchmark.common.utils import Tools
+from atat.pytorch.free_benchmark.common.enums import DeviceType
+from atat.pytorch.free_benchmark.perturbed_layers.base_layer import BaseLayer
+
+
+class CpuLayer(BaseLayer):
+
+    def handle(self, params: DataParams) -> torch.Any:
+
+        print_info_log_rank_0(
+            f"[atat] Free benchmark: Perturbation is to_cpu of {self.api_name}."
+        )
+        new_args = Tools.convert_device_and_dtype(params.args, DeviceType.CPU, change_dtype=True)
+        new_kwargs = Tools.convert_device_and_dtype(params.kwargs, DeviceType.CPU, change_dtype=True)
+        params.perturbed_result = params.origin_func(*new_args, **new_kwargs)
+        return params.perturbed_result
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/__init__.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/base_handler.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/base_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d59ef9fc3adc2f90a7145d825ce597e209758e4
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/base_handler.py
@@ -0,0 +1,213 @@
+import math
+from abc import ABC, abstractmethod
+from typing import Any, Optional, Tuple
+
+import torch
+from atat.pytorch.free_benchmark import (
+    Const,
+    print_warn_log_rank_0,
+)
+from atat.pytorch.free_benchmark.common.utils import TorchC
+from atat.pytorch.free_benchmark.common.constant import ThresholdConfig
+from atat.pytorch.free_benchmark.common.enums import (
+    FuzzThreshold,
+    NormType,
+    PerturbationMode,
+)
+from atat.pytorch.free_benchmark.common.params import DataParams, HandlerParams, make_unequal_row
+
+
+class FuzzHandler(ABC):
+    def __init__(self, params: HandlerParams) -> None:
+        self.params = params
+        self.unequal_rows = []
+
+    @staticmethod
+    def pre_process(origin_ouput, perturbed_output):
+        if (
+            isinstance(origin_ouput, tuple)
+            and hasattr(origin_ouput, "values")
+            and hasattr(origin_ouput, "indices")
+        ):
+            origin_ouput = origin_ouput.values
+            perturbed_output = perturbed_output.values
+        if hasattr(perturbed_output, "dtype"):
+            abs_tol = ThresholdConfig.ABS_TOL_VALUE_DICT.get(perturbed_output.dtype)
+        else:
+            abs_tol = FuzzThreshold.F32_THD.value
+        return (
+            origin_ouput.to(perturbed_output.dtype).to(perturbed_output.device),
+            perturbed_output,
+            abs_tol,
+        )
+
+    def get_ratio_from_specific_norm(
+        self, origin_output, perturbed_output, norm_type, abs_tol
+    ):
+        if norm_type == NormType.ENDLESS_NORM:
+            return self.get_endless_norm(origin_output, perturbed_output, abs_tol)
+        return ThresholdConfig.COMP_CONSISTENT
+
+    @staticmethod
+    def convert_overflow_ratio_to_consistent(ratio):
+        if math.isnan(ratio) or math.isinf(ratio):
+            return ThresholdConfig.COMP_CONSISTENT
+        return ratio
+
+    def get_endless_norm(self, origin_output, perturbed_output, abs_tol):
+        try:
+            ratio_tensor1 = TorchC.where(
+                TorchC.gt(TorchC.abs(perturbed_output), abs_tol),
+                TorchC.div(
+                    TorchC.abs(origin_output),
+                    TorchC.add(TorchC.abs(perturbed_output), abs_tol),
+                ),
+                1,
+            )
+            ratio_tensor2 = TorchC.where(
+                TorchC.gt(TorchC.abs(origin_output), abs_tol),
+                TorchC.div(
+                    TorchC.abs(perturbed_output),
+                    TorchC.add(TorchC.abs(origin_output), abs_tol),
+                ),
+                1,
+            )
+        except:
+            ratio_tensor1 = TorchC.where(
+                TorchC.gt(TorchC.abs(perturbed_output.to(torch.float32)), abs_tol),
+                TorchC.div(
+                    origin_output.to(torch.float32), perturbed_output.to(torch.float32)
+                ),
+                1,
+            )
+            ratio_tensor2 = TorchC.where(
+                TorchC.gt(TorchC.abs(origin_output.to(torch.float32)), abs_tol),
+                TorchC.div(
+                    perturbed_output.to(torch.float32), origin_output.to(torch.float32)
+                ),
+                1,
+            )
+        norm1 = self.convert_overflow_ratio_to_consistent(
+            TorchC.max(ratio_tensor1).item()
+        )
+        norm2 = self.convert_overflow_ratio_to_consistent(
+            TorchC.max(ratio_tensor2).item()
+        )
+        norm3 = self.convert_overflow_ratio_to_consistent(
+            TorchC.min(ratio_tensor1).item()
+        )
+        if norm3 < 0:
+            ratio = ThresholdConfig.SYMBOL_FLIPPING
+        else:
+            ratio = max(norm1, norm2)
+        return ratio
+
+    def ratio_calculate(self, origin_output, perturbed_output, norm_type) -> float:
+        try:
+            origin_output, perturbed_output, abs_tol = self.pre_process(
+                origin_output, perturbed_output
+            )
+        except Exception as e:
+            print_warn_log_rank_0(
+                f"[atat] Free Benchmark: For {self.params.api_name}, "
+                f"when computing ratio,"
+                f" y1 or y2 dtype is not supported {e}"
+            )
+            return ThresholdConfig.COMP_NAN
+        if self.params.fuzz_stage == Const.BACKWARD:
+            abs_tol = ThresholdConfig.BACKWARD_OUTPUT_LOWER_BOUND
+        else:
+            abs_tol = abs_tol**0.5
+        return self.get_ratio_from_specific_norm(
+            origin_output, perturbed_output, norm_type, abs_tol
+        )
+
+    @abstractmethod
+    def get_threshold(self, dtype):
+        pass
+
+    def _get_default_threshold(self, dtype):
+        if self.params.pert_mode == PerturbationMode.NO_CHANGE:
+            threshold = ThresholdConfig.COMP_CONSISTENT
+        else:
+            threshold = ThresholdConfig.DTYPE_PER_THD.get(
+                dtype, ThresholdConfig.DTYPE_PER_THD.get(torch.float32)
+            )
+        return threshold
+
+    def npu_compare(
+        self, origin_output, perturbed_output
+    ) -> Tuple[bool, Optional[float]]:
+
+        if isinstance(perturbed_output, int):
+            return origin_output == perturbed_output, None
+        elif isinstance(perturbed_output, float):
+            return (
+                math.isclose(origin_output, perturbed_output),
+                origin_output / perturbed_output,
+            )
+        elif not isinstance(perturbed_output, torch.Tensor):
+            print_warn_log_rank_0(
+                f"[atat] Free Benchmark: For {self.params.api_name} "
+                f"The compare for output type {type(perturbed_output)} is not supported"
+            )
+
+        threshold = self.get_threshold(origin_output.dtype)
+        ratio = self.ratio_calculate(
+            origin_output, perturbed_output, norm_type=NormType.ENDLESS_NORM
+        )
+        if ratio == ThresholdConfig.SYMBOL_FLIPPING:
+            is_consistent = False
+        else:
+            is_consistent = threshold >= ratio >= 1 / threshold
+        return is_consistent, ratio
+
+    def cmp_output_npu(self, data_params: DataParams):
+        npu_consistent = True
+        max_fuzz_ratio = 0
+        try:
+            if isinstance(data_params.original_result, torch.Tensor):
+                is_consistent, ratio = self.npu_compare(
+                    data_params.original_result, data_params.perturbed_result
+                )
+                npu_consistent = is_consistent
+                max_fuzz_ratio = (
+                    max_fuzz_ratio if ratio is None else max(max_fuzz_ratio, ratio)
+                )
+                data_params.is_consistent = is_consistent and data_params.is_consistent
+                if not is_consistent and data_params.grad_unequal_flag:
+                    self.unequal_rows.append(
+                        make_unequal_row(data_params, self.params, ratio=ratio)
+                    )
+
+            elif isinstance(data_params.original_result, (list, tuple)):
+                for index_, origin_item in enumerate(data_params.original_result):
+                    is_consistent, ratio = self.npu_compare(
+                        origin_item, data_params.perturbed_result[index_]
+                    )
+                    npu_consistent = npu_consistent and is_consistent
+                    max_fuzz_ratio = (
+                        max_fuzz_ratio if ratio is None else max(max_fuzz_ratio, ratio)
+                    )
+                    data_params.is_consistent = (
+                        is_consistent and data_params.is_consistent
+                    )
+                    if not is_consistent and data_params.grad_unequal_flag:
+                        self.unequal_rows.append(
+                            make_unequal_row(
+                                data_params, self.params, ratio=ratio, index=index_
+                            )
+                        )
+        except Exception as e:
+            print_warn_log_rank_0(
+                f"[atat] Free Benchmark: For {self.params.api_name}, "
+                f"when campare the result exception raise {e}"
+            )
+        return npu_consistent, max_fuzz_ratio
+
+    @abstractmethod
+    def handle(self, data_params: DataParams) -> Any:
+        pass
+
+    def get_unequal_rows(self):
+        return self.unequal_rows
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/check_handler.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/check_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f590855f1b96e0a6475c87c9b3dfdafd0288332
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/check_handler.py
@@ -0,0 +1,41 @@
+from typing import Any
+
+import torch
+from atat.pytorch.free_benchmark import print_warn_log_rank_0
+from atat.pytorch.free_benchmark.common.enums import DeviceType
+from atat.pytorch.free_benchmark.compare.single_benchmark import SingleCompare
+from atat.pytorch.free_benchmark.common.params import DataParams, make_unequal_row
+from atat.pytorch.free_benchmark.common.utils import Tools
+from atat.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler
+
+
+class CheckerHandler(FuzzHandler):
+    @staticmethod
+    def other_compare(self, data_params: DataParams) -> bool:
+        is_consistent = SingleCompare().compare_seq(
+                    data_params.original_result, data_params.perturbed_result
+                )
+        if not is_consistent:
+            self.unequal_rows.append(
+                make_unequal_row(data_params, self.params)
+            )
+
+    def get_threshold(self, dtype):
+        return self._get_default_threshold(dtype)
+
+    def handle(self, data_params: DataParams) -> Any:
+        if isinstance(data_params.perturbed_result, bool) or not Tools.is_float_tensor(
+            data_params.perturbed_result
+        ):
+            return data_params.original_result
+        try:
+            if self.params.fuzz_device == DeviceType.NPU:
+                self.cmp_output_npu(data_params)
+            else:
+                self.other_compare(data_params)
+        except Exception as e:
+            print_warn_log_rank_0(
+                f"[atat] Free Benchmark: For {self.params.api_name}, "
+                f"when campare the result exception raise {e}"
+            )
+        return data_params.original_result
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/fix_handler.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/fix_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..789e2653aa0eafc3619fbe3bd192b49dee643a1d
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/fix_handler.py
@@ -0,0 +1,24 @@
+from typing import Any
+
+from atat.pytorch.free_benchmark.common.params import DataParams
+from atat.pytorch.free_benchmark.common.utils import Tools
+from atat.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler
+from atat.pytorch.free_benchmark import print_warn_log_rank_0
+
+
+class FixHandler(FuzzHandler):
+
+    def get_threshold(self, dtype):
+        return self._get_default_threshold(dtype)
+
+    def handle(self, data_params: DataParams) -> Any:
+        try:
+            return Tools.convert_fuzz_output_to_origin(
+                data_params.original_result, data_params.perturbed_result
+            )
+        except Exception as e:
+            print_warn_log_rank_0(
+                f"[atat] Free Benchmark: For {self.params.api_name} "
+                f"Fix output failed. "
+            )
+        return data_params.original_result
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/handler_factory.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/handler_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..50f791d81eeb25f8a50a6b4044dbc8e6e09e6a1e
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/handler_factory.py
@@ -0,0 +1,32 @@
+from atat.pytorch.free_benchmark import FreeBenchmarkException
+from atat.pytorch.free_benchmark.common.constant import PreheatConfig
+from atat.pytorch.free_benchmark.common.utils import Tools
+from atat.pytorch.free_benchmark.common.enums import HandlerType
+from atat.pytorch.free_benchmark.common.params import HandlerParams
+from atat.pytorch.free_benchmark.result_handlers.check_handler import CheckerHandler
+from atat.pytorch.free_benchmark.result_handlers.preheat_handler import PreheatHandler
+from atat.pytorch.free_benchmark.result_handlers.fix_handler import FixHandler
+
+
+class FuzzHandlerFactory:
+
+    result_handlers = {
+        HandlerType.CHECK: CheckerHandler,
+        HandlerType.FIX: FixHandler,
+        HandlerType.PREHEAT: PreheatHandler,
+    }
+
+    @staticmethod
+    def create(params: HandlerParams):
+        if_preheat = params.preheat_config.get(PreheatConfig.IF_PREHEAT)
+        if not if_preheat:
+            handler = FuzzHandlerFactory.result_handlers.get(params.handler_type)
+        else:
+            handler = FuzzHandlerFactory.result_handlers.get(HandlerType.PREHEAT)
+            # TODO
+        if not handler:
+            raise FreeBenchmarkException(
+                FreeBenchmarkException.UnsupportedType,
+                f"无标杆工具支持 [ {HandlerType.CHECK}、{HandlerType.FIX}] 形式",
+            )
+        return handler(params)
diff --git a/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/preheat_handler.py b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/preheat_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8ff3bccf00c2dbe699159b4f77da86c75ae4062
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/free_benchmark/result_handlers/preheat_handler.py
@@ -0,0 +1,174 @@
+from typing import Any
+
+import torch
+import math
+from atat.pytorch.free_benchmark import print_info_log_rank_0, print_warn_log_rank_0
+from atat.pytorch.free_benchmark.common.constant import ThresholdConfig
+from atat.pytorch.free_benchmark.common.enums import DeviceType
+from atat.pytorch.free_benchmark.common.params import DataParams, make_unequal_row
+from atat.pytorch.free_benchmark.common.utils import Tools
+from atat.pytorch.free_benchmark.compare.single_benchmark import SingleCompare
+from atat.pytorch.free_benchmark.common.counter import preheat_counter
+from atat.pytorch.free_benchmark.result_handlers.base_handler import FuzzHandler
+from atat.pytorch.free_benchmark.common.params import HandlerParams
+
+
+class PreheatHandler(FuzzHandler):
+
+    def __init__(self, params: HandlerParams) -> None:
+        super().__init__(params)
+        self.pure_name = Tools.get_pure_api_name(self.params.api_name)
+
+    def get_threshold(self, dtype):
+        return preheat_counter.get_api_thd(self.pure_name, dtype)
+
+    def _is_take_a_sample(self) -> bool:
+        need_sample_set = self._get_need_sample_set()
+        curr_called_seq = preheat_counter.get_api_called_time(self.pure_name)
+        res = curr_called_seq in need_sample_set
+        if res:
+            total_count = preheat_counter.get_one_step_used_api(self.pure_name)
+            print_info_log_rank_0(
+                f"[atat] Free benchmark: preheat sample in step{self.params.step}"
+                f"api_name {self.params.api_name}, "
+                f"curr_called_seq: {curr_called_seq}/{total_count}"
+            )
+            preheat_counter.add_api_sample_time(self.pure_name)
+        return res
+
+    def _get_sample_count_per_step(self) -> set:
+        """
+        每一个step中应该采集的样本数
+        """
+        total_count = preheat_counter.get_one_step_used_api(self.pure_name)
+        preheat_step = self.params.preheat_config.get("preheat_step")
+        max_sample = self.params.preheat_config.get("max_sample")
+        return min(math.ceil(total_count / preheat_step), max_sample)
+
+    def _get_need_sample_set(self):
+        """
+        需要采集的api集合
+        """
+        # 每一步样本数
+        total_count = preheat_counter.get_one_step_used_api(self.pure_name)
+        sample_count_per_step = self._get_sample_count_per_step()
+        need_sample_set = set()
+        prehead_step = self.params.preheat_config.get("preheat_step")
+        for i in range(1, sample_count_per_step + 1):
+            count = (prehead_step * (i - 1) + self.params.step) % total_count
+            if count == 0:
+                count = total_count
+            need_sample_set.add(count)
+        return need_sample_set
+
+
+    def compare_npu_and_cpu(self, data_params: DataParams):
+        args = Tools.convert_device_and_dtype(
+            data_params.args, DeviceType.CPU, change_dtype=True
+        )
+        kwargs = Tools.convert_device_and_dtype(
+            data_params.kwargs, DeviceType.CPU, change_dtype=True
+        )
+        cpu_result = data_params.origin_func(*args, **kwargs)
+        return SingleCompare().compare_seq(data_params.original_result, cpu_result)
+
+    def _need_adjust_threshold(self) -> bool:
+        sample_count_per_step = self._get_sample_count_per_step()
+        sampled_time = preheat_counter.get_api_sample_time(self.pure_name)
+        res = sampled_time >= sample_count_per_step
+        return res
+
+    def _adjust_threshold_for_dtype(self, dtype_str, compare_result):
+        con_ratio = [ratio for ratio, is_consistent in compare_result if is_consistent]
+        incon_ratio = [
+            ratio for ratio, is_consistent in compare_result if not is_consistent
+        ]
+        old_thd = preheat_counter.get_api_thd(self.pure_name, dtype_str)
+        new_thd = old_thd
+        # 正例负例都存在
+        if con_ratio and incon_ratio:
+            if min(incon_ratio) > max(con_ratio):
+                new_thd = min(min(incon_ratio), old_thd)
+                preheat_counter.set_api_preheat(self.pure_name, dtype_str, is_preheat=False)
+        elif con_ratio:
+            # 存在漏报
+            if max(con_ratio) > old_thd:
+                new_thd = 1 + ((old_thd - 1) * ThresholdConfig.API_THD_STEP)
+            else:
+                new_thd = 1 + ((old_thd - 1) / ThresholdConfig.API_THD_STEP)
+        else:
+            new_thd = min(min(incon_ratio), old_thd)
+            preheat_counter.set_api_preheat(self.pure_name, dtype_str, is_preheat=False)
+        return new_thd
+
+    def _adjust_threshold(self):
+        for dtype_str, compare_result in preheat_counter.preheat_record[
+            self.pure_name
+        ].items():
+            new_thd = self._adjust_threshold_for_dtype(dtype_str, compare_result)
+            threshold = self._get_default_threshold(
+                preheat_counter.dtype_map.get(dtype_str)
+            )
+            preheat_counter.update_api_thd(
+                self.pure_name, dtype_str, new_thd, threshold
+            )
+
+    def preheat(self, max_fuzz_ratio, cpu_consistent, first_dtype):
+        # 存储当前step所有输出比值和对应npu\cpu比对结果
+        preheat_counter.update_preheat_record(
+            self.pure_name,
+            first_dtype,
+            (max_fuzz_ratio, cpu_consistent),
+        )
+        if self._need_adjust_threshold():
+            self._adjust_threshold()
+
+    def handle(self, data_params: DataParams) -> Any:
+
+        if isinstance(data_params.perturbed_result, bool) or not Tools.is_float_tensor(
+            data_params.perturbed_result
+        ):
+            return data_params.original_result
+
+        if self.params.step == 0:
+            preheat_counter.add_one_step_used_api(self.pure_name)
+            return data_params.original_result
+        
+        # 如果当前api,step需要预热
+        npu_consistent, max_fuzz_ratio = self.cmp_output_npu(data_params)
+        data_params.is_consistent = npu_consistent
+
+        preheat_counter.check_step(self.params.step)
+
+        if self.params.preheat_config.get("preheat_step") <= self.params.step:
+            return data_params.original_result
+
+        if not data_params.grad_unequal_flag:
+            data_params.grad_unequal_flag = True
+            data_params.is_consistent = False
+            return data_params.original_result
+        preheat_counter.add_api_called_time(self.pure_name)
+
+
+        if not self._is_take_a_sample():
+            return data_params.original_result
+        
+        cpu_consistent = True
+        try:
+            cpu_consistent = self.compare_npu_and_cpu(data_params)
+        except Exception as e:
+            print_warn_log_rank_0(
+                f"[atat] Free Benchmark: For {self.params.api_name}, "
+                f"when campare to cpu exception raise {e}"
+            )
+        try:
+            first_dtype = Tools.get_first_tensor_dtype(data_params.perturbed_result)
+        except RuntimeError:
+            print_warn_log_rank_0(
+                f"[atat] Free Benchmark: For {self.params.api_name}, "
+                f"the output sequence does not contain tensors."
+            )
+        if preheat_counter.get_api_preheat(self.pure_name, str(first_dtype)):
+            self.preheat(max_fuzz_ratio, cpu_consistent, first_dtype)
+
+        return data_params.original_result
diff --git a/debug/accuracy_tools/atat/pytorch/functional/__init__.py b/debug/accuracy_tools/atat/pytorch/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7282af08f0aadc803f7554602614359e7689e14
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/functional/__init__.py
@@ -0,0 +1,4 @@
+from .repair import build_repair
+from .scope import build_scope
+from .step_post_process import build_step_post_process
+from .data_collector import build_collect_data
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/functional/data_collector.py b/debug/accuracy_tools/atat/pytorch/functional/data_collector.py
new file mode 100644
index 0000000000000000000000000000000000000000..54c7623108ac391bce7706911e6ddf39cb327895
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/functional/data_collector.py
@@ -0,0 +1,134 @@
+
+import os
+from ..module_processer import ModuleProcesser
+from .scope import BaseScope, build_scope, ListScope
+from .json_writer import DataWriter
+from ..common.log import print_info_log, print_info_log_rank_0, print_error_log_rank_0
+from ..common.utils import Const
+from ..common.file_check import FileOpen
+from .data_processor import build_data_processor, DataProcessor
+
+
+def build_collect_data(config):
+    return DataCollector(config)
+
+
+class DataCollector:
+    overflow_task = "overflow_check"
+    tensor_task = "tensor"
+    freebenchmark_task = "free_benchmark"
+    tasks_need_tensor_data = [overflow_task, tensor_task, freebenchmark_task]
+    level_without_construct = "L1"
+
+    def __init__(self, config):
+        self.config = config
+        self.data_writer = DataWriter()
+        self.data_processor = build_data_processor(config, self.data_writer)
+        self.module_count = {}
+        if config.task == DataCollector.freebenchmark_task:
+            self.scope = build_scope(ListScope, self.config.scope, self.config.list)
+        else:
+            self.scope = build_scope(None, self.config.scope, self.config.list)
+    
+    def if_return_forward_new_output(self):
+        return self.data_processor.if_return_forward_new_output()
+    
+    def get_forward_new_output(self):
+        return self.data_processor.get_forward_new_output()
+
+    @property
+    def dump_data_dir(self):
+        return self.data_writer.dump_tensor_data_dir
+
+    @property
+    def dump_file_path(self):
+        return self.data_writer.dump_file_path
+
+    def visit_and_clear_overflow_status(self, api_or_module_name):
+        self.data_processor.visit_and_clear_overflow_status(api_or_module_name)
+
+    def write_json(self):
+        self.data_writer.write_json()
+
+
+    def update_data(self, data_info, msg=''):
+        if self.config.task == DataProcessor.overflow:
+            if self.data_processor.has_overflow:
+                self.data_writer.update_data(data_info)
+                msg += "Overflow detected."
+            else:
+                msg += "No Overflow, OK."
+        else:
+            self.data_writer.update_data(data_info)
+        return msg
+
+    @staticmethod
+    def check_scope_and_pid(scope, name, pid):
+        return (not scope or scope.check(name)) and pid == os.getpid()
+
+    @staticmethod
+    def is_inplace(module):
+        return getattr(module, "op_is_inplace", False)
+
+    def pre_forward(self, name, module_type, module, pid, module_input_output):
+        backward_name = name.replace("forward", "backward")
+        if self.check_scope_and_pid(self.scope, backward_name, pid):
+            self.data_processor.analyze_pre_forward(backward_name, module, module_input_output)
+        if not self.is_inplace(module):
+            return
+        print_info_log(f"API {name} is inplace.")
+        if self.check_scope_and_pid(self.scope, name, pid):
+            data_info = self.data_processor.analyze_pre_forward_inplace(name, module_input_output)
+            self.update_data(data_info)
+
+    def __call__(self, name_template, module_type, module, pid, module_input_output):
+        if module_type == BaseScope.Module_Type_Module:
+            name = module.mindstudio_reserved_name
+        else:
+            name = name_template
+
+        if self.config.level != DataCollector.level_without_construct:
+            self.data_writer.update_construct({name: ModuleProcesser.api_parent_node})
+            self.data_writer.update_construct(ModuleProcesser.module_node)
+        if not self.check_scope_and_pid(self.scope, name, pid):
+            return
+        msg = f"Calibrator is collecting data on {name}. "
+        if "forward" in name:
+            if not self.is_inplace(module):
+                data_info = self.data_processor.analyze_forward(name, module, module_input_output)
+            else:
+                data_info = self.data_processor.analyze_forward_inplace(name, module_input_output)
+            self.data_writer.update_stack(self.data_processor.analyze_api_call_stack(name))
+        else:
+            data_info = self.data_processor.analyze_backward(name, module, module_input_output)
+        if data_info:
+            msg = self.update_data(data_info, msg)
+            print_info_log(msg)
+        self.data_writer.flush_data_when_buffer_is_full()
+
+    def module_count_func(self, name, name_template):
+        module_name = name.split(Const.SEP)[-3]
+        if "forward" in name_template:
+            if module_name not in self.module_count:
+                self.module_count[module_name] = [0, [0]]
+            else:
+                if self.module_count[module_name][-1] and \
+                        self.module_count[module_name][0] != self.module_count[module_name][-1][-1]:
+                    self.module_count[module_name][-1].pop()
+                self.module_count[module_name][0] += 1
+                self.module_count[module_name][-1].append(self.module_count[module_name][0])
+            index = self.module_count[module_name][0]
+        else:
+            backward_stack = self.module_count[module_name][-1] if module_name in self.module_count else []
+            if not backward_stack:
+                index = "abnormal"
+            else:
+                index = backward_stack.pop()
+        return index
+
+    def update_dump_paths(self, *args):
+        self.data_writer.update_dump_paths(*args)
+        self.data_writer.initialize_json_file(task=self.config.task, level=self.config.level)
+    
+    def update_iter(self, current_iter):
+        self.data_processor.update_iter(current_iter)
diff --git a/debug/accuracy_tools/atat/pytorch/functional/data_processor.py b/debug/accuracy_tools/atat/pytorch/functional/data_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf6d2b62466832d2d0b2ce91a17a2a9de9690f3f
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/functional/data_processor.py
@@ -0,0 +1,515 @@
+import torch
+import zlib
+import numpy as np
+import os
+import inspect
+from dataclasses import dataclass, asdict
+import torch_npu
+from typing import Tuple, List, Dict, Optional, Union
+from ..common.exceptions import MsaccException
+from ..common.utils import Const
+from ..common import recursive_apply_transform
+from ..functional. json_writer import DataWriter
+from ..free_benchmark import FreeBenchmarkCheck, UnequalRow
+
+bits_for_overflow = 8
+
+def build_data_processor(config, data_writer):
+    if config.task == DataProcessor.full:
+        return FullTensorDataProcessor(config, data_writer)
+    elif config.task == DataProcessor.summary:
+        return DataProcessor(config, data_writer)
+    elif config.task == DataProcessor.overflow:
+        return OverflowTensorDataProcessor(config, data_writer)
+    elif config.task == DataProcessor.free_benchmark:
+        return FreeBenchmarkDataProcessor(config, data_writer)
+    else:
+        raise MsaccException(MsaccException.INVALID_PARAM_ERROR,
+                                  "task should be in [{}, {}, {}, {}]".format(
+                                      DataProcessor.full,
+                                      DataProcessor.summary,
+                                      DataProcessor.overflow,
+                                      DataProcessor.free_benchmark
+                                  ))
+
+
+@dataclass
+class ModuleForwardInputsOutputs:
+    args: Optional[Tuple]
+    kwargs: Optional[Dict]
+    output: Union[Tuple, torch.Tensor]
+
+    @property
+    def args_tuple(self):
+        if not isinstance(self.args, tuple):
+            return (self.args, )
+        else:
+            return self.args
+        
+    @property
+    def output_tuple(self):
+        if not isinstance(self.output, tuple):
+            return (self.output, )
+        else:
+            return self.output
+
+    def concat_args_and_kwargs(self):
+        args = self.args + tuple(self.kwargs.values())
+        return args
+
+
+@dataclass
+class ModuleBackwardInputsOutputs:
+    grad_output: Optional[Tuple]
+    grad_input: Optional[Tuple]
+
+    @property
+    def grad_input_tuple(self):
+        if not isinstance(self.grad_input, tuple):
+            return (self.grad_input, )
+        else:
+            return self.grad_input
+        
+    @property
+    def grad_output_tuple(self):
+        if not isinstance(self.grad_output, tuple):
+            return (self.grad_output, )
+        else:
+            return self.grad_output
+
+
+class DataProcessor:
+    full = "tensor"
+    summary = "statistics"
+    overflow = "overflow_check"
+    free_benchmark = "free_benchmark"
+
+    def __init__(self, config, data_writer):
+        self.data_writer = data_writer
+        self.api_info_struct = {}
+        self.stack_info_struct = {}
+        self.torch_object_key = {
+            "device": self.analyze_device_in_kwargs,
+            "dtype": self.analyze_dtype_in_kwargs
+        }
+        self.current_api_or_module_name = None
+        self.config = config
+        self.api_data_category = None
+        self.has_overflow = False
+        self.current_iter = 0
+
+        # 需要对forward的output进行更改
+        self._return_forward_new_output = False
+        self._forward_new_output = None
+    
+    def if_return_forward_new_output(self):
+        return self._return_forward_new_output
+    
+    def get_forward_new_output(self):
+        self._return_forward_new_output = False
+        return self._forward_new_output
+
+    @staticmethod
+    def get_md5_for_tensor(x):
+        if x.dtype == torch.bfloat16:
+            x = x.float()
+        tensor_bytes = x.cpu().detach().numpy().tobytes()
+        crc32_hash = zlib.crc32(tensor_bytes)
+        return f"{crc32_hash:08x}"
+
+    @staticmethod
+    def analyze_device_in_kwargs(element):
+        single_arg = {}
+        single_arg.update({'type': "torch.device"})
+        if not isinstance(element, str):
+            if hasattr(element, "index"):
+                device_value = element.type + ":" + str(element.index)
+            else:
+                device_value = element.type
+            single_arg.update({"value": device_value})
+        else:
+            single_arg.update({"value": element})
+        return single_arg
+
+    @staticmethod
+    def analyze_dtype_in_kwargs(element):
+        single_arg = {}
+        single_arg.update({"type": "torch.dtype"})
+        single_arg.update({"value": str(element)})
+        return single_arg
+
+    @staticmethod
+    def _convert_numpy_to_builtin(arg):
+        type_mapping = {
+            np.integer: int,
+            np.floating: float,
+            np.bool_: bool,
+            np.complexfloating: complex,
+            np.str_: str,
+            np.byte: bytes,
+            np.unicode_: str
+        }
+        for numpy_type, builtin_type in type_mapping.items():
+            if isinstance(arg, numpy_type):
+                return builtin_type(arg), type(arg).__name__
+        return arg, ''
+
+    def update_iter(self, current_iter):
+        self.current_iter = current_iter
+        
+    def visit_and_clear_overflow_status(self, api_or_module_name):
+        if self.current_api_or_module_name != api_or_module_name:
+            self.current_api_or_module_name = api_or_module_name
+            self.has_overflow = False
+
+    def _analyze_numpy(self, value, numpy_type):
+        single_arg = {}
+        single_arg.update({"type": numpy_type})
+        single_arg.update({"value": value})
+        return single_arg
+
+    def get_stat_info(self, data):
+        if data.is_meta:
+            return
+        data_clone = data.detach()
+        if data_clone.numel() == 0:
+            tensor_max = None
+            tensor_min = None
+            tensor_mean = None
+            tensor_norm = None
+        elif data_clone.dtype == torch.bool:
+            tensor_max = True in data_clone
+            tensor_min = False not in data_clone
+            tensor_mean = None
+            tensor_norm = None
+        elif not len(data_clone.shape):
+            tensor_max = data_clone.item()
+            tensor_min = tensor_max
+            tensor_mean = tensor_max
+            tensor_norm = tensor_max
+        else:
+            if not data_clone.is_floating_point():
+                data_clone = data_clone.float()
+            tensor_max = torch._C._VariableFunctionsClass.max(data_clone).item()
+            tensor_min = torch._C._VariableFunctionsClass.min(data_clone).item()
+            tensor_mean = torch._C._VariableFunctionsClass.mean(data_clone).item()
+            tensor_norm = torch._C._VariableFunctionsClass.norm(data_clone).item()
+
+        return tensor_max, tensor_min, tensor_mean, tensor_norm
+
+    def _analyze_builtin(self, arg):
+        single_arg = {}
+        if isinstance(arg, slice):
+            single_arg.update({"type": "slice"})
+            single_arg.update({"value": [arg.start, arg.stop, arg.step]})
+        else:
+            single_arg.update({"type": type(arg).__name__})
+            single_arg.update({"value": arg})
+        return single_arg
+
+    def is_dump_for_data_mode(self, forward_backward, input_output):
+        """
+        Compare the parameters with data_mode to determine whether to dump.
+
+        Args:
+            forward_backward(str): The forward or backward mode to check.
+            input_output(str): The input or output mode to check.
+
+        Return:
+            bool: True if the parameters are in data_mode or data_mode is all, False otherwise.
+        """
+        return (Const.ALL in self.config.data_mode or
+                forward_backward in self.config.data_mode or
+                input_output in self.config.data_mode)
+
+    @staticmethod
+    def handle_tensor_extremum_nan_inf(data_clone, operator):
+        data_nan = torch._C._VariableFunctionsClass.isnan(data_clone)
+        if int(torch._C._VariableFunctionsClass.sum(data_nan)) == data_clone.numel():
+            return float('nan')
+        finite_mask = torch._C._VariableFunctionsClass.isfinite(data_clone)
+        if int(torch._C._VariableFunctionsClass.sum(finite_mask)) > 0:
+            finite_values = data_clone[finite_mask]
+            return torch._C._VariableFunctionsClass.max(finite_values).item() if operator == 'max' else \
+                torch._C._VariableFunctionsClass.min(finite_values).item()
+        else:
+            data_no_nan = data_clone[~data_nan]
+            return torch._C._VariableFunctionsClass.max(data_no_nan).item() if operator == 'max' else \
+                torch._C._VariableFunctionsClass.min(data_no_nan).item()
+
+    def _analyze_maybe_overflow_tensor(self, tensor_json, tensor):
+        data_clone = tensor.detach()
+        if hasattr(torch_npu._C, '_npu_is_support_inf_nan') and torch_npu._C._npu_is_support_inf_nan():
+            if tensor_json['Max'] is None:
+                return
+            if np.isinf(tensor_json['Max']) or np.isnan(tensor_json['Max']):
+                tensor_json['Max_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(data_clone, "max")
+                self.has_overflow = True
+            if np.isinf(tensor_json['Min']) or np.isnan(tensor_json['Min']):
+                tensor_json['Min_except_inf_nan'] = self.handle_tensor_extremum_nan_inf(data_clone, "min")
+                self.has_overflow = True
+        else:
+            self.has_overflow = check_overflow_npu()
+            if self.has_overflow:
+                clear_overflow_npu()
+
+    def _analyze_tensor(self, tensor, suffix):
+        tensor_max, tensor_min, tensor_mean, tensor_norm = self.get_stat_info(tensor)
+
+        tensor_json = {}
+        tensor_json.update({'type': 'torch.Tensor'})
+        tensor_json.update({'dtype': str(tensor.dtype)})
+        tensor_json.update({"shape": tensor.shape})
+        tensor_json.update({"Max": tensor_max})
+        tensor_json.update({"Min": tensor_min})
+        self._analyze_maybe_overflow_tensor(tensor_json, tensor)
+        tensor_json.update({"Mean": tensor_mean})
+        tensor_json.update({"Norm": tensor_norm})
+        tensor_json.update({"requires_grad": tensor.requires_grad})
+        if self.config.summary_mode == "md5":
+            tensor_md5 = self.get_md5_for_tensor(tensor)
+            tensor_json.update({"md5": tensor_md5})
+
+        return tensor_json
+
+    def analyze_single_element(self, element, suffix_stack):
+        if suffix_stack and suffix_stack[-1] in self.torch_object_key:
+            return self.torch_object_key[suffix_stack[-1]](element)
+
+        converted_numpy, numpy_type = self._convert_numpy_to_builtin(element)
+        if converted_numpy is not element:
+            return self._analyze_numpy(converted_numpy, numpy_type)
+
+        if isinstance(element, torch.Tensor):
+            return self._analyze_tensor(element, Const.SEP.join(suffix_stack))
+
+        if isinstance(element, (bool, int, float, str, slice)):
+            return self._analyze_builtin(element)
+
+    def analyze_element(self, element):
+        return recursive_apply_transform(element, self.analyze_single_element)
+
+    @staticmethod
+    def analyze_api_call_stack(name):
+        stack_str = []
+        for (_, path, line, func, code, _) in inspect.stack()[5:]:
+            if not code:
+                continue
+            stack_line = " ".join([
+                "File", ", ".join([
+                    path,
+                    " ".join(["line", str(line)]),
+                    " ".join(["in", func]),
+                    " ".join(["\n", code[0].strip()])
+                ])
+            ])
+            stack_str.append(stack_line)
+        stack_info_struct = {name: stack_str}
+        return stack_info_struct
+
+    def analyze_pre_forward(self, name, module,
+                        module_input_output: ModuleForwardInputsOutputs):
+        pass
+
+    def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs):
+        api_info_struct = {}
+        if self.is_dump_for_data_mode(Const.FORWARD, Const.INPUT): # check whether data_mode contains forward or input
+            api_info_struct[name] = {}
+            self.api_data_category = Const.INPUT
+            args_info_list = self.analyze_element(module_input_output.args_tuple)
+            api_info_struct[name][Const.INPUT_ARGS] = args_info_list
+
+            self.api_data_category = Const.KWARGS
+            kwargs_info_list = self.analyze_element(module_input_output.kwargs)
+            api_info_struct[name][Const.INPUT_KWARGS] = kwargs_info_list
+
+        if self.is_dump_for_data_mode(Const.FORWARD, Const.OUTPUT): # check whether data_mode contains forward or output
+            api_info_struct[name] = api_info_struct.get(name, {})
+            self.api_data_category = Const.OUTPUT
+            output_info_list = self.analyze_element(module_input_output.output_tuple)
+            api_info_struct[name][Const.OUTPUT] = output_info_list
+
+        return api_info_struct
+
+    def analyze_pre_forward_inplace(self, name, module_input_output: ModuleForwardInputsOutputs):
+        api_info_struct = {}
+        if self.is_dump_for_data_mode(Const.FORWARD, Const.INPUT):
+            api_info_struct[name] = {}
+            self.api_data_category = Const.INPUT
+            args_info_list = self.analyze_element(module_input_output.args_tuple)
+            api_info_struct[name][Const.INPUT_ARGS] = args_info_list
+
+            self.api_data_category = Const.KWARGS
+            kwargs_info_list = self.analyze_element(module_input_output.kwargs)
+            api_info_struct[name][Const.INPUT_KWARGS] = kwargs_info_list
+
+        return api_info_struct
+
+    def analyze_forward_inplace(self, name, module_input_output: ModuleForwardInputsOutputs):
+        concat_args = module_input_output.concat_args_and_kwargs()
+        api_info_struct = {}
+        if self.is_dump_for_data_mode(Const.FORWARD, Const.OUTPUT):
+            api_info_struct[name] = {}
+            self.api_data_category = Const.OUTPUT
+            output_info_list = self.analyze_element(concat_args)
+            api_info_struct[name][Const.OUTPUT] = output_info_list
+
+        return api_info_struct
+
+
+    def analyze_backward(self, name, module, module_input_output: ModuleBackwardInputsOutputs):
+        api_info_struct = {}
+        if self.is_dump_for_data_mode(Const.BACKWARD, Const.OUTPUT):
+            api_info_struct[name] = {}
+            self.api_data_category = Const.OUTPUT
+            input_info_list = self.analyze_element(module_input_output.grad_input_tuple)
+            api_info_struct[name][Const.GRAD_INPUT] = input_info_list
+
+        if self.is_dump_for_data_mode(Const.BACKWARD, Const.INPUT):
+            api_info_struct[name] = api_info_struct.get(name, {})
+            self.api_data_category = Const.INPUT
+            output_info_list = self.analyze_element(module_input_output.grad_output_tuple)
+            api_info_struct[name][Const.GRAD_OUTPUT] = output_info_list
+
+        return api_info_struct
+
+
+class FullTensorDataProcessor(DataProcessor):
+    
+    def _analyze_tensor(self, tensor, suffix):
+        self.data_path = self.data_writer.dump_tensor_data_dir
+        dump_data_name = (self.current_api_or_module_name + Const.SEP + self.api_data_category + Const.SEP +
+                          suffix + ".pt")
+        file_path = os.path.join(self.data_writer.dump_tensor_data_dir, dump_data_name)
+        torch.save(tensor, file_path)
+        single_arg = super()._analyze_tensor(tensor, suffix)
+        single_arg.update({"data_name": dump_data_name})
+        return single_arg
+
+
+class OverflowTensorDataProcessor(DataProcessor):
+    __slots__ = ["cached_tensors_and_file_paths"]
+
+    def __init__(self, config, data_writer):
+        super().__init__(config, data_writer)
+        self.cached_tensors_and_file_paths = {}
+        self.real_overflow_dump_times = 0
+        self.overflow_nums = config.overflow_num
+
+    def _analyze_tensor(self, tensor, suffix):
+        self.data_path = self.data_writer.dump_tensor_data_dir
+        dump_data_name = (self.current_api_or_module_name + Const.SEP + self.api_data_category + Const.SEP +
+                          suffix + ".pt")
+        file_path = os.path.join(self.data_writer.dump_tensor_data_dir, dump_data_name)
+        self.cached_tensors_and_file_paths.update({file_path: tensor})
+        single_arg = super()._analyze_tensor(tensor, suffix)
+        single_arg.update({"data_name": dump_data_name})
+        return single_arg
+
+    def analyze_forward(self, name, module,
+                        module_input_output: ModuleForwardInputsOutputs):
+        self.has_overflow = False
+        api_info_struct = super().analyze_forward(name, module, module_input_output)
+        self.maybe_save_overflow_data_and_check_overflow_times()
+        return api_info_struct if self.has_overflow else None
+
+    def analyze_backward(self, name, module,
+                        module_input_output: ModuleBackwardInputsOutputs):
+        self.has_overflow = False
+        api_info_struct = super().analyze_backward(name, module, module_input_output)
+        self.maybe_save_overflow_data_and_check_overflow_times()
+        return api_info_struct if self.has_overflow else None
+
+    def maybe_save_overflow_data_and_check_overflow_times(self):
+        if self.has_overflow:
+            for file_path, tensor in self.cached_tensors_and_file_paths.items():
+                torch.save(tensor, file_path)
+            self.inc_and_check_overflow_times()
+        self.cached_tensors_and_file_paths = {}
+
+    def inc_and_check_overflow_times(self):
+        self.real_overflow_dump_times += 1
+        if self.overflow_nums == -1:
+            return 
+        if self.real_overflow_dump_times >= self.overflow_nums:
+            raise MsaccException(MsaccException.OVERFLOW_NUMS_ERROR,
+                                 str(self.real_overflow_dump_times))
+
+
+class FreeBenchmarkDataProcessor(DataProcessor):
+
+    def __init__(self, config, data_writer):
+        super().__init__(config, data_writer)
+        self.checker = FreeBenchmarkCheck(config=config)
+    
+    def update_iter(self, current_iter):
+        self.current_iter = current_iter
+        self.checker.update_iter(current_iter)
+
+    def update_unequal_rows(self, unequal_rows: List[UnequalRow]):
+        if len(unequal_rows) == 0:
+            return
+        for row in unequal_rows:
+            data_dict = asdict(row)
+            self.data_writer.write_data_to_csv(
+                data_dict.values(),
+                data_dict.keys(),
+                self.data_writer.free_benchmark_file_path
+            )
+        return
+
+    def analyze_pre_forward(self, name, module,
+                        module_input_output: ModuleForwardInputsOutputs):
+        args = module_input_output.args
+        kwargs = module_input_output.kwargs
+        self.checker.pre_forward(name, module, self, args, kwargs)
+
+    def analyze_forward(self, name, module, module_input_output: ModuleForwardInputsOutputs):
+        new_output, unequal_rows = self.checker.forward(
+            name,
+            module,
+            module_input_output.args,
+            module_input_output.kwargs,
+            module_input_output.output,
+            )
+        self.update_unequal_rows(unequal_rows)
+        if self.checker.if_fix():
+            self._return_forward_new_output = True
+            self._forward_new_output = new_output
+        return None
+
+    def analyze_backward(self, name, module, module_input_output: ModuleBackwardInputsOutputs):
+        self.checker.backward(name, module, module_input_output.grad_output)
+        return None
+    
+
+
+def overflow_debug_mode_enable():
+    overflow_mode = os.getenv(OverflowConst.OVERFLOW_DEBUG_MODE_ENABLE, Const.ENV_DISABLE)
+    return overflow_mode == Const.ENV_ENABLE
+
+def check_overflow_npu():
+    if overflow_debug_mode_enable():
+        float_status = torch.zeros(bits_for_overflow).npu()
+        result = torch_npu.npu_get_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE)
+        if (result.cpu()[0] != 0):
+            return True
+        else:
+            return False
+    else:
+        return torch_npu._C._check_overflow_npu()
+
+def clear_overflow_npu():
+    if overflow_debug_mode_enable():
+        float_status = torch.zeros(bits_for_overflow).npu()
+        torch_npu.npu_clear_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE)
+    else:
+        torch_npu._C._clear_overflow_npu()
+
+class OverflowConst:
+    """
+    Class for Overflow
+    """
+    OVERFLOW_DEBUG_MODE_ENABLE = "OVERFLOW_DEBUG_MODE_ENABLE"
+    OVERFLOW_ORIGINAL_MODE = 0
+    OVERFLOW_DEBUG_MODE = 1
diff --git a/debug/accuracy_tools/atat/pytorch/functional/dump_module.py b/debug/accuracy_tools/atat/pytorch/functional/dump_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..fed73ad5374178fac01180bb905468b9e7c747fa
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/functional/dump_module.py
@@ -0,0 +1,38 @@
+import torch.nn as nn
+from atat.core.utils import print_error_log, DumpException
+from .scope import BaseScope
+from ..common.utils import Const
+from ..hook_module.api_registry import api_register
+from ..debugger.precision_debugger import PrecisionDebugger
+
+module_count = {}
+
+
+def module_dump(module, dump_name):
+    if not isinstance(module, nn.Module):
+        print_error_log("The parameter:module in module_dump is not a Module subclass.")
+        raise DumpException(DumpException.INVALID_PARAM_ERROR)
+    if not isinstance(dump_name, str):
+        print_error_log("The parameter:dump_name in module_dump is not a str type.")
+        raise DumpException(DumpException.INVALID_PARAM_ERROR)
+    api_register.api_originality()
+    if dump_name not in module_count:
+        module_count[dump_name] = 0
+    else:
+        module_count[dump_name] += 1
+    dump_name = dump_name + Const.SEP + str(module_count.get(dump_name)) + Const.SEP
+
+    pdg = PrecisionDebugger()
+    _, forward_hook, backward_hook = pdg.service.build_hook(BaseScope.Module_Type_Module, dump_name)
+    module.register_forward_hook(forward_hook, with_kwargs=True)
+    module.register_full_backward_hook(backward_hook)
+
+    module.register_forward_pre_hook(pdg.service.module_processor.node_hook(dump_name + Const.FORWARD, Const.START))
+    module.register_forward_hook(pdg.service.module_processor.node_hook(dump_name + Const.FORWARD, Const.STOP))
+    module.register_full_backward_pre_hook(
+        pdg.service.module_processor.node_hook(dump_name + Const.BACKWARD, Const.START))
+    module.register_full_backward_hook(pdg.service.module_processor.node_hook(dump_name + Const.BACKWARD, Const.STOP))
+
+
+def module_dump_end():
+    api_register.api_modularity()
diff --git a/debug/accuracy_tools/atat/pytorch/functional/json_writer.py b/debug/accuracy_tools/atat/pytorch/functional/json_writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f3765aa6f2dc9d5a773d9292204211d68a77569
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/functional/json_writer.py
@@ -0,0 +1,115 @@
+import os
+import csv
+from pathlib import Path
+import json
+from ..common.log import print_info_log_rank_0
+from ..common.utils import Const
+
+
+class DataWriter:  # TODO: UT
+    # dump_json_name = "dump.json"
+    # stack_json_name = "stack.json"
+    # construct_json_name = "construct.json"
+
+    def __init__(self, init_json=None) -> None:
+        self.dump_count = 0
+        self.init_json = init_json
+        self.dump_file_path = None  # os.path.join(dump_dir, DataWriter.dump_json_name)
+        self.stack_file_path = None  # os.path.join(dump_dir, DataWriter.stack_json_name)
+        self.construct_file_path = None  # os.path.join(dump_dir, DataWriter.construct_json_name)
+        self.free_benchmark_file_path = None  
+        self.dump_tensor_data_dir = None
+        self.buffer_size = 1000
+        self.cache_data = {"data": {}}
+        self.cache_stack = {}
+        self.cache_construct = {}
+
+    def initialize_json_file(self, **kwargs):
+        kwargs.update({"dump_data_dir": self.dump_tensor_data_dir, "data": {}})
+        with open(self.dump_file_path, 'w') as f:
+            json.dump(kwargs, f)
+
+        if os.path.exists(self.stack_file_path):
+            os.remove(self.stack_file_path)
+        Path(self.stack_file_path).touch()
+
+        if os.path.exists(self.construct_file_path):
+            os.remove(self.construct_file_path)
+        Path(self.construct_file_path).touch()
+
+    def update_dump_paths(self, dump_file_path, stack_file_path, construct_file_path, dump_data_dir, free_benchmark_file_path):
+        self.dump_file_path = dump_file_path
+        self.stack_file_path = stack_file_path
+        self.construct_file_path = construct_file_path
+        self.dump_tensor_data_dir = dump_data_dir
+        self.free_benchmark_file_path = free_benchmark_file_path
+
+    def update_data(self, new_data):
+        key = next(iter(new_data.keys()))  # assert len(new_data.keys()) == 1
+        if key in self.cache_data["data"]:
+            self.cache_data["data"][key].update(new_data[key])
+        else:
+            self.cache_data["data"].update(new_data)
+
+    def flush_data_when_buffer_is_full(self):
+        if len(self.cache_data["data"]) >= self.buffer_size:
+            self.write_data_json(self.dump_file_path)
+
+    def update_stack(self, new_data):
+        self.cache_stack.update(new_data)
+
+    def update_construct(self, new_data):
+        self.cache_construct.update(new_data)
+
+    def write_data_json(self, file_path):
+        import fcntl
+        print_info_log_rank_0(f"dump.json is at {os.path.dirname(os.path.dirname(file_path))}. ")
+        if Path(file_path).exists() and os.path.getsize(file_path) > 0:
+            with open(file_path, "r+") as f:
+                fcntl.flock(f, fcntl.LOCK_EX)
+                data_to_write = json.load(f)
+                fcntl.flock(f, fcntl.LOCK_UN)
+        else:
+            self.init_json['data_path'] = self.dump_tensor_data_dir
+            data_to_write = self.init_json
+        data_to_write['data'].update(self.cache_data['data'])
+        with open(file_path, 'w+') as f:
+            fcntl.flock(f, fcntl.LOCK_EX)
+            json.dump(data_to_write, f, indent=1)
+            fcntl.flock(f, fcntl.LOCK_UN)
+
+        self.cache_data["data"].clear()
+
+    def write_stack_info_json(self, file_path):
+        import fcntl
+        with open(file_path, 'w+') as f:
+            fcntl.flock(f, fcntl.LOCK_EX)
+            json.dump(self.cache_stack, f, indent=1)
+            fcntl.flock(f, fcntl.LOCK_UN)
+
+    def write_construct_info_json(self, file_path):
+        import fcntl
+        with open(file_path, 'w+') as f:
+            fcntl.flock(f, fcntl.LOCK_EX)
+            json.dump(self.cache_construct, f, indent=1)
+            fcntl.flock(f, fcntl.LOCK_UN)
+
+    def write_json(self):
+        self.write_data_json(self.dump_file_path)
+        self.write_stack_info_json(self.stack_file_path)
+        self.write_construct_info_json(self.construct_file_path)
+
+    @staticmethod
+    def write_data_to_csv(result: list, result_header: tuple, file_path: str):
+        if len(result) == 0:
+            return
+        is_exists = os.path.exists(file_path)
+        append = "a+" if is_exists else "w+"
+        with os.fdopen(
+            os.open(file_path, Const.WRITE_FLAGS, Const.WRITE_MODES), append, newline=""
+        ) as csv_file:
+            spawn_writer = csv.writer(csv_file)
+            if not is_exists:
+                spawn_writer.writerow(result_header)
+            spawn_writer.writerows([result,])
+            
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/functional/repair.py b/debug/accuracy_tools/atat/pytorch/functional/repair.py
new file mode 100644
index 0000000000000000000000000000000000000000..3469db9da74de2e0fc8145631eb69e2d64d01558
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/functional/repair.py
@@ -0,0 +1,90 @@
+from abc import ABC, abstractmethod
+
+import torch
+
+from .scope import build_scope, ListScope, BaseScope
+from ..common.exceptions import RepairException
+from ..common import recursive_apply_transform, print_info_log_rank_0
+
+
+def build_repair(config):
+    if config.repair_type is None:
+        return None
+    elif config.repair_type == RepairAPI.ToCPU:
+        return RepairAPI_toCPU(config)
+    elif config.repair_type == RepairAPI.RaisePrecision:
+        return RepairAPI_raise(config)
+    else:
+        raise RepairException(RepairException.InvalidRepairType, f"精度修复类型"
+            f"须配置为'{RepairAPI.ToCPU}'或'{RepairAPI.RaisePrecision}，"
+            f"实际配置为{config.repair_type}")
+
+
+class RepairAPI(ABC):
+    ToCPU = "cpu"
+    RaisePrecision = "raise"
+
+    def __init__(self, config):
+        self.config = config
+        self.scope = build_scope(ListScope, config.repair_scope, config.repair_api_str)
+        self.saved, self.towards = "None",  "None"
+
+    def check_name_and_module_type(self, name, module_type):
+        if module_type == BaseScope.Module_Type_Module:
+            return False
+        if not self.scope.check(name):
+            return False
+        return True
+
+    def convert(self, name, module_type, args, kwargs):
+        is_target = self.check_name_and_module_type(name, module_type)
+        if is_target:
+            args = recursive_apply_transform(args, self.fx)
+            kwargs = recursive_apply_transform(kwargs, self.fx)
+            print_info_log_rank_0(f"[calibrator] convert inputs of {name} to "
+                                  f"{self.towards}.")
+        return args, kwargs
+
+    def invert(self, name, module_type, out_feat):
+        is_target = self.check_name_and_module_type(name, module_type)
+        if is_target:
+            out_feat = recursive_apply_transform(out_feat, self.inv_fx)
+            print_info_log_rank_0(f"[calibrator] convert outputs of {name} back to "\
+                                  f"{self.saved}.")
+        return out_feat
+
+
+class RepairAPI_toCPU(RepairAPI):
+    def fx(self, arg, _):
+        if isinstance(arg, torch.Tensor):
+            self.saved = arg.device
+            self.towards = torch.device("cpu")
+            return arg.cpu()
+        return arg
+
+    def inv_fx(self, arg, _):
+        if isinstance(arg, torch.Tensor):
+            return arg.to(self.saved)
+        return arg
+
+
+class RepairAPI_raise(RepairAPI):
+    raise_dtype_map = {
+        torch.bfloat16: torch.float32,
+        torch.float16: torch.float32
+    }
+
+    def fx(self, arg, _):
+        if isinstance(arg, torch.Tensor):
+            self.saved = arg.dtype
+            self.towards = RepairAPI_raise.raise_dtype_map.get(self.saved)
+            # bug: nested input may be of various dtypes. which to save and invert?
+            return arg.to(self.towards)
+        return arg
+
+    def inv_fx(self, arg, _):
+        if isinstance(arg, torch.Tensor):
+            return arg.to(self.saved)
+        return arg
+
+
diff --git a/debug/accuracy_tools/atat/pytorch/functional/scope.py b/debug/accuracy_tools/atat/pytorch/functional/scope.py
new file mode 100644
index 0000000000000000000000000000000000000000..e557b876b1b00beef60dd623175374ad20d6a287
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/functional/scope.py
@@ -0,0 +1,174 @@
+from abc import ABC, abstractmethod
+from ..common.exceptions import ScopeException
+from ..common.utils import Const
+
+
+def build_scope(scope_class, scope=[], api_list=[]):
+    if not scope and not api_list:
+        return None
+    if scope_class:
+        return scope_class(scope, api_list)
+    return build_range_scope_according_to_scope_name(scope, api_list)
+
+
+def build_range_scope_according_to_scope_name(scope, api_list):
+    api_range_scope = APIRangeScope(scope, api_list)
+    module_range_scope = ModuleRangeScope(scope, api_list)
+    if not scope:  # 如果没有scope参数则用哪类scope都一样
+        return api_range_scope
+    if api_range_scope.is_valid and module_range_scope.is_valid:
+        raise ScopeException(ScopeException.InvalidScope, f"scope={scope}.")
+    elif api_range_scope.is_valid:
+        return api_range_scope
+    elif module_range_scope.is_valid:
+        return module_range_scope
+    else:
+        raise ScopeException(ScopeException.InvalidScope, f"scope={scope}")
+
+
+class BaseScope(ABC):
+    Module_Type_Module = "Module"
+    Module_Type_API = "api"
+
+    @staticmethod
+    def rectify_args(scope, api_list):
+        if not isinstance(api_list, list):
+            raise ScopeException(ScopeException.InvalidApiStr,
+                f"api_list参数须配置为列表，实际类型为{type(api_list)}.")
+        for api in api_list:
+            if not isinstance(api, str):
+                raise ScopeException(ScopeException.InvalidApiStr,
+                    f"api_list中的元素须配置为字符串，实际类型为{type(api)}.")
+        if isinstance(scope, str):
+            scope = [scope]
+            return scope, api_list
+        if not isinstance(scope, list):
+            raise ScopeException(ScopeException.InvalidScope,
+                f"scope参数须配置为字符串或列表，实际类型为{type(scope)}.")
+        for s in scope:
+            if not isinstance(s, str):
+                raise ScopeException(ScopeException.InvalidScope,
+                f"scope列表元素要求类型为字符串，实际类型为{type(s)}.")
+        return scope, api_list
+
+    def __init__(self, scope, api_list):
+        scope, api_list = self.rectify_args(scope, api_list)
+        self.scope = scope
+        self.api_list = api_list
+
+    def check_api_list(self, api_name):
+        if not self.api_list:
+            return True
+        for api_str in self.api_list:
+            if api_str in api_name:
+                return True
+
+    @abstractmethod
+    def check(self, name):
+        pass
+
+
+class ListScope(BaseScope):
+    @staticmethod
+    def rectify_args(scope, api_list):
+        if scope and api_list:
+            raise ScopeException(ScopeException.ArgConflict,
+                f"scope和api_list不可以同时配置，实际配置为scope={scope}, api_list={api_list}.")
+        return super(ListScope, ListScope).rectify_args(scope, api_list)
+
+    def check(self, module_name):
+        if not self.scope or module_name in self.scope:
+            return self.check_api_list(module_name)
+        return False
+
+
+class RangeScope(BaseScope, ABC):
+    @staticmethod
+    def rectify_args(scope, api_list):
+        scope, api_list = super(RangeScope, RangeScope).rectify_args(scope, api_list)
+        if isinstance(scope, list):
+            if len(scope) == 1:
+                scope.append(scope[0])
+            elif len(scope) > 2:
+                raise ScopeException(ScopeException.InvalidScope,
+                    f"scope参数指定区间断点，须传入长度为1或2的列表，实际长度为{len(scope)}.")
+
+        return scope, api_list
+
+    @abstractmethod
+    def check_scope_is_valid(self):
+        pass
+
+    def __init__(self, *args):
+        super().__init__(*args)
+        self.in_scope = False
+        self.is_valid = self.check_scope_is_valid()
+
+    def begin_module(self, module_name):
+        pass
+
+    def end_module(self, module_name):
+        pass
+
+
+class APIRangeScope(RangeScope):
+    def check_scope_is_valid(self):
+        if not self.scope:
+            return True
+        scope_start_type = self.scope[0].split(Const.SEP)[0]
+        if scope_start_type == BaseScope.Module_Type_Module:
+            return False
+        scope_stop_type = self.scope[1].split(Const.SEP)[0]
+        if scope_stop_type == BaseScope.Module_Type_Module:
+            return False
+        return True
+
+    def check(self, api_name):
+        if self.scope and api_name == self.scope[0]:
+            self.in_scope = True
+
+        if not self.scope or self.in_scope:
+            result = self.check_api_list(api_name)
+        else:
+            result = False
+
+        if self.scope and api_name == self.scope[1]:
+            self.in_scope = False
+        return result
+
+
+class ModuleRangeScope(RangeScope):
+    """
+        模块与api不同的是，模块内部还有子结构需要dump，
+        需要用pre_hook和full_backward_hook来精确控制module的开始和结束，
+        在这些hook触发时调用begin_module和end_module做区间控制
+    """
+    def check_scope_is_valid(self):
+        if not self.scope:
+            return True
+        scope_start_type = self.scope[0].split(Const.SEP)[0]
+        scope_stop_type = self.scope[1].split(Const.SEP)[0]
+        if scope_start_type == BaseScope.Module_Type_Module and \
+                scope_stop_type == BaseScope.Module_Type_Module:
+            return True
+        return False
+
+    def begin_module(self, module_name):
+        if not self.scope:
+            return
+        if module_name == self.scope[0]:
+            self.in_scope = True
+
+    def end_module(self, module_name):
+        if not self.scope:
+            return
+        if module_name == self.scope[1]:
+            self.in_scope = False
+
+    def check(self, module_name):
+        if not self.scope or self.in_scope:
+            return self.check_api_list(module_name)
+        return False
+
+
+
diff --git a/debug/accuracy_tools/atat/pytorch/functional/step_post_process.py b/debug/accuracy_tools/atat/pytorch/functional/step_post_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f0d3459326f04691a0041c120bf4efc676f8bc1
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/functional/step_post_process.py
@@ -0,0 +1,43 @@
+from abc import ABC, abstractmethod
+from ..common.exceptions import StepException
+
+
+def run_parallel_ut(config):
+    pass
+
+
+def compare_distrbuted(config):
+    pass
+
+
+def build_step_post_process(config):
+    if not config.on_step_end:
+        return None
+    if config.on_step_end == StepPostProcess.SingleAPICheck:
+        return SingleAPICheck(config)
+    elif config.on_step_end == StepPostProcess.Compare:
+        return AutoCompare(config)
+    else:
+        raise StepException(StepException.InvalidPostProcess, f"step后处理须配置为"
+            f"'{StepPostProcess.SingleAPICheck}'或'{StepPostProcess.Compare}'，"
+            f"实际配置为{config.on_step_end}")
+
+
+class StepPostProcess(ABC):
+    SingleAPICheck = 'single_api_check'
+    Compare = 'compare'
+
+
+class SingleAPICheck:
+    def __init__(self, config):
+        self.config = config
+
+    def run(self):
+        run_parallel_ut(self.config)
+
+class AutoCompare:
+    def __init__(self, config):
+        self.config = config
+
+    def run(self):
+        compare_distrbuted(self.config.bench_dump_path, self.config.dump_path)
diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/__init__.py b/debug/accuracy_tools/atat/pytorch/hook_module/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e7a5ca15e8d08d0bb886866bf413712796c9edd
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/hook_module/__init__.py
@@ -0,0 +1 @@
+from .wrap_functional import remove_dropout
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/api_registry.py b/debug/accuracy_tools/atat/pytorch/hook_module/api_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..003a8699cd750a424bf989ae9d1b3fac78f76650
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/hook_module/api_registry.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import torch
+import torch.distributed as dist
+from . import wrap_torch, wrap_functional, wrap_tensor, wrap_vf, wrap_distributed, wrap_aten
+from .wrap_torch import get_torch_ops
+from .wrap_functional import get_functional_ops
+from .wrap_tensor import get_tensor_ops
+from .wrap_vf import get_vf_ops
+from .wrap_distributed import get_distributed_ops
+from .wrap_aten import get_aten_ops
+from ..common.utils import torch_without_guard_version, npu_distributed_api, is_gpu
+torch_version_above_2 = torch.__version__.split('+')[0] > '2.0'
+
+if not is_gpu:
+    import torch_npu
+    from . import wrap_npu_custom
+    from .wrap_npu_custom import get_npu_ops
+
+
+class ApiRegistry:
+    def __init__(self):
+        self.tensor_ori_attr = {}
+        self.torch_ori_attr = {}
+        self.functional_ori_attr = {}
+        self.distributed_ori_attr = {}
+        self.npu_distributed_ori_attr = {}
+        self.vf_ori_attr = {}
+        self.aten_ori_attr = {}
+        self.torch_npu_ori_attr = {}
+
+        self.tensor_hook_attr = {}
+        self.torch_hook_attr = {}
+        self.functional_hook_attr = {}
+        self.distributed_hook_attr = {}
+        self.npu_distributed_hook_attr = {}
+        self.vf_hook_attr = {}
+        self.aten_hook_attr = {}
+        self.torch_npu_hook_attr = {}
+
+    @staticmethod
+    def store_ori_attr(ori_api_group, api_list, api_ori_attr):
+        for api in api_list:
+            if '.' in api:
+                sub_module_name, sub_op = api.rsplit('.', 1)
+                sub_module = getattr(ori_api_group, sub_module_name)
+                api_ori_attr[api] = getattr(sub_module, sub_op)
+            else:
+                api_ori_attr[api] = getattr(ori_api_group, api)
+
+    @staticmethod
+    def set_api_attr(api_group, attr_dict):
+        for api, api_attr in attr_dict.items():
+            if '.' in api:
+                sub_module_name, sub_op = api.rsplit('.', 1)
+                sub_module = getattr(api_group, sub_module_name, None)
+                if sub_module is not None:
+                    setattr(sub_module, sub_op, api_attr)
+            else:
+                setattr(api_group, api, api_attr)
+
+    def api_modularity(self):
+        self.set_api_attr(torch.Tensor, self.tensor_hook_attr)
+        self.set_api_attr(torch, self.torch_hook_attr)
+        self.set_api_attr(torch.nn.functional, self.functional_hook_attr)
+        self.set_api_attr(dist, self.distributed_hook_attr)
+        self.set_api_attr(dist.distributed_c10d, self.distributed_hook_attr)
+        if not is_gpu and not torch_without_guard_version:
+            self.set_api_attr(torch_npu.distributed, self.npu_distributed_hook_attr)
+            self.set_api_attr(torch_npu.distributed.distributed_c10d, self.npu_distributed_hook_attr)
+        if torch_version_above_2:
+            self.set_api_attr(torch.ops.aten, self.aten_hook_attr)
+        self.set_api_attr(torch._VF, self.vf_hook_attr)
+        if not is_gpu:
+            self.set_api_attr(torch_npu, self.torch_npu_hook_attr)
+
+    def api_originality(self):
+        self.set_api_attr(torch.Tensor, self.tensor_ori_attr)
+        self.set_api_attr(torch, self.torch_ori_attr)
+        self.set_api_attr(torch.nn.functional, self.functional_ori_attr)
+        self.set_api_attr(dist, self.distributed_ori_attr)
+        self.set_api_attr(dist.distributed_c10d, self.distributed_ori_attr)
+        if not is_gpu and not torch_without_guard_version:
+            self.set_api_attr(torch_npu.distributed, self.npu_distributed_ori_attr)
+            self.set_api_attr(torch_npu.distributed.distributed_c10d, self.npu_distributed_ori_attr)
+        if torch_version_above_2:
+            self.set_api_attr(torch.ops.aten, self.aten_ori_attr)
+        self.set_api_attr(torch._VF, self.vf_ori_attr)
+        if not is_gpu:
+            self.set_api_attr(torch_npu, self.torch_npu_ori_attr)
+
+    def initialize_hook(self, hook):
+        self.store_ori_attr(torch.Tensor, get_tensor_ops(), self.tensor_ori_attr)
+        wrap_tensor.wrap_tensor_ops_and_bind(hook)
+        for attr_name in dir(wrap_tensor.HOOKTensor):
+            if attr_name.startswith("wrap_"):
+                self.tensor_hook_attr[attr_name[5:]] = getattr(wrap_tensor.HOOKTensor, attr_name)
+
+        self.store_ori_attr(torch, get_torch_ops(), self.torch_ori_attr)
+        wrap_torch.wrap_torch_ops_and_bind(hook)
+        for attr_name in dir(wrap_torch.HOOKTorchOP):
+            if attr_name.startswith("wrap_"):
+                self.torch_hook_attr[attr_name[5:]] = getattr(wrap_torch.HOOKTorchOP, attr_name)
+
+        self.store_ori_attr(torch.nn.functional, get_functional_ops(), self.functional_ori_attr)
+        wrap_functional.wrap_functional_ops_and_bind(hook)
+        for attr_name in dir(wrap_functional.HOOKFunctionalOP):
+            if attr_name.startswith("wrap_"):
+                self.functional_hook_attr[attr_name[5:]] = getattr(wrap_functional.HOOKFunctionalOP, attr_name)
+
+        self.store_ori_attr(dist, get_distributed_ops(), self.distributed_ori_attr)
+        wrap_distributed.wrap_distributed_ops_and_bind(hook)
+        if not is_gpu and not torch_without_guard_version:
+            self.store_ori_attr(torch_npu.distributed, npu_distributed_api, self.npu_distributed_ori_attr)
+        for attr_name in dir(wrap_distributed.HOOKDistributedOP):
+            if attr_name.startswith("wrap_"):
+                self.distributed_hook_attr[attr_name[5:]] = getattr(wrap_distributed.HOOKDistributedOP, attr_name)
+                if not is_gpu and not torch_without_guard_version and attr_name[5:] in npu_distributed_api:  
+                    self.npu_distributed_hook_attr[attr_name[5:]] = getattr(wrap_distributed.HOOKDistributedOP,
+                                                                            attr_name)
+
+        if torch_version_above_2:
+            self.store_ori_attr(torch.ops.aten, get_aten_ops(), self.aten_ori_attr)
+            wrap_aten.wrap_aten_ops_and_bind(hook)
+            for attr_name in dir(wrap_aten.HOOKAtenOP):
+                if attr_name.startswith("wrap_"):
+                    self.aten_hook_attr[attr_name[5:]] = getattr(wrap_aten.HOOKAtenOP, attr_name)
+
+        self.store_ori_attr(torch._VF, get_vf_ops(), self.vf_ori_attr)
+        wrap_vf.wrap_vf_ops_and_bind(hook)
+        for attr_name in dir(wrap_vf.HOOKVfOP):
+            if attr_name.startswith("wrap_"):
+                self.vf_hook_attr[attr_name[5:]] = getattr(wrap_vf.HOOKVfOP, attr_name)
+
+        if not is_gpu:
+            self.store_ori_attr(torch_npu, get_npu_ops(), self.torch_npu_ori_attr)
+            wrap_npu_custom.wrap_npu_ops_and_bind(hook)
+            for attr_name in dir(wrap_npu_custom.HOOKNpuOP):
+                if attr_name.startswith("wrap_"):
+                    self.torch_npu_hook_attr[attr_name[5:]] = getattr(wrap_npu_custom.HOOKNpuOP, attr_name)
+
+
+api_register = ApiRegistry()
diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/hook_module.py b/debug/accuracy_tools/atat/pytorch/hook_module/hook_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4180e703aeb42a79f4f9a495af8f503e42c9ab
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/hook_module/hook_module.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import functools
+import threading
+import torch
+import torch.nn as nn
+import torch.utils.hooks as full_hooks
+from ..common.utils import Const
+
+class HOOKModule(nn.Module):
+    module_count = {}
+    inner_stop_hook = {}
+
+    def __init__(self, build_hook) -> None:
+        super(HOOKModule, self).__init__()
+        self.has_overflow = False
+        self.prefix = ""
+        self.current_thread = threading.current_thread().ident
+        if self.current_thread not in HOOKModule.inner_stop_hook:
+            HOOKModule.inner_stop_hook[self.current_thread] = False
+        self.stop_hook = HOOKModule.inner_stop_hook.get(self.current_thread, False)
+
+        if not self.stop_hook:
+            if hasattr(self, "prefix_op_name_"):
+                self.prefix = self.prefix_op_name_
+
+            if self.prefix not in HOOKModule.module_count:
+                HOOKModule.module_count[self.prefix] = 1
+                self.prefix += '0' + Const.SEP
+            else:
+                HOOKModule.module_count[self.prefix] += 1
+                self.prefix = self.prefix + str(HOOKModule.module_count[self.prefix] - 1) + Const.SEP
+            self.mindstudio_reserved_name = self.prefix
+            forward_pre_hook, forward_hook, backward_hook = build_hook(self.prefix)
+            self.register_forward_pre_hook(forward_pre_hook, with_kwargs=True)
+            self.register_forward_hook(forward_hook, with_kwargs=True)
+            self.register_backward_hook(backward_hook)
+
+    def __call__(self, *input, **kwargs):
+        changed = False
+        if not self.stop_hook:
+            HOOKModule.inner_stop_hook[self.current_thread] = True
+            changed = True
+        result = self._call_func(*input, **kwargs)
+        if changed:
+            HOOKModule.inner_stop_hook[self.current_thread] = False
+        return result
+
+    def _call_func(self, *input, **kwargs):
+        full_backward_hooks, non_full_backward_hooks = [], []
+        if len(self._backward_hooks) > 0:
+            full_backward_hooks, non_full_backward_hooks = self._get_backward_hooks()
+        for hook in self._forward_pre_hooks.values():
+            result_input, result_kwargs = hook(self, input, kwargs)
+            if result_input is not None:
+                if not isinstance(result_input, tuple):
+                    result_input = (result_input,)
+                input = result_input
+            if result_kwargs is not None:
+                kwargs = result_kwargs
+        bw_hook = None
+        if len(full_backward_hooks) > 0:
+            bw_hook = full_hooks.BackwardHook(self, full_backward_hooks)
+            input = bw_hook.setup_input_hook(input)
+        if torch._C._get_tracing_state():
+            result = self._slow_forward(*input, **kwargs)
+        else:
+            result = self.forward(*input, **kwargs)
+        for hook in self._forward_hooks.values():
+            hook_result = hook(self, input, kwargs, result)
+            if hook_result is not None:
+                result = hook_result
+        if bw_hook:
+            result = bw_hook.setup_output_hook(result)
+        if len(non_full_backward_hooks) > 0:
+            var = result
+            while not isinstance(var, torch.Tensor):
+                if isinstance(var, dict):
+                    var = next((v for v in var.values() if isinstance(v, torch.Tensor)))
+                elif isinstance(var, (list, tuple)):
+                    if var:
+                        var = var[0]
+                    else:
+                        return result
+                else:
+                    return result
+            grad_fn = var.grad_fn
+            if grad_fn is not None:
+                for hook in non_full_backward_hooks:
+                    wrapper = functools.partial(hook, self)
+                    functools.update_wrapper(wrapper, hook)
+                    grad_fn.register_hook(wrapper)
+                self._maybe_warn_non_full_backward_hook(input, result, grad_fn)
+        return result
diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml b/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8bab7cd7696b33081df2630b527d79ba30cccbe2
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/hook_module/support_wrap_ops.yaml
@@ -0,0 +1,1877 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# List of ops that register hooks
+
+functional:
+  - conv1d
+  - conv2d
+  - conv3d
+  - conv_transpose1d
+  - conv_transpose2d
+  - conv_transpose3d
+  - conv_tbc
+  - avg_pool1d
+  - avg_pool2d
+  - avg_pool3d
+  - fractional_max_pool2d_with_indices
+  - fractional_max_pool2d
+  - fractional_max_pool3d_with_indices
+  - fractional_max_pool3d
+  - max_pool1d_with_indices
+  - max_pool1d
+  - max_pool2d_with_indices
+  - max_pool2d
+  - max_pool3d_with_indices
+  - max_pool3d
+  - max_unpool1d
+  - max_unpool2d
+  - max_unpool3d
+  - lp_pool2d
+  - lp_pool1d
+  - adaptive_max_pool1d_with_indices
+  - adaptive_max_pool1d
+  - adaptive_max_pool2d_with_indices
+  - adaptive_max_pool2d
+  - adaptive_max_pool3d_with_indices
+  - adaptive_max_pool3d
+  - adaptive_avg_pool1d
+  - adaptive_avg_pool2d
+  - adaptive_avg_pool3d
+  - dropout
+  - alpha_dropout
+  - dropout2d
+  - dropout3d
+  - feature_alpha_dropout
+  - threshold
+  - threshold_
+  - relu
+  - relu_
+  - glu
+  - hardtanh
+  - hardtanh_
+  - relu6
+  - elu
+  - elu_
+  - selu
+  - selu_
+  - celu
+  - celu_
+  - leaky_relu
+  - leaky_relu_
+  - prelu
+  - rrelu
+  - rrelu_
+  - logsigmoid
+  - gelu
+  - hardshrink
+  - tanhshrink
+  - softsign
+  - softplus
+  - softmin
+  - softmax
+  - gumbel_softmax
+  - log_softmax
+  - softshrink
+  - tanh
+  - sigmoid
+  - hardsigmoid
+  - linear
+  - bilinear
+  - silu
+  - hardswish
+  - embedding
+  - embedding_bag
+  - batch_norm
+  - instance_norm
+  - layer_norm
+  - group_norm
+  - local_response_norm
+  - ctc_loss
+  - nll_loss
+  - poisson_nll_loss
+  - gaussian_nll_loss
+  - kl_div
+  - cross_entropy
+  - binary_cross_entropy
+  - binary_cross_entropy_with_logits
+  - smooth_l1_loss
+  - l1_loss
+  - mse_loss
+  - margin_ranking_loss
+  - hinge_embedding_loss
+  - multilabel_margin_loss
+  - soft_margin_loss
+  - multilabel_soft_margin_loss
+  - cosine_embedding_loss
+  - multi_margin_loss
+  - pixel_shuffle
+  - pixel_unshuffle
+  - channel_shuffle
+  - upsample
+  - interpolate
+  - upsample_nearest
+  - upsample_bilinear
+  - grid_sample
+  - affine_grid
+  - pad
+  - pairwise_distance
+  - pdist
+  - cosine_similarity
+  - one_hot
+  - triplet_margin_loss
+  - triplet_margin_with_distance_loss
+  - normalize
+  - unfold
+  - fold
+  - multi_head_attention_forward
+  - scaled_dot_product_attention
+
+tensor:
+  - __add__
+  - __and__
+  - __bool__
+  - __div__
+  - __eq__
+  - __ge__
+  - __gt__
+  - __getitem__
+  - __iadd__
+  - __iand__
+  - __idiv__
+  - __ifloordiv__
+  - __ilshift__
+  - __imod__
+  - __imul__
+  - __ior__
+  - __irshift__
+  - __isub__
+  - __ixor__
+  - __lshift__
+  - __matmul__
+  - __mod__
+  - __mul__
+  - __nonzero__
+  - __or__
+  - __radd__
+  - __rmul__
+  - __rshift__
+  - __setitem__
+  - __sub__
+  - __truediv__
+  - __xor__
+  - abs
+  - abs_
+  - absolute
+  - absolute_
+  - acos
+  - acos_
+  - acosh
+  - acosh_
+  - add
+  - add_
+  - addbmm
+  - addbmm_
+  - addcdiv
+  - addcdiv_
+  - addcmul
+  - addcmul_
+  - addmm
+  - addmm_
+  - addmv
+  - addmv_
+  - addr
+  - addr_
+  - align_as
+  - align_to
+  - all
+  - allclose
+  - amax
+  - amin
+  - angle
+  - any
+  - arccos
+  - arccos_
+  - arccosh
+  - arccosh_
+  - arcsin
+  - arcsin_
+  - arcsinh
+  - arcsinh_
+  - arctan
+  - arctan_
+  - arctanh
+  - arctanh_
+  - argmax
+  - argmin
+  - argsort
+  - asin
+  - asin_
+  - asinh
+  - asinh_
+  - atan
+  - atan2
+  - atan2_
+  - atan_
+  - atanh
+  - atanh_
+  - baddbmm
+  - baddbmm_
+  - bernoulli
+  - bernoulli_
+  - bincount
+  - bitwise_and
+  - bitwise_and_
+  - bitwise_not
+  - bitwise_not_
+  - bitwise_or
+  - bitwise_or_
+  - bitwise_xor
+  - bitwise_xor_
+  - bmm
+  - broadcast_to
+  - cauchy_
+  - ceil
+  - ceil_
+  - cholesky
+  - chunk
+  - clamp
+  - cholesky_solve
+  - cholesky_inverse
+  - clamp_
+  - clamp_max
+  - clamp_max_
+  - clip
+  - clamp_min
+  - clamp_min_
+  - clip_
+  - copysign
+  - copysign_
+  - cos
+  - cos_
+  - cosh
+  - cosh_
+  - count_nonzero
+  - cummax
+  - cummin
+  - cumprod
+  - cumprod_
+  - cumsum
+  - cumsum_
+  - deg2rad
+  - deg2rad_
+  - det
+  - diag
+  - diag_embed
+  - diagflat
+  - diagonal
+  - diff
+  - dist
+  - digamma
+  - digamma_
+  - div
+  - div_
+  - divide
+  - divide_
+  - dot
+  - eig
+  - eq
+  - eq_
+  - erf
+  - equal
+  - erf_
+  - erfc
+  - erfc_
+  - erfinv
+  - erfinv_
+  - exp
+  - exp2
+  - exp2_
+  - expm1
+  - exp_
+  - expm1_
+  - exponential_
+  - fill_
+  - fix
+  - fill_diagonal_
+  - fix_
+  - flip
+  - fliplr
+  - flatten
+  - flipud
+  - float_power
+  - float_power_
+  - floor
+  - floor_
+  - floor_divide
+  - floor_divide_
+  - fmax
+  - fmin
+  - fmod
+  - fmod_
+  - frac
+  - frac_
+  - gather
+  - gcd
+  - gcd_
+  - ge
+  - ge_
+  - geometric_
+  - geqrf
+  - ger
+  - greater
+  - greater_
+  - gt
+  - gt_
+  - greater_equal
+  - greater_equal_
+  - hardshrink
+  - heaviside
+  - heaviside_
+  - histc
+  - hypot
+  - hypot_
+  - igamma
+  - igamma_
+  - igammac
+  - igammac_
+  - index_add
+  - index_add_
+  - inverse
+  - index_copy
+  - index_copy_
+  - index_fill
+  - index_fill_
+  - index_put
+  - index_put_
+  - inner
+  - index_select
+  - isclose
+  - isfinite
+  - isinf
+  - isnan
+  - isneginf
+  - isposinf
+  - isreal
+  - kron
+  - kthvalue
+  - lcm
+  - lcm_
+  - ldexp
+  - ldexp_
+  - le
+  - le_
+  - lerp
+  - lerp_
+  - where
+  - less
+  - less_
+  - less_equal
+  - less_equal_
+  - lgamma
+  - lgamma_
+  - log
+  - log10
+  - log10_
+  - log1p
+  - log1p_
+  - log2
+  - log2_
+  - log_
+  - log_normal_
+  - log_softmax
+  - logcumsumexp
+  - logdet
+  - logaddexp
+  - logaddexp2
+  - logical_and
+  - logical_and_
+  - logical_not
+  - logit
+  - logical_not_
+  - logical_or
+  - logical_or_
+  - logical_xor
+  - logical_xor_
+  - logit_
+  - logsumexp
+  - lstsq
+  - lt
+  - lt_
+  - lu_solve
+  - map2_
+  - map_
+  - masked_fill
+  - matmul
+  - masked_fill_
+  - masked_scatter
+  - masked_scatter_
+  - masked_select
+  - matrix_exp
+  - max
+  - maximum
+  - mean
+  - matrix_power
+  - median
+  - min
+  - minimum
+  - mm
+  - mode
+  - msort
+  - mul
+  - mul_
+  - multinomial
+  - multiply
+  - multiply_
+  - mv
+  - mvlgamma
+  - mvlgamma_
+  - nansum
+  - narrow
+  - narrow_copy
+  - ne
+  - ne_
+  - neg
+  - neg_
+  - negative
+  - negative_
+  - nonzero
+  - norm
+  - normal_
+  - not_equal
+  - not_equal_
+  - permute
+  - pinverse
+  - polygamma
+  - pow
+  - pow_
+  - polygamma_
+  - prelu
+  - prod
+  - put_
+  - rad2deg
+  - rad2deg_
+  - ravel
+  - real
+  - reciprocal
+  - reciprocal_
+  - relu
+  - relu_
+  - remainder
+  - repeat_interleave
+  - reshape
+  - remainder_
+  - renorm
+  - renorm_
+  - repeat
+  - reshape_as
+  - resize_
+  - resize_as_
+  - roll
+  - rot90
+  - round
+  - round_
+  - rsqrt
+  - rsqrt_
+  - scatter
+  - scatter_
+  - scatter_add
+  - scatter_add_
+  - select
+  - sgn
+  - sgn_
+  - sigmoid
+  - sigmoid_
+  - sign
+  - sign_
+  - signbit
+  - sin
+  - sin_
+  - sinc
+  - sinc_
+  - sinh
+  - sinh_
+  - slogdet
+  - smm
+  - softmax
+  - solve
+  - sort
+  - split_with_sizes
+  - sqrt
+  - sqrt_
+  - square
+  - square_
+  - squeeze
+  - squeeze_
+  - sspaddmm
+  - std
+  - sub
+  - sub_
+  - sum
+  - sum_to_size
+  - svd
+  - symeig
+  - t
+  - t_
+  - take
+  - tan
+  - tan_
+  - tanh
+  - tanh_
+  - tensor_split
+  - tile
+  - topk
+  - transpose
+  - transpose_
+  - triangular_solve
+  - tril
+  - tril_
+  - triu
+  - true_divide
+  - triu_
+  - true_divide_
+  - trunc
+  - trunc_
+  - type_as
+  - unbind
+  - unflatten
+  - unfold
+  - unsafe_chunk
+  - unsqueeze
+  - unsafe_split
+  - unsafe_split_with_sizes
+  - var
+  - vdot
+  - unsqueeze_
+  - view_as
+  - xlogy
+  - xlogy_
+
+torch:
+  - linalg.norm
+  - linalg.vector_norm
+  - linalg.matrix_norm
+  - linalg.diagonal
+  - linalg.det
+  - linalg.slogdet
+  - linalg.cond
+  - linalg.matrix_rank
+  - linalg.qr
+  - linalg.lu
+  - linalg.lu_factor
+  - linalg.svd
+  - linalg.svdvals
+  - linalg.solve
+  - linalg.lstsq
+  - linalg.inv
+  - linalg.pinv
+  - linalg.matrix_exp
+  - linalg.matrix_power
+  - linalg.cross
+  - linalg.matmul
+  - linalg.vecdot
+  - linalg.multi_dot
+  - linalg.householder_product
+  - linalg.tensorsolve
+  - linalg.vander
+  - linalg.cholesky_ex
+  - linalg.inv_ex
+  - linalg.solve_ex
+  - linalg.lu_factor_ex
+  - linalg.ldl_factor
+  - linalg.ldl_factor_ex
+  - _adaptive_avg_pool2d
+  - _add_relu
+  - _add_relu_
+  - _aminmax
+  - _batch_norm_impl_index
+  - _convolution
+  - _foreach_norm
+  - _softmax_backward_data
+  - abs
+  - abs_
+  - absolute
+  - acos
+  - acos_
+  - acosh
+  - acosh_
+  - adaptive_avg_pool1d
+  - adaptive_max_pool1d
+  - add
+  - addbmm
+  - addcdiv
+  - addcmul
+  - addmm
+  - addmv
+  - addmv_
+  - addr
+  - amax
+  - affine_grid_generator
+  - align_tensors
+  - all
+  - alpha_dropout
+  - amin
+  - alpha_dropout_
+  - angle
+  - any
+  - arange
+  - arccos
+  - arccos_
+  - arccosh
+  - arccosh_
+  - arcsin
+  - arcsin_
+  - arcsinh
+  - arcsinh_
+  - arctan
+  - arctan_
+  - arctanh
+  - arctanh_
+  - argmax
+  - argmin
+  - argsort
+  - asin
+  - asin_
+  - asinh
+  - asinh_
+  - atan
+  - atan2
+  - atan_
+  - atanh
+  - atanh_
+  - atleast_1d
+  - atleast_2d
+  - atleast_3d
+  - avg_pool1d
+  - baddbmm
+  - bartlett_window
+  - batch_norm_backward_elemt
+  - batch_norm_backward_reduce
+  - batch_norm_elemt
+  - batch_norm_gather_stats
+  - batch_norm_gather_stats_with_counts
+  - bernoulli
+  - batch_norm_stats
+  - batch_norm_update_stats
+  - bilinear
+  - bincount
+  - binomial
+  - binary_cross_entropy_with_logits
+  - bitwise_and
+  - bitwise_not
+  - bitwise_or
+  - bitwise_xor
+  - blackman_window
+  - block_diag
+  - bmm
+  - broadcast_tensors
+  - broadcast_to
+  - bucketize
+  - cartesian_prod
+  - cat
+  - cdist
+  - ceil
+  - ceil_
+  - celu
+  - celu_
+  - chain_matmul
+  - channel_shuffle
+  - cholesky
+  - cholesky_inverse
+  - cholesky_solve
+  - choose_qparams_optimized
+  - chunk
+  - clamp
+  - clamp_
+  - clamp_max
+  - clamp_max_
+  - clamp_min
+  - clamp_min_
+  - clip
+  - clip_
+  - clone
+  - column_stack
+  - combinations
+  - concat
+  - concatenate
+  - constant_pad_nd
+  - conv1d
+  - conv2d
+  - conv3d
+  - conv_tbc
+  - conv_transpose1d
+  - conv_transpose2d
+  - conv_transpose3d
+  - cos
+  - convolution
+  - copysign
+  - cos_
+  - cosh
+  - cosh_
+  - cosine_embedding_loss
+  - cosine_similarity
+  - count_nonzero
+  - cov
+  - cross
+  - ctc_loss
+  - cummax
+  - cummin
+  - cumprod
+  - cumsum
+  - deg2rad
+  - deg2rad_
+  - det
+  - diag
+  - diag_embed
+  - diff
+  - diagflat
+  - diagonal
+  - digamma
+  - dist
+  - div
+  - divide
+  - dot
+  - dropout
+  - dropout_
+  - dsmm
+  - dstack
+  - eig
+  - einsum
+  - embedding
+  - embedding_bag
+  - embedding_renorm_
+  - eq
+  - equal
+  - erf
+  - erf_
+  - erfc
+  - erfc_
+  - erfinv
+  - exp
+  - exp2
+  - exp2_
+  - exp_
+  - expm1
+  - expm1_
+  - eye
+  - feature_dropout
+  - feature_alpha_dropout
+  - feature_alpha_dropout_
+  - feature_dropout_
+  - fix
+  - fill_
+  - fix_
+  - flatten
+  - flip
+  - fliplr
+  - flipud
+  - float_power
+  - floor
+  - floor_
+  - floor_divide
+  - fmax
+  - fmin
+  - fmod
+  - frac
+  - frac_
+  - full
+  - frobenius_norm
+  - full_like
+  - gather
+  - gcd
+  - gcd_
+  - ge
+  - geqrf
+  - ger
+  - greater
+  - greater_equal
+  - grid_sampler
+  - grid_sampler_2d
+  - group_norm
+  - grid_sampler_3d
+  - gru
+  - gru_cell
+  - gt
+  - hamming_window
+  - hann_window
+  - hardshrink
+  - heaviside
+  - hinge_embedding_loss
+  - histc
+  - hsmm
+  - hspmm
+  - hstack
+  - hypot
+  - igamma
+  - igammac
+  - index_add
+  - index_copy
+  - inner
+  - index_fill
+  - index_put
+  - index_put_
+  - index_select
+  - instance_norm
+  - inverse
+  - isclose
+  - isfinite
+  - isinf
+  - isnan
+  - isneginf
+  - isposinf
+  - istft
+  - kaiser_window
+  - kl_div
+  - kron
+  - kthvalue
+  - layer_norm
+  - lcm
+  - lcm_
+  - ldexp
+  - ldexp_
+  - le
+  - lerp
+  - less
+  - less_equal
+  - lgamma
+  - linspace
+  - log
+  - log10
+  - log10_
+  - log1p
+  - log1p_
+  - log2
+  - log2_
+  - log_softmax
+  - log_
+  - logaddexp
+  - logaddexp2
+  - logcumsumexp
+  - logdet
+  - logical_and
+  - logical_not
+  - logical_or
+  - logical_xor
+  - logit
+  - logit_
+  - logspace
+  - logsumexp
+  - lstm
+  - lstm_cell
+  - lstsq
+  - lt
+  - lu_solve
+  - lu_unpack
+  - masked_fill
+  - margin_ranking_loss
+  - masked_scatter
+  - masked_select
+  - matrix_exp
+  - matmul
+  - matrix_power
+  - matrix_rank
+  - max
+  - max_pool1d
+  - max_pool2d
+  - max_pool1d_with_indices
+  - max_pool3d
+  - maximum
+  - mean
+  - median
+  - min
+  - minimum
+  - mm
+  - mode
+  - moveaxis
+  - movedim
+  - msort
+  - mul
+  - multinomial
+  - multiply
+  - mv
+  - mvlgamma
+  - nan_to_num
+  - nan_to_num_
+  - nanmedian
+  - nansum
+  - narrow
+  - native_batch_norm
+  - native_group_norm
+  - narrow_copy
+  - native_layer_norm
+  - native_norm
+  - ne
+  - neg
+  - negative
+  - neg_
+  - negative_
+  - nextafter
+  - nonzero
+  - norm
+  - norm_except_dim
+  - normal
+  - not_equal
+  - nuclear_norm
+  - ones_like
+  - outer
+  - pairwise_distance
+  - pdist
+  - permute
+  - pinverse
+  - pixel_shuffle
+  - pixel_unshuffle
+  - poisson
+  - poisson_nll_loss
+  - polar
+  - polygamma
+  - pow
+  - prelu
+  - prod
+  - qr
+  - quantile
+  - rad2deg
+  - rad2deg_
+  - range
+  - ravel
+  - real
+  - reciprocal
+  - relu
+  - reciprocal_
+  - relu_
+  - remainder
+  - renorm
+  - repeat_interleave
+  - reshape
+  - resize_as_
+  - roll
+  - rot90
+  - round
+  - round_
+  - rrelu
+  - rrelu_
+  - rsqrt
+  - row_stack
+  - rsqrt_
+  - rsub
+  - saddmm
+  - scalar_tensor
+  - scatter
+  - select
+  - scatter_add
+  - searchsorted
+  - selu
+  - selu_
+  - sgn
+  - sigmoid
+  - sigmoid_
+  - sign
+  - signbit
+  - sin
+  - sin_
+  - sinc
+  - sinc_
+  - sinh
+  - sinh_
+  - slogdet
+  - smm
+  - softmax
+  - solve
+  - sort
+  - sparse_coo_tensor
+  - square
+  - split
+  - split_with_sizes
+  - spmm
+  - sqrt
+  - sqrt_
+  - square_
+  - squeeze
+  - sspaddmm
+  - stack
+  - std
+  - std_mean
+  - stft
+  - sub
+  - subtract
+  - sum
+  - svd
+  - swapaxes
+  - swapdims
+  - symeig
+  - t
+  - take
+  - take_along_dim
+  - tan
+  - tan_
+  - tanh
+  - tanh_
+  - tensordot
+  - tensor_split
+  - threshold
+  - threshold_
+  - tile
+  - topk
+  - transpose
+  - trapz
+  - triangular_solve
+  - tril
+  - tril_indices
+  - triplet_margin_loss
+  - triu
+  - triu_indices
+  - true_divide
+  - trunc
+  - trunc_
+  - unique_consecutive
+  - xlogy
+  - unbind
+  - unsafe_chunk
+  - unsafe_split
+  - vander
+  - var
+  - vdot
+  - unsafe_split_with_sizes
+  - unsqueeze
+  - var_mean
+  - vstack
+  - where
+  - xlogy_
+
+_VF:
+  - lstm
+
+torch_npu:
+  - one_
+  - npu_sort_v2
+  - npu_transpose
+  - npu_broadcast
+  - npu_dtype_cast
+  - empty_with_format
+  - npu_one_hot
+  - npu_stride_add
+  - npu_ps_roi_pooling
+  - npu_roi_align
+  - npu_nms_v4
+  - npu_iou
+  - npu_nms_with_mask
+  - npu_pad
+  - npu_bounding_box_encode
+  - npu_bounding_box_decode
+  - npu_batch_nms
+  - npu_slice
+  - _npu_dropout
+  - npu_indexing
+  - npu_ifmr
+  - npu_max
+  - npu_scatter
+  - npu_layer_norm_eval
+  - npu_alloc_float_status
+  - npu_confusion_transpose
+  - npu_bmmV2
+  - fast_gelu
+  - npu_sub_sample
+  - npu_deformable_conv2d
+  - npu_mish
+  - npu_anchor_response_flags
+  - npu_yolo_boxes_encode
+  - npu_grid_assign_positive
+  - npu_normalize_batch
+  - npu_masked_fill_range
+  - npu_linear
+  - npu_bert_apply_adam
+  - npu_giou
+  - npu_ciou
+  - npu_diou
+  - npu_sign_bits_pack
+  - npu_sign_bits_unpack
+  - npu_flash_attention
+  - npu_scaled_masked_softmax
+  - npu_rotary_mul
+  - npu_roi_align
+  - npu_roi_alignbk
+  - npu_ptiou
+  - npu_fusion_attention
+  - npu_dropout_with_add_softmax
+  - npu_random_choice_with_mask
+  - npu_rotated_iou
+  - npu_conv2d
+  - npu_conv3d
+  - npu_softmax_cross_entropy_with_logits
+  - npu_all_gather_base_mm
+  - npu_swiglu
+  - npu_rms_norm
+  - npu_mm_reduce_scatter_base
+  - npu_mm_all_reduce_base
+  - npu_conv_transpose2d
+  - npu_convolution
+  - npu_convolution_transpose
+  - npu_min
+  - npu_nms_rotated
+  - npu_reshape
+  - npu_rotated_box_decode
+  - npu_rotated_box_encode
+  - npu_rotated_overlaps
+  - npu_silu
+  - npu_fused_attention_score
+  - npu_multi_head_attention
+  - npu_gru
+  - npu_incre_flash_attention
+  - npu_prompt_flash_attention
+  - npu_lstm
+  - npu_apply_adam
+
+aten:
+  - signbit
+  - logical_not_
+  - _foreach_copy_
+  - clamp
+  - hardswish_
+  - arcsin_
+  - logsumexp
+  - native_group_norm
+  - special_i1e
+  - bitwise_and
+  - new_full
+  - fft_ihfft
+  - _adaptive_avg_pool2d
+  - scatter_add
+  - abs
+  - selu
+  - exponential
+  - silu
+  - _native_batch_norm_legit_functional
+  - special_hermite_polynomial_h
+  - tanh_
+  - log_sigmoid_forward
+  - _fft_c2c
+  - heaviside_
+  - sigmoid_backward
+  - zeros_like
+  - as_strided_scatter
+  - trace
+  - _assert_async
+  - avg_pool2d_backward
+  - exp2
+  - binary_cross_entropy_backward
+  - geometric
+  - fft_ihfftn
+  - smooth_l1_loss
+  - multiply
+  - __lshift__
+  - binary_cross_entropy_with_logits
+  - _embedding_bag
+  - arange
+  - linalg_qr
+  - _embedding_bag_forward_only
+  - _unsafe_view
+  - remainder
+  - cholesky_inverse
+  - sub_
+  - zero
+  - fix
+  - xlogy
+  - __doc__
+  - rsqrt_
+  - cummin
+  - __xor__
+  - eye
+  - _fused_adam
+  - ceil
+  - nll_loss2d_backward
+  - replication_pad3d_backward
+  - fill_
+  - logaddexp2
+  - _thnn_fused_lstm_cell_backward_impl
+  - native_dropout
+  - fft_ifft
+  - expand
+  - _cdist_backward
+  - avg_pool3d_backward
+  - round_
+  - topk
+  - max_unpool3d
+  - xlogy_
+  - reflection_pad2d_backward
+  - addcdiv_
+  - relu6
+  - multilabel_margin_loss_forward
+  - prelu
+  - logaddexp
+  - _cholesky_solve_helper
+  - _foreach_addcdiv
+  - arctan_
+  - fft_irfftn
+  - logical_or
+  - bitwise_or_
+  - hardtanh_backward
+  - uniform
+  - less_equal
+  - _foreach_sub
+  - linalg_cholesky_ex
+  - hardswish
+  - fft_fft2
+  - sign
+  - min
+  - norm
+  - asin
+  - addcmul_
+  - stft
+  - col2im
+  - special_chebyshev_polynomial_u
+  - adaptive_max_pool3d
+  - __ilshift__
+  - _resize_output
+  - gather
+  - lu_unpack
+  - native_batch_norm_backward
+  - sigmoid
+  - sqrt
+  - new_empty_strided
+  - _foreach_lerp_
+  - mean
+  - scatter_add_
+  - _fft_c2r
+  - rand_like
+  - true_divide_
+  - gcd_
+  - multinomial
+  - permute
+  - index_put_
+  - arcsinh_
+  - log1p_
+  - index_add
+  - atan
+  - glu_backward
+  - searchsorted
+  - fill
+  - _unsafe_index
+  - index_reduce_
+  - replication_pad2d
+  - expm1_
+  - hardsigmoid
+  - addmm
+  - fft_fftn
+  - fft_ifftshift
+  - special_modified_bessel_k1
+  - fft_rfft
+  - ge
+  - _adaptive_avg_pool2d_backward
+  - argmin
+  - linalg_lu_factor_ex
+  - atanh_
+  - addmv
+  - _foreach_sqrt_
+  - huber_loss_backward
+  - empty_like
+  - softshrink
+  - subtract_
+  - bitwise_left_shift_
+  - special_modified_bessel_i0
+  - _nested_tensor_from_tensor_list
+  - slice_backward
+  - special_modified_bessel_i1
+  - special_chebyshev_polynomial_t
+  - conj_physical
+  - _cdist_forward
+  - margin_ranking_loss
+  - max_pool3d_with_indices_backward
+  - _foreach_reciprocal_
+  - lcm
+  - transpose_
+  - cudnn_batch_norm_backward
+  - reciprocal
+  - copysign_
+  - _foreach_pow
+  - rad2deg
+  - _foreach_sqrt
+  - negative
+  - replication_pad3d
+  - atanh
+  - _linalg_eigh
+  - igamma_
+  - special_i0e
+  - linalg_ldl_factor_ex
+  - special_ndtri
+  - logit
+  - diagonal_copy
+  - triu
+  - silu_
+  - polygamma
+  - square_
+  - nextafter_
+  - special_scaled_modified_bessel_k0
+  - bitwise_not
+  - var
+  - mkldnn_rnn_layer_backward
+  - upsample_bilinear2d
+  - arctan2
+  - clone
+  - arcsin
+  - new_ones
+  - soft_margin_loss
+  - nan_to_num
+  - huber_loss
+  - linalg_lu_solve
+  - elu_backward
+  - acosh
+  - __ior__
+  - _unsafe_index_put
+  - __or__
+  - _linalg_slogdet
+  - arcsinh
+  - select_scatter
+  - less_
+  - reflection_pad1d
+  - istft
+  - reflection_pad2d
+  - diagonal_backward
+  - special_entr
+  - _softmax_backward_data
+  - randn
+  - celu
+  - embedding
+  - igammac_
+  - new_zeros
+  - native_layer_norm_backward
+  - nonzero_static
+  - diagonal_scatter
+  - grid_sampler_2d
+  - smooth_l1_loss_backward
+  - _to_copy
+  - fft_irfft2
+  - relu_
+  - fmod
+  - log1p
+  - i0
+  - mse_loss_backward
+  - copy
+  - special_laguerre_polynomial_l
+  - addmv_
+  - quantized_gru
+  - diag_embed
+  - acos
+  - fmod_
+  - linalg_cross
+  - mvlgamma_
+  - _foreach_mul
+  - cummax
+  - less_equal_
+  - ne
+  - to
+  - _pdist_forward
+  - special_xlog1py
+  - digamma
+  - lgamma
+  - mv
+  - softplus
+  - special_bessel_y1
+  - pin_memory
+  - logical_xor_
+  - cat
+  - grid_sampler_2d_backward
+  - frac_
+  - dropout
+  - unsafe_chunk
+  - masked_fill_
+  - log
+  - negative_
+  - _scaled_dot_product_flash_attention
+  - _amp_foreach_non_finite_check_and_unscale_
+  - randn_like
+  - add
+  - roll
+  - threshold
+  - gcd
+  - asinh
+  - round
+  - t_
+  - unfold_backward
+  - scatter_reduce
+  - softplus_backward
+  - bitwise_right_shift_
+  - pdist
+  - select_backward
+  - relu
+  - special_bessel_j1
+  - asinh_
+  - pow
+  - fft_fftshift
+  - clamp_max_
+  - logical_xor
+  - index_reduce
+  - _foreach_add_
+  - adaptive_max_pool2d
+  - adaptive_max_pool3d_backward
+  - tan
+  - addbmm_
+  - cosh_
+  - __rshift__
+  - _foreach_maximum
+  - fft_ifftn
+  - special_spherical_bessel_j0
+  - split_with_sizes
+  - divide_
+  - neg_
+  - nll_loss
+  - _euclidean_dist
+  - pairwise_distance
+  - _adaptive_avg_pool3d
+  - slice
+  - absolute_
+  - gelu_backward
+  - arccos
+  - sin
+  - tril_
+  - triu_
+  - fft_irfft
+  - flip
+  - _foreach_sign
+  - linalg_householder_product
+  - _list_to_tensor
+  - cumprod
+  - randint_like
+  - item
+  - narrow_copy
+  - tanh
+  - linalg_vector_norm
+  - _cudnn_rnn
+  - _scaled_dot_product_efficient_attention
+  - _reshape_alias
+  - _linalg_det
+  - constant_pad_nd
+  - _linalg_svd
+  - sinh_
+  - view
+  - nll_loss_backward
+  - greater
+  - sqrt_
+  - avg_pool3d
+  - arctan
+  - le_
+  - _pdist_backward
+  - _adaptive_avg_pool3d_backward
+  - log_
+  - logical_or_
+  - mse_loss
+  - rrelu_with_noise_backward
+  - _native_batch_norm_legit
+  - log10
+  - scatter_
+  - atan2_
+  - greater_equal
+  - index_select
+  - __iand__
+  - digamma_
+  - eq
+  - divide
+  - cholesky_solve
+  - _prelu_kernel
+  - fft_ifft2
+  - _foreach_neg_
+  - alias
+  - erfc_
+  - not_equal
+  - mul
+  - gru
+  - _dir
+  - glu
+  - clip
+  - lt
+  - rsqrt
+  - avg_pool2d
+  - conj_physical_
+  - quantized_lstm
+  - erfinv_
+  - log10_
+  - float_power_
+  - _functional_assert_async
+  - hardtanh
+  - logical_and_
+  - _resize_output_
+  - clamp_min
+  - _functional_sym_constrain_range_for_size
+  - _addmm_activation
+  - bucketize
+  - _thnn_fused_lstm_cell
+  - zeros
+  - reflection_pad1d_backward
+  - tan_
+  - bitwise_not_
+  - addmm_
+  - absolute
+  - as_strided
+  - special_ndtr
+  - gt_
+  - baddbmm
+  - special_log_ndtr
+  - hardshrink
+  - fft_hfft
+  - hypot
+  - native_layer_norm
+  - _scaled_dot_product_flash_attention_backward
+  - floor_divide
+  - is_same_size
+  - std
+  - floor_divide_
+  - clamp_min_
+  - _foreach_sign_
+  - std_mean
+  - tanh_backward
+  - _foreach_addcmul
+  - binary_cross_entropy
+  - threshold_backward
+  - deg2rad_
+  - masked_fill
+  - linspace
+  - reflection_pad3d
+  - mish
+  - index_copy
+  - scatter_reduce_
+  - _sparse_coo_tensor_with_dims_and_tensors
+  - __loader__
+  - _foreach_div_
+  - cosh
+  - _foreach_maximum_
+  - neg
+  - lift_fresh
+  - logspace
+  - selu_
+  - leaky_relu_
+  - matmul
+  - _foreach_sub_
+  - bitwise_or
+  - unfold
+  - fmin
+  - convolution
+  - argmax
+  - maximum
+  - reflection_pad3d_backward
+  - fft_fft
+  - mode
+  - remainder_
+  - _foreach_neg
+  - erf_
+  - special_zeta
+  - index_add_
+  - arccos_
+  - lgamma_
+  - unsqueeze_
+  - gelu_
+  - bmm
+  - _add_relu
+  - unfold_copy
+  - not_equal_
+  - subtract
+  - true_divide
+  - max_pool2d_with_indices_backward
+  - _native_batch_norm_legit_no_training
+  - replication_pad1d
+  - name
+  - greater_
+  - log_normal
+  - minimum
+  - alpha_dropout
+  - rnn_tanh
+  - _functional_sym_constrain_range
+  - sum
+  - _prelu_kernel_backward
+  - cumsum_
+  - ne_
+  - _linalg_solve_ex
+  - native_batch_norm
+  - igammac
+  - hypot_
+  - exp
+  - leaky_relu
+  - new_empty
+  - cudnn_batch_norm
+  - resize_as_
+  - mm
+  - triangular_solve
+  - sign_
+  - clamp_max
+  - bitwise_right_shift
+  - logical_and
+  - special_i0
+  - index_copy_
+  - arctanh_
+  - elu
+  - index
+  - isposinf
+  - linalg_solve_triangular
+  - logcumsumexp
+  - arccosh
+  - nan_to_num_
+  - nll_loss_forward
+  - convolution_backward
+  - sub
+  - special_scaled_modified_bessel_k1
+  - mish_
+  - diagonal
+  - median
+  - tril
+  - sgn
+  - native_group_norm_backward
+  - stack
+  - take
+  - linalg_lu
+  - log2
+  - hardsigmoid_
+  - erfc
+  - max
+  - native_dropout_backward
+  - logit_
+  - addr
+  - clip_
+  - _foreach_minimum_
+  - atan_
+  - repeat
+  - cumprod_
+  - bitwise_xor_
+  - less
+  - index_put
+  - rrelu_with_noise
+  - addbmm
+  - special_bessel_y0
+  - __and__
+  - bernoulli_
+  - uniform_
+  - log2_
+  - mul_
+  - adaptive_max_pool2d_backward
+  - _foreach_addcmul_
+  - slice_scatter
+  - isneginf
+  - pow_
+  - renorm_
+  - arccosh_
+  - replication_pad1d_backward
+  - bitwise_and_
+  - heaviside
+  - renorm
+  - special_modified_bessel_k0
+  - le
+  - is_pinned
+  - __ixor__
+  - leaky_relu_backward
+  - count_nonzero
+  - _fused_adam_
+  - repeat_interleave
+  - upsample_bicubic2d
+  - rsub
+  - arctan2_
+  - frac
+  - scalar_tensor
+  - rrelu_with_noise_
+  - rot90
+  - erf
+  - lerp_
+  - expm1
+  - full
+  - sym_constrain_range_for_size
+  - prod
+  - normal_
+  - elu_
+  - special_airy_ai
+  - nextafter
+  - split
+  - addcdiv
+  - fft_rfft2
+  - max_pool3d_with_indices
+  - positive
+  - transpose
+  - mish_backward
+  - clamp_
+  - exp_
+  - _foreach_reciprocal
+  - linalg_matrix_exp
+  - unsqueeze
+  - upsample_nearest2d
+  - sinc_
+  - select
+  - rad2deg_
+  - trunc_
+  - _make_dep_token
+  - nanmedian
+  - fft_hfftn
+  - hardtanh_
+  - sym_constrain_range
+  - index_fill_
+  - deg2rad
+  - rand
+  - sinc
+  - pixel_shuffle
+  - tril_indices
+  - copy_
+  - _int_mm
+  - greater_equal_
+  - celu_
+  - div
+  - igamma
+  - exp2_
+  - cos
+  - log_normal_
+  - _log_softmax_backward_data
+  - im2col
+  - reciprocal_
+  - amax
+  - broadcast_tensors
+  - erfinv
+  - __spec__
+  - _fused_dropout
+  - special_hermite_polynomial_he
+  - aminmax
+  - rnn_relu
+  - meshgrid
+  - var_mean
+  - eq_
+  - upsample_nearest3d
+  - dot
+  - zero_
+  - floor_
+  - fft_rfftn
+  - special_erfcx
+  - _foreach_div
+  - fft_hfft2
+  - _upsample_bilinear2d_aa
+  - sort
+  - log_sigmoid_backward
+  - add_
+  - copysign
+  - bernoulli
+  - special_bessel_j0
+  - max_pool2d_with_indices
+  - _scaled_dot_product_efficient_attention_backward
+  - t
+  - _softmax
+  - arctanh
+  - hinge_embedding_loss
+  - hardswish_backward
+  - fmax
+  - multiply_
+  - floor
+  - lstm
+  - i0_
+  - cholesky
+  - where
+  - __irshift__
+  - addcmul
+  - embedding_dense_backward
+  - sigmoid_
+  - fix_
+  - ormqr
+  - exponential_
+  - __name__
+  - fft_ihfft2
+  - logical_not
+  - ones
+  - sgn_
+  - sinh
+  - any
+  - _foreach_addcdiv_
+  - asin_
+  - gt
+  - lift
+  - squeeze
+  - grid_sampler_3d_backward
+  - atan2
+  - _fft_r2c
+  - angle
+  - silu_backward
+  - acosh_
+  - abs_
+  - lerp
+  - special_i1
+  - complex
+  - ceil_
+  - _foreach_minimum
+  - hardsigmoid_backward
+  - upsample_nearest1d
+  - mvlgamma
+  - acos_
+  - lt_
+  - grid_sampler_3d
+  - max_unpool2d
+  - ones_like
+  - soft_margin_loss_backward
+  - _fused_moving_avg_obs_fq_helper
+  - isnan
+  - nansum
+  - baddbmm_
+  - amin
+  - isinf
+  - bitwise_left_shift
+  - unsafe_split_with_sizes
+  - full_like
+  - sin_
+  - bitwise_xor
+  - linalg_ldl_solve
+  - cos_
+  - div_
+  - polar
+  - randint
+  - trunc
+  - __package__
+  - nll_loss2d_forward
+  - diag
+  - argsort
+  - _foreach_mul_
+  - square
+  - detach
+  - affine_grid_generator
+  - _pin_memory
+  - geometric_
+  - unbind
+  - randperm
+  - upsample_nearest2d_backward
+  - all
+  - threshold_
+  - unsafe_split
+  - cauchy
+  - normal
+  - linalg_inv_ex
+  - multi_margin_loss
+  - cumsum
+  - gelu
+  - index_fill
+  - scatter
+  - mkldnn_rnn_layer
+  - ge_
+  - dist
+  - _foreach_add
+  - logit_backward
+  - triu_indices
+  - lcm_
+  - empty_strided
+  - replication_pad2d_backward
+  - cauchy_
+  - _log_softmax
+  - vdot
+
+distributed:
+  - send
+  - recv
+  - broadcast
+  - all_reduce
+  - reduce
+  - all_gather
+  - gather
+  - isend
+  - irecv
+  - scatter
+  - reduce_scatter
+  - _reduce_scatter_base
+  - _all_gather_base
+  - all_to_all_single
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_aten.py b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_aten.py
new file mode 100644
index 0000000000000000000000000000000000000000..8666287095bbe12f7e9d5f314cff1db75d74a108
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_aten.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2023-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+import torch
+
+import yaml
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard, Const
+from ..common.file_check import FileOpen
+
+
+cur_path = os.path.dirname(os.path.realpath(__file__))
+yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
+with FileOpen(yaml_path, 'r') as f:
+    WrapAtenOps = yaml.safe_load(f).get('aten')
+
+
+aten_func = {}
+for f in dir(torch.ops.aten):
+    aten_func[f] = getattr(torch.ops.aten, f)
+
+
+def get_aten_ops():
+    global WrapAtenOps
+    _all_aten_ops = dir(torch.ops.aten)
+    return set(WrapAtenOps) & set(_all_aten_ops)
+
+
+class HOOKAtenOP(object):
+    pass
+
+
+class AtenOPTemplate(HOOKModule):
+    def __init__(self, op, hook):
+        if isinstance(op, torch._ops.OpOverloadPacket):
+            op_name_ = op._qualified_op_name.split("::")[-1]
+        else:
+            op_name_ = op.name().split("::")[-1]
+            overload_name = op._overloadname
+            if not '.' + overload_name in op_name_:
+                op_name_ = op_name_ + '.' + overload_name
+        self.op = op
+        self.prefix_op_name_ = "Aten" + Const.SEP + str(op_name_) + Const.SEP
+        super().__init__(hook)
+
+    @torch_device_guard
+    def forward(self, *args, **kwargs):
+        return self.op(*args, **kwargs)
+
+
+class AtenOPPacketTemplate():
+    def __init__(self, opPacket, hook):
+        self.opPacket = opPacket
+        self.hook = hook
+
+    def __getattr__(self, key):
+        try:
+            attr = getattr(self.opPacket, key)
+        except AttributeError as e:
+            raise AttributeError(f"AtenOPPacketTemplate or OpOverloadPacket does not have attribute '{key}'.") from e
+        if isinstance(attr, torch._ops.OpOverload):
+            return AtenOPTemplate(attr, self.hook)
+        else:
+            return attr
+
+    def overloads(self):
+        return self.opPacket.overloads()
+
+    @torch_device_guard
+    def __call__(self, *args, **kwargs):
+        return AtenOPTemplate(self.opPacket, self.hook)(*args, **kwargs)
+
+
+def wrap_aten_op(op, hook):
+    return AtenOPPacketTemplate(op, hook)
+
+
+def wrap_aten_ops_and_bind(hook):
+    _aten_ops = get_aten_ops()
+    for op_name in _aten_ops:
+        if not isinstance(aten_func.get(op_name), torch._ops.OpOverloadPacket):
+            continue
+        setattr(HOOKAtenOP, "wrap_" + str(op_name), wrap_aten_op(aten_func.get(op_name), hook))
diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_distributed.py b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..68ce83c16b8414f43e61b1a667f8cb7c27899a10
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_distributed.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2022-2023. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+from functools import wraps
+import torch.distributed as dist
+import yaml
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard, Const
+from ..common.file_check import FileOpen
+
+
+cur_path = os.path.dirname(os.path.realpath(__file__))
+yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
+with FileOpen(yaml_path, 'r') as f:
+    WrapDistributedOps = yaml.safe_load(f).get('distributed')
+
+
+distributed_func = {}
+for f in dir(dist):
+    distributed_func[f] = getattr(dist, f)
+
+
+def get_distributed_ops():
+    global WrapDistributedOps
+    _all_distributed_ops = dir(dist)
+    return set(WrapDistributedOps) & set(_all_distributed_ops)
+
+
+class HOOKDistributedOP(object):
+    pass
+
+
+class DistributedOPTemplate(HOOKModule):
+    def __init__(self, op_name, build_hook):
+        self.op_name_ = op_name
+        self.prefix_op_name_ = "Distributed" + Const.SEP + str(op_name) + Const.SEP
+        super().__init__(build_hook)
+        if not self.stop_hook and self.op_name_ in Const.INPLACE_LIST:
+            self.op_is_inplace = True
+
+    @torch_device_guard
+    def forward(self, *args, **kwargs):
+        return distributed_func.get(self.op_name_)(*args, **kwargs)
+
+
+def wrap_distributed_op(op_name, hook):
+    @wraps(DistributedOPTemplate)
+    def distributed_op_template(*args, **kwargs):
+        return DistributedOPTemplate(op_name, hook)(*args, **kwargs)
+
+    distributed_op_template.__name__ = op_name
+    return distributed_op_template
+
+
+def wrap_distributed_ops_and_bind(hook):
+    _distributed_ops = get_distributed_ops()
+    for op_name in _distributed_ops:
+        setattr(HOOKDistributedOP, "wrap_" + str(op_name), wrap_distributed_op(op_name, hook))
diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_functional.py b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..46f25efe664fca2bff917b93e3e0632398bdc74e
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_functional.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+
+import torch
+import yaml
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard, Const
+from ..common.log import print_info_log_rank_0
+from ..common.file_check import FileOpen
+
+
+def remove_dropout():
+    if torch.__version__ > "1.8":
+        print_info_log_rank_0("For precision comparison, the probability p in the dropout method is set to 0.")
+        import torch.nn.functional as F
+        from torch import _VF
+        from torch.overrides import has_torch_function_unary, handle_torch_function
+
+        def function_dropout(input: torch.Tensor, p: float = 0.5, training: bool = True,
+                             inplace: bool = False) -> torch.Tensor:
+            if has_torch_function_unary(input):
+                return handle_torch_function(function_dropout, (input,), input, p=0., training=training, inplace=inplace)
+            if p < 0.0 or p > 1.0:
+                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
+            return _VF.dropout_(input, 0., training) if inplace else _VF.dropout(input, 0., training)
+
+
+        def function_dropout2d(input: torch.Tensor, p: float = 0.5, training: bool = True,
+                               inplace: bool = False) -> torch.Tensor:
+            if has_torch_function_unary(input):
+                return handle_torch_function(function_dropout2d, (input,), input, p=0., training=training, inplace=inplace)
+            if p < 0.0 or p > 1.0:
+                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
+            return _VF.feature_dropout_(input, 0., training) if inplace else _VF.feature_dropout(input, 0., training)
+
+
+        def function_dropout3d(input: torch.Tensor, p: float = 0.5, training: bool = True,
+                               inplace: bool = False) -> torch.Tensor:
+            if has_torch_function_unary(input):
+                return handle_torch_function(function_dropout3d, (input,), input, p=0., training=training, inplace=inplace)
+            if p < 0.0 or p > 1.0:
+                raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
+            return _VF.feature_dropout_(input, 0., training) if inplace else _VF.feature_dropout(input, 0., training)
+
+        F.dropout = function_dropout
+        F.dropout2d = function_dropout2d
+        F.dropout3d = function_dropout3d
+
+cur_path = os.path.dirname(os.path.realpath(__file__))
+yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
+with FileOpen(yaml_path, 'r') as f:
+    WrapFunctionalOps = yaml.safe_load(f).get('functional')
+
+
+def get_functional_ops():
+    global WrapFunctionalOps
+    _all_functional_ops = dir(torch.nn.functional)
+    return set(WrapFunctionalOps) & set(_all_functional_ops)
+
+
+TorchFunctions = {func: getattr(torch.nn.functional, func) for func in get_functional_ops()}
+
+
+class HOOKFunctionalOP(object):
+    pass
+
+
+class FunctionalOPTemplate(HOOKModule):
+    def __init__(self, op_name, hook):
+        self.op_name_ = op_name
+        self.prefix_op_name_ = "Functional" + Const.SEP + str(op_name) + Const.SEP
+        super().__init__(hook)
+
+    @torch_device_guard
+    def forward(self, *args, **kwargs):
+        return TorchFunctions[str(self.op_name_)](*args, **kwargs)
+
+
+def wrap_functional_op(op_name, hook):
+    def functional_op_template(*args, **kwargs):
+        return FunctionalOPTemplate(op_name, hook)(*args, **kwargs)
+
+    return functional_op_template
+
+
+def wrap_functional_ops_and_bind(hook):
+    _functional_ops = get_functional_ops()
+    for op_name in _functional_ops:
+        setattr(HOOKFunctionalOP, "wrap_" + op_name, wrap_functional_op(op_name, hook))
diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_npu_custom.py b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_npu_custom.py
new file mode 100644
index 0000000000000000000000000000000000000000..e910e609c8379e0c66239755c3ec2a44953ef1ec
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_npu_custom.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+import torch
+import torch_npu
+import yaml
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard, torch_without_guard_version, Const
+from ..common.file_check import FileOpen
+
+cur_path = os.path.dirname(os.path.realpath(__file__))
+yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
+with FileOpen(yaml_path, 'r') as f:
+    WrapNpuOps = yaml.safe_load(f).get('torch_npu')
+
+
+def get_npu_ops():
+    global WrapNpuOps
+    if torch_without_guard_version:
+        _npu_ops = dir(torch.ops.npu)
+    else:
+        _npu_ops = dir(torch_npu._C._VariableFunctionsClass)
+    return set(WrapNpuOps) & set(_npu_ops)
+
+
+class HOOKNpuOP(object):
+    pass
+
+
+class NpuOPTemplate(HOOKModule):
+
+    def __init__(self, op_name, hook):
+        self.op_name_ = op_name
+        self.prefix_op_name_ = "NPU" + Const.SEP + str(op_name) + Const.SEP
+        super().__init__(hook)
+
+    @torch_device_guard
+    def forward(self, *args, **kwargs):
+        if torch_without_guard_version:
+            return getattr(torch.ops.npu, str(self.op_name_))(*args, **kwargs)
+        else:
+            return getattr(torch_npu._C._VariableFunctionsClass, str(self.op_name_))(*args, **kwargs)
+
+
+def wrap_npu_op(op_name, hook):
+
+    def npu_op_template(*args, **kwargs):
+        return NpuOPTemplate(op_name, hook)(*args, **kwargs)
+
+    return npu_op_template
+
+
+def wrap_npu_ops_and_bind(hook):
+    _npu_ops = get_npu_ops()
+    for op_name in _npu_ops:
+        setattr(HOOKNpuOP, "wrap_" + str(op_name), wrap_npu_op(op_name, hook))
diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_tensor.py b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b49826ab4712d440b4933651eb6b7eab950d023
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_tensor.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+
+import torch
+import yaml
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard, parameter_adapter, Const
+from ..common.file_check import FileOpen
+
+cur_path = os.path.dirname(os.path.realpath(__file__))
+yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
+with FileOpen(yaml_path, 'r') as f:
+    WrapTensorOps = yaml.safe_load(f).get('tensor')
+
+
+def get_tensor_ops():
+    global WrapTensorOps
+    _tensor_ops = dir(torch.Tensor)
+    return set(WrapTensorOps) & set(_tensor_ops)
+
+
+TensorOps = {op: getattr(torch.Tensor, op) for op in get_tensor_ops()}
+
+
+class HOOKTensor(object):
+    pass
+
+
+class TensorOPTemplate(HOOKModule):
+
+    def __init__(self, op_name, hook):
+        self.op_name_ = op_name
+        self.prefix_op_name_ = "Tensor" + Const.SEP + str(op_name) + Const.SEP
+        super().__init__(hook)
+
+    @torch_device_guard
+    @parameter_adapter
+    def forward(self, *args, **kwargs):
+        return TensorOps[str(self.op_name_)](*args, **kwargs)
+
+
+def wrap_tensor_op(op_name, hook):
+
+    def tensor_op_template(*args, **kwargs):
+        return TensorOPTemplate(op_name, hook)(*args, **kwargs)
+
+    return tensor_op_template
+
+
+def wrap_tensor_ops_and_bind(hook):
+    _tensor_ops = get_tensor_ops()
+    for op_name in _tensor_ops:
+        setattr(HOOKTensor, "wrap_" + str(op_name), wrap_tensor_op(op_name, hook))
diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_torch.py b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..889512e9c0c64d9d05dc19cbc30e542c6e5b577c
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_torch.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+
+import torch
+import yaml
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard, Const
+from ..common.file_check import FileOpen
+
+cur_path = os.path.dirname(os.path.realpath(__file__))
+yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
+with FileOpen(yaml_path, 'r') as f:
+    WrapTorchOps = yaml.safe_load(f).get('torch')
+
+
+def get_torch_ops():
+    global WrapTorchOps
+    _torch_ops = []
+    for operation in WrapTorchOps:
+        if '.' in operation:
+            operation_sub_module_name, operation_sub_op = operation.rsplit('.', 1)
+            operation_sub_module = getattr(torch, operation_sub_module_name)
+            if operation_sub_op in dir(operation_sub_module):
+                _torch_ops.append(operation)
+        else:
+            if hasattr(torch, operation):
+                _torch_ops.append(operation)
+    return set(_torch_ops)
+
+
+TorchOps = {}
+for op in get_torch_ops():
+    if '.' in op:
+        sub_module_name, sub_op = op.rsplit('.', 1)
+        sub_module = getattr(torch, sub_module_name)
+        TorchOps[op] = getattr(sub_module, sub_op)
+    else:
+        TorchOps[op] = getattr(torch, op)
+
+
+
+class HOOKTorchOP(object):
+    pass
+
+
+class TorchOPTemplate(HOOKModule):
+
+    def __init__(self, op_name, hook):
+        self.op_name_ = op_name
+        self.prefix_op_name_ = "Torch" + Const.SEP + str(op_name) + Const.SEP
+        super().__init__(hook)
+
+    @torch_device_guard
+    def forward(self, *args, **kwargs):
+        return TorchOps[str(self.op_name_)](*args, **kwargs)
+
+
+def wrap_torch_op(op_name, hook):
+
+    def torch_op_template(*args, **kwargs):
+        return TorchOPTemplate(op_name, hook)(*args, **kwargs)
+
+    return torch_op_template
+
+
+def wrap_torch_ops_and_bind(hook):
+    _torch_ops = get_torch_ops()
+    for op_name in _torch_ops:
+        setattr(HOOKTorchOP, "wrap_" + op_name, wrap_torch_op(op_name, hook))
diff --git a/debug/accuracy_tools/atat/pytorch/hook_module/wrap_vf.py b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_vf.py
new file mode 100644
index 0000000000000000000000000000000000000000..08d47308e077981e65193eea71874d4f9432c6c0
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/hook_module/wrap_vf.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+# Copyright (C) 2019-2020. Huawei Technologies Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+
+import torch
+import yaml
+
+from .hook_module import HOOKModule
+from ..common.utils import torch_device_guard, Const
+from ..common.file_check import FileOpen
+
+cur_path = os.path.dirname(os.path.realpath(__file__))
+yaml_path = os.path.join(cur_path, "support_wrap_ops.yaml")
+with FileOpen(yaml_path, 'r') as f:
+    WrapVfOps = yaml.safe_load(f).get('_VF')
+
+
+def get_vf_ops():
+    global WrapVfOps
+    # _all_functional_ops = dir(torch.nn.functional)
+    # assert set(WrapFunctionalOps) <= set(_all_functional_ops)
+    return WrapVfOps
+
+
+class HOOKVfOP(object):
+    pass
+
+
+class VfOPTemplate(HOOKModule):
+    def __init__(self, op_name, hook):
+        self.op_name_ = op_name
+        self.prefix_op_name_ = "VF" + Const.SEP + str(op_name) + Const.SEP
+        super().__init__(hook)
+
+    @torch_device_guard
+    def forward(self, *args, **kwargs):
+        return getattr(torch._C._VariableFunctionsClass, str(self.op_name_))(*args, **kwargs)
+
+
+def wrap_vf_op(op_name, hook):
+    def vf_op_template(*args, **kwargs):
+        return VfOPTemplate(op_name, hook)(*args, **kwargs)
+
+    return vf_op_template
+
+
+def wrap_vf_ops_and_bind(hook):
+    _vf_ops = get_vf_ops()
+    for op_name in _vf_ops:
+        setattr(HOOKVfOP, "wrap_" + op_name, wrap_vf_op(op_name, hook))
diff --git a/debug/accuracy_tools/atat/pytorch/module_processer.py b/debug/accuracy_tools/atat/pytorch/module_processer.py
new file mode 100644
index 0000000000000000000000000000000000000000..434f95910dafd57587d93ad22cb0a0c825083aba
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/module_processer.py
@@ -0,0 +1,85 @@
+from functools import wraps
+import torch
+from torch.utils.hooks import BackwardHook
+from .functional.scope import ModuleRangeScope
+from .common.utils import Const
+
+
+class ModuleProcesser:
+    module_stack = []
+    api_parent_node = ""
+    module_node = {}
+    current_module_name = ""
+
+    def __init__(self, scope):
+        if isinstance(scope, ModuleRangeScope):
+            self.scope = scope
+        else:
+            self.scope = None
+        BackwardHook.setup_input_hook = ModuleProcesser.clone_return_value(BackwardHook.setup_input_hook)
+        BackwardHook.setup_output_hook = ModuleProcesser.clone_return_value(BackwardHook.setup_output_hook)
+        self.module_count = {}
+
+    @staticmethod
+    def clone_return_value(func):
+        @wraps(func)
+        def clone_return_value_func(*args, **kwargs):
+            result = func(*args, **kwargs)
+            return ModuleProcesser.clone_if_tensor(result)
+
+        return clone_return_value_func
+    
+    @staticmethod
+    def clone_if_tensor(result):
+        if isinstance(result, torch.Tensor):
+            return result.clone()
+        elif isinstance(result, tuple):
+            return tuple(ModuleProcesser.clone_if_tensor(x) for x in result)
+        elif isinstance(result, list):
+            return list(ModuleProcesser.clone_if_tensor(x) for x in result)
+        elif isinstance(result, dict):
+            return {k: ModuleProcesser.clone_if_tensor(v) for k, v in result.items()}
+        else:
+            return result
+
+    def node_hook(self, name_prefix, start_or_stop, **kwargs):
+
+        def pre_hook(module, input, output=None):
+            try:
+                index = self.module_count_func(name_prefix)
+            except IndexError as e:
+                index = None
+                pass
+            module.mindstudio_reserved_name = full_name = name_prefix + Const.SEP + str(index)
+            if self.module_stack:
+                ModuleProcesser.module_node[full_name] = self.module_stack[-1]
+            else:
+                ModuleProcesser.module_node[full_name] = None
+
+            ModuleProcesser.module_stack.append(full_name)
+            if self.module_stack:
+                ModuleProcesser.api_parent_node = self.module_stack[-1]
+            if self.scope:
+                self.scope.begin_module(full_name)
+
+        def end_hook(module, input, output=None):
+            if self.module_stack:
+                ModuleProcesser.module_stack.pop()
+            if self.module_stack:
+                ModuleProcesser.api_parent_node = self.module_stack[-1]
+            else:
+                ModuleProcesser.api_parent_node = None
+            if self.scope:
+                self.scope.end_module(module.mindstudio_reserved_name)
+
+        if "start" in start_or_stop:
+            return pre_hook
+        else:
+            return end_hook
+
+    def module_count_func(self, module_name):
+        if module_name not in self.module_count:
+            self.module_count[module_name] = 0
+        else:
+            self.module_count[module_name] += 1
+        return self.module_count[module_name]
diff --git a/debug/accuracy_tools/atat/pytorch/overflow_check/__init__.py b/debug/accuracy_tools/atat/pytorch/overflow_check/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/debug/accuracy_tools/atat/pytorch/overflow_check/info_dump.py b/debug/accuracy_tools/atat/pytorch/overflow_check/info_dump.py
new file mode 100644
index 0000000000000000000000000000000000000000..161e9f23f0fb7c3a9c09bb5e7697eb9a7dfaef15
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/overflow_check/info_dump.py
@@ -0,0 +1,252 @@
+import inspect
+import fcntl
+import os
+import threading
+
+import json
+import numpy as np
+import torch
+
+from atat.core.file_check_util import FileOpen, FileCheckConst, change_mode
+from atat.core.utils import get_time
+from ..common.utils import print_error_log
+
+
+special_torch_object = ["memory_format"]
+lock = threading.Lock()
+
+
+def write_npy(file_path, tensor):
+    saved_tensor = tensor.contiguous().cpu().detach()
+    if tensor.dtype == torch.bfloat16:
+        saved_numpy = saved_tensor.to(torch.float32).numpy()
+    else:
+        saved_numpy = saved_tensor.numpy()
+    if os.path.exists(file_path):
+        raise ValueError(f"File {file_path} already exists")
+    np.save(file_path, saved_numpy)
+    full_path = os.path.abspath(file_path)
+    return full_path
+
+
+class APIInfo:
+    def __init__(self, api_name, is_forward, save_real_data=False):
+        self.rank = os.getpid()
+        self.api_name = api_name
+        self.save_real_data = save_real_data
+        self.torch_object_key = {'device': self.analyze_device_in_kwargs, 'dtype': self.analyze_dtype_in_kwargs}
+        self.is_forward = is_forward
+        self.args_num = 0
+
+    def analyze_element(self, element):
+        if isinstance(element, (list, tuple)):
+            out = []
+            for item in element:
+                out.append(self.analyze_element(item))
+            return out
+        elif isinstance(element, dict):
+            out_dict = {}
+            for key, value in element.items():
+                if key in self.torch_object_key.keys():
+                    fun = self.torch_object_key[key]
+                    out_dict[key] = fun(value)
+                elif key in special_torch_object:
+                    continue
+                else:
+                    out_dict[key] = self.analyze_element(value)
+            return out_dict
+        elif isinstance(element, torch.Tensor):
+            out_tensor = self.analyze_tensor(element, self.save_real_data)
+            return out_tensor
+        elif self.is_builtin_class(element):
+            out_builtin = self.analyze_builtin(element)
+            return out_builtin
+        else:
+            msg = f"Type {type(element)} is unsupported at analyze_element"
+            print_error_log(msg)
+
+            raise NotImplementedError(msg)
+
+    def analyze_tensor(self, arg, save_real_data):
+        single_arg = {}
+        if not save_real_data:
+            single_arg.update({'type': 'torch.Tensor'})
+            single_arg.update({'dtype': str(arg.dtype)})
+            single_arg.update({'shape': arg.shape})
+            single_arg.update({'Max': self.transfer_types(self.get_tensor_extremum(arg, 'max'), str(arg.dtype))})
+            single_arg.update({'Min': self.transfer_types(self.get_tensor_extremum(arg, 'min'), str(arg.dtype))})
+            single_arg.update({'requires_grad': arg.requires_grad})
+
+        else:
+            dump_path = "./"
+            api_args = self.api_name + '.' + str(self.args_num)
+            rank = arg.device.index
+            if self.is_forward:
+                forward_real_data_path = os.path.join(dump_path, "forward_real_data_" + get_time(), f"rank{rank}")
+                if not os.path.exists(forward_real_data_path):
+                    os.makedirs(forward_real_data_path, 0o755)
+
+                file_path = os.path.join(forward_real_data_path, f'{api_args}.npy')
+            else:
+                backward_real_data_path = os.path.join(dump_path, "backward_real_data_" + get_time(), f"rank{rank}")
+                if not os.path.exists(backward_real_data_path):
+                    os.makedirs(backward_real_data_path, 0o755)
+                file_path = os.path.join(backward_real_data_path, f'{api_args}.npy')
+            self.args_num += 1
+            npy_path = write_npy(file_path, arg)
+            single_arg.update({'type': 'torch.Tensor'})
+            single_arg.update({'datapath': npy_path})
+            single_arg.update({'requires_grad': arg.requires_grad})
+        return single_arg
+
+    def analyze_builtin(self, arg):
+        single_arg = {}
+        if isinstance(arg, slice):
+            single_arg.update({'type': "slice"})
+            single_arg.update({'value': [arg.start, arg.stop, arg.step]})
+        else:
+            single_arg.update({'type': self.get_type_name(str(type(arg)))})
+            single_arg.update({'value': arg})
+        return single_arg
+
+    def transfer_types(self, data, dtype):
+        if 'int' in dtype or 'bool' in dtype:
+            return int(data)
+        else:
+            return float(data)
+
+    def is_builtin_class(self, element):
+        if element is None or isinstance(element, (bool, int, float, str, slice)):
+            return True
+        return False
+
+    def analyze_device_in_kwargs(self, element):
+        single_arg = {}
+        single_arg.update({'type': 'torch.device'})
+        if not isinstance(element, str):
+
+            if hasattr(element, "index"):
+                device_value = element.type + ":" + str(element.index)
+                single_arg.update({'value': device_value})
+            else:
+                device_value = element.type
+        else:
+            single_arg.update({'value': element})
+        return single_arg
+
+    def analyze_dtype_in_kwargs(self, element):
+        single_arg = {}
+        single_arg.update({'type': 'torch.dtype'})
+        single_arg.update({'value': str(element)})
+        return single_arg
+
+    def get_tensor_extremum(self, data, operator):
+        if data.dtype is torch.bool:
+            if operator == 'max':
+                return True in data
+            elif operator == 'min':
+                return False not in data
+        if operator == 'max':
+            return torch._C._VariableFunctionsClass.max(data).item()
+        else:
+            return torch._C._VariableFunctionsClass.min(data).item()
+
+    def get_type_name(self, name):
+
+        left = name.index("'")
+        right = name.rindex("'")
+        return name[left + 1: right]
+
+
+class ForwardAPIInfo(APIInfo):
+    def __init__(self, name, save_real_data, args, kwargs):
+        super().__init__(name, is_forward=True, save_real_data=save_real_data)
+        self.analyze_api_input(args, kwargs)
+        self.analyze_api_call_stack()
+
+    def analyze_api_input(self, args, kwargs):
+        args_info_list = self.analyze_element(args)
+        kwargs_info_dict = self.analyze_element(kwargs)
+        self.api_info_struct = {self.api_name: {"args": args_info_list, "kwargs": kwargs_info_dict}}
+
+    def analyze_api_call_stack(self):
+        stack_str = []
+        for (_, path, line, func, code, _) in inspect.stack()[3:]:
+            if not code: 
+                continue
+            stack_line = " ".join([
+                "File", ", ".join([path, " ".join(["line", str(line)]), " ".join(["in", func]),
+                                   " ".join(["\n", code[0].strip()])])])
+            stack_str.append(stack_line)
+        self.stack_info_struct = {self.api_name: stack_str}
+
+
+class BackwardAPIInfo(APIInfo):
+    def __init__(self, name, grads):
+        super().__init__(name, is_forward=False)
+        self.analyze_api_input(grads)
+
+    def analyze_api_input(self, grads):
+        grads_info_list = self.analyze_element(grads)
+        self.grad_info_struct = {self.api_name: grads_info_list}
+
+
+def write_api_info_json(api_info):
+    dump_path = "./"
+    rank = api_info.rank
+    if isinstance(api_info, ForwardAPIInfo):
+        file_path = os.path.join(dump_path, f'forward_info_{rank}.json')
+        stack_file_path = os.path.join(dump_path, f'stack_info_{rank}.json')
+        write_json(file_path, api_info.api_info_struct)
+        write_json(stack_file_path, api_info.stack_info_struct, indent=4)
+
+    elif isinstance(api_info, BackwardAPIInfo):
+        file_path = os.path.join(dump_path, f'backward_info_{rank}.json')
+        write_json(file_path, api_info.grad_info_struct)
+    else:
+        raise ValueError(f"Invalid api_info type {type(api_info)}")
+
+
+def write_json(file_path, data, indent=None):
+    if not os.path.exists(file_path):
+        with FileOpen(file_path, 'w') as f:
+            f.write("{\n}")
+        change_mode(file_path, FileCheckConst.DATA_FILE_AUTHORITY)
+    lock.acquire()
+    with FileOpen(file_path, 'a+') as f:
+        fcntl.flock(f, fcntl.LOCK_EX)
+        try:
+            f.seek(0, os.SEEK_END)
+            f.seek(f.tell() - 1, os.SEEK_SET)
+            f.truncate()
+            if f.tell() > 3:
+                f.seek(f.tell() - 1, os.SEEK_SET)
+                f.truncate()
+                f.write(',\n')
+            f.write(json.dumps(data, indent=indent)[1:-1] + '\n}')
+        except Exception as e:
+            raise ValueError(f"Json save failed:{e}") from e
+        finally:
+            fcntl.flock(f, fcntl.LOCK_UN)
+            lock.release()
+
+
+def initialize_output_json():
+    dump_path = os.path.realpath("./")
+    files = ['forward_info.json', 'backward_info.json', 'stack_info.json']
+
+    forward_real_data_path = os.path.join(dump_path, 'forward_real_data')
+    if os.path.exists(forward_real_data_path):
+        raise ValueError(f"file {forward_real_data_path} already exists, please remove it first")
+    else:
+        os.mkdir(forward_real_data_path, mode=0o750)
+
+    backward_real_data_path = os.path.join(dump_path, 'backward_real_data')
+    if os.path.exists(backward_real_data_path):
+        raise ValueError(f"file {backward_real_data_path} already exists, please remove it first")
+    else:
+        os.mkdir(backward_real_data_path, mode=0o750)
+    for file in files:
+        file_path = os.path.join(dump_path, file)
+        if os.path.exists(file_path):
+            raise ValueError(f"file {file_path} already exists, please remove it first or use a new dump path")
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/overflow_check/overflow_check.py b/debug/accuracy_tools/atat/pytorch/overflow_check/overflow_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8f9926b6cd2bab4a347260e0126f551297aec8b
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/overflow_check/overflow_check.py
@@ -0,0 +1,190 @@
+import os
+from pathlib import Path
+
+import torch
+
+try:
+    import torch_npu
+except ImportError:
+    is_gpu = True
+else:
+    is_gpu = False
+
+from atat.core.file_check_util import FileCheckConst
+from atat.core.utils import print_warn_log, get_time, print_info_log
+from ..dump.dump import forward_init_status, forward_acl_dump
+from .utils import OverFlowUtil, dump_overflow, check_overflow_npu, clear_overflow_npu
+from ..dump.utils import DumpUtil, Const, get_tensor_rank, create_dirs_if_not_exist, check_single_rank_folder
+from .info_dump import write_api_info_json, ForwardAPIInfo, BackwardAPIInfo
+from ..dump import dump
+
+backward_init_status = False
+api_overflow = []
+forward_api_info = {}
+backward_api_info = {}
+FORWARD_REAL_DATA_PATH = os.path.join('./', 'forward_real_data')
+BACKWARD_REAL_DATA_PATH = os.path.join('./', 'backward_real_data')
+rank = os.getpid()
+pkl_name = ''
+
+
+def check_overflow_environment(pid):
+    if not OverFlowUtil.get_overflow_check_switch():
+        return False
+    if pid != os.getpid():
+        return False
+    if is_gpu:
+        print_warn_log("Overflow detection is not supported in the GPU environment.")
+        return False
+    global backward_init_status
+    if backward_init_status or forward_init_status:
+        return False
+    return True
+
+
+def check_data_overflow(x):
+    if isinstance(x, (tuple, list)) and x:
+        for i, item in enumerate(x):
+            if True == check_data_overflow(item):
+                return True
+        return False
+    else:
+        if isinstance(x, torch.Tensor) and x.numel() != 0 and x.dtype != torch.bool:
+            if x.is_meta:
+                return False
+            if len(x.shape) == 0:
+                tensor_max = x.cpu().detach().float().numpy().tolist()
+                tensor_min = tensor_max
+            else:
+                tensor_max = torch._C._VariableFunctionsClass.max(x).cpu().detach().float().numpy().tolist()
+                tensor_min = torch._C._VariableFunctionsClass.min(x).cpu().detach().float().numpy().tolist()
+            # inf
+            if tensor_max == float('inf') or tensor_min == float('-inf'):
+                return True
+            if x.dtype in [torch.float16, torch.float32, torch.bfloat16] and \
+                    (tensor_max == torch.finfo(x.dtype).max or tensor_min == torch.finfo(x.dtype).min):
+                return True
+            # nan
+            elif tensor_max != tensor_max or tensor_min != tensor_min:
+                return True
+            else:
+                return False
+        elif isinstance(x, bool) or isinstance(x, int) or isinstance(x, float):
+            if x == float('inf') or x == float('-inf') or x != x:
+                return True
+            else:
+                return False
+        else:
+            return False
+
+
+def check_path(apis, path):
+    return any(api in path for api in apis)
+
+
+def overflow_check(name, **kwargs):
+    overflow_nums = OverFlowUtil.overflow_nums
+    pid = kwargs.get('pid')
+    dump_mode = DumpUtil.dump_switch_mode
+    if not pid:
+        return RuntimeError("Not get the specified process pid.")
+
+    def overflowcheck_hook(module, in_feat, out_feat=None):
+        if not check_overflow_environment(pid):
+            return
+        dump_file = DumpUtil.get_dump_path()
+        global rank
+        dump_dir, dump_filename = os.path.split(dump_file)
+        dump_dir = os.path.join(dump_dir, "step{}".format(DumpUtil.iter_num))
+        if not os.path.exists(dump_dir):
+            Path(dump_dir).mkdir(mode=FileCheckConst.DATA_DIR_AUTHORITY, exist_ok=True)
+        if DumpUtil.is_single_rank is None:
+            DumpUtil.is_single_rank = check_single_rank_folder(dump_dir)
+        dump_file = os.path.join(dump_dir, dump_filename)
+        rank_this = get_tensor_rank(in_feat, out_feat)
+        DumpUtil.dump_root = os.path.dirname(DumpUtil.dump_path)
+        if rank_this is not None and rank != rank_this:
+            rank = rank_this
+            dump.rename_()
+        if DumpUtil.target_rank is not None:
+            if rank != DumpUtil.target_rank:
+                return
+        dump_path = create_dirs_if_not_exist(rank, dump_file)
+        global pkl_name
+        pkl_name = dump_path
+        dump_dir = os.path.split(dump_path)[0]
+        global api_overflow
+        global forward_api_info
+        global backward_api_info
+
+        module_name = name
+        if hasattr(torch_npu._C, '_npu_is_support_inf_nan') and torch_npu._C._npu_is_support_inf_nan():
+            # backward API endwith backward
+            if module_name.endswith(Const.BACKWARD):
+                check_feat = in_feat
+            else:
+                check_feat = out_feat
+            module.has_overflow = check_data_overflow(check_feat)
+        else:
+            module.has_overflow = check_overflow_npu()
+        if not module.has_overflow:
+            if hasattr(module, 'input_args'):
+                del module.input_args
+            if hasattr(module, 'input_kwargs'):
+                del module.input_kwargs
+        if module.has_overflow and OverFlowUtil.check_overflow_dump_times(overflow_nums):
+            if overflow_type_judge(in_feat, out_feat, module_name) and DumpUtil.need_replicate:
+                if module_name.endswith(Const.FORWARD):
+                    forward_api_info.update({name: ForwardAPIInfo(name, True, module.input_args, module.input_kwargs)})
+                    api_overflow.append(module_name)
+                else:
+                    api_overflow.append(module_name.replace("backward", "forward"))
+                    backward_api_info.update({name: BackwardAPIInfo(name, out_feat)})
+            OverFlowUtil.inc_overflow_dump_times()
+            dump_file_name = os.path.join(dump_dir,
+                                          "{}_{}.pkl".format(module_name, OverFlowUtil.real_overflow_dump_times))
+            dump_overflow(module_name, in_feat, out_feat, dump_file_name)
+            dump.pkl_name = dump_file_name
+
+            print_warn_log("[overflow {} times]: module name :'{}' is overflow and dump file is saved in '{}'."
+                           .format(OverFlowUtil.real_overflow_dump_times, module_name,
+                                   os.path.realpath(dump_file_name)))
+            if dump_mode == "acl":
+                acl_dump(module, module_name)
+            dump.write_to_disk()
+            # clear overflow flag for the next check
+            clear_overflow_npu()
+            if not OverFlowUtil.check_overflow_dump_times(overflow_nums):
+                for key in forward_api_info:
+                    write_api_info_json(forward_api_info[key])
+                for key in backward_api_info:
+                    write_api_info_json(backward_api_info[key])
+                raise ValueError("[overflow {} times]: dump file is saved in '{}'."
+                                 .format(OverFlowUtil.real_overflow_dump_times, os.path.realpath(dump_file_name)))
+
+    def overflow_type_judge(in_feat, out_feat, module_name):
+        if module_name.endswith(Const.BACKWARD):
+            check_feat = out_feat
+        else:
+            check_feat = in_feat
+        if check_data_overflow(check_feat):
+            print_warn_log("module name :'{}' is overflow and its inputs already has an overflow, so you need "
+                           "to go back to find where the overflow started.".format(module_name))
+            return False
+        elif not check_data_overflow(in_feat) and not check_data_overflow(out_feat):
+            print_warn_log("module name :'{}' is overflow and its inputs and outputs do not overflow, "
+                           "so this is a process overflow".format(module_name))
+            return False
+        else:
+            print_warn_log("module name :'{}' is overflow. Its input is normal and its output "
+                           "is overflow.".format(module_name))
+            return True
+
+    def acl_dump(module, module_name):
+        if "forward" in module_name:
+            forward_acl_dump(module, module_name)
+        if "backward" in module_name:
+            print_info_log("The overflow is caused by backward operator {}. "
+                           "You can use reverse acl dump(mode='acl') to get operator dump data.".format(module_name))
+
+    return overflowcheck_hook
diff --git a/debug/accuracy_tools/atat/pytorch/overflow_check/utils.py b/debug/accuracy_tools/atat/pytorch/overflow_check/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d254d5845505fb2ae0c41c56ac9a0e1d9225ba87
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/overflow_check/utils.py
@@ -0,0 +1,114 @@
+import os
+import torch
+
+try:
+    import torch_npu
+except ImportError:
+    is_gpu = True
+else:
+    is_gpu = False
+
+from atat.core.utils import check_switch_valid, check_inplace_op, OverflowConst
+from ..common.utils import Const
+from ..dump.dump import dump_stack_info, get_scalar_data_info, dump_data_by_rank_count, \
+    get_not_float_tensor_info, get_float_tensor_info
+from ..dump.utils import DumpUtil, make_dump_data_dir
+
+
+class OverFlowUtil(object):
+    overflow_check_switch = None
+    overflow_filter_switch = Const.OFF
+    real_overflow_dump_times = 0
+    overflow_nums = 1
+
+    @staticmethod
+    def set_overflow_check_switch(switch, filter_switch):
+        OverFlowUtil.overflow_check_switch = switch
+        OverFlowUtil.overflow_filter_switch = filter_switch
+
+    @staticmethod
+    def get_overflow_check_switch():
+        if OverFlowUtil.overflow_check_switch is None:
+            return True
+        return OverFlowUtil.overflow_check_switch == "ON"
+
+    @staticmethod
+    def inc_overflow_dump_times():
+        OverFlowUtil.real_overflow_dump_times += 1
+
+    @staticmethod
+    def check_overflow_dump_times(need_dump_times):
+        if need_dump_times == -1:
+            return True
+        return OverFlowUtil.real_overflow_dump_times < need_dump_times
+
+
+def set_overflow_check_switch(switch, filter_switch=Const.OFF):
+    check_switch_valid(switch)
+    check_switch_valid(filter_switch)
+
+    OverFlowUtil.set_overflow_check_switch(switch, filter_switch)
+
+
+def dump_overflow(module_name, in_feat, out_feat, dump_file):
+    name_template = f"{module_name}" + "_{}"
+    DumpUtil.dump_data_dir = make_dump_data_dir(dump_file)
+    dump_stack_info(name_template)
+    if check_inplace_op(name_template):
+        if Const.PRE_FORWARD in name_template:
+            name_template = name_template.replace(Const.PRE_FORWARD, Const.FORWARD)
+        else:
+            _dump_tensor_completely(in_feat, name_template.format("output"))
+            return
+
+    if "forward" in name_template:
+        _dump_tensor_completely(in_feat, name_template.format("input"))
+        _dump_tensor_completely(out_feat, name_template.format("output"))
+    else:
+        _dump_tensor_completely(in_feat, name_template.format("output"))
+        _dump_tensor_completely(out_feat, name_template.format("input"))
+
+
+def _dump_tensor_completely(x, prefix):
+    dump_flag = Const.DUMP_RATIO_MAX + 1
+    if isinstance(x, (tuple, list)) and x:
+        for i, item in enumerate(x):
+            _dump_tensor_completely(item, "{}.{}".format(prefix, i))
+    elif isinstance(x, torch.Tensor):
+        if x.numel() == 0 or len(x.shape) == 0 or not x.is_floating_point():
+            if OverFlowUtil.overflow_filter_switch == Const.OFF:
+                data_info = get_not_float_tensor_info(x)
+                dump_data_by_rank_count(dump_flag, prefix, data_info)
+        else:
+            data_info = get_float_tensor_info(x)
+            dump_data_by_rank_count(dump_flag, prefix, data_info)
+
+    elif OverFlowUtil.overflow_filter_switch == Const.OFF:
+        if isinstance(x, bool) or isinstance(x, int) or isinstance(x, float):
+            data_info = get_scalar_data_info(x)
+            dump_data_by_rank_count(dump_flag, prefix, data_info)
+
+
+def overflow_debug_mode_enalbe():
+    overflow_mode = os.getenv(OverflowConst.OVERFLOW_DEBUG_MODE_ENABLE, Const.ENV_DISABLE)
+    return overflow_mode == Const.ENV_ENABLE
+
+
+def check_overflow_npu():
+    if overflow_debug_mode_enalbe():
+        float_status = torch.zeros(8).npu()
+        result = torch_npu.npu_get_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE)
+        if (result.cpu()[0] != 0):
+            return True
+        else:
+            return False
+    else:
+        return torch_npu._C._check_overflow_npu()
+
+
+def clear_overflow_npu():
+    if overflow_debug_mode_enalbe():
+        float_status = torch.zeros(8).npu()
+        torch_npu.npu_clear_float_status(float_status, OverflowConst.OVERFLOW_DEBUG_MODE)
+    else:
+        torch_npu._C._clear_overflow_npu()
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/pt_config.py b/debug/accuracy_tools/atat/pytorch/pt_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0691915cffc93b4a4505b2453560043b44cdc40
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/pt_config.py
@@ -0,0 +1,90 @@
+import os
+import json
+from ..core.common_config import CommonConfig, BaseConfig
+from ..core.utils import Const
+from ..core.file_check_util import FileOpen
+
+
+#特定任务配置类
+class TensorConfig(BaseConfig):
+    def __init__(self, json_config):
+        super().__init__(json_config)
+        self.check_config()
+        self._check_file_format()
+
+    def _check_file_format(self):
+        if self.file_format is not None and self.file_format not in ["npy", "bin"]:
+            raise Exception("file_format is invalid")
+
+
+class StatisticsConfig(BaseConfig):
+    def __init__(self, json_config):
+        super().__init__(json_config)
+        self.check_config()
+        self._check_summary_mode()
+
+    def _check_summary_mode(self):
+        if self.summary_mode and self.summary_mode not in ["statistics", "md5"]:
+            raise Exception("summary_mode is invalid")
+        
+
+class OverflowCheckConfig(BaseConfig):
+    def __init__(self, json_config):
+        super().__init__(json_config)
+        self.overflow_num = json_config.get("overflow_nums")
+        self.check_mode = json_config.get("check_mode")
+        self.check_overflow_config()
+    
+    def check_overflow_config(self):
+        if self.overflow_num is not None and not isinstance(self.overflow_num, int):
+            raise Exception("overflow_num is invalid")
+        if self.check_mode is not None and self.check_mode not in ["all", "aicore", "atomic"]:
+            raise Exception("check_mode is invalid")
+        
+class FreeBenchmarkCheckConfig(BaseConfig):
+    def __init__(self, json_config):
+        super().__init__(json_config)
+        self.fuzz_device = json_config.get("fuzz_device")
+        self.pert_mode = json_config.get("pert_mode")
+        self.handler_type = json_config.get("handler_type")
+        self.fuzz_level = json_config.get("fuzz_level")
+        self.fuzz_stage = json_config.get("fuzz_stage")
+        self.if_preheat = json_config.get("if_preheat")
+        self.preheat_step = json_config.get("preheat_step")
+        self.max_sample = json_config.get("max_sample")
+        self.check_freebenchmark_config()
+
+    def check_freebenchmark_config(self):
+        if self.if_preheat and  self.handler_type == "fix":
+            raise Exception("Preheating is not supported in fix handler type")
+
+def parse_task_config(task, json_config):
+    default_dic = {}
+    if task == Const.TENSOR:
+        config_dic = json_config.get(Const.TENSOR) if json_config.get(Const.TENSOR) else default_dic
+        return TensorConfig(config_dic)
+    elif task == Const.STATISTICS:
+        config_dic = json_config.get(Const.STATISTICS) if json_config.get(Const.STATISTICS) else default_dic
+        return StatisticsConfig(config_dic)
+    elif task == Const.OVERFLOW_CHECK:
+        config_dic = json_config.get(Const.OVERFLOW_CHECK) if json_config.get(Const.OVERFLOW_CHECK) else default_dic
+        return OverflowCheckConfig(config_dic)
+    elif task == Const.FREE_BENCHMARK:
+        config_dic = json_config.get(Const.FREE_BENCHMARK) if json_config.get(Const.FREE_BENCHMARK) else default_dic
+        return FreeBenchmarkCheckConfig(config_dic)
+    else:
+        return StatisticsConfig(default_dic)
+
+
+def parse_json_config(json_file_path, task):
+    if not json_file_path:
+        config_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+        json_file_path = os.path.join(os.path.join(config_dir, "config"), "config.json")
+    with FileOpen(json_file_path, 'r') as file:
+        json_config = json.load(file)
+    common_config = CommonConfig(json_config)
+    if task and task in Const.TASK_LIST:
+        task_config = parse_task_config(task, json_config)
+    else:
+        task_config = parse_task_config(common_config.task, json_config)
+    return common_config, task_config
\ No newline at end of file
diff --git a/debug/accuracy_tools/atat/pytorch/service.py b/debug/accuracy_tools/atat/pytorch/service.py
new file mode 100644
index 0000000000000000000000000000000000000000..35da18ae91bc8a3510d1f9212f02a853d52f24a6
--- /dev/null
+++ b/debug/accuracy_tools/atat/pytorch/service.py
@@ -0,0 +1,169 @@
+import os
+from pathlib import Path
+import functools
+import torch
+from .functional import build_repair, build_collect_data, build_step_post_process
+from .functional.scope import BaseScope
+from .common.utils import get_rank_if_initialized, is_gpu, Const
+from .common.file_check import FileChecker, FileCheckConst, check_path_before_create
+from .common import print_info_log_rank_0
+from .hook_module.api_registry import api_register
+from .hook_module import remove_dropout
+from .functional.data_processor import ModuleForwardInputsOutputs, ModuleBackwardInputsOutputs
+from .module_processer import ModuleProcesser
+
+
+class Service:
+    make_dir_flag = True
+    REGISTER_HOOK_KWARGS = ["overflow_nums", "dump_mode", "dump_config"]
+
+    def __init__(self, config):
+        self.model = None
+        self.config = config
+        self.collect_data = build_collect_data(config)
+        self.module_processor = ModuleProcesser(self.collect_data.scope)
+        self.repair = build_repair(config)
+        self.step_post_process = build_step_post_process(config)
+        self.switch = False
+        self.current_iter = 0
+        self.first_start = True
+        self.current_rank = None
+        self.first_touch_dir = True
+
+    def build_hook(self, module_type, name):
+        def pre_hook(repair, api_or_module_name, module, args, kwargs):
+            self.collect_data.visit_and_clear_overflow_status(module.mindstudio_reserved_name)
+            nonlocal module_type, pid
+            if not self.switch:
+                return args, kwargs
+            if repair:
+                args, kwargs = repair.convert(api_or_module_name, module_type, args, kwargs)
+            if self.collect_data:
+                module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=None)
+                self.collect_data.pre_forward(api_or_module_name, module_type, module, pid, module_input_output)
+            return args, kwargs
+
+        def forward_hook(repair, api_or_module_name, module, args, kwargs, output):
+            self.collect_data.visit_and_clear_overflow_status(module.mindstudio_reserved_name)
+            nonlocal module_type, pid
+            if not self.switch:
+                return
+            
+            if self.collect_data:
+                module_input_output = ModuleForwardInputsOutputs(args=args, kwargs=kwargs, output=output)
+                self.collect_data(api_or_module_name, module_type, module, pid, module_input_output)
+                if self.collect_data.if_return_forward_new_output():
+                    return self.collect_data.get_forward_new_output()
+            if repair:
+                output = repair.invert(api_or_module_name, module_type, output)
+
+            return output
+
+        def backward_hook(repair, api_or_module_name, module, grad_input, grad_output):
+            nonlocal module_type, pid
+            if not self.switch:
+                return
+            if self.collect_data:
+                module_input_output = ModuleBackwardInputsOutputs(grad_input=grad_input, grad_output=grad_output)
+                self.collect_data(api_or_module_name, module_type, module, pid, module_input_output)
+
+        pid = os.getpid()
+        if module_type == BaseScope.Module_Type_Module:
+            forward_name_template = name + Const.SEP + "{}" + Const.SEP + "forward"
+            backward_name_template = name + Const.SEP + "{}" + Const.SEP + "backward"
+        else:
+            forward_name_template = name + "forward"
+            backward_name_template = name + "backward"
+        pre_forward_hook = functools.partial(pre_hook, self.repair, forward_name_template)
+        forward_hook = functools.partial(forward_hook, self.repair, forward_name_template)
+        backward_hook = functools.partial(backward_hook, None, backward_name_template)
+        return pre_forward_hook, forward_hook, backward_hook
+
+    def step(self):
+        self.current_iter += 1
+        if self.step_post_process:
+            self.step_post_process()
+        self.collect_data.update_iter(self.current_iter)
+
+    def start(self, model):
+        self.model = model
+        if self.config.step and self.current_iter > max(self.config.step):
+            self.stop()
+            raise Exception("atat: exit after iteration {}".format(max(self.config.step)))
+        if self.config.step and self.current_iter not in self.config.step:
+            return
+        if self.first_start:
+            self.current_rank = get_rank_if_initialized()
+            if self.config.rank and self.current_rank not in self.config.rank:
+                return
+            self.register_hook_new()
+            self.first_start = False
+        self.switch = True
+        self.create_dirs()
+        print_info_log_rank_0(f"Dump switch is turned on at step {self.current_iter}. "
+                              f"Dump data will be saved in {self.dump_iter_dir}.")
+
+    def stop(self):
+        if self.config.step and self.current_iter not in self.config.step:
+            return
+        if self.config.rank and self.current_rank not in self.config.rank:
+            return
+        self.switch = False
+        self.collect_data.write_json()
+
+
+    def create_dirs(self):
+        check_path_before_create(self.config.dump_path)
+        if not os.path.exists(self.config.dump_path):
+            Path(self.config.dump_path).mkdir(mode=0o750, exist_ok=True)
+        file_check = FileChecker(self.config.dump_path, FileCheckConst.DIR)
+        file_check.common_check()
+        self.dump_iter_dir = os.path.join(self.config.dump_path, f"step{self.current_iter}")
+        cur_rank = self.current_rank if self.current_rank is not None else ''
+        dump_dir = os.path.join(self.dump_iter_dir, f"rank{cur_rank}")
+        if not os.path.exists(dump_dir):
+            Path(dump_dir).mkdir(mode=0o750, parents=True, exist_ok=True)
+        if self.config.task in self.collect_data.tasks_need_tensor_data:
+            dump_data_dir = os.path.join(dump_dir, "dump_tensor_data")
+            Path(dump_data_dir).mkdir(mode=0o750, exist_ok=True)
+        else:
+            dump_data_dir = None
+
+        dump_file_path = os.path.join(dump_dir, "dump.json")
+        stack_file_path = os.path.join(dump_dir, "stack.json")
+        construct_file_path = os.path.join(dump_dir, "construct.json")
+        free_benchmark_file_path = os.path.join(self.config.dump_path, "free_benchmark.csv")
+        self.collect_data.update_dump_paths(dump_file_path, stack_file_path, construct_file_path, dump_data_dir, free_benchmark_file_path)
+
+    def register_hook_new(self):
+        hook_name = self.config.task
+
+        if "overflow_check" in hook_name and not is_gpu:
+            pass
+
+        print_info_log_rank_0("The {} hook function is successfully mounted to the model.".format(hook_name))
+        if self.config.level in ["L0", "mix"]:
+            assert self.model is not None
+            print_info_log_rank_0("The init dump mode is enabled, and the module dump function will not be available")
+            for name, module in self.model.named_modules():
+                if module == self.model:
+                    continue
+                prefix = BaseScope.Module_Type_Module + Const.SEP + name + Const.SEP +\
+                         module.__class__.__name__ + Const.SEP
+
+                pre_forward_hook, forward_hook, backward_hook = self.build_hook(BaseScope.Module_Type_Module, prefix)
+                module.register_forward_hook(forward_hook, with_kwargs=True)
+                module.register_full_backward_hook(backward_hook)
+
+                module.register_forward_pre_hook(self.module_processor.node_hook(prefix + "forward", "start"))
+                module.register_forward_hook(self.module_processor.node_hook(prefix + "forward", "stop"))
+                module.register_full_backward_pre_hook(self.module_processor.node_hook(prefix + "backward", "start"))
+                module.register_full_backward_hook(self.module_processor.node_hook(prefix + "backward", "stop"))
+
+        if self.config.level in ["mix", "L1"]:
+            api_register.initialize_hook(functools.partial(self.build_hook, BaseScope.Module_Type_API))
+            api_register.api_modularity()
+
+        if "acc_cmp_dump" in hook_name:
+            remove_dropout()
+
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py
index 6b2b62dbac1472058bb389113d83c8dbd6bea362..eccd6b4afe0918f2ab6f44a566a05b9f5eed1ec3 100644
--- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/common/utils.py
@@ -41,7 +41,7 @@ except ImportError:
 else:
     is_gpu = False
 
-torch_without_guard_version_list = ['2.1']
+torch_without_guard_version_list = ['2.1', '2.2']
 for version in torch_without_guard_version_list:
     if torch.__version__.startswith(version):
         torch_without_guard_version = True
@@ -77,6 +77,7 @@ class Const:
     BACKWARD = 'backward'
     FORWARD = 'forward'
     PRE_FORWARD = "pre_forward"
+    DELIMITER = '.'
 
     # dump mode
     ALL = "all"
@@ -113,10 +114,12 @@ class Const:
     ENV_ENABLE = "1"
     ENV_DISABLE = "0"
 
-    MAX_SEED_VALUE = 2**32 - 1
+    MAX_SEED_VALUE = 2 ** 32 - 1
 
-    INPLACE_LIST = ["broadcast", "all_reduce", "reduce", "all_gather", "gather", "scatter", "reduce_scatter",
-                    "_reduce_scatter_base", "_all_gather_base"]
+    INPLACE_LIST = [
+        "broadcast", "all_reduce", "reduce", "all_gather", "gather", "scatter", "reduce_scatter",
+        "_reduce_scatter_base", "_all_gather_base", "send", "recv", "irecv", "isend", "all_to_all_single"
+    ]
 
 
 class CompareConst:
@@ -201,6 +204,7 @@ class VersionCheck:
     V1_11 = "1.11"
     V2_0 = "2.0"
     V2_1 = "2.1"
+    V2_2 = "2.2"
 
     @staticmethod
     def check_torch_version(version):
@@ -319,11 +323,18 @@ def check_mode_valid(mode, scope=None, api_list=None):
         raise ValueError("api_list param set invalid, it's must be a list.")
     mode_check = {
         Const.ALL: lambda: None,
-        Const.RANGE: lambda:  ValueError("set_dump_switch, scope param set invalid, it's must be [start, end].") if len(scope) != 2 else None,
-        Const.LIST: lambda:  ValueError("set_dump_switch, scope param set invalid, it's should not be an empty list.") if len(scope) == 0 else None,
-        Const.STACK: lambda:  ValueError("set_dump_switch, scope param set invalid, it's must be [start, end] or [].") if len(scope) > 2 else None,
-        Const.ACL: lambda:  ValueError("set_dump_switch, scope param set invalid, only one api name is supported in acl mode.") if len(scope) != 1 else None,
-        Const.API_LIST: lambda:  ValueError("Current dump mode is 'api_list', but the content of api_list parameter is empty or valid.") if len(api_list) < 1 else None,
+        Const.RANGE: lambda: ValueError("set_dump_switch, scope param set invalid, it's must be [start, end].") if len(
+            scope) != 2 else None,
+        Const.LIST: lambda: ValueError(
+            "set_dump_switch, scope param set invalid, it's should not be an empty list.") if len(scope) == 0 else None,
+        Const.STACK: lambda: ValueError(
+            "set_dump_switch, scope param set invalid, it's must be [start, end] or [].") if len(scope) > 2 else None,
+        Const.ACL: lambda: ValueError(
+            "set_dump_switch, scope param set invalid, only one api name is supported in acl mode.") if len(
+            scope) != 1 else None,
+        Const.API_LIST: lambda: ValueError(
+            "Current dump mode is 'api_list', but the content of api_list parameter is empty or valid.") if len(
+            api_list) < 1 else None,
         Const.API_STACK: lambda: None,
     }
     if mode not in Const.DUMP_MODE:
@@ -346,7 +357,8 @@ def check_dump_mode_valid(dump_mode):
         print_warn_log("Please set dump_mode as a list.")
         dump_mode = [dump_mode]
     if not all(mode in ["all", "forward", "backward", "input", "output"] for mode in dump_mode):
-        raise ValueError("Please set dump_mode as a list containing one or more of the following: 'all', 'forward', 'backward', 'input', 'output'.")
+        raise ValueError(
+            "Please set dump_mode as a list containing one or more of the following: 'all', 'forward', 'backward', 'input', 'output'.")
     if 'input' not in dump_mode and 'output' not in dump_mode:
         dump_mode.extend(['input', 'output'])
     if 'forward' not in dump_mode and 'backward' not in dump_mode:
@@ -380,7 +392,7 @@ def check_compare_param(input_parma, output_path, stack_mode=False, summary_comp
         check_file_or_directory_path(input_parma.get("bench_dump_data_dir"), True)
     check_file_or_directory_path(output_path, True)
     with FileOpen(input_parma.get("npu_pkl_path"), "r") as npu_pkl, \
-         FileOpen(input_parma.get("bench_pkl_path"), "r") as bench_pkl:
+            FileOpen(input_parma.get("bench_pkl_path"), "r") as bench_pkl:
         check_pkl_file(input_parma, npu_pkl, bench_pkl, stack_mode)
 
 
@@ -451,14 +463,16 @@ def is_starts_with(string, prefix_list):
 
 def check_stack_mode(pkl_fp):
     api_prefix = ""
-    api_pattern = r'\[\"([0-9a-zA-Z_.]+_(for|back)ward)_(in|out)put(\.[0-9]+)?'
+    api_match = ""
+    api_pattern = r'\[\"([0-9a-zA-Z_.]+.(for|back)ward).(in|out)put(\.[0-9]+)?'
     is_stack_mode = False
     for index, line in enumerate(pkl_fp):
-        if index == 0:
-            api_match = re.search(api_pattern, line)
-            api_prefix = api_match.group(1)
+        if not api_match:
+            if re.search(api_pattern, line):
+                api_match = re.search(api_pattern, line)
+                api_prefix = api_match.group(1)
         elif api_prefix and line.startswith(f'["{api_prefix}'):
-            if line.startswith(f'["{api_prefix}_stack_info'):
+            if line.startswith(f'["{api_prefix}.stack_info'):
                 is_stack_mode = True
                 break
         else:
@@ -636,11 +650,13 @@ def format_value(value):
 def torch_device_guard(func):
     if is_gpu or torch_without_guard_version:
         return func
+
     # Parse args/kwargs matched torch.device objects
 
     @torch_npu_device_guard
     def wrapper(*args, **kwargs):
         return func(*args, **kwargs)
+
     return wrapper
 
 
@@ -684,16 +700,15 @@ def get_process_rank(model):
         return 0, False
     if local_device.type == 'cpu':
         print_warn_log("Warning: the debugger is unable to get the rank id. "
-            "This may cause the dumpped data to be corrupted in the "
-            "case of distributed training. (You may ignore this if you are using only one card.) "
-            "Transfer the model to npu or gpu before register_hook() to avoid this warning.")
+                       "This may cause the dumpped data to be corrupted in the "
+                       "case of distributed training. (You may ignore this if you are using only one card.) "
+                       "Transfer the model to npu or gpu before register_hook() to avoid this warning.")
         return 0, False
     else:
         return local_device.index, True
 
 
 def parameter_adapter(func):
-
     @wraps(func)
     def inner(self, *args, **kwargs):
         if self.op_name_ == "__getitem__" and len(args) > 1 and isinstance(args[1], torch.Tensor):
@@ -719,6 +734,7 @@ def parameter_adapter(func):
         if self.op_name_ == "__eq__" and args[1] is None:
             return False
         return func(self, *args, **kwargs)
+
     return inner
 
 
@@ -730,7 +746,7 @@ def generate_compare_script(dump_path, pkl_file_path, dump_switch_mode):
 
     try:
         with FileOpen(template_path, 'r') as ftemp, \
-           os.fdopen(os.open(compare_script_path, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as fout:
+                os.fdopen(os.open(compare_script_path, Const.WRITE_FLAGS, Const.WRITE_MODES), 'w+') as fout:
             code_temp = ftemp.read()
             fout.write(code_temp % (pkl_file_path, dump_path, is_api_stack))
     except OSError:
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py
index be1e0dae76e27bb63dab6016101cd84584cafd62..d9ce305b4496eaa0e706b883584292e0b994edc1 100644
--- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/acc_compare.py
@@ -20,6 +20,7 @@ import multiprocessing
 import os.path
 import stat
 import sys
+import torch
 
 import numpy as np
 import pandas as pd
@@ -29,7 +30,7 @@ from ..advisor.advisor import Advisor
 from ..common.utils import check_compare_param, add_time_as_suffix, \
     print_info_log, print_warn_log, print_error_log, CompareException, Const, \
     CompareConst, format_value, check_file_not_exists, check_configuration_param, \
-    is_summary_compare, is_md5_compare
+    task_dumppath_get
 from ..common.file_check_util import FileChecker, FileCheckConst, change_mode, FileOpen
 
 
@@ -227,67 +228,47 @@ def rename_api(npu_name, process):
     return torch_func
 
 
-def merge_tensor(tensor_list):
+def merge_tensor(tensor_list, summary_compare, md5_compare):
     op_dict = {}
     op_dict["op_name"] = []
     op_dict["input_struct"] = []
+    op_dict["kwargs_struct"] = []
     op_dict["output_struct"] = []
     op_dict["summery"] = []
     op_dict["stack_info"] = []
 
+    all_mode_bool = summary_compare == False and md5_compare == False
+    if all_mode_bool:
+        op_dict["data_name"] = []
+
     for tensor in tensor_list:
-        if tensor[0].find("stack_info") != -1:
-            if len(tensor) != Const.STACK_COLUMN_NUM:
-                print_error_log(f"This stack_info data is not complete. {tensor}")
-                raise CompareException(CompareException.INVALID_DATA_ERROR)
-            op_dict["stack_info"].append(tensor[1])
+        if len(tensor) == 2:
+            op_dict['stack_info'].append(tensor['full_info'])
             break
-        op_dict["op_name"].append(tensor[0])
-        if len(tensor) != Const.SUMMARY_COLUMN_NUM:
-            print_error_log(f"This summary data is not complete. {tensor}")
-            raise CompareException(CompareException.INVALID_DATA_ERROR)    
-        if tensor[0].find("input") != -1:
-            op_dict["input_struct"].append((tensor[3], tensor[4], tensor[2]))
-        elif tensor[0].find("output") != -1:
-            op_dict["output_struct"].append((tensor[3], tensor[4], tensor[2]))
-
-        if tensor[1] <= Const.DUMP_RATIO_MAX:
-            op_dict["summery"].append(tensor[5])
-
-    return op_dict
-
+        op_dict["op_name"].append(tensor['full_op_name'])
+        if not md5_compare:
+            if tensor['full_op_name'].find("input") != -1:
+                op_dict["input_struct"].append((tensor['dtype'], tensor['shape']))
+            elif tensor['full_op_name'].find("kwarg") != -1:
+                op_dict["kwargs_struct"].append((tensor['dtype'], tensor['shape']))
+            elif tensor['full_op_name'].find("output") != -1:
+                op_dict["output_struct"].append((tensor['dtype'], tensor['shape']))
+        else:
+            if tensor['full_op_name'].find("input") != -1:
+                op_dict["input_struct"].append((tensor['dtype'], tensor['shape'], tensor['md5']))
+            elif tensor['full_op_name'].find("kwarg") != -1:
+                op_dict["kwargs_struct"].append((tensor['dtype'], tensor['shape'], tensor['md5']))
+            elif tensor['full_op_name'].find("output") != -1:
+                op_dict["output_struct"].append((tensor['dtype'], tensor['shape'], tensor['md5']))
 
-def read_op(ops_queue, pkl_file_handle, stack_mode):
-    tensor_list = []
-    read_err = False
-    read_output_flag = {"last_line": False, "curr_line": False}
-    end_flag = "stack_info" if stack_mode is True else "output"
+        op_dict["summery"].append([tensor['Max'], tensor['Min'], tensor['Mean'], tensor['Norm']])
 
-    while True:
-        curr_pos = pkl_file_handle.tell()
-        tensor_line = pkl_file_handle.readline()
-        if len(tensor_line) == 0 and not read_output_flag.get("curr_line"):
-            read_err = True
-            break
-        if tensor_line == '\n':
-            continue
-        if len(tensor_line) != 0:
-            tensor_data = json.loads(tensor_line)
-            if not isinstance(tensor_data, list):
-                print_error_log(f"This data is not a list, please check the dump data pkl file. {tensor_data}")
-                raise CompareException(CompareException.INVALID_DATA_ERROR) 
-            read_output_flag["last_line"] = read_output_flag.get("curr_line")
-            read_output_flag["curr_line"] = True if tensor_data[0].find(end_flag) != -1 else False
-
-        if (read_output_flag.get("last_line") and not read_output_flag.get("curr_line")) \
-                or (len(tensor_line) == 0 and read_output_flag.get("curr_line")):  # end of file scenario
-            ops_queue.append(merge_tensor(tensor_list))
-            # the pos of the handle needs to restore to the start of the next api.
-            pkl_file_handle.seek(curr_pos, 0)
-            break
-        tensor_list.append(tensor_data)
+        if all_mode_bool:
+            op_dict["data_name"].append(tensor['data_name'])
 
-    return not read_err
+    if not op_dict["kwargs_struct"]:
+        del op_dict["kwargs_struct"]
+    return op_dict
 
 
 def match_op(npu_queue, bench_queue, fuzzy_match):
@@ -308,7 +289,17 @@ def get_accuracy(result, n_dict, b_dict, summary_compare=False, md5_compare=Fals
         npu_stack_info = n_dict.get("stack_info", None)
         bench_stack_info = b_dict.get("stack_info", None)
         has_stack = npu_stack_info and bench_stack_info
+
+        all_mode_bool = summary_compare == False and md5_compare == False
+        if all_mode_bool:
+            npu_data_name = n_dict.get("data_name", None)
+            bench_data_name = b_dict.get("data_name", None)
+        has_data_name = False
+
         for index in range(min_len):
+            if all_mode_bool:
+                has_data_name = npu_data_name[n_start + index] and bench_data_name[b_start + index]
+
             n_name = n_dict['op_name'][n_start + index]
             b_name = b_dict['op_name'][b_start + index]
             n_struct = n_dict[key][index]
@@ -319,6 +310,10 @@ def get_accuracy(result, n_dict, b_dict, summary_compare=False, md5_compare=Fals
                                n_struct[2], b_struct[2], CompareConst.PASS if n_struct[2] == b_struct[2] else CompareConst.DIFF]
                 if has_stack and index == 0 and key == "input_struct":
                     result_item.extend(npu_stack_info)
+                else:
+                    result_item.append(CompareConst.NONE)
+                if all_mode_bool and has_data_name:
+                    result_item.append(npu_data_name[n_start + index])
                 result.append(result_item)
                 continue
 
@@ -345,7 +340,7 @@ def get_accuracy(result, n_dict, b_dict, summary_compare=False, md5_compare=Fals
                         if magnitude_diff > 0.5:
                             warning_flag = True
                     else:
-                        result_item[start_idx + i] = CompareConst.NAN
+                        result_item[start_idx + i] = CompareConst.NONE
                 accuracy_check = CompareConst.WARNING if warning_flag else ""
                 err_msg += "Need double check api accuracy." if warning_flag else ""
                 result_item[start_idx:] = [f'{str(x)}\t' if str(x) in ('inf', '-inf', 'nan') else x for x in result_item[start_idx:]]
@@ -354,11 +349,18 @@ def get_accuracy(result, n_dict, b_dict, summary_compare=False, md5_compare=Fals
             result_item.append(err_msg)
             if has_stack and index == 0 and key == "input_struct":
                 result_item.extend(npu_stack_info)
+            else:
+                result_item.append(CompareConst.NONE)
+            if all_mode_bool and has_data_name:
+                result_item.append(npu_data_name[n_start + index])
 
             result.append(result_item)
 
         if n_len > b_len:
             for index in range(b_len, n_len):
+                if all_mode_bool:
+                    has_data_name = npu_data_name[n_start + index] and bench_data_name[b_start + index]
+
                 n_name = n_dict['op_name'][n_start + index]
                 n_struct = n_dict[key][index]
                 if md5_compare:
@@ -379,6 +381,10 @@ def get_accuracy(result, n_dict, b_dict, summary_compare=False, md5_compare=Fals
 
                 if has_stack and index == 0 and key == "input_struct":
                     result_item.extend(npu_stack_info)
+                else:
+                    result_item.append(CompareConst.NONE)
+                if all_mode_bool and has_data_name:
+                    result_item.append(npu_data_name[n_start + index])
 
                 result.append(result_item)
 
@@ -386,10 +392,13 @@ def get_accuracy(result, n_dict, b_dict, summary_compare=False, md5_compare=Fals
     b_num = len(b_dict['op_name'])
     n_num_input = len([name for name in n_dict['op_name'] if 'input' in name])
     b_num_input = len([name for name in b_dict['op_name'] if 'input' in name])
-    n_num_output = n_num - n_num_input
-    b_num_output = b_num - b_num_input
+    n_num_kwarg = len([name for name in n_dict['op_name'] if 'kwarg' in name])
+    b_num_kwarg = len([name for name in b_dict['op_name'] if 'kwarg' in name])
+    n_num_output = n_num - n_num_input - n_num_kwarg
+    b_num_output = b_num - b_num_input - b_num_kwarg
     get_accuracy_core(0, n_num_input, 0, b_num_input, 'input_struct')
-    get_accuracy_core(n_num_input, n_num_output, b_num_input, b_num_output, 'output_struct')
+    get_accuracy_core(n_num_input, n_num_kwarg, b_num_input, b_num_kwarg, "kwargs_struct")
+    get_accuracy_core(n_num_input + n_num_kwarg, n_num_output, b_num_input + b_num_kwarg, b_num_output, 'output_struct')
 
 
 def _do_multi_process(input_parma, result_path):
@@ -407,12 +416,14 @@ def read_dump_path(result_path):
     try:
         csv_pd = pd.read_csv(result_path)
         npu_dump_name_list = csv_pd.iloc[0:, 0].tolist()
-        bench_dump_name_list = csv_pd.iloc[0:, 1].tolist()
+        npu_dump_tensor_list = csv_pd.iloc[0:, -1].tolist()
+        # bench_dump_name_list = csv_pd.iloc[0:, 1].tolist()
         op_name_mapping_dict = {}
         for index, _ in enumerate(npu_dump_name_list):
             npu_dump_name = npu_dump_name_list[index]
-            bench_dump_name = bench_dump_name_list[index]
-            op_name_mapping_dict[npu_dump_name] = [npu_dump_name, bench_dump_name]
+            npu_dump_tensor = npu_dump_tensor_list[index]
+            # bench_dump_name = bench_dump_name_list[index]
+            op_name_mapping_dict[npu_dump_name] = [npu_dump_tensor, npu_dump_tensor]
         return op_name_mapping_dict
     except FileNotFoundError as e:
         print_error_log('{} file is not found.'.format(result_path))
@@ -464,7 +475,12 @@ def compare_ops(idx, fusion_op_names, dump_path_dict, result_path, lock, input_p
     for i, op_name in enumerate(fusion_op_names):
         if is_print_compare_log:
             print("start compare: {}".format(op_name))
-        cos_sim, max_abs_err, max_relative_err, err_msg, one_thousand_err_ratio, five_thousand_err_ratio = compare_by_op(op_name, dump_path_dict, input_parma)
+
+        if op_name == '-1':
+            cos_sim = max_abs_err = max_relative_err = err_msg = one_thousand_err_ratio = five_thousand_err_ratio = CompareConst.NONE
+        else:
+            cos_sim, max_abs_err, max_relative_err, err_msg, one_thousand_err_ratio, five_thousand_err_ratio = compare_by_op(op_name, dump_path_dict, input_parma)
+
         if is_print_compare_log:
             print("[{}] Compare result: cosine {}, max_abs_err {}, max_relative_err {}, {}, one_thousand_err_ratio {}, five_thousand_err_ratio {}".format(op_name, cos_sim, max_abs_err, max_relative_err, err_msg, one_thousand_err_ratio, five_thousand_err_ratio))
         cos_result.append(cos_sim)
@@ -506,15 +522,15 @@ def _save_cmp_result(idx, cos_result, max_err_result, max_relative_err_result, e
 def check_accuracy(cos, max_abs_err):
     if cos == CompareConst.SHAPE_UNMATCH:
         return CompareConst.ACCURACY_CHECK_UNMATCH
-    if cos == CompareConst.NAN or max_abs_err == CompareConst.NAN:
-        return CompareConst.NAN
+    if cos == CompareConst.NONE or max_abs_err == CompareConst.NONE:
+        return CompareConst.NONE
     if cos == "N/A" or max_abs_err == "N/A":
         return CompareConst.ACCURACY_CHECK_NO
     try:
         cos, max_abs_err = float(cos), float(max_abs_err)
     except ValueError:
         print_warn_log("Cosine or MaxAbsErr can not get float value.")
-        return CompareConst.NAN
+        return CompareConst.NONE
     if cos < CompareConst.COS_THRESHOLD and max_abs_err > CompareConst.MAX_ABS_ERR_THRESHOLD:
         return CompareConst.ACCURACY_CHECK_NO
     if cos < CompareConst.COS_MAX_THRESHOLD or max_abs_err > CompareConst.MAX_ABS_ERR_MAX_THRESHOLD:
@@ -524,19 +540,20 @@ def check_accuracy(cos, max_abs_err):
 
 def compare_by_op(op_name, op_name_mapping_dict, input_parma):
     npu_bench_name_list = op_name_mapping_dict[op_name]
-    if npu_bench_name_list[1] == CompareConst.NAN:
-        return CompareConst.NAN, CompareConst.NAN, CompareConst.NAN, CompareConst.NO_BENCH, CompareConst.NAN, CompareConst.NAN
+    data_name = npu_bench_name_list[1]
+    if data_name == '-1' or data_name == -1:
+        return CompareConst.NONE, CompareConst.NONE, CompareConst.NONE, CompareConst.NO_BENCH, CompareConst.NONE, CompareConst.NONE
     try:
-        n_path = os.path.join(input_parma.get("npu_dump_data_dir"), npu_bench_name_list[0] + ".npy")
-        b_path = os.path.join(input_parma.get("bench_dump_data_dir"), npu_bench_name_list[1] + ".npy")
+        n_path = os.path.join(input_parma.get("npu_dump_data_dir"), npu_bench_name_list[0])
+        b_path = os.path.join(input_parma.get("bench_dump_data_dir"), npu_bench_name_list[1])
         n_path_checker = FileChecker(n_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
-                                     FileCheckConst.NUMPY_SUFFIX, False)
+                                     FileCheckConst.PT_SUFFIX, False)
         b_path_checker = FileChecker(b_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
-                                     FileCheckConst.NUMPY_SUFFIX, False)
+                                     FileCheckConst.PT_SUFFIX, False)
         n_path = n_path_checker.common_check()
         b_path = b_path_checker.common_check()
-        n_value = np.load(n_path)
-        b_value = np.load(b_path)
+        n_value = torch.load(n_path).detach().numpy()
+        b_value = torch.load(b_path).detach().numpy()
     except IOError as error:
         return CompareConst.NAN, CompareConst.NAN, CompareConst.NAN, "Dump file: {} not found.".format(error.filename), CompareConst.NAN, CompareConst.NAN
     relative_err = get_relative_err(n_value, b_value)
@@ -601,10 +618,9 @@ def handle_inf_nan(n_value, b_value):
 def compare(input_parma, output_path, stack_mode=False, auto_analyze=True,
             fuzzy_match=False):
     try:
-        summary_compare = is_summary_compare(input_parma)
-        md5_compare = is_md5_compare(input_parma)
+        summary_compare, md5_compare = task_dumppath_get(input_parma)
         check_configuration_param(stack_mode, auto_analyze, fuzzy_match)
-        check_compare_param(input_parma, output_path, stack_mode, summary_compare)
+        check_compare_param(input_parma, output_path, stack_mode, summary_compare, md5_compare)
     except CompareException as error:
         print_error_log('Compare failed. Please check the arguments and do it again!')
         sys.exit(error.code)
@@ -620,11 +636,12 @@ def compare_core(input_parma, output_path, stack_mode=False, auto_analyze=True,
     file_path = os.path.join(os.path.realpath(output_path), file_name)
     check_file_not_exists(file_path)
 
-    with FileOpen(input_parma.get("npu_pkl_path"), "r") as npu_pkl, \
-            FileOpen(input_parma.get("bench_pkl_path"), "r") as bench_pkl, \
+    with FileOpen(input_parma.get("npu_json_path"), "r") as npu_json, \
+            FileOpen(input_parma.get("bench_json_path"), "r") as bench_json, \
+            FileOpen(input_parma.get("stack_json_path"), "r") as stack_json, \
             os.fdopen(os.open(file_path, os.O_RDWR | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP), 'w+') \
                     as fout:
-        compare_process([npu_pkl, bench_pkl, fout], stack_mode, fuzzy_match, summary_compare, md5_compare)
+        compare_process([npu_json, bench_json, stack_json, fout], stack_mode, fuzzy_match, summary_compare, md5_compare)
         if summary_compare:
             print_info_log(f"Summary compare result is {file_path}")
 
@@ -671,19 +688,150 @@ def parse(pkl_file, module_name_prefix):
                 print(summery_info)
 
 
+def op_item_parse(item, op_name, index, item_list=[], top_bool=True):
+    if item == None or (isinstance(item, dict) and len(item) == 0):
+        if not top_bool:
+            tmp = {'full_op_name': op_name + '.' + str(index), 'Max': None, 'Min': None, 'Mean': None, 'Norm': None, 'dtype': None, 'shape': None, 'md5': None, 'data_name': '-1'}
+        else:
+            tmp = {'full_op_name': op_name + '.0', 'Max': None, 'Min': None, 'Mean': None, 'Norm': None, 'dtype': None, 'shape': None, 'md5': None, 'data_name': '-1'}
+        item_list.append(tmp)
+        return item_list
+    if index == None:
+        if isinstance(item, dict):
+            full_op_name = op_name + '.0'
+        else:
+            full_op_name = op_name
+    else:
+        full_op_name = op_name + '.' + str(index)
+    if isinstance(item, dict):
+        if 'dtype' in item:
+            parsed_item = item
+            parsed_item['full_op_name'] = full_op_name
+            item_list.append(parsed_item)
+        else:
+            parsed_item = {}
+            if item['type'] == 'slice':
+                parsed_item['full_op_name'] = full_op_name
+                parsed_item['dtype'] = 'slice'
+                parsed_item['shape'] = str(np.shape(np.array(item['value'])))
+                parsed_item['md5'] = None
+                parsed_item['Max'] = None
+                parsed_item['Min'] = None
+                parsed_item['Mean'] = None
+                parsed_item['Norm'] = None
+                parsed_item['data_name'] = '-1'
+                item_list.append(parsed_item)
+            else:
+                parsed_item['full_op_name'] = full_op_name
+                parsed_item['dtype'] = str(type(item['value']))
+                parsed_item['shape'] = '[]'
+                parsed_item['md5'] = None
+                parsed_item['Max'] = item['value']
+                parsed_item['Min'] = item['value']
+                parsed_item['Mean'] = item['value']
+                parsed_item['Norm'] = item['value']
+                parsed_item['data_name'] = '-1'
+                item_list.append(parsed_item)
+    else:
+        for j in range(len(item)):
+            op_item_parse(item[j], full_op_name, j, top_bool=False)
+    return item_list
+
+
+def read_op(op_data, op_name):
+    op_parsed_list = []
+    if 'forward' in op_name:
+        if 'input_args' in op_data:
+            input_item = op_data['input_args']
+            input_parsed_list = op_item_parse(input_item, op_name + '_input', None)
+            op_parsed_list = input_parsed_list.copy()
+            input_parsed_list.clear()
+        if 'input_kwargs' in op_data:
+            kwargs_item = op_data['input_kwargs']
+            if isinstance(kwargs_item, dict) and "type" in kwargs_item or isinstance(kwargs_item, list):
+                kwarg_parsed_list = op_item_parse(kwargs_item, op_name + '_input', None)
+                op_parsed_list += kwarg_parsed_list
+                kwarg_parsed_list.clear()
+            elif kwargs_item:
+                for kwarg in kwargs_item:
+                    kwarg_parsed_list = op_item_parse(kwargs_item[kwarg], op_name + '_input.' + kwarg, None)
+                    op_parsed_list += kwarg_parsed_list
+                    kwarg_parsed_list.clear()
+        if 'output' in op_data:
+            output_item = op_data['output']
+            output_parsed_list = op_item_parse(output_item, op_name + '_output', None)
+            op_parsed_list += output_parsed_list
+            output_parsed_list.clear()
+    if 'backward' in op_name:
+        if 'grad_input' in op_data:
+            input_item = op_data['grad_input']
+            input_parsed_list = op_item_parse(input_item, op_name + '_input', None)
+            op_parsed_list = input_parsed_list.copy()
+            input_parsed_list.clear()
+        if 'grad_output' in op_data:
+            output_item = op_data['grad_output']
+            output_parsed_list = op_item_parse(output_item, op_name + '_output', None)
+            op_parsed_list += output_parsed_list
+            output_parsed_list.clear()
+    return op_parsed_list
+
+
 def compare_process(file_handles, stack_mode, fuzzy_match, summary_compare=False, md5_compare=False):
-    npu_pkl_handle, bench_pkl_handle, output_csv_handle = file_handles
+    npu_json_handle, bench_json_handle, stack_json_handle, output_csv_handle = file_handles
+    npu_json_data = json.load(npu_json_handle)
+    bench_json_data = json.load(bench_json_handle)
+    stack_json_data = json.load(stack_json_handle)
+
     if fuzzy_match:
         print_warn_log("This task uses fuzzy matching, which may affect the accuracy of the comparison.")
+
     npu_ops_queue = []
     bench_ops_queue = []
     result = []
+
+    ops_npu_iter = iter(npu_json_data['data'])
+    ops_bench_iter = iter(bench_json_data['data'])
+    read_err_npu = True
+    read_err_bench = True
+
     while True:
-        npu_file_flag = read_op(npu_ops_queue, npu_pkl_handle, stack_mode)
-        bench_file_flag = read_op(bench_ops_queue, bench_pkl_handle, stack_mode)
-        if (not npu_file_flag and not bench_file_flag) \
-                or (len(npu_ops_queue) == 0 or len(bench_ops_queue) == 0):
+        if not read_err_npu or not read_err_bench:
             break
+        try:
+            op_name_npu = next(ops_npu_iter)
+            read_err_npu = True
+
+            npu_op_data = npu_json_data['data'][op_name_npu]
+            npu_op_parsed_list = read_op(npu_op_data, op_name_npu)
+            if op_name_npu in stack_json_data:
+                npu_op_parsed_list.append({'full_op_name': op_name_npu, 'full_info': stack_json_data[op_name_npu]})
+            else:
+                npu_op_parsed_list.append({'full_op_name': op_name_npu, 'full_info': None})
+
+            npu_ops_queue.append(merge_tensor(npu_op_parsed_list, summary_compare, md5_compare))
+        except StopIteration:
+            read_err_npu = False
+            continue
+        try:
+            op_name_bench = next(ops_bench_iter)
+            read_err_bench = True
+
+            bench_op_data = bench_json_data['data'][op_name_bench]
+            bench_op_parsed_list = read_op(bench_op_data, op_name_bench)
+            if op_name_bench in stack_json_data:
+                bench_op_parsed_list.append(
+                    {'full_op_name': op_name_bench, 'full_info': stack_json_data[op_name_bench]})
+            else:
+                bench_op_parsed_list.append({'full_op_name': op_name_bench, 'full_info': None})
+
+            bench_ops_queue.append(merge_tensor(bench_op_parsed_list, summary_compare, md5_compare))
+        except StopIteration:
+            read_err_bench = False
+            continue
+
+        if len(npu_ops_queue) == 0 or len(bench_ops_queue) == 0:
+            break
+
         n_match_point, b_match_point = match_op(npu_ops_queue, bench_ops_queue, fuzzy_match)
         if n_match_point == -1 and b_match_point == -1:
             continue
@@ -706,8 +854,23 @@ def compare_process(file_handles, stack_mode, fuzzy_match, summary_compare=False
         header = CompareConst.SUMMARY_COMPARE_RESULT_HEADER[:]
     else:
         header = CompareConst.COMPARE_RESULT_HEADER[:]
+
+    all_mode_bool = summary_compare == False and md5_compare == False
     if stack_mode:
-        header.append(CompareConst.STACK)
+        if all_mode_bool:
+            header.append(CompareConst.STACK)
+            header.append(CompareConst.DATA_NAME)
+        else:
+            header.append(CompareConst.STACK)
+    else:
+        if all_mode_bool:
+            for row in result:
+                del row[-2]
+            header.append(CompareConst.DATA_NAME)
+        else:
+            for row in result:
+                del row[-1]
+
     result_df = pd.DataFrame(result, columns=header)
     result_df.to_csv(output_csv_handle, index=False)
 
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py
index 85f9bb95ffa954f64390f9f418d37cf55c00d41b..e6bce19a6203e0a2644e08c825139c4a569ea69d 100644
--- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/compare/distributed_compare.py
@@ -18,7 +18,7 @@ import os
 import sys
 import re
 from ..common.utils import print_error_log, CompareException, check_compare_param, check_file_or_directory_path, \
-    check_configuration_param, is_summary_compare, is_md5_compare
+    check_configuration_param, task_dumppath_get
 from .acc_compare import compare_core
 
 
@@ -36,28 +36,23 @@ def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs):
                 raise CompareException(CompareException.INVALID_PATH_ERROR)
         return contents
 
-    def extract_pkl_and_data_dir(dirname):
-        pkl_path, dump_data_dir, pkl_name, dump_data_dirname = '', '', '', ''
+
+    def extract_json(dirname, stack_json=False):
+        json_path = ''
         for fname in os.listdir(dirname):
             full_path = os.path.join(dirname, fname)
-            if os.path.isdir(full_path):
-                dump_data_dir = full_path
-                dump_data_dirname = fname
-            elif full_path.endswith('.pkl'):
-                pkl_path = full_path
-                pkl_name = fname
+            if full_path.endswith('.json'):
+                json_path = full_path
+                if not stack_json and 'stack' not in json_path:
+                    break
+                if stack_json and 'stack' in json_path:
+                    break
+
         # Provide robustness on invalid directory inputs
-        if not pkl_path:
+        if not json_path:
             print_error_log(f'No file is found in dump dir {dirname}. ')
             raise CompareException(CompareException.NO_DUMP_FILE_ERROR)
-        name_body, ext = os.path.splitext(pkl_name)
-        pattern = re.compile(f'{name_body}$')
-        match = pattern.match(dump_data_dirname)
-        if dump_data_dir and match is None:
-            print_error_log('The names of pkl and directory do not match! '
-                f'Please check the names and remove irrelevant files in {dirname}. ')
-            raise CompareException(CompareException.INVALID_FILE_ERROR)
-        return pkl_path, dump_data_dir
+        return json_path
 
 
     if kwargs.get('suffix'):
@@ -77,18 +72,19 @@ def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs):
     for nr, br in zip(npu_ranks, bench_ranks):
         n_dir = os.path.join(npu_dump_dir, nr)
         b_dir = os.path.join(bench_dump_dir, br)
-        npu_pkl_path, npu_dump_data_dir = extract_pkl_and_data_dir(n_dir)
-        bench_pkl_path, bench_dump_data_dir = extract_pkl_and_data_dir(b_dir)
+        s_dir = b_dir
+        npu_json_path = extract_json(n_dir, stack_json=False)
+        bench_json_path = extract_json(b_dir, stack_json=False)
+        stack_json_path = extract_json(s_dir, stack_json=True)
+
         dump_result_param = {
-            'npu_pkl_path': npu_pkl_path,
-            'bench_pkl_path': bench_pkl_path,
-            'npu_dump_data_dir': npu_dump_data_dir,
-            'bench_dump_data_dir': bench_dump_data_dir,
+            'npu_json_path': npu_json_path,
+            'bench_json_path': bench_json_path,
+            'stack_json_path': stack_json_path,
             'is_print_compare_log': True
         }
         try:
-            summary_compare = is_summary_compare(dump_result_param)
-            md5_compare = is_md5_compare(dump_result_param)
+            summary_compare, md5_compare = task_dumppath_get(dump_result_param)
             check_configuration_param(stack_mode, auto_analyze, fuzzy_match)
             check_compare_param(dump_result_param, output_path, stack_mode=stack_mode, summary_compare=summary_compare)
         except CompareException as error:
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/compare/test_acc_compare.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/compare/test_acc_compare.py
index d51dbfb93d04378866f887f9873e7f99279cb495..5580de5790d9b2ccef132ea6981ba9969bbe9af0 100644
--- a/debug/accuracy_tools/ptdbg_ascend/test/ut/compare/test_acc_compare.py
+++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/compare/test_acc_compare.py
@@ -6,24 +6,14 @@ from ptdbg_ascend.compare import acc_compare as compare
 from ptdbg_ascend.common.utils import CompareConst
 
 
-npu_dict = {'op_name': ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.1', 'Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_output'],\
- 'input_struct': [('torch.float32', [1, 1, 28, 28]), ('torch.float32', [16, 1, 5, 5]), ('torch.float32', [16])],\
-  'output_struct': [('torch.float32', [1, 16, 28, 28])], 'summery': [[3.029174327850342, -2.926689624786377, -0.06619918346405029], \
-  [0.19919930398464203, -0.19974489510059357, 0.006269412115216255], [0.19734230637550354, -0.18177609145641327, 0.007903944700956345], [2.1166646480560303, -2.190781354904175, -0.003579073818400502]], 'stack_info': []}
-bench_dict = {'op_name': ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.1', 'Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_output'],\
- 'input_struct': [('torch.float32', [1, 1, 28, 28]), ('torch.float32', [16, 1, 5, 5]), ('torch.float32', [16])],\
-  'output_struct': [('torch.float32', [1, 16, 28, 28])], 'summery': [[3.029174327850342, -2.926689624786377, -0.06619918346405029], \
-  [0.19919930398464203, -0.19974489510059357, 0.006269412115216255], [0.19734230637550354, -0.18177609145641327, 0.007903944700956345], [2.1166646480560303, -2.190781354904175, -0.003579073818400502]], 'stack_info': []}
-tensor_list = [['Functional_conv2d_0_forward_input.0', 1, [], 'torch.float32', [1, 1, 28, 28], [3.029174327850342, -2.926689624786377, -0.06619918346405029]],\
- ['Functional_conv2d_0_forward_input.1', 1, [], 'torch.float32', [16, 1, 5, 5], [0.19919930398464203, -0.19974489510059357, 0.006269412115216255]], \
- ['Functional_conv2d_0_forward_input.2', 1, [], 'torch.float32', [16], [0.19734230637550354, -0.18177609145641327, 0.007903944700956345]],\
-  ['Functional_conv2d_0_forward_output', 1, [], 'torch.float32', [1, 16, 28, 28], [2.1166646480560303, -2.190781354904175, -0.003579073818400502]]]
-result_op_dict = {'op_name': ['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.1', 'Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_output'], \
-'input_struct': [('torch.float32', [1, 1, 28, 28], []), ('torch.float32', [16, 1, 5, 5], []), ('torch.float32', [16], [])], \
-'output_struct': [('torch.float32', [1, 16, 28, 28], [])], 'summery': [[3.029174327850342, -2.926689624786377, -0.06619918346405029], [0.19919930398464203, -0.19974489510059357, 0.006269412115216255], \
-[0.19734230637550354, -0.18177609145641327, 0.007903944700956345], [2.1166646480560303, -2.190781354904175, -0.003579073818400502]], 'stack_info': []}
-
-o_result = [['Functional_conv2d_0_forward_input.0', 'Functional_conv2d_0_forward_input.0', 'torch.float32', 'torch.float32', [1, 1, 28, 28], [1, 1, 28, 28], ' ', ' ', ' ', ' ', ' ', 3.029174327850342, -2.926689624786377, -0.06619918346405029, 3.029174327850342, -2.926689624786377, -0.06619918346405029, 'Yes', ''], ['Functional_conv2d_0_forward_input.1', 'Functional_conv2d_0_forward_input.1', 'torch.float32', 'torch.float32', [16, 1, 5, 5], [16, 1, 5, 5], ' ', ' ', ' ', ' ', ' ', 0.19919930398464203, -0.19974489510059357, 0.006269412115216255, 0.19919930398464203, -0.19974489510059357, 0.006269412115216255, 'Yes', ''], ['Functional_conv2d_0_forward_input.2', 'Functional_conv2d_0_forward_input.2', 'torch.float32', 'torch.float32', [16], [16], ' ', ' ', ' ', ' ', ' ', 0.19734230637550354, -0.18177609145641327, 0.007903944700956345, 0.19734230637550354, -0.18177609145641327, 0.007903944700956345, 'Yes', ''], ['Functional_conv2d_0_forward_output', 'Functional_conv2d_0_forward_output', 'torch.float32', 'torch.float32', [1, 16, 28, 28], [1, 16, 28, 28], ' ', ' ', ' ', ' ', ' ', 2.1166646480560303, -2.190781354904175, -0.003579073818400502, 2.1166646480560303, -2.190781354904175, -0.003579073818400502, 'Yes', '']]
+npu_dict = {'op_name': ['modulemodel.linear.Linear.0.forward_input.0', 'modulemodel.linear.Linear.0.forward_input.1', 'modulemodel.linear.Linear.0.forward_input.2.0', 'modulemodel.linear.Linear.0.forward_input.2.1', 'modulemodel.linear.Linear.0.forward_output.0'], 'input_struct': [('torch.float32', [10, 10], '7f84caad'), (None, None, None), ("<class 'int'>", '[]', None), ("<class 'int'>", '[]', None)], 'output_struct': [('torch.float32', [10, 10], '3e8354f5')], 'summery': [[2.8386683464050293, -2.158618688583374, 0.11464785784482956, 10.07983684539795], [None, None, None, None], [2, 2, 2, 2], [2, 2, 2, 2], [1.1663073301315308, -1.6045000553131104, -0.1430426388978958, 6.108779430389404]], 'stack_info': [['File run_sample.py, line 11, in forward, \n return self.relu(self.linear(x))', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1568, in _call_impl, \n result = forward_call(*args, **kwargs)', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)', 'File run_sample.py, line 21, in forward, \n return self.linear(self.model(x))', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1528, in _call_impl, \n return forward_call(*args, **kwargs)', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)', 'File run_sample.py, line 30, in , \n y = model(x)']]}
+bench_dict = {'op_name': ['modulemodel.linear.Linear.0.forward_input.0', 'modulemodel.linear.Linear.0.forward_input.1', 'modulemodel.linear.Linear.0.forward_input.2.0', 'modulemodel.linear.Linear.0.forward_input.2.1', 'modulemodel.linear.Linear.0.forward_output.0'], 'input_struct': [('torch.float32', [10, 10], '7f84caad'), (None, None, None), ("<class 'int'>", '[]', None), ("<class 'int'>", '[]', None)], 'output_struct': [('torch.float32', [10, 10], '3e8354f5')], 'summery': [[2.8386683464050293, -2.158618688583374, 0.11464785784482956, 10.07983684539795], [None, None, None, None], [2, 2, 2, 2], [2, 2, 2, 2], [1.1663073301315308, -1.6045000553131104, -0.1430426388978958, 6.108779430389404]], 'stack_info': [['File run_sample.py, line 11, in forward, \n return self.relu(self.linear(x))', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1568, in _call_impl, \n result = forward_call(*args, **kwargs)', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)', 'File run_sample.py, line 21, in forward, \n return self.linear(self.model(x))', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1528, in _call_impl, \n return forward_call(*args, **kwargs)', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)', 'File run_sample.py, line 30, in , \n y = model(x)']]}
+tensor_list = [{'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [10, 10], 'Max': 2.8386683464050293, 'Min': -2.158618688583374, 'Mean': 0.11464785784482956, 'Norm': 10.07983684539795, 'requires_grad': False, 'md5': '7f84caad', 'full_op_name': 'modulemodel.linear.Linear.0.forward_input.0'}, {'full_op_name': 'modulemodel.linear.Linear.0.forward_input.1', 'Max': None, 'Min': None, 'Mean': None, 'Norm': None, 'dtype': None, 'shape': None, 'md5': None, 'data_name': '-1'}, {'full_op_name': 'modulemodel.linear.Linear.0.forward_input.2.0', 'dtype': "<class 'int'>", 'shape': '[]', 'md5': None, 'Max': 2, 'Min': 2, 'Mean': 2, 'Norm': 2, 'data_name': '-1'}, {'full_op_name': 'modulemodel.linear.Linear.0.forward_input.2.1', 'dtype': "<class 'int'>", 'shape': '[]', 'md5': None, 'Max': 2, 'Min': 2, 'Mean': 2, 'Norm': 2, 'data_name': '-1'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [10, 10], 'Max': 1.1663073301315308, 'Min': -1.6045000553131104, 'Mean': -0.1430426388978958, 'Norm': 6.108779430389404, 'requires_grad': True, 'md5': '3e8354f5', 'full_op_name': 'modulemodel.linear.Linear.0.forward_output.0'}, {'full_op_name': 'modulemodel.linear.Linear.0.forward', 'full_info': ['File run_sample.py, line 11, in forward, \n return self.relu(self.linear(x))', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1568, in _call_impl, \n result = forward_call(*args, **kwargs)', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)', 'File run_sample.py, line 21, in forward, \n return self.linear(self.model(x))', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1528, in _call_impl, \n return forward_call(*args, **kwargs)', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)', 'File run_sample.py, line 30, in , \n y = model(x)']}]
+result_op_dict = {'op_name': ['modulemodel.linear.Linear.0.forward_input.0', 'modulemodel.linear.Linear.0.forward_input.1', 'modulemodel.linear.Linear.0.forward_input.2.0', 'modulemodel.linear.Linear.0.forward_input.2.1', 'modulemodel.linear.Linear.0.forward_output.0'], 'input_struct': [('torch.float32', [10, 10], '7f84caad'), (None, None, None), ("<class 'int'>", '[]', None), ("<class 'int'>", '[]', None)], 'output_struct': [('torch.float32', [10, 10], '3e8354f5')], 'summery': [[2.8386683464050293, -2.158618688583374, 0.11464785784482956, 10.07983684539795], [None, None, None, None], [2, 2, 2, 2], [2, 2, 2, 2], [1.1663073301315308, -1.6045000553131104, -0.1430426388978958, 6.108779430389404]], 'stack_info': [['File run_sample.py, line 11, in forward, \n return self.relu(self.linear(x))', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1568, in _call_impl, \n result = forward_call(*args, **kwargs)', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)', 'File run_sample.py, line 21, in forward, \n return self.linear(self.model(x))', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1528, in _call_impl, \n return forward_call(*args, **kwargs)', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)', 'File run_sample.py, line 30, in , \n y = model(x)']]}
+o_result = [['modulemodel.linear.Linear.0.forward_input.0', 'modulemodel.linear.Linear.0.forward_input.0', 'torch.float32', 'torch.float32', [10, 10], [10, 10], '7f84caad', '7f84caad', 'Pass', ['File run_sample.py, line 11, in forward, \n return self.relu(self.linear(x))', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1568, in _call_impl, \n result = forward_call(*args, **kwargs)', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)', 'File run_sample.py, line 21, in forward, \n return self.linear(self.model(x))', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1528, in _call_impl, \n return forward_call(*args, **kwargs)', 'File /home/louyujing/miniconda3/envs/pytorch21/lib/python3.8/site-packages/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)', 'File run_sample.py, line 30, in , \n y = model(x)']], ['modulemodel.linear.Linear.0.forward_input.1', 'modulemodel.linear.Linear.0.forward_input.1', None, None, None, None, None, None, 'Pass', 'None'], ['modulemodel.linear.Linear.0.forward_input.2.0', 'modulemodel.linear.Linear.0.forward_input.2.0', "<class 'int'>", "<class 'int'>", '[]', '[]', None, None, 'Pass', 'None'], ['modulemodel.linear.Linear.0.forward_input.2.1', 'modulemodel.linear.Linear.0.forward_input.2.1', "<class 'int'>", "<class 'int'>", '[]', '[]', None, None, 'Pass', 'None'], ['modulemodel.linear.Linear.0.forward_output.0', 'modulemodel.linear.Linear.0.forward_output.0', 'torch.float32', 'torch.float32', [10, 10], [10, 10], '3e8354f5', '3e8354f5', 'Pass', 'None']]
+npu_op_data = {'input_args': [{'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [10, 10], 'Max': 2.8386683464050293, 'Min': -2.158618688583374, 'Mean': 0.11464785784482956, 'Norm': 10.07983684539795, 'requires_grad': False, 'md5': '7f84caad'}, None, [{'type': 'int', 'value': 2}, {'type': 'int', 'value': 2}]], 'input_kwargs': {}, 'output': [{'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [10, 10], 'Max': 1.1663073301315308, 'Min': -1.6045000553131104, 'Mean': -0.1430426388978958, 'Norm': 6.108779430389404, 'requires_grad': True, 'md5': '3e8354f5'}]}
+result_1 = [{'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [10, 10], 'Max': 2.8386683464050293, 'Min': -2.158618688583374, 'Mean': 0.11464785784482956, 'Norm': 10.07983684539795, 'requires_grad': False, 'md5': '7f84caad', 'full_op_name': 'modulemodel.linear.Linear.0.forward_input.0'}, {'full_op_name': 'modulemodel.linear.Linear.0.forward_input.1', 'Max': None, 'Min': None, 'Mean': None, 'Norm': None, 'dtype': None, 'shape': None, 'md5': None, 'data_name': '-1'}, {'full_op_name': 'modulemodel.linear.Linear.0.forward_input.2.0', 'dtype': "<class 'int'>", 'shape': '[]', 'md5': None, 'Max': 2, 'Min': 2, 'Mean': 2, 'Norm': 2, 'data_name': '-1'}, {'full_op_name': 'modulemodel.linear.Linear.0.forward_input.2.1', 'dtype': "<class 'int'>", 'shape': '[]', 'md5': None, 'Max': 2, 'Min': 2, 'Mean': 2, 'Norm': 2, 'data_name': '-1'}, {'type': 'torch.Tensor', 'dtype': 'torch.float32', 'shape': [10, 10], 'Max': 1.1663073301315308, 'Min': -1.6045000553131104, 'Mean': -0.1430426388978958, 'Norm': 6.108779430389404, 'requires_grad': True, 'md5': '3e8354f5', 'full_op_name': 'modulemodel.linear.Linear.0.forward_output.0'}]
+aten_result = [['Aten__native_batch_norm_legit_functional.default_0_forward_input.0', 'Functional_batch_norm_0_forward_input.0', 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 136.56337118148804, -124.33742618560791, -0.010397066915174946, ' ', 139.625, -127.5625, -0.0103607177734375, 3.061628818511963, -3.22507381439209, 3.634914173744619e-05, 'Warning', 'Need double check api accuracy.', 'None'], ['Aten__native_batch_norm_legit_functional.default_0_forward_input.1', 'Functional_batch_norm_0_forward_input.1', 'torch.float32', 'torch.float32', [256], [256], 2.527024927258026, -2.1782388387364335, -0.0008296193100250093, ' ', 2.5276029109954834, -2.1788690090179443, -0.0008259844034910202, 0.0005779837374575436, -0.0006301702815108001, 3.634906533989124e-06, 'Warning', 'Need double check api accuracy.', 'None'], ['Aten__native_batch_norm_legit_functional.default_0_forward_input.2', 'Functional_batch_norm_0_forward_input.2', 'torch.float32', 'torch.float32', [256], [256], 1.5384095311164856, -3.7736878395080566, -0.9390918612480164, ' ', 2.472219944000244, -2.845968723297119, -0.008756577968597412, 0.9338104128837585, 0.9277191162109375, 0.930335283279419, 'Warning', 'Need double check api accuracy.', 'None'], ['Aten__native_batch_norm_legit_functional.default_0_forward_input.3', 'Functional_batch_norm_0_forward_input.3', 'torch.float32', 'torch.float32', [256], [256], 1.763145923614502, -4.398397922515869, -1.0521326325833797, ' ', 2.763145923614502, -3.398397922515869, -0.052132632583379745, 1.0, 1.0, 1.0, 'Warning', 'Need double check api accuracy.', 'None'], ['Aten__native_batch_norm_legit_functional.default_0_forward_input.4', 'Functional_batch_norm_0_forward_input.4', 'torch.float32', 'torch.float32', [256], [256], 2.673110008239746, -3.149275064468384, 0.01613386906683445, ' ', 2.673110008239746, -3.149275064468384, 0.01613386906683445, 0.0, 0.0, 0.0, 'Warning', 'Need double check api accuracy.', 'None'], ['Aten__native_batch_norm_legit_functional.default_0_forward_output.0', 'Functional_batch_norm_0_forward_output', 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], 8.156781196594238, -4.843813419342041, -0.008758545174714527, ' ', 13.5546875, -10.640625, -0.008758544921875, 5.397906303405762, -5.796811580657959, 2.5283952709287405e-10, 'Warning', 'Need double check api accuracy.', 'None'], ['Aten__native_batch_norm_legit_functional.default_0_forward_output.1', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', ' ', ' ', ' ', ' ', ' ', 0.30550330877304077, -0.24485322833061218, -0.010361209511756897, 'Nan', 'Nan', 'Nan', 'Yes', '', 'None'], ['Aten__native_batch_norm_legit_functional.default_0_forward_output.2', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', ' ', ' ', ' ', ' ', ' ', 623.9192504882812, 432.96826171875, 520.2276611328125, 'Nan', 'Nan', 'Nan', 'Yes', '', 'None'], ['Aten__native_batch_norm_legit_functional.default_0_forward_output.3', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', ' ', ' ', ' ', ' ', ' ', 2.4797861576080322, -3.055997371673584, -0.04795549064874649, 'Nan', 'Nan', 'Nan', 'Yes', '', 'None'], ['Aten__native_batch_norm_legit_functional.default_0_forward_output.4', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', ' ', ' ', ' ', ' ', ' ', 61.7945556640625, 42.59713363647461, 52.03831481933594, 'Nan', 'Nan', 'Nan', 'Yes', '', 'None']]
 
 npu_dict_aten = {'op_name': ['Aten__native_batch_norm_legit_functional.default_0_forward_input.0',
         'Aten__native_batch_norm_legit_functional.default_0_forward_input.1',
@@ -61,18 +51,6 @@ bench_dict_functional = {'op_name': ['Functional_batch_norm_0_forward_input.0',
         [5.397906303405762, -5.796811580657959, 2.5283952709287405e-10]]
 }
 
-aten_result = [['Aten__native_batch_norm_legit_functional.default_0_forward_input.0', 'Functional_batch_norm_0_forward_input.0', 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], ' ', ' ', ' ', ' ', ' ', 139.625, -127.5625, -0.0103607177734375, 3.061628818511963, -3.22507381439209, 3.634914173744619e-05, 'Yes', ''],
-    ['Aten__native_batch_norm_legit_functional.default_0_forward_input.1', 'Functional_batch_norm_0_forward_input.1', 'torch.float32', 'torch.float32', [256], [256], ' ', ' ', ' ', ' ', ' ', 2.5276029109954834, -2.1788690090179443, -0.0008259844034910202, 0.0005779837374575436, -0.0006301702815108001, 3.634906533989124e-06, 'Yes', ''],
-    ['Aten__native_batch_norm_legit_functional.default_0_forward_input.2', 'Functional_batch_norm_0_forward_input.2', 'torch.float32', 'torch.float32', [256], [256], ' ', ' ', ' ', ' ', ' ', 2.472219944000244, -2.845968723297119, -0.008756577968597412, 0.9338104128837585, 0.9277191162109375, 0.930335283279419, 'Yes', ''],
-    ['Aten__native_batch_norm_legit_functional.default_0_forward_input.3', 'Functional_batch_norm_0_forward_input.3', 'torch.float32', 'torch.float32', [256], [256], ' ', ' ', ' ', ' ', ' ', 2.763145923614502, -3.398397922515869, -0.052132632583379745, 1.0, 1.0, 1.0, 'Yes', ''],
-    ['Aten__native_batch_norm_legit_functional.default_0_forward_input.4', 'Functional_batch_norm_0_forward_input.4', 'torch.float32', 'torch.float32', [256], [256], ' ', ' ', ' ', ' ', ' ', 2.673110008239746, -3.149275064468384, 0.01613386906683445, 0.0, 0.0, 0.0, 'Yes', ''],
-    ['Aten__native_batch_norm_legit_functional.default_0_forward_output.0', 'Functional_batch_norm_0_forward_output', 'torch.float16', 'torch.float32', [256, 256, 14, 14], [256, 256, 14, 14], ' ', ' ', ' ', ' ', ' ', 13.5546875, -10.640625, -0.008758544921875, 5.397906303405762, -5.796811580657959, 2.5283952709287405e-10, 'Yes', ''],
-    ['Aten__native_batch_norm_legit_functional.default_0_forward_output.1', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', ' ', ' ', ' ', ' ', ' ', 0.30550330877304077, -0.24485322833061218, -0.010361209511756897, 'Nan', 'Nan', 'Nan', 'Yes', ''],
-    ['Aten__native_batch_norm_legit_functional.default_0_forward_output.2', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', ' ', ' ', ' ', ' ', ' ', 623.9192504882812, 432.96826171875, 520.2276611328125, 'Nan', 'Nan', 'Nan', 'Yes', ''],
-    ['Aten__native_batch_norm_legit_functional.default_0_forward_output.3', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', ' ', ' ', ' ', ' ', ' ', 2.4797861576080322, -3.055997371673584, -0.04795549064874649, 'Nan', 'Nan', 'Nan', 'Yes', ''],
-    ['Aten__native_batch_norm_legit_functional.default_0_forward_output.4', 'Nan', 'torch.float32', 'Nan', [256], 'Nan', ' ', ' ', ' ', ' ', ' ', 61.7945556640625, 42.59713363647461, 52.03831481933594, 'Nan', 'Nan', 'Nan', 'Yes', '']
-    ]
-
 
 class TestUtilsMethods(unittest.TestCase):
     def test_correct_data(self):
@@ -164,19 +142,13 @@ class TestUtilsMethods(unittest.TestCase):
         self.assertEqual(result, True)
 
     def test_merge_tensor(self):
-        op_dict = compare.merge_tensor(tensor_list)
+        op_dict = compare.merge_tensor(tensor_list, False, True)
         self.assertEqual(op_dict, result_op_dict)
 
     def test_read_op(self):
-        base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-        
-        pkl_dir = os.path.join(base_dir, "resources/compare/npu_test.pkl")
- 
-        npu_ops_queue = []
-        npu_pkl_handle = open(pkl_dir, "r")
-        stack_mode = False
-        result = compare.read_op(npu_ops_queue, npu_pkl_handle, stack_mode)
-        self.assertEqual(result, True)
+        op_name_npu = 'modulemodel.linear.Linear.0.forward'
+        result = compare.read_op(npu_op_data, op_name_npu)
+        self.assertEqual(result, result_1)
 
 
     def test_match_op(self):
@@ -187,11 +159,11 @@ class TestUtilsMethods(unittest.TestCase):
 
     def test_get_accuracy(self):
         result = []
-        compare.get_accuracy(result, npu_dict, bench_dict)
+        compare.get_accuracy(result, npu_dict, bench_dict, False, True)
         
         self.assertEqual(result, o_result)
 
     def test_get_accuracy_graph_mode(self):
         result = []
-        compare.get_accuracy(result, npu_dict_aten, bench_dict_functional)
+        compare.get_accuracy(result, npu_dict_aten, bench_dict_functional, True, False)
         self.assertEqual(result, aten_result)
diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py
index 9ae980102121314205446bcd4e4d80fadbd74dad..b550954236f3e6c494efd4d69593da085965b9c5 100644
--- a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py
+++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_utils.py
@@ -3,7 +3,7 @@ import torch
 import pytest
 import ptdbg_ascend.common.utils as utils
 
-from ptdbg_ascend.common.utils import CompareException, is_md5_compare, get_md5_for_tensor
+from ptdbg_ascend.common.utils import CompareException, get_md5_for_tensor
 from ptdbg_ascend.common.file_check_util import FileCheckException
 
 
@@ -32,10 +32,6 @@ class TestUtilsMethods(unittest.TestCase):
             utils.check_file_size(file, 0)
         self.assertEqual(error.value.code, CompareException.INVALID_FILE_ERROR)
 
-    def test_is_md5_compare(self):
-        input_param = {"npu_pkl_path": "resources/compare/npu_test.pkl"}
-        result = is_md5_compare(input_param)
-        self.assertFalse(result)
 
     def test_get_md5_for_tensor(self):
         data = [[1, 2], [3, 4]]
diff --git a/debug/accuracy_tools/setup.py b/debug/accuracy_tools/setup.py
index 886d230906476909b7e88eade5424e8d20aa883a..f1579a7e416e946e7f76ae2f78cc05d112cfc22d 100644
--- a/debug/accuracy_tools/setup.py
+++ b/debug/accuracy_tools/setup.py
@@ -19,7 +19,7 @@ from setuptools import setup, find_packages
 
 setup(
     name='ascend_training_accuracy_tools',
-    version='0.0.1',
+    version='0.0.3',
     description='This is a pytorch precision comparison tools',
     long_description='This is a pytorch precision comparison tools, include ptdbg and api accuracy checker',
     packages=find_packages(),