From d1e8e7a9a48087548a09c4ff4e2fa9bf8b795c88 Mon Sep 17 00:00:00 2001
From: fuchao <fuchao32@huawei.com>
Date: Sat, 17 Aug 2024 16:12:22 +0800
Subject: [PATCH 1/7] =?UTF-8?q?=E8=AE=AD=E7=BB=83=E8=BF=87=E7=A8=8B?=
 =?UTF-8?q?=E7=9A=84=E6=BA=A2=E5=87=BADump=E4=B8=AA=E6=95=B0=E5=8F=AF?=
 =?UTF-8?q?=E9=85=8D=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/mindspore/source_en/model_train/debug/dump.md    |  8 ++++++--
 docs/mindspore/source_zh_cn/model_train/debug/dump.md | 10 +++++++---
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/docs/mindspore/source_en/model_train/debug/dump.md b/docs/mindspore/source_en/model_train/debug/dump.md
index 9c6fcdfbef..483bd8177a 100644
--- a/docs/mindspore/source_en/model_train/debug/dump.md
+++ b/docs/mindspore/source_en/model_train/debug/dump.md
@@ -181,7 +181,8 @@ The support for Asynchronous Dump on Ascend backend is shown in the table below
             "input_output": 0,
             "kernels": ["Default/Conv-op12"],
             "support_device": [0,1,2,3,4,5,6,7],
-            "statistic_category": ["max", "min", "l2norm"]
+            "statistic_category": ["max", "min", "l2norm"],
+            "overflow_number": 0
         },
         "e2e_dump_settings": {
             "enable": true,
@@ -222,6 +223,7 @@ The support for Asynchronous Dump on Ascend backend is shown in the table below
       Except for those marked as supporting device statistics, other statistics can be collected only on the host.
       This field is optional, with default values of ["max", "min", "l2norm"].
 
+    - `overflow_number`：Specify the number of data to overflow dump. This field is required only when `op_debug_mode` is set to 3 and only the overflow operator is saved. It can control the overflow data to be dumped in chronological order until the specified value is reached, and the overflow data will no longer be dumped. The default value is 0, which means dumping all overflow data.
     - `enable`: When set to true, enable Synchronous Dump. When set to false, asynchronous dump will be used on Ascend and synchronous dump will still be used on GPU.
     - `trans_flag`: Enable trans flag. Transform the device data format into NCHW. If it is `True`, the data will be saved in the 4D format (NCHW) format on the Host side; if it is `False`, the data format on the Device side will be retained. Default: `True`.
     - `stat_calc_mode`: Select the backend for statistical calculations. Options are "host" and "device". Choosing "device" enables device computation of statistics, currently only effective on Ascend, and supports only min/max/avg/l2norm statistics.
@@ -517,7 +519,8 @@ MindSpore provides debugging capabilities for large networks through asynchronou
             "kernels": ["Default/Conv-op12"],
             "support_device": [0,1,2,3,4,5,6,7],
             "statistic_category": ["max", "min", "l2norm"],
-            "file_format": "npy"
+            "file_format": "npy",
+            "overflow_number": 0
         }
     }
     ```
@@ -551,6 +554,7 @@ MindSpore provides debugging capabilities for large networks through asynchronou
       This field is optional, with default values of ["max", "min", "l2norm"].
 
     - `file_format`: Dump file type. It can be either `npy` and `bin`. `npy`: data will be dumped in npy files as host format. `bin`: data will be dumped in protobuf file as device format and need to be transformed to parse using the provided data analysis tool. Please refer to [Asynchronous Dump Data Analysis Sample](#data-analysis-sample-1) for details. The default value is `bin`.
+    - `overflow_number`：Specify the number of data to overflow dump. This field is required only when `op_debug_mode` is set to 3 and `file_format` is set to `npy`. It can control the overflow data to be dumped in chronological order until the specified value is reached, and the overflow data will no longer be dumped. The default value is 0, which means dumping all overflow data.
 
 2. Set Dump environment variable.
 
diff --git a/docs/mindspore/source_zh_cn/model_train/debug/dump.md b/docs/mindspore/source_zh_cn/model_train/debug/dump.md
index 981d847304..2a1ab9ac1b 100644
--- a/docs/mindspore/source_zh_cn/model_train/debug/dump.md
+++ b/docs/mindspore/source_zh_cn/model_train/debug/dump.md
@@ -181,7 +181,8 @@ Ascend后端异步Dump支持情况如下表（GPU/CPU后端不支持）。
             "input_output": 0,
             "kernels": ["Default/Conv-op12"],
             "support_device": [0,1,2,3,4,5,6,7],
-            "statistic_category": ["max", "min", "l2norm"]
+            "statistic_category": ["max", "min", "l2norm"],
+            "overflow_number": 0
         },
         "e2e_dump_settings": {
             "enable": true,
@@ -222,8 +223,9 @@ Ascend后端异步Dump支持情况如下表（GPU/CPU后端不支持）。
       以上除了标记了支持device统计的，其它都仅支持在host统计。
       该字段为可选，默认值为["max", "min", "l2norm"]。
 
+    - `overflow_number`：指定溢出dump的数据个数。该字段仅在`op_debug_mode`设置为3，只保存溢出算子时需要配置，可控制溢出数据按时间序dump，到指定数值后溢出数据不再dump。默认值为0，表示dump全部溢出数据。
     - `enable`：设置成true，表示开启同步Dump；设置成false时，在Ascend上会使用异步Dump，在GPU上仍然使用同步Dump。
-    - `trans_flag`：开启格式转换。将设备上的数据格式转换成NCHW格式。若为`True`，则数据会以Host侧的4D格式（NCHW）格式保存；若为`False`，则保留Device侧的数据格式。该配置参数在CPU上无效，因为CPU上没有format转换。默认值：true。
+    - `trans_flag`：开启格式转换，将设备上的数据格式转换成NCHW格式。若为`True`，则数据会以Host侧的4D格式（NCHW）格式保存；若为`False`，则保留Device侧的数据格式。该配置参数在CPU上无效，因为CPU上没有format转换。默认值：true。
     - `stat_calc_mode`：选择统计信息计算后端，可选"host"和"device"。选择"device"后可以使能device计算统计信息，当前只在Ascend生效，只支持`min/max/avg/l2norm`统计量。
     - `sample_mode`：设置成0，表示不开启切片dump功能；设置成1时，在图编译等级为O0或O1的情况下开启切片dump功能。仅在op_debug_mode设置为0时生效，其它场景不会开启切片dump功能。
     - `sample_num`：用于控制切片dump中切片的大小。默认值为100。
@@ -517,7 +519,8 @@ MindSpore通过异步Dump提供了Ascend平台上大型网络的调试能力。
             "kernels": ["Default/Conv-op12"],
             "support_device": [0,1,2,3,4,5,6,7],
             "statistic_category": ["max", "min", "l2norm"],
-            "file_format": "npy"
+            "file_format": "npy",
+            "overflow_number": 0
         }
     }
     ```
@@ -551,6 +554,7 @@ MindSpore通过异步Dump提供了Ascend平台上大型网络的调试能力。
       该字段为可选，默认值为["max", "min", "l2norm"]。
 
     - `file_format`: dump数据的文件类型，只支持`npy`和`bin`两种取值。设置成`npy`，则dump出的算子张量数据将为host侧格式的npy文件；设置成`bin`，则dump出的数据将为device侧格式的protobuf文件，需要借助转换工具进行处理，详细步骤请参考[异步Dump数据分析样例](#数据分析样例-1)。默认取值为`bin`。
+    - `overflow_number`：指定溢出dump的数据个数。该字段仅在`op_debug_mode`设置为3开启溢出检测功能，且`file_format`设置为`npy`时需要配置，可控制溢出数据按时间序dump，到指定数值后溢出数据不再dump。默认值为0，表示dump全部溢出数据。
 
 2. 设置数据Dump的环境变量。
 
-- 
Gitee


From bdf9956196bbc01a2c5f84b531596436c12f9801 Mon Sep 17 00:00:00 2001
From: maning202007 <maning36@huawei.com>
Date: Mon, 19 Aug 2024 17:28:57 +0800
Subject: [PATCH 2/7] add dtype in dump file_name

---
 .../source_en/model_train/debug/dump.md       | 21 ++++++++++---------
 .../source_zh_cn/model_train/debug/dump.md    | 21 ++++++++++---------
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/docs/mindspore/source_en/model_train/debug/dump.md b/docs/mindspore/source_en/model_train/debug/dump.md
index 483bd8177a..11bff2d1dd 100644
--- a/docs/mindspore/source_en/model_train/debug/dump.md
+++ b/docs/mindspore/source_en/model_train/debug/dump.md
@@ -280,9 +280,9 @@ After starting the training, the data objects saved by the synchronous Dump incl
                 - {iteration_id}/
                     {op_type}.{op_name}.json
                     statistic.csv
-                    {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy
+                    {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy
                 - constants/
-                    Parameter.data-{data_id}.0.0.{timestamp}.output.0.DefaultFormat.npy
+                    Parameter.data-{data_id}.0.0.{timestamp}.output.0.DefaultFormat.{dtype}.npy
             ...
         - graphs/
             ms_output_trace_code_graph_{graph_id}.pb
@@ -305,31 +305,32 @@ After starting the training, the data objects saved by the synchronous Dump incl
 - `input_output_index` : the index of input or output. For example, `output_0` means that the file is the data of the first output Tensor of the operator.
 - `slot`: the id of the slot.
 - `format`: the format of the data.
+- `dtype`: the original data type. When it is `bfloat16` or `int4`, the saved data in the `.npy` file is converted to `float32` or `int8` respectively.
 - `data_id`: the id of constant data.
 
 For multi-graph networks, due to the control flow, some subgraphs may not be executed, but Dump only saves the executed nodes, so the {graph_id} in the `.pb` file name in the graphs directory does not necessarily exist in the {graph_id} directory under {net_name}.
 
-Only when `saved_data` is "statistic" or "full", `statistic.csv` is generated. Only when `saved_data` is "tensor" or "full", `{op_type}. {op_name}. {task_id}. {stream_id}. {timestamp}. {input_output_index}. {slot}. {format}.npy` named complete tensor information is generated.
+Only when `saved_data` is "statistic" or "full", `statistic.csv` is generated. Only when `saved_data` is "tensor" or "full", `{op_type}. {op_name}. {task_id}. {stream_id}. {timestamp}. {input_output_index}. {slot}. {format}.{dtype}.npy` named complete tensor information is generated.
 
 Only when `save_kernel_args` is `True`, `{op_type}.{op_name}.json` is generated and the params of the corresponding operators is saved.
 
 The data file generated by the synchronous Dump is a binary file with the suffix `.npy`, and the file naming format is:
 
 ```text
-{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy
+{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy
 ```
 
 The constant data file generated by the synchronous Dump is in the same format as data file, whereas {op_type}, {task_id}, {stream_id}, {input_output_index}, {slot}, {format} are unchanged for all constant data. Note, non-Tensor type will not generate data file. This function is not supported in the Ascend scenario.
 
 ```text
-Parameter.data-{data_id}.0.0.{timestamp}.output.0.DefaultFormat.npy
+Parameter.data-{data_id}.0.0.{timestamp}.output.0.DefaultFormat.{dtype}.npy
 ```
 
 The {iteration_id} directory may also save files starting with `Parameter` (parameters such as weight and bias will be saved as files starting with `Parameter`), while `Parameter` files will not be saved on Ascend.
 
 User can use Numpy interface `numpy.load` to read the data.
 
-The statistics file generated by the synchronous dump is named `statistic.csv`. This file stores key statistics for all tensors dumped under the same directory as itself (with the file names `{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy`). Each row in `statistic.csv` summarizes a single tensor, each row contains the statistics: Op Type, Op Name, Task ID, Stream ID, Timestamp, IO, Slot, Data Size, Data Type, Shape, and statistics items configured by the user. Note that opening this file with Excel may cause data to be displayed incorrectly. Please use commands like `vi` or `cat`, or use Excel to import csv from text for viewing.
+The statistics file generated by the synchronous dump is named `statistic.csv`. This file stores key statistics for all tensors dumped under the same directory as itself (with the file names `{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy`). Each row in `statistic.csv` summarizes a single tensor, each row contains the statistics: Op Type, Op Name, Task ID, Stream ID, Timestamp, IO, Slot, Data Size, Data Type, Shape, and statistics items configured by the user. Note that opening this file with Excel may cause data to be displayed incorrectly. Please use commands like `vi` or `cat`, or use Excel to import csv from text for viewing.
 
 The suffixes of the final execution graph files generated by synchronous Dump are `.pb` and `.ir` respectively, and the file naming format is:
 
@@ -485,13 +486,13 @@ Through the operator name and input and output information, you can find the onl
 - `slot`: 0, this tensor only has one slot.
 
 Search for the corresponding file name in the data object file directory saved by Dump:
-`Conv2d.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.npy`.
+`Conv2d.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.float16.npy`.
 
 When restoring data, execute:
 
 ```python
 import numpy
-numpy.load("Conv2D.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.npy")
+numpy.load("Conv2D.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.float16.npy")
 ```
 
 Generate the numpy.array data.
@@ -642,7 +643,7 @@ The overflow file (file `Opdebug.Node_OpDebug.{task_id}.{stream_id}.{timestamp}`
 If set `file_format` to `npy`, the operator file will be saved as a npy format file, and the overflow file will be saved as a json format file. The file naming formats are:
 
 ```text
-{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy
+{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy
 Opdebug.Node_OpDebug.{task_id}.{stream_id}.{timestamp}.output.0.json
 ```
 
@@ -668,7 +669,7 @@ The original data file generated by dump can also be parsed by using the data pa
 
 If setting `file_format` to `npy`, the naming convention of data files generated by asynchronous dump is the same as those of synchronous dump. Please refer to [Introduction to Synchronous Dump Data File](#introduction-to-data-object-directory-and-data-file). The overflow file generated by overflow detection is in the `json` format, and the content analysis of the overflow file can refer to the [Analyzing the Data File of an Overflow/Underflow Operator](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/infacldevg/aclcppdevg/aclcppdevg_000160.html) .
 
-The `saved_data` option only takes effect when `file_format` is "npy". If `saved_data` is "statistic" or "full", tensor statistics will be dumped in `statistic.csv`. When `saved_data` is "tensor" or "full", full tensor data will be dumped in `{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy`. The format of the statistic file will be the same as that of synchonous dump. Please refer to [Introduction to Synchronous Dump Data File](#introduction-to-data-object-directory-and-data-file).
+The `saved_data` option only takes effect when `file_format` is "npy". If `saved_data` is "statistic" or "full", tensor statistics will be dumped in `statistic.csv`. When `saved_data` is "tensor" or "full", full tensor data will be dumped in `{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy`. The format of the statistic file will be the same as that of synchonous dump. Please refer to [Introduction to Synchronous Dump Data File](#introduction-to-data-object-directory-and-data-file).
 
 The constant dump file, final execution graph file and execution order file naming rules generated by asynchronous Dump are the same as that of synchronous Dump. You can refer to [Introduction to Synchronous Dump Data File](#introduction-to-data-object-directory-and-data-file).
 
diff --git a/docs/mindspore/source_zh_cn/model_train/debug/dump.md b/docs/mindspore/source_zh_cn/model_train/debug/dump.md
index 2a1ab9ac1b..e6af0d641e 100644
--- a/docs/mindspore/source_zh_cn/model_train/debug/dump.md
+++ b/docs/mindspore/source_zh_cn/model_train/debug/dump.md
@@ -280,9 +280,9 @@ Ascend后端异步Dump支持情况如下表（GPU/CPU后端不支持）。
                 - {iteration_id}/
                     {op_type}.{op_name}.json
                     statistic.csv
-                    {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy
+                    {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy
                 - constants/
-                    Parameter.data-{data_id}.0.0.{timestamp}.output.0.DefaultFormat.npy
+                    Parameter.data-{data_id}.0.0.{timestamp}.output.0.DefaultFormat.{dtype}.npy
             ...
         - graphs/
             ms_output_trace_code_graph_{graph_id}.pb
@@ -305,31 +305,32 @@ Ascend后端异步Dump支持情况如下表（GPU/CPU后端不支持）。
 - `input_output_index`：输入或输出标号，例如`output.0`表示该文件是该算子的第1个输出Tensor的数据。
 - `slot`：slot标号。
 - `format`: 数据格式。
+- `dtype`: 原始的数据类型。如果是`bfloat16`或`int4`类型，保存在`.npy`文件中的数据会分别被转换成`float32`或`int8`。
 - `data_id`: 常量数据标号。
 
 对于多图网络，由于存在控制流，某些子图可能不会被执行，Dump只保存执行过的节点，所以graphs目录下`.pb`文件名中的{graph_id}并不一定在{net_name}下存在对应的{graph_id}目录。
 
-只当`saved_data`为"statistic"或者"full"时，才会生成`statistic.csv`，当`saved_data`为"tensor"或者"full"时，才会生成`{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy`命名的完整张量信息。
+只当`saved_data`为"statistic"或者"full"时，才会生成`statistic.csv`，当`saved_data`为"tensor"或者"full"时，才会生成`{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy`命名的完整张量信息。
 
 只当`save_kernel_args`为`True`时，才会生成`{op_type}.{op_name}.json`，保存算子的初始化信息。
 
 同步Dump生成的数据文件是后缀名为`.npy`的文件，文件命名格式为：
 
 ```text
-{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy
+{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy
 ```
 
 同步Dump生成的常量数据文件与其他数据文件格式相同，而所有常量数据的{op_type}，{task_id}，{stream_id}，{input_output_index}，{slot}，{format}不变。注意，非Tensor类型数据不会被生成数据文件。该功能不支持Ascend场景。
 
 ```text
-Parameter.data-{data_id}.0.0.{timestamp}.output.0.DefaultFormat.npy
+Parameter.data-{data_id}.0.0.{timestamp}.output.0.DefaultFormat.{dtype}.npy
 ```
 
 {iteration_id}目录下也可能会保存Parameter开头的文件（weight, bias等参数会保存成Parameter开头的文件），Ascend上不会保存Parameter文件。
 
 可以用Numpy的`numpy.load`接口读取数据。
 
-同步Dump生成的统计数据文件名为`statistic.csv`，此文件存有相同目录下所有落盘张量（文件名为`{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy`）的统计信息。每个张量一行，每行有张量的 Op Type，Op Name，Task ID，Stream ID，Timestamp，IO，Slot，Data Size，Data Type，Shape以及用户配置的统计信息项。注意，如果用Excel来打开此文件，数据可能无法正确显示。请用`vi`、`cat`等命令查看，或者使用Excel自文本导入csv查看。
+同步Dump生成的统计数据文件名为`statistic.csv`，此文件存有相同目录下所有落盘张量（文件名为`{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy`）的统计信息。每个张量一行，每行有张量的 Op Type，Op Name，Task ID，Stream ID，Timestamp，IO，Slot，Data Size，Data Type，Shape以及用户配置的统计信息项。注意，如果用Excel来打开此文件，数据可能无法正确显示。请用`vi`、`cat`等命令查看，或者使用Excel自文本导入csv查看。
 
 同步Dump生成的最终执行图文件后缀名分别为`.pb`和`.ir`，文件命名格式为：
 
@@ -485,13 +486,13 @@ x, w],    pri_format: NC1HWC0, pad: (0, 0, 0, 0), visited: true, pad_mod: same,
 - `slot`：0，该算子的输出只有一个slot。
 
 在Dump保存的数据对象文件目录下搜索到相应的文件名：
-`Conv2D.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.npy`。
+`Conv2D.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.float16.npy`。
 
 还原数据的时候，通过执行：
 
 ```python
 import numpy
-numpy.load("Conv2D.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.npy")
+numpy.load("Conv2D.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.float16.npy")
 ```
 
 生成numpy.array数据。
@@ -642,7 +643,7 @@ MindSpore通过异步Dump提供了Ascend平台上大型网络的调试能力。
 若配置文件中`file_format`值设置为`npy`，算子文件会保存成npy格式的文件，溢出文件会被保存成json格式的文件。文件命名格式分别为：
 
 ```text
-{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy
+{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy
 Opdebug.Node_OpDebug.{task_id}.{stream_id}.{timestamp}.output.0.json
 ```
 
@@ -668,7 +669,7 @@ Dump生成的原始数据文件也可以使用MindSpore Insight的数据解析
 
 若配置`file_format`值为`npy`，则启用异步dump生成的数据文件命名规则与同步Dump相同，可以参考[同步Dump数据文件介绍](#数据对象目录和数据文件介绍)，溢出检测生成的溢出文件是`json`格式，溢出文件内容解析可参考[解析算子溢出数据文件](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/devguide/appdevg/aclpythondevg/aclpythondevg_0078.html#ZH-CN_TOPIC_0000001781325073__section6864050111619) 。
 
-选项`saved_data`只有在`file_format`为"npy"的时候生效。如`saved_data`是"statistic"或者"full"。张量统计数据会落盘到`statistic.csv`。如`saved_data`是"tensor"或者"full"完整张量数据会落盘到`{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy`。`statistic.csv`的格式与同步Dump相同，可以参考[同步Dump数据文件介绍](#数据对象目录和数据文件介绍)。
+选项`saved_data`只有在`file_format`为"npy"的时候生效。如`saved_data`是"statistic"或者"full"。张量统计数据会落盘到`statistic.csv`。如`saved_data`是"tensor"或者"full"完整张量数据会落盘到`{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy`。`statistic.csv`的格式与同步Dump相同，可以参考[同步Dump数据文件介绍](#数据对象目录和数据文件介绍)。
 
 ### 数据分析样例
 
-- 
Gitee


From c458cd4c8e82ea0415ac909e647639943593f0a7 Mon Sep 17 00:00:00 2001
From: liuzihan000 <liuzihan8@huawei.com>
Date: Wed, 14 Aug 2024 16:09:33 +0800
Subject: [PATCH 3/7] dump doc update

---
 .../source_en/model_train/debug/dump.md       | 683 +++++++++++------
 .../source_zh_cn/model_train/debug/dump.md    | 693 ++++++++++++------
 2 files changed, 943 insertions(+), 433 deletions(-)

diff --git a/docs/mindspore/source_en/model_train/debug/dump.md b/docs/mindspore/source_en/model_train/debug/dump.md
index 11bff2d1dd..9d7906ef77 100644
--- a/docs/mindspore/source_en/model_train/debug/dump.md
+++ b/docs/mindspore/source_en/model_train/debug/dump.md
@@ -4,166 +4,110 @@
 
 The input and output of the operator can be saved for debugging through the data dump when the training result deviates from the expectation.
 
-- For dynamic graph mode, the forward process can utilize Python's native execution capabilities, allowing users to view and record the corresponding inputs and outputs during the execution of the network script. The JIT and backward processes, which are part of graph compilation, can use synchronous dump functionality to save the input and output data of operators to disk files.
+- For dynamic graph mode, the forward process can utilize Python's native execution capabilities, allowing users to view and record the corresponding inputs and outputs during the execution of the network script. The JIT and backward processes, which are part of graph compilation, can use Ascend O0/O1 functionality to save the input and output data of operators to disk files.
 
 - For the static graph mode, MindSpore provides the Dump function to save the graph and the input and output data of the operator during model training to a disk file.
 
-MindSpore provides two Dump modes:
+In different modes, the Dump features supported by MindSpore are not entirely the same, and the required configuration files and the generated data formats vary accordingly. Therefore, you need to select the corresponding Dump configuration based on the running mode:
 
-- Synchronous Dump: After the operator is dispatched, the Host side performs stream synchronization, initiates data copying from the Device side, and saves it to a file.
-- Asynchronous Dump: Specifically developed for Ascend. After the operator execution is completed, the Device side actively initiates data dumping to disk.
+- [Dump in Ascend O0/O1 Mode](#dump-in-ascend-o0o1-mode)
+- [Dump in Ascend O2 Mode](#dump-in-ascend-o2-mode)
+- [Dump in CPU/GPU mode](#dump-in-cpugpu-mode)
 
-> Different modes require different configuration files, and the generated data formats also differ:
+> - The differences between Ascend O0, O1, and O2 modes can be found in [the parameter jit_level of the set_context method](https://www.mindspore.cn/docs/en/r2.3.1/api_python/mindspore/mindspore.set_context.html).
 >
-> - For GPU/CPU backends and Ascend backend with compilation levels O0/O1, it is recommended to use [synchronous dump](#synchronous-dump). For details, refer to [synchronous dump step](https://www.mindspore.cn/docs/en/master/model_train/debug/dump.html#dump-step). For Ascend backend with compilation level O2, it is recommended to use [asynchronous dump](#asynchronous-dump). For details, refer to [asynchronous dump step](https://www.mindspore.cn/docs/en/master/model_train/debug/dump.html#dump-step-1).
-> - Currently, Dump does not support heterogeneous training. If Dump is enabled in a heterogeneous training scenario, the generated Dump data object directory may not match the expected directory structure.
+> - Dumping constant data is only supported in CPU/GPU mode, while not supported in Ascend O0/O1/O2 mode.
+>
+> - In Ascend O2 mode, Dump supports both .npy and .bin file formats for data, while other modes only support the .npy file format for Dump data.
+>
+> - Currently, Dump does not support heterogeneous training, meaning it does not support CPU/Ascend mixed training or GPU/Ascend mixed training.
 
-The support for Synchronous Dump on Ascend backend is shown in the table below (GPU/CPU backend refers to the `O0/O1`)
+MindSpore supports different Dump functionalities under various modes, as shown in the following table:
 
 <table align="center">
   <tr>
-   <td rowspan="12" align="center">Synchronous Dump</td>
    <td colspan="2" align="center">Feature</td>
-   <td align="center">O0/O1</td>
-   <td align="center">O2</td>
+   <td align="center">Ascend O0/Ascend O1</td>
+   <td align="center">Ascend O2</td>
+   <td align="center">CPU/GPU</td>
   </tr>
   <tr>
    <td align="left">Full Dump</td>
    <td align="left">Full network data dump</td>
    <td align="left">Supported</td>
-   <td align="left">Not Supported</td>
-  </tr>
-  <tr>
-   <td rowspan="2" align="left">Partial Data Dump</td>
-   <td align="left">Statistics Dump</td>
-   <td align="left">Supports both host and device modes<sup>1</sup></td>
-   <td align="left">Not Supported</td>
-  </tr>
-  <tr>
-   <td align="left">Data Sampling Dump</td>
-   <td align="left">Supported<sup>2</sup></td>
-   <td align="left">Not Supported</td>
-  </tr>
-  <tr>
-   <td align="left">Overflow Dump</td>
-   <td align="left">Dump overflow operators</td>
-   <td align="left">Supported<sup>2</sup></td>
-   <td align="left">Not Supported</td>
-  </tr>
-  <tr>
-   <td rowspan="5" align="left">Conditional Dump</td>
-   <td align="left">Specify Operator Name</td>
-   <td align="left">Supported</td>
-   <td align="left">Not Supported</td>
-  </tr>
-  <tr>
-   <td align="left">Specify Iteration</td>
-   <td align="left">Supported</td>
-   <td align="left">Not Supported</td>
-  </tr>
-  <tr>
-   <td align="left">Specify Device</td>
-   <td align="left">Supported</td>
-   <td align="left">Not Supported</td>
-  </tr>
-  <tr>
-   <td align="left">Specify File Format</td>
-   <td align="left">Not Applicable</td>
-   <td align="left">Not Supported</td>
-  </tr>
-  <tr>
-   <td align="left">set_dump</td>
-   <td align="left">Supported<sup>2</sup></td>
-   <td align="left">Not Supported</td>
-  </tr>
-  <tr>
-   <td rowspan="2" align="left">Auxiliary Information Dump</td>
-   <td align="left">Graph IR Dump</td>
-   <td align="left">Supported</td>
-   <td align="left">Not Supported</td>
-  </tr>
-  <tr>
-   <td align="left">Execution Sequence Dump</td>
-   <td align="left">Supported</td>
-   <td align="left">Not Supported</td>
-  </tr>
-</table>
-
-> 1. In terms of statistics, the computing speed of the device is faster than that of the host(currently only supported on Ascend backend), but the host has more statistical indicators than the device. Refer to the `statistic_category` option for details.
-> 2. Only supported on the Ascend backend.
-
-The support for Asynchronous Dump on Ascend backend is shown in the table below (not supported on GPU/CPU backend).
-
-<table align="center">
-  <tr>
-   <td rowspan="12" align="center">Asynchronous Dump</td>
-   <td colspan="2" align="center">Feature</td>
-   <td align="center">O0/O1</td>
-   <td align="center">O2</td>
-  </tr>
-  <tr>
-   <td align="left">Full Dump</td>
-   <td align="left">Full network data dump</td>
    <td align="left">Supported, but without full_name information</td>
    <td align="left">Supported</td>
   </tr>
   <tr>
    <td rowspan="2" align="left">Partial Data Dump</td>
    <td align="left">Statistics Dump</td>
-   <td align="left">Host mode only</td>
-   <td align="left">Host mode only</td>
+   <td align="left">Supports both host and device modes<sup>1</sup></td>
+   <td align="left">Supports only host mode</td>
+   <td align="left">Not Supported On CPU, GPU Supports only host mode</td>
   </tr>
   <tr>
    <td align="left">Data Sampling Dump</td>
+   <td align="left">Supported</td>
    <td align="left">Not Supported</td>
    <td align="left">Not Supported</td>
   </tr>
   <tr>
    <td align="left">Overflow Dump</td>
    <td align="left">Dump overflow operators</td>
-   <td align="left">Not Supported</td>
    <td align="left">Supported</td>
+   <td align="left">Supported</td>
+   <td align="left">Not Supported</td>
   </tr>
   <tr>
    <td rowspan="5" align="left">Conditional Dump</td>
    <td align="left">Specify Operator Name</td>
-   <td align="left">Not Supported</td>
+   <td align="left">Supported</td>
+   <td align="left">Supported</td>
    <td align="left">Supported</td>
   </tr>
   <tr>
    <td align="left">Specify Iteration</td>
    <td align="left">Supported</td>
    <td align="left">Supported</td>
+   <td align="left">Supported</td>
   </tr>
   <tr>
    <td align="left">Specify Device</td>
    <td align="left">Supported</td>
    <td align="left">Supported</td>
+   <td align="left">Supported</td>
   </tr>
   <tr>
    <td align="left">Specify File Format</td>
+   <td align="left">Not Applicable</td>
    <td align="left">Supported</td>
-   <td align="left">Supported</td>
+   <td align="left">Not Applicable</td>
   </tr>
   <tr>
    <td align="left">set_dump</td>
+   <td align="left">Supported</td>
    <td align="left">Not Supported</td>
    <td align="left">Not Supported</td>
   </tr>
   <tr>
    <td rowspan="2" align="left">Auxiliary Information Dump</td>
    <td align="left">Graph IR Dump</td>
+   <td align="left">Supported</td>
    <td align="left">Not Supported</td>
-   <td align="left">Not Supported</td>
+   <td align="left">Supported</td>
   </tr>
   <tr>
    <td align="left">Execution Sequence Dump</td>
+   <td align="left">Supported</td>
    <td align="left">Not Supported</td>
-   <td align="left">Not Supported</td>
+   <td align="left">Supported</td>
   </tr>
 </table>
 
-## Synchronous Dump
+> 1. In terms of statistics, the computing speed of the device is faster than that of the host(currently only supported on Ascend backend), but the host has more statistical indicators than the device. Refer to the `statistic_category` option for details.
+> 2. Only supported on the Ascend backend.
+
+## Dump in Ascend O0/O1 Mode
 
 ### Dump Step
 
@@ -192,43 +136,49 @@ The support for Asynchronous Dump on Ascend backend is shown in the table below
     }
     ```
 
-    - `op_debug_mode`: This attribute is used for operator overflow or operator exception debugging. 0: save all operators or specified operators; 3: only save overflow operators; 4: only save input of the exception operator. Set it to 0 when the data is dumped. If it is not set to 0, only the data of the overflow operator or exception operator will be dumped. Default: 0.
-    - `dump_mode`: 0: all operator data in the network dumped out; 1: the operator data specified in Dump `"kernels"`; 2: dump target and its contents using [mindspore.set_dump](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.set_dump.html). Specified data dump is supported only when "dump_mode' is set to `0`.
-    - `path`: The absolute path to Dump saved data.
-    - `net_name`: The customized net name: "ResNet50".
-    - `iteration`: Specify the iterations of data required to be dumped, type is string. Use "|" to separate the step data of different intervals to be saved. For example, "0 | 5-8 | 100-120" represents dump the data of the 1st, 6th to 9th, and 101st to 121st steps. If iteration set to "all", data of every iteration will be dumped. Specified iteration dump is supported only when "op_debug_mode" is set to `0` or `3`, not supported when when "op_debug_mode" is set to `4`.
-    - `saved_data`: Specify what data is to be dumped, type is string. Use "tensor" to indicate complete tensor data Dumped, use "statistic" to dump tensor statistics, use "full" to dump both tensor data and statistics. Synchronous statistics dump is only supported on GPU and Ascend. Using "statistic" or "full" on CPU will result in exception. Default setting is "tensor". Statistic dump is only supported when "op_debug_mode" is set to `0`.
-    - `input_output`: 0: dump input and output of kernel, 1:dump input of kernel, 2:dump output of kernel. Only input of kernel can be saved when "op_debug_mode" is set to `4`.
-    - `kernels`: This item can be configured in three formats:
-        1. List of operator names. Turn on the IR save switch `set_context(save_graphs=2)` and execute the network to obtain the operator name from the generated `trace_code_graph_{graph_id}`IR file. For details, please refer to [Saving IR](https://www.mindspore.cn/docs/en/master/model_train/debug/error_analysis/mindir.html#saving-ir).
-        Note that whether setting `set_context(save_graphs=2)` may cause the different IDs of the same operator, so when dump specified operators, keep this setting unchanged after obtaining the operator name. Or you can obtain the operator names from the file `ms_output_trace_code_graph_{graph_id}.ir` saved by Dump. Refer to [Synchronous Dump Data Object Directory](https://www.mindspore.cn/docs/en/master/model_train/debug/dump.html#introduction-to-data-object-directory-and-data-file).
-        2. You can also specify an operator type. When there is no operator scope information or operator id information in the string, the background considers it as an operator type, such as "conv". The matching rule of operator type is: when the operator name contains an operator type string, the matching is considered successful (case insensitive). For example, "conv" can match operators "Conv2D-op1234" and "Conv3D-op1221".
-        3. Regular expressions are supported. When the string conforms to the format of "name-regex(xxx)", it would be considered a regular expression. For example, "name-regex(Default/.+)" can match all operators with names starting with "Default/".
-    - `support_device`: Supported devices, default setting is `[0,1,2,3,4,5,6,7]`. You can specify specific device ids to dump specific device data. This configuration parameter is invalid on the CPU, because there is no concept of device on the CPU, but it is still need to reserve this parameter in the json file.
-    - `statistic_category`: This attribute is used by users to configure the category of statistical information to be saved, and only takes effect when saving statistical information is enabled(i.e.`saved_data` is set to `statistic` or `full`). The type is a string list, where the optional values of the strings are as follows:
-
-        - "max": represents the maximum value of the elements in tensor, supporting both device and host statistics;
-        - "min": represents the minimum value of the elements in tensor, supporting both device and host statistics;
-        - "avg": represents the average value of elements in tensor, supporting device and host statistics;
-        - "count": represents the number of the elements in tensor;
-        - "negative zero count": represents the number of the elements which is less then zero in tensor;
-        - "positive zero count": represents the number of the elements which is greater then zero in tensor;
-        - "nan count": represents the number of `Nan` elements in the tensor;
-        - "negative inf count": represents the number of `-Inf` elements in the tensor;
-        - "positive inf count": represents the number of `+Inf` elements in the tensor;
-        - "zero count": represents the number of zero elements in the tensor;
-        - "md5": represents the MD5 value of the tensor;
-        - "l2norm": represents L2Norm value of the tensor, supporting both device and host statistics.
-
-      Except for those marked as supporting device statistics, other statistics can be collected only on the host.
-      This field is optional, with default values of ["max", "min", "l2norm"].
-
-    - `overflow_number`：Specify the number of data to overflow dump. This field is required only when `op_debug_mode` is set to 3 and only the overflow operator is saved. It can control the overflow data to be dumped in chronological order until the specified value is reached, and the overflow data will no longer be dumped. The default value is 0, which means dumping all overflow data.
-    - `enable`: When set to true, enable Synchronous Dump. When set to false, asynchronous dump will be used on Ascend and synchronous dump will still be used on GPU.
-    - `trans_flag`: Enable trans flag. Transform the device data format into NCHW. If it is `True`, the data will be saved in the 4D format (NCHW) format on the Host side; if it is `False`, the data format on the Device side will be retained. Default: `True`.
-    - `stat_calc_mode`: Select the backend for statistical calculations. Options are "host" and "device". Choosing "device" enables device computation of statistics, currently only effective on Ascend, and supports only min/max/avg/l2norm statistics.
-    - `sample_mode`: Setting it to 0 means the sample dump function is not enabled. Enable the sample dump function in graph compilation with optimization level O0 or O1. This field is effective only when "op_debug_mode" is set to `0`, sample dump cannot be enabled in other scene.
-    - `sample_num`: Used to control the size of sample in sample dump. The default value is 100.
+    - `common_dump_settings`:
+
+        - `op_debug_mode`: This attribute is used for operator overflow or operator exception debugging. 0: save all operators or specified operators; 3: only save overflow operators; 4: only save input of the exception operator. Set it to 0 when the data is dumped. If it is not set to 0, only the data of the overflow operator or exception operator will be dumped. Default: 0.
+        - `dump_mode`: 0: all operator data in the network dumped out; 1: the operator data specified in Dump `"kernels"`; 2: dump target and its contents using [mindspore.set_dump](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.set_dump.html). Specified data dump is supported only when "dump_mode' is set to `0`.
+        - `path`: The absolute path to Dump saved data.
+        - `net_name`: The customized net name: "ResNet50".
+        - `iteration`: Specify the iterations of data required to be dumped, type is string. Use "|" to separate the step data of different intervals to be saved. For example, "0 | 5-8 | 100-120" represents dump the data of the 1st, 6th to 9th, and 101st to 121st steps. If iteration set to "all", data of every iteration will be dumped. Specified iteration dump is supported only when "op_debug_mode" is set to `0` or `3`, not supported when when "op_debug_mode" is set to `4`.
+        - `saved_data`: Specify what data is to be dumped, type is string. Use "tensor" to indicate complete tensor data Dumped, use "statistic" to dump tensor statistics, use "full" to dump both tensor data and statistics. Default setting is "tensor". Statistic dump is only supported when "op_debug_mode" is set to `0`.
+        - `input_output`: 0: dump input and output of kernel, 1:dump input of kernel, 2:dump output of kernel. Only input of kernel can be saved when "op_debug_mode" is set to `4`.
+        - `kernels`: This item can be configured in three formats:
+           1. List of operator names. Turn on the IR save switch `set_context(save_graphs=2)` and execute the network to obtain the operator name from the generated `trace_code_graph_{graph_id}`IR file. For details, please refer to [Saving IR](https://www.mindspore.cn/docs/en/master/model_train/debug/error_analysis/mindir.html#saving-ir).
+           Note that whether setting `set_context(save_graphs=2)` may cause the different IDs of the same operator, so when dump specified operators, keep this setting unchanged after obtaining the operator name. Or you can obtain the operator names from the file `ms_output_trace_code_graph_{graph_id}.ir` saved by Dump. Refer to [Ascend O0/O1 Dump Data Object Directory](#introduction-to-data-object-directory-and-data-file).
+           2. You can also specify an operator type. When there is no operator scope information or operator id information in the string, the background considers it as an operator type, such as "conv". The matching rule of operator type is: when the operator name contains an operator type string, the matching is considered successful (case insensitive). For example, "conv" can match operators "Conv2D-op1234" and "Conv3D-op1221".
+           3. Regular expressions are supported. When the string conforms to the format of "name-regex(xxx)", it would be considered a regular expression. For example, "name-regex(Default/.+)" can match all operators with names starting with "Default/".
+        - `support_device`: Supported devices, default setting is `[0,1,2,3,4,5,6,7]`. You can specify specific device ids to dump specific device data. This configuration parameter is invalid on the CPU, because there is no concept of device on the CPU, but it is still need to reserve this parameter in the json file.
+        - `statistic_category`: This attribute is used by users to configure the category of statistical information to be saved, and only takes effect when saving statistical information is enabled(i.e.`saved_data` is set to `statistic` or `full`). The type is a string list, where the optional values of the strings are as follows:
+
+            - "max": represents the maximum value of the elements in tensor, supporting both device and host statistics;
+            - "min": represents the minimum value of the elements in tensor, supporting both device and host statistics;
+            - "avg": represents the average value of elements in tensor, supporting device and host statistics;
+            - "count": represents the number of the elements in tensor;
+            - "negative zero count": represents the number of the elements which is less then zero in tensor;
+            - "positive zero count": represents the number of the elements which is greater then zero in tensor;
+            - "nan count": represents the number of `Nan` elements in the tensor;
+            - "negative inf count": represents the number of `-Inf` elements in the tensor;
+            - "positive inf count": represents the number of `+Inf` elements in the tensor;
+            - "zero count": represents the number of zero elements in the tensor;
+            - "md5": represents the MD5 value of the tensor;
+            - "l2norm": represents L2Norm value of the tensor, supporting both device and host statistics.
+
+            Except for those marked as supporting device statistics, other statistics can be collected only on the host.
+            This field is optional, with default values of ["max", "min", "l2norm"].
+
+        - `overflow_number`：Specify the number of data to overflow dump. This field is required only when `op_debug_mode` is set to 3 and only the overflow operator is saved. It can control the overflow data to be dumped in chronological order until the specified value is reached, and the overflow data will no longer be dumped. The default value is 0, which means dumping all overflow data.
+
+    - `e2e_dump_settings`:
+
+        - `enable`: When set to true, enable Synchronous Dump. When set to false or not set, Asynchronous Dump will be used on Ascend. The main difference between the two is that Asynchronous Dump has less impact on the original code execution order.
+        - `trans_flag`: Enable trans flag. Transform the device data format into NCHW. If it is `True`, the data will be saved in the 4D format (NCHW) format on the Host side; if it is `False`, the data format on the Device side will be retained. Default: `True`.
+        - `stat_calc_mode`: Select the backend for statistical calculations. Options are "host" and "device". Choosing "device" enables device computation of statistics, currently only effective on Ascend, and supports only min/max/avg/l2norm statistics.
+        - `sample_mode`(Optional): Setting it to 0 means the sample dump function is not enabled. Enable the sample dump function in graph compilation with optimization level O0 or O1. This field is effective only when "op_debug_mode" is set to `0`, sample dump cannot be enabled in other scene.
+        - `sample_num`(Optional): Used to control the size of sample in sample dump. The default value is 100.
+        - `save_kernel_args`(Optional): When set to true, the initialization information of kernels will be saved.
 
 2. Set Dump environment variable.
 
@@ -260,16 +210,15 @@ The support for Asynchronous Dump on Ascend backend is shown in the table below
 3. Execute the training script to dump data.
 
    After the training is started, if the `MINDSPORE_DUMP_CONFIG` environment variable is correctly configured, the content of the configuration file will be read and the operator data will be saved according to the data storage path specified in the Dump configuration.
-   In synchronous mode, if you want to dump data in GPU environment, you must use the non-data sink mode (set the `dataset_sink_mode` parameter in `model.train` or `DatasetHelper` to `False`) to ensure that you can get the dump data of each step.
    If `model.train` or `DatasetHelper` is not called in the script, the default is non-data sinking mode. Using the Dump function will automatically generate the IR file of the final execution graph.
 
     You can set `set_context(reserve_class_name_in_scope=False)` in your training script to avoid dump failure because of file name is too long.
 
-4. Read and parse synchronous dump data through `numpy.load`, refer to [Introduction to Synchronous Dump Data File](#introduction-to-data-object-directory-and-data-file).
+4. Read and parse dump data through `numpy.load`, refer to [Introduction to Ascend O0/O1 Dump Data File](#introduction-to-data-object-directory-and-data-file).
 
 ### Introduction to Data Object Directory and Data File
 
-After starting the training, the data objects saved by the synchronous Dump include the final execution graph (`ms_output_trace_code_graph_{graph_id}.ir` file) and the input and output data of the operators in the graph. The data directory structure is as follows:
+After starting the training, the data objects saved under the Ascend O0/O1 Dump mode include the final execution graph (`ms_output_trace_code_graph_{graph_id}.ir` file) and the input and output data of the operators in the graph. The data directory structure is as follows:
 
 ```text
 {path}/
@@ -313,26 +262,28 @@ For multi-graph networks, due to the control flow, some subgraphs may not be exe
 Only when `saved_data` is "statistic" or "full", `statistic.csv` is generated. Only when `saved_data` is "tensor" or "full", `{op_type}. {op_name}. {task_id}. {stream_id}. {timestamp}. {input_output_index}. {slot}. {format}.{dtype}.npy` named complete tensor information is generated.
 
 Only when `save_kernel_args` is `True`, `{op_type}.{op_name}.json` is generated and the params of the corresponding operators is saved.
+When `save_kernel_args` is set to `True`, a JSON file named `{op_type}.{op_name}.json` will be generated, which saves the initialization information of the operator. The internal format of this JSON file contains the corresponding values of each initialization parameter of the operator. For example, for the `Matmul` operator, the JSON information would look like this:
 
-The data file generated by the synchronous Dump is a binary file with the suffix `.npy`, and the file naming format is:
-
-```text
-{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy
+```json
+{
+    "transpose_a": "False",
+    "transpose_b": "False"
+}
 ```
 
-The constant data file generated by the synchronous Dump is in the same format as data file, whereas {op_type}, {task_id}, {stream_id}, {input_output_index}, {slot}, {format} are unchanged for all constant data. Note, non-Tensor type will not generate data file. This function is not supported in the Ascend scenario.
+This JSON indicates that both initialization parameters `transpose_a` and `transpose_b` of the `Matmul` operator have the value `False`.
+
+The data file generated by the Ascend O0/O1 Dump is a binary file with the suffix `.npy`, and the file naming format is:
 
 ```text
-Parameter.data-{data_id}.0.0.{timestamp}.output.0.DefaultFormat.{dtype}.npy
+{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy
 ```
 
-The {iteration_id} directory may also save files starting with `Parameter` (parameters such as weight and bias will be saved as files starting with `Parameter`), while `Parameter` files will not be saved on Ascend.
-
 User can use Numpy interface `numpy.load` to read the data.
 
-The statistics file generated by the synchronous dump is named `statistic.csv`. This file stores key statistics for all tensors dumped under the same directory as itself (with the file names `{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy`). Each row in `statistic.csv` summarizes a single tensor, each row contains the statistics: Op Type, Op Name, Task ID, Stream ID, Timestamp, IO, Slot, Data Size, Data Type, Shape, and statistics items configured by the user. Note that opening this file with Excel may cause data to be displayed incorrectly. Please use commands like `vi` or `cat`, or use Excel to import csv from text for viewing.
+The statistics file generated by the Ascend O0/O1 dump is named `statistic.csv`. This file stores key statistics for all tensors dumped under the same directory as itself (with the file names `{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy`). Each row in `statistic.csv` summarizes a single tensor, each row contains the statistics: Op Type, Op Name, Task ID, Stream ID, Timestamp, IO, Slot, Data Size, Data Type, Shape, and statistics items configured by the user. Note that opening this file with Excel may cause data to be displayed incorrectly. Please use commands like `vi` or `cat`, or use Excel to import csv from text for viewing.
 
-The suffixes of the final execution graph files generated by synchronous Dump are `.pb` and `.ir` respectively, and the file naming format is:
+The suffixes of the final execution graph files generated by Ascend O0/O1 Dump are `.pb` and `.ir` respectively, and the file naming format is:
 
 ```text
 ms_output_trace_code_graph_{graph_id}.pb
@@ -341,7 +292,7 @@ ms_output_trace_code_graph_{graph_id}.ir
 
 The files with the suffix `.ir` can be opened and viewed by the `vi` command.
 
-The suffix of the node execution sequence file generated by the synchronous Dump is `.csv`, and the file naming format is:
+The suffix of the node execution sequence file generated by the Ascend O0/O1 Dump is `.csv`, and the file naming format is:
 
 ```text
 ms_execution_order_graph_{graph_id}.csv
@@ -359,7 +310,7 @@ This file stores the list of iterations in which the graph was executed. After t
 
 ### Data Analysis Sample
 
-In order to better demonstrate the process of using dump to save and analyze data, we provide a set of [complete sample script](https://gitee.com/mindspore/docs/tree/master/docs/sample_code/dump) , you only need to execute `bash dump_sync_dump.sh` for synchronous dump.
+In order to better demonstrate the process of using dump to save and analyze data, we provide a set of [complete sample script](https://gitee.com/mindspore/docs/tree/master/docs/sample_code/dump) , you only need to execute `bash dump_sync_dump.sh` for Ascend O0/O1 dump.
 
 After the graph corresponding to the script is saved to the disk through the Dump function, the final execution graph file `ms_output_trace_code_graph_{graph_id}.ir` will be generated. This file saves the stack information of each operator in the corresponding graph, and records the generation script corresponding to the operator.
 
@@ -497,9 +448,7 @@ numpy.load("Conv2D.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.float
 
 Generate the numpy.array data.
 
-## Asynchronous Dump
-
-MindSpore provides debugging capabilities for large networks through asynchronous dumps on Ascend.
+## Dump in Ascend O2 Mode
 
 ### Dump Step
 
@@ -526,36 +475,38 @@ MindSpore provides debugging capabilities for large networks through asynchronou
     }
     ```
 
-    - `op_debug_mode`: This attribute is used for operator overflow debugging. 0: disable overflow check function; 3: enable overflow check function; 4: enable the lightweight exception dump function. Set it to 0 when Dump data is processed. If it is not set to 0, only the data of the overflow operator or exception operator will be dumped.
-    - `dump_mode`: 0: all operator data in the network dumped out; 1: dump kernels data in kernels list. When overflow detection is enabled, the setting of this field becomes invalid, and Dump only saves the data of the overflow node. Specified data dump is supported only when "dump_mode' is set to `0`.
-    - `path`: The absolute path to save Dump data. When the graph compilation level is O0, MindSpore will create a new subdirectory for each step in the path directory.
-    - `net_name`: The customized net name: "ResNet50".
-    - `iteration`: Specify the iterations to dump, type is string. Use "|" to separate the step data of different intervals to be saved. For example, "0 | 5-8 | 100-120" represents dump the data of the 1st, 6th to 9th, and 101st to 121st steps. If iteration set to "all", data of every iteration will be dumped. Specified iteration dump is supported only when "op_debug_mode" is set to `0`, not supported when when "op_debug_mode" is set to `3` or `4`.
-    - `saved_data`: Specify what data is to be dumped, type is string. Use "tensor" to dump tensor data, use "statistic" to dump tensor statistics, use "full" to dump both tensor data and statistics. Default setting is "tensor". Asynchronous statistics dump is only supported when `file_format` is set to `npy`, using "statistic" or "full" when `file_format` is set to `bin` will result in exception. Statistic dump is only supported when "op_debug_mode" is set to `0`.
-    - `input_output`: When set to 0, it means to Dump the operator's input and output; when set to 1, it means to Dump the operator's input; setting it to 2 means to Dump the output of the operator.
-    - `kernels`: This item can be configured in two formats:
-        1. List of operator names. Specifying operator needs to first set the environment variable for saving the graph file to save the graph, and then obtain the operator name from the saved graph file. Please refer to the documentation on Ascend Developer Zone [DUMP_GE_GRAPH](https://www.hiascend.com/document/detail/en/canncommercial/601/inferapplicationdev/graphdevg/graphdevg_000050.html) , [DUMP_GRAPH_LEVEL](https://www.hiascend.com/document/detail/en/canncommercial/601/inferapplicationdev/graphdevg/graphdevg_000051.html) and [DUMP_GRAPH_PATH](https://www.hiascend.com/document/detail/en/canncommercial/601/inferapplicationdev/graphdevg/graphdevg_000052.html) for details about the environment variable for saving the graph file.
-        2. Regular expressions of operator names. When the string conforms to the format of "name-regex(xxx)", it would be considered a regular expression. For example, "name-regex(Default/.+)" can match all operators with names starting with "Default/".
-    - `support_device`: Supported devices, default setting is `[0,1,2,3,4,5,6,7]`. You can specify specific device ids to dump specific device data.
-    - `statistic_category`: This attribute is used by users to configure the category of statistical information to be saved, and only takes effect when saving statistical information is enabled(i.e.`saved_data` is set to `statistic` or `full`). The type is a string list, where the optional values of the strings are as follows:
-
-        - "max": represents the maximum value of the elements in tensor;
-        - "min": represents the minimum value of the elements in tensor;
-        - "avg": represents the average value of elements in tensor;
-        - "count": represents the number of the elements in tensor;
-        - "negative zero count": represents the number of the elements which is less then zero in tensor;
-        - "positive zero count": represents the number of the elements which is greater then zero in tensor;
-        - "nan count": represents the number of `Nan` elements in the tensor;
-        - "negative inf count": represents the number of `-Inf` elements in the tensor;
-        - "positive inf count": represents the number of `+Inf` elements in the tensor;
-        - "zero count": represents the number of zero elements in the tensor;
-        - "md5": represents the MD5 value of the tensor;
-        - "l2norm": represents L2Norm value of the tensor.
-
-      This field is optional, with default values of ["max", "min", "l2norm"].
-
-    - `file_format`: Dump file type. It can be either `npy` and `bin`. `npy`: data will be dumped in npy files as host format. `bin`: data will be dumped in protobuf file as device format and need to be transformed to parse using the provided data analysis tool. Please refer to [Asynchronous Dump Data Analysis Sample](#data-analysis-sample-1) for details. The default value is `bin`.
-    - `overflow_number`：Specify the number of data to overflow dump. This field is required only when `op_debug_mode` is set to 3 and `file_format` is set to `npy`. It can control the overflow data to be dumped in chronological order until the specified value is reached, and the overflow data will no longer be dumped. The default value is 0, which means dumping all overflow data.
+    - `common_dump_settings`:
+
+        - `op_debug_mode`: This attribute is used for operator overflow debugging. 0: disable overflow check function; 3: enable overflow check function; 4: enable the lightweight exception dump function. Set it to 0 when Dump data is processed. If it is not set to 0, only the data of the overflow operator or exception operator will be dumped.
+        - `dump_mode`: 0: all operator data in the network dumped out; 1: dump kernels data in kernels list. When overflow detection is enabled, the setting of this field becomes invalid, and Dump only saves the data of the overflow node. Specified data dump is supported only when "dump_mode' is set to `0`.
+        - `path`: The absolute path to save Dump data.
+        - `net_name`: The customized net name: "ResNet50".
+        - `iteration`: Specify the iterations to dump, type is string. Use "|" to separate the step data of different intervals to be saved. For example, "0 | 5-8 | 100-120" represents dump the data of the 1st, 6th to 9th, and 101st to 121st steps. If iteration set to "all", data of every iteration will be dumped. Specified iteration dump is supported only when "op_debug_mode" is set to `0`, not supported when when "op_debug_mode" is set to `3` or `4`.
+        - `saved_data`: Specify what data is to be dumped, type is string. Use "tensor" to dump tensor data, use "statistic" to dump tensor statistics, use "full" to dump both tensor data and statistics. Default setting is "tensor". Dump in Ascend O2 Mode statistics dump is only supported when `file_format` is set to `npy`, using "statistic" or "full" when `file_format` is set to `bin` will result in exception. Statistic dump is only supported when "op_debug_mode" is set to `0`.
+        - `input_output`: When set to 0, it means to Dump the operator's input and output; when set to 1, it means to Dump the operator's input; setting it to 2 means to Dump the output of the operator.
+        - `kernels`: This item can be configured in two formats:
+             1. List of operator names. Specifying operator needs to first set the environment variable for saving the graph file to save the graph, and then obtain the operator name from the saved graph file. Please refer to the documentation on Ascend Developer Zone [DUMP_GE_GRAPH](https://www.hiascend.com/document/detail/en/canncommercial/601/inferapplicationdev/graphdevg/graphdevg_000050.html) , [DUMP_GRAPH_LEVEL](https://www.hiascend.com/document/detail/en/canncommercial/601/inferapplicationdev/graphdevg/graphdevg_000051.html) and [DUMP_GRAPH_PATH](https://www.hiascend.com/document/detail/en/canncommercial/601/inferapplicationdev/graphdevg/graphdevg_000052.html) for details about the environment variable for saving the graph file.
+             2. Regular expressions of operator names. When the string conforms to the format of "name-regex(xxx)", it would be considered a regular expression. For example, "name-regex(Default/.+)" can match all operators with names starting with "Default/".
+        - `support_device`: Supported devices, default setting is `[0,1,2,3,4,5,6,7]`. You can specify specific device ids to dump specific device data.
+        - `statistic_category`: This attribute is used by users to configure the category of statistical information to be saved, and only takes effect when saving statistical information is enabled(i.e.`saved_data` is set to `statistic` or `full`). The type is a string list, where the optional values of the strings are as follows:
+
+            - "max": represents the maximum value of the elements in tensor;
+            - "min": represents the minimum value of the elements in tensor;
+            - "avg": represents the average value of elements in tensor;
+            - "count": represents the number of the elements in tensor;
+            - "negative zero count": represents the number of the elements which is less then zero in tensor;
+            - "positive zero count": represents the number of the elements which is greater then zero in tensor;
+            - "nan count": represents the number of `Nan` elements in the tensor;
+            - "negative inf count": represents the number of `-Inf` elements in the tensor;
+            - "positive inf count": represents the number of `+Inf` elements in the tensor;
+            - "zero count": represents the number of zero elements in the tensor;
+            - "md5": represents the MD5 value of the tensor;
+            - "l2norm": represents L2Norm value of the tensor.
+
+            This field is optional, with default values of ["max", "min", "l2norm"].
+
+        - `file_format`: Dump file type. It can be either `npy` and `bin`. `npy`: data will be dumped in npy files as host format. `bin`: data will be dumped in protobuf file as device format and need to be transformed to parse using the provided data analysis tool. Please refer to [Ascend O2 Mode Dump Data Analysis Sample](#data-analysis-sample-1) for details. The default value is `bin`.
+        - `overflow_number`：Specify the number of data to overflow dump. This field is required only when `op_debug_mode` is set to 3 and `file_format` is set to `npy`. It can control the overflow data to be dumped in chronological order until the specified value is reached, and the overflow data will no longer be dumped. The default value is 0, which means dumping all overflow data.
 
 2. Set Dump environment variable.
 
@@ -578,7 +529,7 @@ MindSpore provides debugging capabilities for large networks through asynchronou
 
     You can set `set_context(reserve_class_name_in_scope=False)` in your training script to avoid dump failure because of file name is too long.
 
-4. Refer to [Asynchronous Dump Data Analysis Sample](#data-analysis-sample-1) to analyze the Dump data file.
+4. Refer to [Ascend O2 Mode Dump Data Analysis Sample](#data-analysis-sample-1) to analyze the Dump data file.
 
 > - If you need to dump all or part of the operator, you can modify the `dump_mode` option in the json configuration file to 0 or 1.
 > - Due to the slow Dump speed, enabling Dump in large model scenarios can extend the communication interval between different cards, leading to communication operator timeouts. This issue can be resolved by adjusting the timeout duration for the communication operators. For the Ascend backend, you can set the HCCL_EXEC_TIMEOUT environment variable. For detailed instructions, please refer to the [Ascend CANN documentation](https://www.hiascend.com/document/detail/zh/canncommercial/80RC1/apiref/envvar/envref_07_0072.html).
@@ -602,27 +553,6 @@ When the graph compilation level is not O0 or O1, the Dump directory structure i
     acl_dump_{device_id}.json
 ```
 
-When the graph compilation level is O0 or O1, the Dump directory structure is as follows. In this scenario, the dump files for aclop and aclnn operators will be saved in {device_id} directory, and the dump files for communication operators such as "ResuceSum" will be saved in {iteration_id} directory:
-
-```text
-{path}/
-    - {step_id}/
-        - {time}/
-            - {device_id}/
-                - {model_name}/
-                    - {model_id}/
-                        - {iteration_id}/
-                            statistic.csv
-                            {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp} //aclop ops
-                            {op_name}.{op_type}.{task_id}.{stream_id}.{timestamp} //aclnn ops
-                            mapping.csv
-                statistic.csv
-                {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp} //aclop ops
-                {op_name}.{op_type}.{task_id}.{stream_id}.{timestamp} //aclnn ops
-                mapping.csv
-    acl_dump_{device_id}.json
-```
-
 - `path`: the absolute path set in the `data_dump.json` configuration file.
 - `device_id`: the id of the device.
 - `model_name`: the model name generated by MindSpore.
@@ -636,7 +566,7 @@ When the graph compilation level is O0 or O1, the Dump directory structure is as
 - `timestamp`: the time stamp.
 - `step_id`: user side training step id.
 
-The `acl_damp_{device_id}.json` file in the {path} directory is an intermediate file generated by asynchronous dump during interface calls, and generally does not need to be paid attention to.
+The `acl_damp_{device_id}.json` file in the {path} directory is an intermediate file generated by Ascend O2 Mode dump during interface calls, and generally does not need to be paid attention to.
 
 The overflow file (file `Opdebug.Node_OpDebug.{task_id}.{stream_id}.{timestamp}`) is only saved when overflow dump is enabled and overflow is detected.
 
@@ -651,9 +581,9 @@ If the length of the tensor file name defined according to the naming rules exce
 
 If set `file_format` to `npy`, it can be loaded by `numpy.load`.
 
-If not configured `file_format` or set `file_format` to `bin`, after the training is started, the original data file generated by asynchronous Dump or overflow files generated by overflow detection are in protobuf format. They need to be parsed using the data analysis tool that comes with the HiSilicon Run package. For details, please refer to [How to view dump data files](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/developmenttools/devtool/atlasaccuracy_16_0078.html).
+If not configured `file_format` or set `file_format` to `bin`, after the training is started, the original data file generated by Ascend O2 Mode Dump or overflow files generated by overflow detection are in protobuf format. They need to be parsed using the data analysis tool that comes with the HiSilicon Run package. For details, please refer to [How to view dump data files](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/developmenttools/devtool/atlasaccuracy_16_0078.html).
 
-The data format on the Device side may be different from the definition in the calculation diagram on the Host side. The bin file data format of the asynchronous dump is the Device side format. If you want to convert to the Host side format, you can refer to [How to convert dump data file format](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/developmenttools/devtool/atlasaccuracy_16_0077.html).
+The data format on the Device side may be different from the definition in the calculation diagram on the Host side. The bin file data format of the Ascend O2 Mode dump is the Device side format. If you want to convert to the Host side format, you can refer to [How to convert dump data file format](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/developmenttools/devtool/atlasaccuracy_16_0077.html).
 
 If the file is saved in `bin` format, the file naming format is:
 
@@ -665,19 +595,19 @@ Take the Conv2D-op12 of AlexNet network as an example: `Conv2D.Default_network-W
 
 If ".", "/", "\", and spaces appear in `op_type` and `op_name`, they will be converted to underscores.
 
-The original data file generated by dump can also be parsed by using the data parsing tool DumpParser of MindSpore Insight. Please refer to [DumpParser Introduction](https://gitee.com/mindspore/mindinsight/blob/master/mindinsight/parser/README.md#) for the usage of DumpParser. The data format parsed by MindSpore Insight is exactly the same as that of synchronous dump.
+The original data file generated by dump can also be parsed by using the data parsing tool DumpParser of MindSpore Insight. Please refer to [DumpParser Introduction](https://gitee.com/mindspore/mindinsight/blob/master/mindinsight/parser/README.md#) for the usage of DumpParser. The data format parsed by MindSpore Insight is exactly the same as that of Ascend O0/O1 dump.
 
-If setting `file_format` to `npy`, the naming convention of data files generated by asynchronous dump is the same as those of synchronous dump. Please refer to [Introduction to Synchronous Dump Data File](#introduction-to-data-object-directory-and-data-file). The overflow file generated by overflow detection is in the `json` format, and the content analysis of the overflow file can refer to the [Analyzing the Data File of an Overflow/Underflow Operator](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/infacldevg/aclcppdevg/aclcppdevg_000160.html) .
+If setting `file_format` to `npy`, the naming convention of data files generated by Ascend O2 Mode dump is the same as those of Ascend O0/O1 dump. Please refer to [Introduction to Ascend O0/O1 Dump Data File](#introduction-to-data-object-directory-and-data-file). The overflow file generated by overflow detection is in the `json` format, and the content analysis of the overflow file can refer to the [Analyzing the Data File of an Overflow/Underflow Operator](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/infacldevg/aclcppdevg/aclcppdevg_000160.html) .
 
-The `saved_data` option only takes effect when `file_format` is "npy". If `saved_data` is "statistic" or "full", tensor statistics will be dumped in `statistic.csv`. When `saved_data` is "tensor" or "full", full tensor data will be dumped in `{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy`. The format of the statistic file will be the same as that of synchonous dump. Please refer to [Introduction to Synchronous Dump Data File](#introduction-to-data-object-directory-and-data-file).
+The `saved_data` option only takes effect when `file_format` is "npy". If `saved_data` is "statistic" or "full", tensor statistics will be dumped in `statistic.csv`. When `saved_data` is "tensor" or "full", full tensor data will be dumped in `{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy`. The format of the statistic file will be the same as that of Ascend O0/O1 dump. Please refer to [Introduction to Ascend O0/O1 Dump Data File](#introduction-to-data-object-directory-and-data-file).
 
-The constant dump file, final execution graph file and execution order file naming rules generated by asynchronous Dump are the same as that of synchronous Dump. You can refer to [Introduction to Synchronous Dump Data File](#introduction-to-data-object-directory-and-data-file).
+The constant dump file, final execution graph file and execution order file naming rules generated by Ascend O2 Mode Dump are the same as that of Ascend O0/O1 Dump. You can refer to [Introduction to Ascend O0/O1 Dump Data File](#introduction-to-data-object-directory-and-data-file).
 
 ### Data Analysis Sample
 
-Asynchronous dump does not automatically save `.ir` files. To view `.ir` files, you can use MindSpore IR save switch `set_comtext(save_graphs=2)` before executing the use case. After executing the use case, you can view the saved `tracecode_graph_ xxx}` file, which can be opened with `vi`. Please refer to the data analysis example of synchronous dump for the file viewing method. When the graph compilation level is O0 or O1, the operator files saved by asynchronous dump are different from the operator names in the graph file. Therefore, asynchronous dump is not recommended for this scenario, and synchronous dump is recommended. When the compilation level of the graph is O2, since the `.ir` file is not the final execution graph, it cannot be guaranteed that the operator names in the operator file correspond one-to-one with those in the `.ir` file. Please refer to the documentation on Ascend Developer Zone [DUMP_GE_GRAPH](https://www.hiascend.com/document/detail/en/canncommercial/601/inferapplicationdev/graphdevg/graphdevg_000050.html) , [DUMP_GRAPH_LEVEL](https://www.hiascend.com/document/detail/en/canncommercial/601/inferapplicationdev/graphdevg/graphdevg_000051.html) and [DUMP_GRAPH_PATH](https://www.hiascend.com/document/detail/en/canncommercial/601/inferapplicationdev/graphdevg/graphdevg_000052.html) to save the final execution graph.
+Ascend O2 Mode dump does not automatically save `.ir` files. To view `.ir` files, you can use MindSpore IR save switch `set_comtext(save_graphs=2)` before executing the use case. After executing the use case, you can view the saved `tracecode_graph_ xxx}` file, which can be opened with `vi`. Please refer to the data analysis example of Ascend O0/O1 dump for the file viewing method. Since the `.ir` file is not the final execution graph, it cannot be guaranteed that the operator names in the operator file correspond one-to-one with those in the `.ir` file. Please refer to the documentation on Ascend Developer Zone [DUMP_GE_GRAPH](https://www.hiascend.com/document/detail/en/canncommercial/601/inferapplicationdev/graphdevg/graphdevg_000050.html) , [DUMP_GRAPH_LEVEL](https://www.hiascend.com/document/detail/en/canncommercial/601/inferapplicationdev/graphdevg/graphdevg_000051.html) and [DUMP_GRAPH_PATH](https://www.hiascend.com/document/detail/en/canncommercial/601/inferapplicationdev/graphdevg/graphdevg_000052.html) to save the final execution graph.
 
-Through the asynchronous Dump function, the data files generated by the operator asynchronous Dump can be obtained. If `file_format` in the Dump configure file is set to "npy", then the step 1, 2 in the follows steps can be skipped. If `file_format` is not set or set to "bin", the tensor files need to be converted to `.npy` format.
+In Ascend O2 mode, dump data files will be generated in the corresponding directory described above. The parsing of these data files can be done through the following three steps: If `file_format` in the Dump configure file is set to "npy", then the step 1, 2 in the follows steps can be skipped. If `file_format` is not set or set to "bin", the tensor files need to be converted to `.npy` format.
 
 1. Parse the dumped file using `msaccucmp.py` provied in the run package, the path where the `msaccucmp.py` file is located may be different on different environments. You can find it through the `find` command:
 
@@ -732,13 +662,344 @@ Through the asynchronous Dump function, the data files generated by the operator
     numpy.load("Conv2D.Default_network-WithLossCell__backbone-AlexNet_conv3-Conv2d_Conv2D-op12.2.7.161243956333802.input.0.32x256x13x13.npy")
     ```
 
+## Dump in CPU/GPU Mode
+
+### Dump Step
+
+1. Create a configuration file in json format, and the name and location of the JSON file can be customized.
+
+    ```json
+    {
+        "common_dump_settings": {
+            "op_debug_mode": 0,
+            "dump_mode": 0,
+            "path": "/absolute_path",
+            "net_name": "ResNet50",
+            "iteration": "0|5-8|100-120",
+            "saved_data": "tensor",
+            "input_output": 0,
+            "kernels": ["Default/Conv-op12"],
+            "support_device": [0,1,2,3,4,5,6,7],
+            "statistic_category": ["max", "min", "l2norm"]
+        },
+        "e2e_dump_settings": {
+            "enable": true,
+            "trans_flag": true,
+        }
+    }
+    ```
+
+    - `common_dump_settings`:
+
+        - `op_debug_mode`: This attribute is used for operator overflow or operator exception debugging. 0 is the only supported mode in CPU/GPU Dump mode, which means saving all operators or specified operators;
+        - `dump_mode`: 0: all operator data in the network dumped out; 1: the operator data specified in Dump `"kernels"`; 2: dump target and its contents using [mindspore.set_dump](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.set_dump.html). Specified data dump is supported only when "dump_mode' is set to `0`.
+        - `path`: The absolute path to Dump saved data.
+        - `net_name`: The customized net name: "ResNet50".
+        - `iteration`: Specify the iterations of data required to be dumped, type is string. Use "|" to separate the step data of different intervals to be saved. For example, "0 | 5-8 | 100-120" represents dump the data of the 1st, 6th to 9th, and 101st to 121st steps. If iteration is set to "all", data of every iteration will be dumped. Specified iteration dump is supported only when "op_debug_mode" is set to `0` or `3`, not supported when when "op_debug_mode" is set to `4`.
+        - `saved_data`: Specify what data is to be dumped, type is string. Use "tensor" to indicate complete tensor data Dumped, use "statistic" to dump tensor statistics, use "full" to dump both tensor data and statistics. Using "statistic" or "full" on CPU will result in exception. Default setting is "tensor". Statistic dump is only supported when "op_debug_mode" is set to `0`.
+        - `input_output`: 0: dump input and output of kernel, 1: dump input of kernel, 2: dump output of kernel. Only input of kernel can be saved when "op_debug_mode" is set to `4`.
+        - `kernels`: This item can be configured in three formats:
+             1. List of operator names. Turn on the IR save switch `set_context(save_graphs=2)` and execute the network to obtain the operator name from the generated `trace_code_graph_{graph_id}`IR file. For details, please refer to [Saving IR](https://www.mindspore.cn/docs/en/master/model_train/debug/error_analysis/mindir.html#saving-ir).
+             Note that whether setting `set_context(save_graphs=2)` may cause the different IDs of the same operator, so when dump specified operators, keep this setting unchanged after obtaining the operator name. Or you can obtain the operator names from the file `ms_output_trace_code_graph_{graph_id}.ir` saved by Dump. Refer to [Ascend O0/O1 Dump Data Object Directory](#introduction-to-data-object-directory-and-data-file).
+             2. You can also specify an operator type. When there is no operator scope information or operator id information in the string, the background considers it as an operator type, such as "conv". The matching rule of operator type is: when the operator name contains an operator type string, the matching is considered successful (case insensitive). For example, "conv" can match operators "Conv2D-op1234" and "Conv3D-op1221".
+             3. Regular expressions are supported. When the string conforms to the format of "name-regex(xxx)", it would be considered a regular expression. For example, "name-regex(Default/.+)" can match all operators with names starting with "Default/".
+        - `support_device`: Supported devices, default setting is `[0,1,2,3,4,5,6,7]`. You can specify specific device ids to dump specific device data. This configuration parameter is invalid on the CPU, because there is no concept of device on the CPU, but it is still need to reserve this parameter in the json file.
+        - `statistic_category`: This attribute is used by users to configure the category of statistical information to be saved, and only takes effect when saving statistical information is enabled(i.e.`saved_data` is set to `statistic` or `full`). The type is a string list, where the optional values of the strings are as follows:
+
+            - "max": represents the maximum value of the elements in tensor;
+            - "min": represents the minimum value of the elements in tensor;
+            - "avg": represents the average value of elements in tensor;
+            - "count": represents the number of the elements in tensor;
+            - "negative zero count": represents the number of the elements which is less then zero in tensor;
+            - "positive zero count": represents the number of the elements which is greater then zero in tensor;
+            - "nan count": represents the number of `Nan` elements in the tensor;
+            - "negative inf count": represents the number of `-Inf` elements in the tensor;
+            - "positive inf count": represents the number of `+Inf` elements in the tensor;
+            - "zero count": represents the number of zero elements in the tensor;
+            - "md5": represents the MD5 value of the tensor;
+            - "l2norm": represents L2Norm value of the tensor.
+
+        In CPU/GPU Dump Mode, all statistics are calculated on the host.
+        This field is optional, with default values of ["max", "min", "l2norm"].
+
+    - `e2e_dump_settings`:
+
+        - `enable`: In CPU/GPU Dump Mode, this field must be set to `true`.
+        - `trans_flag`: Enable trans flag. Transform the device data format into NCHW. If it is `True`, the data will be saved in the 4D format (NCHW) format on the Host side; if it is `False`, the data format on the Device side will be retained. Default: `True`.
+
+2. Set Dump environment variable.
+
+   Specify the json configuration file of Dump.
+
+   ```bash
+   export MINDSPORE_DUMP_CONFIG=${xxx}
+   ```
+
+   "xxx" represents the absolute path to the configuration file.
+
+   ```bash
+   export MINDSPORE_DUMP_CONFIG=/path/to/data_dump.json
+   ```
+
+   If the `path` field is not set or set to an empty string in the Dump configuration file, you also need to configure the environment variable `MS_DIAGNOSTIC_DATA_PATH`.
+
+   ```bash
+   export MS_DIAGNOSTIC_DATA_PATH=${yyy}
+   ```
+
+   Then "$MS_DIAGNOSTIC_DATA_PATH/debug_dump" is regarded as `path`. If the `path` field is set in Dump configuration file, the actual value of the field is still the same.
+
+   Note:
+
+    - Set the environment variables before executing the training script. Setting environment variables during training will not take effect.
+    - Dump environment variables need to be configured before calling `mindspore.communication.init`.
+
+3. Execute the training script to dump data.
+
+   After the training is started, if the `MINDSPORE_DUMP_CONFIG` environment variable is correctly configured, the content of the configuration file will be read and the operator data will be saved according to the data storage path specified in the Dump configuration.
+   If you want to dump data in GPU environment, you must use the non-data sink mode (set the `dataset_sink_mode` parameter in `model.train` or `DatasetHelper` to `False`) to ensure that you can get the dump data of each step.
+   If `model.train` or `DatasetHelper` is not called in the script, the default is non-data sinking mode. Using the Dump function will automatically generate the IR file of the final execution graph.
+
+    You can set `set_context(reserve_class_name_in_scope=False)` in your training script to avoid dump failure because of file name is too long.
+
+4. Read and parse dump data through `numpy.load`, refer to [Introduction to CPU/GPU Dump Data File](#introduction-to-data-object-directory-and-data-file-2).
+
+### Introduction to Data Object Directory and Data File
+
+After starting the training, the data objects saved by the CPU/GPU Dump include the final execution graph (`ms_output_trace_code_graph_{graph_id}.ir` file) and the input and output data of the operators in the graph. The data directory structure is as follows:
+
+```text
+{path}/
+    - rank_{rank_id}/
+        - .dump_metadata/
+        - {net_name}/
+            - {graph_id}/
+                - {iteration_id}/
+                    {op_type}.{op_name}.json
+                    statistic.csv
+                    {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy
+                - constants/
+                    Parameter.data-{data_id}.0.0.{timestamp}.output.0.DefaultFormat.npy
+            ...
+        - graphs/
+            ms_output_trace_code_graph_{graph_id}.pb
+            ms_output_trace_code_graph_{graph_id}.ir
+        - execution_order/
+            ms_execution_order_graph_{graph_id}.csv
+            ms_global_execution_order_graph_{graph_id}.csv
+```
+
+- `path`: the absolute path set in the `data_dump.json` configuration file.
+- `rank_id`: the id of the logic device.
+- `net_name`: the network name set in the `data_dump.json` configuration file.
+- `graph_id`: the id of the training graph.
+- `iteration_id`: the iteration of the training.
+- `op_type`: the type of the operator.
+- `op_name`: the name of the operator.
+- `task_id`: the id of the task.
+- `stream_id`: the id of the stream.
+- `timestamp`: the time stamp.
+- `input_output_index` : the index of input or output. For example, `output_0` means that the file is the data of the first output Tensor of the operator.
+- `slot`: the id of the slot.
+- `format`: the format of the data.
+- `data_id`: the id of constant data.
+
+For multi-graph networks, due to the control flow, some subgraphs may not be executed, but Dump only saves the executed nodes, so the {graph_id} in the `.pb` file name in the graphs directory does not necessarily exist in the {graph_id} directory under {net_name}.
+
+Only when `saved_data` is "statistic" or "full", `statistic.csv` is generated. Only when `saved_data` is "tensor" or "full", `{op_type}. {op_name}. {task_id}. {stream_id}. {timestamp}. {input_output_index}. {slot}. {format}.npy` named complete tensor information is generated.
+
+Only when `save_kernel_args` is `True`, `{op_type}.{op_name}.json` is generated and the params of the corresponding operators is saved.
+
+The data file generated by the CPU/GPU Dump is a binary file with the suffix `.npy`, and the file naming format is:
+
+```text
+{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy
+```
+
+The constant data file generated by the CPU/GPU Dump is in the same format as data file, whereas {op_type}, {task_id}, {stream_id}, {input_output_index}, {slot}, {format} are unchanged for all constant data. Note, non-Tensor type will not generate data file.
+
+```text
+Parameter.data-{data_id}.0.0.{timestamp}.output.0.DefaultFormat.npy
+```
+
+The {iteration_id} directory may also save files starting with `Parameter` (parameters such as weight and bias will be saved as files starting with `Parameter`), while `Parameter` files will not be saved on Ascend.
+
+User can use Numpy interface `numpy.load` to read the data.
+
+The statistics file generated by the CPU/GPU dump is named `statistic.csv`. This file stores key statistics for all tensors dumped under the same directory as itself (with the file names `{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy`). Each row in `statistic.csv` summarizes a single tensor, each row contains the statistics: Op Type, Op Name, Task ID, Stream ID, Timestamp, IO, Slot, Data Size, Data Type, Shape, and statistics items configured by the user. Note that opening this file with Excel may cause data to be displayed incorrectly. Please use commands like `vi` or `cat`, or use Excel to import csv from text for viewing.
+
+The suffixes of the final execution graph files generated by CPU/GPU Dump are `.pb` and `.ir` respectively, and the file naming format is:
+
+```text
+ms_output_trace_code_graph_{graph_id}.pb
+ms_output_trace_code_graph_{graph_id}.ir
+```
+
+The files with the suffix `.ir` can be opened and viewed by the `vi` command.
+
+The suffix of the node execution sequence file generated by the CPU/GPU Dump is `.csv`, and the file naming format is:
+
+```text
+ms_execution_order_graph_{graph_id}.csv
+```
+
+The suffix of the graph execution history file is `.csv`. The file naming format is:
+
+```text
+ms_global_execution_order_graph_{graph_id}.csv
+```
+
+This file stores the list of iterations in which the graph was executed. After the graph is compiled, it may be split into multiple sub-graphs. Since sub-graphs share the same graph execution history with root graph, only root graph will generate an execution history file. This function is not supported on Ascend.
+
+`.dump_metadata` records the original training information(the directory is not available for Ascend backend), and `data_dump.json` saves the dump configuration set by the user.
+
+### Data Analysis Sample
+
+In order to better demonstrate the process of using dump to save and analyze data, we provide a set of [complete sample script](https://gitee.com/mindspore/docs/tree/master/docs/sample_code/dump) , you only need to execute `bash dump_sync_dump.sh` for CPU/GPU dump.
+
+After the graph corresponding to the script is saved to the disk through the Dump function, the final execution graph file `ms_output_trace_code_graph_{graph_id}.ir` will be generated. This file saves the stack information of each operator in the corresponding graph, and records the generation script corresponding to the operator.
+
+Take [AlexNet script](https://gitee.com/mindspore/docs/blob/master/docs/sample_code/dump/train_alexnet.py) as an example:
+
+```python
+...
+def conv(in_channels, out_channels, kernel_size, stride=1, padding=0, pad_mode="valid"):
+    weight = weight_variable()
+    return nn.Conv2d(in_channels, out_channels,
+                     kernel_size=kernel_size, stride=stride, padding=padding,
+                     weight_init=weight, has_bias=False, pad_mode=pad_mode)
+
+
+def fc_with_initialize(input_channels, out_channels):
+    weight = weight_variable()
+    bias = weight_variable()
+    return nn.Dense(input_channels, out_channels, weight, bias)
+
+
+def weight_variable():
+    return TruncatedNormal(0.02)
+
+
+class AlexNet(nn.Cell):
+    """
+    Alexnet
+    """
+
+    def __init__(self, num_classes=10, channel=3):
+        super(AlexNet, self).__init__()
+        self.conv1 = conv(channel, 96, 11, stride=4)
+        self.conv2 = conv(96, 256, 5, pad_mode="same")
+        self.conv3 = conv(256, 384, 3, pad_mode="same")
+        self.conv4 = conv(384, 384, 3, pad_mode="same")
+        self.conv5 = conv(384, 256, 3, pad_mode="same")
+        self.relu = nn.ReLU()
+        self.max_pool2d = nn.MaxPool2d(kernel_size=3, stride=2)
+        self.flatten = nn.Flatten()
+        self.fc1 = fc_with_initialize(6 * 6 * 256, 4096)
+        self.fc2 = fc_with_initialize(4096, 4096)
+        self.fc3 = fc_with_initialize(4096, num_classes)
+
+    def construct(self, x):
+        """
+        The construct function.
+
+        Args:
+           x(int): Input of the network.
+
+        Returns:
+           Tensor, the output of the network.
+        """
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.max_pool2d(x)
+        x = self.conv2(x)
+        x = self.relu(x)
+        x = self.max_pool2d(x)
+        x = self.conv3(x)
+        x = self.relu(x)
+        x = self.conv4(x)
+        x = self.relu(x)
+        x = self.conv5(x)
+        x = self.relu(x)
+        x = self.max_pool2d(x)
+        x = self.flatten(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.relu(x)
+        x = self.fc3(x)
+        return x
+...
+```
+
+If the user wants to view the code at line 175 in the script:
+
+```python
+x = self.conv3(x)
+```
+
+After executing the network training, you can find multiple operator information corresponding to the line of code from the final execution graph (`ms_output_trace_code_graph_{graph_id}.ir` file). The content of the file corresponding to Conv2D-op12 is as follows:
+
+```text
+  %20(equivoutput) = Conv2D(%17, %19) {instance name: conv2d} primitive_attrs: {IsFeatureMapInputList: (0), kernel_size: (3, 3), mode: 1, out_channel: 384, input_names: [
+x, w],    pri_format: NC1HWC0, pad: (0, 0, 0, 0), visited: true, pad_mod: same, format: NCHW,  pad_list: (1, 1, 1, 1), precision_flag: reduce, groups: 1, output_used_num:
+(1), stream_id:     0, stride: (1, 1, 1, 1), group: 1, dilation: (1, 1, 1, 1), output_names: [output], IsFeatureMapOutput: true, ms_function_graph: true}
+       : (<Tensor[Float32], (32, 256, 13, 13)>, <Tensor[Float32], (384, 256, 3, 3)>) -> (<Tensor[Float32], (32, 384, 13, 13)>)
+       : (<Float16xNC1HWC0[const vector][32, 16, 13, 13, 16]>, <Float16xFracZ[const vector][144, 24, 16, 16]>) -> (<Float32xNC1HWC0[const vector][32, 24, 13, 13, 16]>)
+       : full_name_with_scope: (Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op12)
+       ...
+       # In file ./tain_alexnet.py(175)/        x = self.conv3(x)/
+       ...
+```
+
+The meanings of the lines in the file content shown above are as follows:
+
+- The input and output of the operator on the Host side (the first line) and the Device side (the second line, some operators may not exist). It can be seen from the execution graph that the operator has two inputs (left side of the arrow) and one output (right side of the arrow).
+
+    ```text
+       : (<Tensor[Float32], (32, 256, 13, 13)>, <Tensor[Float32], (384, 256, 3, 3)>) -> (<Tensor[Float32], (32, 384, 13, 13)>)
+       : (<Float16xNC1HWC0[const vector][32, 16, 13, 13, 16]>, <Float16xFracZ[const vector][144, 24, 16, 16]>) -> (<Float32xNC1HWC0[const vector][32, 24, 13, 13, 16]>)
+    ```
+
+- Operator name. It can be seen from the execution graph that the full name of the operator in the final execution graph is `Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op12`.
+
+    ```text
+    : (Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op12)
+    ```
+
+- The training script code corresponding to the operator. By searching the training script code to be queried, multiple matching operators can be found.
+
+    ```text
+    # In file {Absolute path of model_zoo}/official/cv/alexnet/src/alexnet.py(175)/        x = self.conv3(x)/
+    ```
+
+Through the operator name and input and output information, you can find the only corresponding Tensor data file. For example, if you want to view the dump file corresponding to the first output data of the Conv2D-op12 operator, you can obtain the following information:
+
+- `operator_name`: `Conv2D-op12`.
+
+- `input_output_index`: `output.0` indicates that the file is the data of the first output Tensor of the operator.
+
+- `slot`: 0, this tensor only has one slot.
+
+Search for the corresponding file name in the data object file directory saved by Dump:
+`Conv2d.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.npy`.
+
+When restoring data, execute:
+
+```python
+import numpy
+numpy.load("Conv2D.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.npy")
+```
+
+Generate the numpy.array data.
+
 ## Other Description
 
 ### Other Dump Function
 
 In some special scenarios, the GE dump mode can be applied under development guidance.
 
-To enable GE dump, set the environment variable MINDSPORE_DUMP_CONFIG and ENABLE_MS_GE_DUMP to 1. This mode applies only to the scenario where the compilation level of the graph is O2. The format of the configuration file is the same as that of the asynchronous dump configuration file. The op_debug_mode field cannot be set to 4. Other parameters are the same as those of the asynchronous dump configuration file.
+To enable GE dump, set the environment variable MINDSPORE_DUMP_CONFIG and ENABLE_MS_GE_DUMP to 1. This mode applies only to the scenario where the compilation level of the graph is O2. The format of the configuration file is the same as that of the Ascend O2 Dump configuration file. The op_debug_mode field cannot be set to 4. Other parameters are the same as those of the Ascend O2 Dump configuration file.
 
 ```bash
 export ENABLE_MS_GE_DUMP=1
@@ -759,7 +1020,7 @@ When GE dump is enabled, and the graph compilation level is O2, the Dump directo
                         mapping.csv
 ```
 
-Among them, the meanings of `path`, `time`, `device_id`, `model_name`, `model_id`, `iteration_id`, `op_type`, `op_name`, `task_id`, `stream_id`, and `timestamp` are the same as those of asynchronous dump.
+Among them, the meanings of `path`, `time`, `device_id`, `model_name`, `model_id`, `iteration_id`, `op_type`, `op_name`, `task_id`, `stream_id`, and `timestamp` are the same as those of Ascend O2 Dump.
 
 This method will be abandoned in the future and is not recommended for use.
 
@@ -769,4 +1030,4 @@ This method will be abandoned in the future and is not recommended for use.
 - Dump only supports saving data with type of bool, int, int8, in16, int32, int64, uint, uint8, uint16, uint32, uint64, float, float16, float32, float64, bfloat16, double, complex64 and complex128.
 - Complex64 and complex128 only support saving as npy files, not as statistics information.
 - The Print operator has an input parameter with type of string, which is not a data type supported by Dump. Therefore, when the Print operator is included in the script, there will be an error log, which will not affect the saving data of other types.
-- When asynchronous dump  is enabled, lite exception dump is not supported by using set_context(ascend_config={"exception_dump": "2"), while full exception dump is supported by using set_context(ascend_config={"exception_dump": "1").
+- When Ascend O2 dump is enabled, lite exception dump is not supported by using set_context(ascend_config={"exception_dump": "2"}), while full exception dump is supported by using set_context(ascend_config={"exception_dump": "1"}).
diff --git a/docs/mindspore/source_zh_cn/model_train/debug/dump.md b/docs/mindspore/source_zh_cn/model_train/debug/dump.md
index e6af0d641e..53f568a202 100644
--- a/docs/mindspore/source_zh_cn/model_train/debug/dump.md
+++ b/docs/mindspore/source_zh_cn/model_train/debug/dump.md
@@ -6,164 +6,107 @@
 
 - 对于静态图模式，MindSpore提供了Dump功能，用来将模型训练中的图以及算子的输入输出数据保存到磁盘文件。
 
-- 对于动态图模式，前向过程可以使用Python原生执行能力，用户可以在网络脚本运行过程中查看记录相应的输入输出。jit以及反向过程属于图编译的部分可以使用同步Dump功能，将算子的输入输出数据保存到磁盘文件。
+- 对于动态图模式，前向过程可以使用Python原生执行能力，用户可以在网络脚本运行过程中查看记录相应的输入输出。jit以及反向过程属于图编译的部分可以使用Ascend O0/O1功能，将算子的输入输出数据保存到磁盘文件。
 
-MindSpore提供了两种Dump模式：
+MindSpore在不同模式下支持的Dump功能不完全相同，需要的配置文件和以及生成的数据格式也不同，因此需要根据运行的模式选择对应的Dump配置：
 
-- 同步Dump：在算子下发后，Host侧执行流同步，发起对Device侧数据的拷贝，将其保存到文件中。
-- 异步Dump：专为Ascend开发，在算子执行完成后，Device侧主动发起数据落盘。
+- [Ascend下O0/O1模式Dump](#ascend下o0o1模式dump)
+- [Ascend下O2模式Dump](#ascend下o2模式dump)
+- [CPU/GPU模式Dump](#cpugpu-dump)
 
-> 不同模式需要不同的配置文件，生成的数据格式也不同：
+> - Ascend下O0/O1/O2模式的区别请见[set_context的参数jit_level](https://www.mindspore.cn/docs/zh-CN/r2.3.1/api_python/mindspore/mindspore.set_context.html)。
 >
-> - GPU/CPU后端和编译等级为O0/O1下的Ascend后端，推荐使用[同步Dump](#同步dump)，具体参考[同步dump操作步骤](https://www.mindspore.cn/docs/zh-CN/master/model_train/debug/dump.html#%E6%93%8D%E4%BD%9C%E6%AD%A5%E9%AA%A4)；编译等级为O2的Ascend后端推荐使用[异步Dump](#异步dump)，具体参考[异步dump操作步骤](https://www.mindspore.cn/docs/zh-CN/master/model_train/debug/dump.html#%E6%93%8D%E4%BD%9C%E6%AD%A5%E9%AA%A4-1)。
-> - Dump暂不支持异构训练，如果在异构训练场景启用Dump，生成的Dump数据对象目录可能不符合预期的目录结构。
+> - CPU/GPU模式支持dump常量数据，Ascend O0/O1/O2模式不支持Dump常量数据。
+>
+> - Ascend O2模式支持dump数据格式`.npy`和`.bin`文件，其他模式只支持dump数据格式`.npy`文件。
+>
+> - Dump暂不支持异构训练，即不支持CPU/Ascend混合训练或GPU/Ascend混合训练。
 
-Ascend后端同步Dump支持情况如下表（GPU/CPU后端参考 `O0/O1` 列）。
+MindSpore在不同模式下支持的Dump功能如下表所示：
 
 <table align="center">
   <tr>
-   <td rowspan="12" align="center">同步Dump</td>
    <td colspan="2" align="center">功能</td>
-   <td align="center">O0/O1</td>
-   <td align="center">O2</td>
+   <td align="center">Ascend O0/Ascend O1</td>
+   <td align="center">Ascend O2</td>
+   <td align="center">CPU/GPU</td>
   </tr>
   <tr>
    <td align="left">全量dump</td>
    <td align="left">整网数据dump</td>
    <td align="left">支持</td>
-   <td align="left">不支持</td>
-  </tr>
-  <tr>
-   <td rowspan="2" align="left">部分数据dump</td>
-   <td align="left">统计信息dump</td>
-   <td align="left">支持host和device模式<sup>1</sup></td>
-   <td align="left">不支持</td>
-  </tr>
-  <tr>
-   <td align="left">数据采样dump</td>
-   <td align="left">支持<sup>2</sup></td>
-   <td align="left">不支持</td>
-  </tr>
-  <tr>
-   <td align="left">溢出dump</td>
-   <td align="left">dump溢出算子</td>
-   <td align="left">支持<sup>2</sup></td>
-   <td align="left">不支持</td>
-  </tr>
-  <tr>
-   <td rowspan="5" align="left">指定条件dump</td>
-   <td align="left">指定算子名称</td>
-   <td align="left">支持</td>
-   <td align="left">不支持</td>
-  </tr>
-  <tr>
-   <td align="left">指定迭代</td>
-   <td align="left">支持</td>
-   <td align="left">不支持</td>
-  </tr>
-  <tr>
-   <td align="left">指定device</td>
-   <td align="left">支持</td>
-   <td align="left">不支持</td>
-  </tr>
-  <tr>
-   <td align="left">指定file_format</td>
-   <td align="left">不涉及</td>
-   <td align="left">不支持</td>
-  </tr>
-  <tr>
-   <td align="left">set_dump</td>
-   <td align="left">支持<sup>2</sup></td>
-   <td align="left">不支持</td>
-  </tr>
-  <tr>
-   <td rowspan="2" align="left">辅助信息dump</td>
-   <td align="left">图ir dump</td>
-   <td align="left">支持</td>
-   <td align="left">不支持</td>
-  </tr>
-  <tr>
-   <td align="left">执行序dump</td>
-   <td align="left">支持</td>
-   <td align="left">不支持</td>
-  </tr>
-</table>
-
-> 1. 在统计信息方面，device计算速度较host快（目前仅支持Ascend后端），但host统计指标比device多，详见`statistic_category`选项。
-> 2. 仅支持Ascend后端。
-
-Ascend后端异步Dump支持情况如下表（GPU/CPU后端不支持）。
-
-<table align="center">
-  <tr>
-   <td rowspan="12" align="center">异步Dump</td>
-   <td colspan="2" align="center">功能</td>
-   <td align="center">O0/O1</td>
-   <td align="center">O2</td>
-  </tr>
-  <tr>
-   <td align="left">全量dump</td>
-   <td align="left">整网数据dump</td>
    <td align="left">支持，但无full_name信息</td>
    <td align="left">支持</td>
   </tr>
   <tr>
    <td rowspan="2" align="left">部分数据dump</td>
    <td align="left">统计信息dump</td>
+   <td align="left">支持host和device模式<sup>1</sup></td>
    <td align="left">仅支持host模式</td>
-   <td align="left">仅支持host模式</td>
+   <td align="left">CPU不支持， GPU仅支持host模式</td>
   </tr>
   <tr>
    <td align="left">数据采样dump</td>
+   <td align="left">支持</td>
    <td align="left">不支持</td>
    <td align="left">不支持</td>
   </tr>
   <tr>
    <td align="left">溢出dump</td>
    <td align="left">dump溢出算子</td>
-   <td align="left">不支持</td>
    <td align="left">支持</td>
+   <td align="left">支持</td>
+   <td align="left">不支持</td>
   </tr>
   <tr>
    <td rowspan="5" align="left">指定条件dump</td>
    <td align="left">指定算子名称</td>
-   <td align="left">不支持</td>
+   <td align="left">支持</td>
+   <td align="left">支持</td>
    <td align="left">支持</td>
   </tr>
   <tr>
    <td align="left">指定迭代</td>
    <td align="left">支持</td>
    <td align="left">支持</td>
+   <td align="left">支持</td>
   </tr>
   <tr>
    <td align="left">指定device</td>
    <td align="left">支持</td>
    <td align="left">支持</td>
+   <td align="left">支持</td>
   </tr>
   <tr>
    <td align="left">指定file_format</td>
+   <td align="left">不涉及</td>
    <td align="left">支持</td>
-   <td align="left">支持</td>
+   <td align="left">不涉及</td>
   </tr>
   <tr>
    <td align="left">set_dump</td>
+   <td align="left">支持</td>
    <td align="left">不支持</td>
    <td align="left">不支持</td>
   </tr>
   <tr>
    <td rowspan="2" align="left">辅助信息dump</td>
    <td align="left">图ir dump</td>
+   <td align="left">支持</td>
    <td align="left">不支持</td>
-   <td align="left">不支持</td>
+   <td align="left">支持</td>
   </tr>
   <tr>
    <td align="left">执行序dump</td>
+   <td align="left">支持</td>
    <td align="left">不支持</td>
-   <td align="left">不支持</td>
+   <td align="left">支持</td>
   </tr>
 </table>
 
-## 同步Dump
+> 在统计信息方面，device计算速度较host快（目前仅支持Ascend后端），但host统计指标比device多，详见`statistic_category`选项。
+
+## Ascend下O0/O1模式Dump
 
 ### 操作步骤
 
@@ -192,43 +135,49 @@ Ascend后端异步Dump支持情况如下表（GPU/CPU后端不支持）。
     }
     ```
 
-    - `op_debug_mode`：该属性用于算子溢出或算子异常调试，设置成0，表示保存所有算子或指定算子；设置成3，表示只保存溢出算子；设置成4，表示只保存异常算子的输入。在Dump数据的时候请设置成0，若设置成其他值，则只会Dump溢出算子或异常算子的数据。默认值：0。
-    - `dump_mode`：设置成0，表示Dump出该网络中的所有算子数据；设置成1，表示Dump`"kernels"`里面指定的算子数据或算子类型数据；设置成2，表示使用[mindspore.set_dump](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.set_dump.html) Dump指定对象。仅在op_debug_mode设置为0时支持指定算子dump。
-    - `path`：Dump保存数据的绝对路径。
-    - `net_name`：自定义的网络名称，例如："ResNet50"。
-    - `iteration`：指定需要Dump数据的迭代。类型为str，用“|”分离要保存的不同区间的step的数据。如"0|5-8|100-120"表示Dump第1个，第6个到第9个， 第101个到第121个step的数据。指定“all”，表示Dump所有迭代的数据。仅在op_debug_mode设置为0或3时支持保存指定迭代，op_debug_mode设置为4时不支持指定迭代。
-    - `saved_data`: 指定Dump的数据。类型为str，取值成"tensor"，表示Dump出完整张量数据；取值成"statistic"，表示只Dump张量的统计信息；取值"full"代表两种都要。同步Dump统计信息现只支持GPU场景和Ascend场景，CPU场景若选"statistic"或"full"便会错误退出。默认取值为"tensor"。保存统计信息仅支持op_debug_mode设置为0的场景。
-    - `input_output`：设置成0，表示Dump出算子的输入和算子的输出；设置成1，表示Dump出算子的输入；设置成2，表示Dump出算子的输出。在op_debug_mode设置为4时，只能保存算子输入。
-    - `kernels`：该项可以配置三种格式：
-        1. 算子的名称列表。开启IR保存开关`set_context(save_graphs=2)`并执行用例，从生成的IR文件`trace_code_graph_{graph_id}`中获取算子名称。详细说明可以参照教程：[如何保存IR](https://www.mindspore.cn/docs/zh-CN/master/model_train/debug/error_analysis/mindir.html#如何保存ir)。
-        需要注意的是，是否设置`set_context(save_graphs=2)`可能会导致同一个算子的id不同，所以在Dump指定算子时要在获取算子名称之后保持这一项设置不变。或者也可以在Dump保存的`ms_output_trace_code_graph_{graph_id}.ir`文件中获取算子名称，参考[同步Dump数据对象目录](https://www.mindspore.cn/docs/zh-CN/master/model_train/debug/dump.html#%E6%95%B0%E6%8D%AE%E5%AF%B9%E8%B1%A1%E7%9B%AE%E5%BD%95%E5%92%8C%E6%95%B0%E6%8D%AE%E6%96%87%E4%BB%B6%E4%BB%8B%E7%BB%8D)。
-        2. 还可以指定算子类型。当字符串中不带算子scope信息和算子id信息时，后台则认为其为算子类型，例如："conv"。算子类型的匹配规则为：当发现算子名中包含算子类型字符串时，则认为匹配成功（不区分大小写），例如："conv" 可以匹配算子 "Conv2D-op1234"、"Conv3D-op1221"。
-        3. 算子名称的正则表达式。当字符串符合"name-regex(xxx)"格式时，后台则会将其作为正则表达式。例如，"name-regex(Default/.+)"可匹配算子名称以"Default/"开头的所有算子。
-    - `support_device`：支持的设备，默认设置成0到7即可；在分布式训练场景下，需要dump个别设备上的数据，可以只在`support_device`中指定需要Dump的设备Id。该配置参数在CPU上无效，因为CPU下没有device这个概念，但是在json格式的配置文件中仍需保留该字段。
-    - `statistic_category`: 该属性用于用户配置要保存的统计信息类别，仅在开启了保存统计信息(即`saved_data`设置为"statistic"或"full")时生效。类型为字符串列表，其中的字符串可选值如下：
-
-        - "max": 表示Tensor中元素的最大值，支持在device统计和在host统计；
-        - "min": 表示Tensor中元素的最小值，支持在device统计和在host统计；
-        - "avg": 表示Tensor中元素的平均值，支持在device统计和在host统计；
-        - "count": 表示Tensor中元素的个数；
-        - "negative zero count": 表示Tensor中小于0的元素个数；
-        - "positive zero count": 表示Tensor中大于0的元素个数；
-        - "nan count": 表示Tensor中元素的`Nan`的个数；
-        - "negative inf count": 表示Tensor中`-Inf`元素的个数；
-        - "positive inf count": 表示Tensor中`+Inf`元素的个数；
-        - "zero count": 表示Tensor中元素`0`的个数；
-        - "md5": 表示Tensor的MD5值；
-        - "l2norm": 表示Tensor的L2Norm值，支持在device统计和在host统计。
-
-      以上除了标记了支持device统计的，其它都仅支持在host统计。
-      该字段为可选，默认值为["max", "min", "l2norm"]。
-
-    - `overflow_number`：指定溢出dump的数据个数。该字段仅在`op_debug_mode`设置为3，只保存溢出算子时需要配置，可控制溢出数据按时间序dump，到指定数值后溢出数据不再dump。默认值为0，表示dump全部溢出数据。
-    - `enable`：设置成true，表示开启同步Dump；设置成false时，在Ascend上会使用异步Dump，在GPU上仍然使用同步Dump。
-    - `trans_flag`：开启格式转换，将设备上的数据格式转换成NCHW格式。若为`True`，则数据会以Host侧的4D格式（NCHW）格式保存；若为`False`，则保留Device侧的数据格式。该配置参数在CPU上无效，因为CPU上没有format转换。默认值：true。
-    - `stat_calc_mode`：选择统计信息计算后端，可选"host"和"device"。选择"device"后可以使能device计算统计信息，当前只在Ascend生效，只支持`min/max/avg/l2norm`统计量。
-    - `sample_mode`：设置成0，表示不开启切片dump功能；设置成1时，在图编译等级为O0或O1的情况下开启切片dump功能。仅在op_debug_mode设置为0时生效，其它场景不会开启切片dump功能。
-    - `sample_num`：用于控制切片dump中切片的大小。默认值为100。
+    - `common_dump_settings`:
+
+        - `op_debug_mode`：该属性用于算子溢出或算子异常调试，设置成0，表示保存所有算子或指定算子；设置成3，表示只保存溢出算子；设置成4，表示只保存异常算子的输入。在Dump数据的时候请设置成0，若设置成其他值，则只会Dump溢出算子或异常算子的数据。默认值：0。
+        - `dump_mode`：设置成0，表示Dump出该网络中的所有算子数据；设置成1，表示Dump`"kernels"`里面指定的算子数据或算子类型数据；设置成2，表示使用[mindspore.set_dump](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.set_dump.html) Dump指定对象。仅在op_debug_mode设置为0时支持指定算子dump。
+        - `path`：Dump保存数据的绝对路径。
+        - `net_name`：自定义的网络名称，例如："ResNet50"。
+        - `iteration`：指定需要Dump数据的迭代。类型为str，用“|”分离要保存的不同区间的step的数据。如"0|5-8|100-120"表示Dump第1个，第6个到第9个， 第101个到第121个step的数据。指定“all”，表示Dump所有迭代的数据。仅在op_debug_mode设置为0或3时支持保存指定迭代，op_debug_mode设置为4时不支持指定迭代。
+        - `saved_data`: 指定Dump的数据。类型为str，取值成"tensor"，表示Dump出完整张量数据；取值成"statistic"，表示只Dump张量的统计信息；取值"full"代表两种都要。默认取值为"tensor"。保存统计信息仅在op_debug_mode设置为0时生效。
+        - `input_output`：设置成0，表示Dump出算子的输入和算子的输出；设置成1，表示Dump出算子的输入；设置成2，表示Dump出算子的输出。在op_debug_mode设置为4时，只能保存算子输入。
+        - `kernels`：该项可以配置三种格式：
+           1. 算子的名称列表。开启IR保存开关`set_context(save_graphs=2)`并执行用例，从生成的IR文件`trace_code_graph_{graph_id}`中获取算子名称。详细说明可以参照教程：[如何保存IR](https://www.mindspore.cn/docs/zh-CN/master/model_train/debug/error_analysis/mindir.html#如何保存ir)。
+           需要注意的是，是否设置`set_context(save_graphs=2)`可能会导致同一个算子的id不同，所以在Dump指定算子时要在获取算子名称之后保持这一项设置不变。或者也可以在Dump保存的`ms_output_trace_code_graph_{graph_id}.ir`文件中获取算子名称，参考[Ascend O0/O1模式下Dump数据对象目录](#数据对象目录和数据文件介绍)。
+           2. 还可以指定算子类型。当字符串中不带算子scope信息和算子id信息时，后台则认为其为算子类型，例如："conv"。算子类型的匹配规则为：当发现算子名中包含算子类型字符串时，则认为匹配成功（不区分大小写），例如："conv" 可以匹配算子 "Conv2D-op1234"、"Conv3D-op1221"。
+           3. 算子名称的正则表达式。当字符串符合"name-regex(xxx)"格式时，后台则会将其作为正则表达式。例如，"name-regex(Default/.+)"可匹配算子名称以"Default/"开头的所有算子。
+        - `support_device`：支持的设备，默认设置成0到7即可；在分布式训练场景下，需要dump个别设备上的数据，可以只在`support_device`中指定需要Dump的设备Id。该配置参数在CPU上无效，因为CPU下没有device这个概念，但是在json格式的配置文件中仍需保留该字段。
+        - `statistic_category`: 该属性用于用户配置要保存的统计信息类别，仅在开启了保存统计信息(即`saved_data`设置为"statistic"或"full")时生效。类型为字符串列表，其中的字符串可选值如下：
+
+            - "max": 表示Tensor中元素的最大值，支持在device统计和在host统计；
+            - "min": 表示Tensor中元素的最小值，支持在device统计和在host统计；
+            - "avg": 表示Tensor中元素的平均值，支持在device统计和在host统计；
+            - "count": 表示Tensor中元素的个数；
+            - "negative zero count": 表示Tensor中小于0的元素个数；
+            - "positive zero count": 表示Tensor中大于0的元素个数；
+            - "nan count": 表示Tensor中元素的`Nan`的个数；
+            - "negative inf count": 表示Tensor中`-Inf`元素的个数；
+            - "positive inf count": 表示Tensor中`+Inf`元素的个数；
+            - "zero count": 表示Tensor中元素`0`的个数；
+            - "md5": 表示Tensor的MD5值；
+            - "l2norm": 表示Tensor的L2Norm值，支持在device统计和在host统计。
+
+        以上除了标记了支持device统计的，其它都仅支持在host统计。
+        该字段为可选，默认值为["max", "min", "l2norm"]。
+
+        - `overflow_number`：指定溢出dump的数据个数。该字段仅在`op_debug_mode`设置为3，只保存溢出算子时需要配置，可控制溢出数据按时间序dump，到指定数值后溢出数据不再dump。默认值为0，表示dump全部溢出数据。
+
+    - `e2e_dump_settings`:
+
+        - `enable`：设置成true，表示开启同步Dump；设置成false时，采用异步Dump。不设置该字段时默认值为false，开启异步Dump。两者的区别是异步Dump对原本代码执行过程的影响更小。
+        - `trans_flag`：开启格式转换，将设备上的数据格式转换成NCHW格式。若为`True`，则数据会以Host侧的4D格式（NCHW）格式保存；若为`False`，则保留Device侧的数据格式。该配置参数在CPU上无效，因为CPU上没有format转换。默认值：true。
+        - `stat_calc_mode`：选择统计信息计算后端，可选"host"和"device"。选择"device"后可以使能device计算统计信息，当前只在Ascend生效，只支持`min/max/avg/l2norm`统计量。
+        - `sample_mode`（可选）：设置成0，表示不开启切片dump功能；设置成1时，在图编译等级为O0或O1的情况下开启切片dump功能。仅在op_debug_mode设置为0时生效，其它场景不会开启切片dump功能。
+        - `sample_num`（可选）：用于控制切片dump中切片的大小。默认值为100。
+        - `same_kernel_args`（可选）: 设置成true时，会保存算子的初始化信息。
 
 2. 设置Dump环境变量。
 
@@ -260,16 +209,15 @@ Ascend后端异步Dump支持情况如下表（GPU/CPU后端不支持）。
 3. 启动网络训练脚本。
 
    训练启动后，若正确配置了`MINDSPORE_DUMP_CONFIG`环境变量，则会读取配置文件的内容，并按照Dump配置中指定的数据保存路径保存算子数据。
-   同步模式下，GPU环境如果要Dump数据，必须采用非数据下沉模式（设置`model.train`或`DatasetHelper`中的`dataset_sink_mode`参数为`False`），以保证可以获取每个step的Dump数据。
    若脚本中都不调用`model.train`或`DatasetHelper`，则默认为非数据下沉模式。使用Dump功能将自动生成最终执行图的IR文件。
 
     可以在训练脚本中设置`set_context(reserve_class_name_in_scope=False)`，避免Dump文件名称过长导致Dump数据文件生成失败。
 
-4. 通过`numpy.load`读取和解析同步Dump数据，参考[同步Dump数据文件介绍](#数据对象目录和数据文件介绍)。
+4. 通过`numpy.load`读取和解析Dump数据，参考[Ascend O0/O1模式下Dump数据文件介绍](#数据对象目录和数据文件介绍)。
 
 ### 数据对象目录和数据文件介绍
 
-启动训练后，同步Dump保存的数据对象包括最终执行图（`ms_output_trace_code_graph_{graph_id}.ir`文件）以及图中算子的输入和输出数据，数据目录结构如下所示：
+启动训练后，Ascend O0/O1模式下Dump保存的数据对象包括最终执行图（`ms_output_trace_code_graph_{graph_id}.ir`文件）以及图中算子的输入和输出数据，数据目录结构如下所示：
 
 ```text
 {path}/
@@ -312,27 +260,28 @@ Ascend后端异步Dump支持情况如下表（GPU/CPU后端不支持）。
 
 只当`saved_data`为"statistic"或者"full"时，才会生成`statistic.csv`，当`saved_data`为"tensor"或者"full"时，才会生成`{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy`命名的完整张量信息。
 
-只当`save_kernel_args`为`True`时，才会生成`{op_type}.{op_name}.json`，保存算子的初始化信息。
-
-同步Dump生成的数据文件是后缀名为`.npy`的文件，文件命名格式为：
+只当`save_kernel_args`为`True`时，才会生成`{op_type}.{op_name}.json`，保存算子的初始化信息。该json文件内部格式为算子各初始化参数的对应值，以`Matmul`算子为例， json信息如下：
 
-```text
-{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy
+```json
+{
+    "transpose_a": "False",
+    "transpose_b": "False"
+}
 ```
 
-同步Dump生成的常量数据文件与其他数据文件格式相同，而所有常量数据的{op_type}，{task_id}，{stream_id}，{input_output_index}，{slot}，{format}不变。注意，非Tensor类型数据不会被生成数据文件。该功能不支持Ascend场景。
+代表`Matmul`算子的两个初始化参数`transpose_a`和`transpose_b`的值均为`false`。
+
+Ascend O0/O1模式下Dump生成的数据文件是后缀名为`.npy`的文件，文件命名格式为：
 
 ```text
-Parameter.data-{data_id}.0.0.{timestamp}.output.0.DefaultFormat.{dtype}.npy
+{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy
 ```
 
-{iteration_id}目录下也可能会保存Parameter开头的文件（weight, bias等参数会保存成Parameter开头的文件），Ascend上不会保存Parameter文件。
-
 可以用Numpy的`numpy.load`接口读取数据。
 
-同步Dump生成的统计数据文件名为`statistic.csv`，此文件存有相同目录下所有落盘张量（文件名为`{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy`）的统计信息。每个张量一行，每行有张量的 Op Type，Op Name，Task ID，Stream ID，Timestamp，IO，Slot，Data Size，Data Type，Shape以及用户配置的统计信息项。注意，如果用Excel来打开此文件，数据可能无法正确显示。请用`vi`、`cat`等命令查看，或者使用Excel自文本导入csv查看。
+Ascend O0/O1模式下生成的统计数据文件名为`statistic.csv`，此文件存有相同目录下所有落盘张量（文件名为`{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy`）的统计信息。每个张量一行，每行有张量的 Op Type，Op Name，Task ID，Stream ID，Timestamp，IO，Slot，Data Size，Data Type，Shape以及用户配置的统计信息项。注意，如果用Excel来打开此文件，数据可能无法正确显示。请用`vi`、`cat`等命令查看，或者使用Excel自文本导入csv查看。
 
-同步Dump生成的最终执行图文件后缀名分别为`.pb`和`.ir`，文件命名格式为：
+Ascend O0/O1模式下生成的最终执行图文件后缀名分别为`.pb`和`.ir`，文件命名格式为：
 
 ```text
 ms_output_trace_code_graph_{graph_id}.pb
@@ -341,25 +290,15 @@ ms_output_trace_code_graph_{graph_id}.ir
 
 其中以`.ir`为后缀的文件可以通过`vi`命令打开查看。
 
-同步Dump生成的节点执行序文件后缀名为`.csv`，文件命名格式为：
+Ascend O0/O1模式下Dump生成的节点执行序文件后缀名为`.csv`，文件命名格式为：
 
 ```text
 ms_execution_order_graph_{graph_id}.csv
 ```
 
-图执行历史文件的后缀为`.csv`，文件名格式为：
-
-```text
-ms_global_execution_order_graph_{graph_id}.csv
-```
-
-此文件记录该图在训练过程中的执行轮次历史。图编译过程中，一张根图可能产生多张子图，但子图与根图具有相同的执行轮次历史。故与图执行序文件不同，此处仅保存根图的图执行历史文件。该功能不支持Ascend。
-
-`.dump_metadata`记录了训练的原信息（Ascend后端无此目录），其中`data_dump.json`保存了用户设置的dump配置。
-
 ### 数据分析样例
 
-为了更好地展示使用Dump来保存数据并分析数据的流程，我们提供了一套[完整样例脚本](https://gitee.com/mindspore/docs/tree/master/docs/sample_code/dump) ，同步Dump只需要执行 `bash run_sync_dump.sh`。
+为了更好地展示使用Dump来保存数据并分析数据的流程，我们提供了一套[完整样例脚本](https://gitee.com/mindspore/docs/tree/master/docs/sample_code/dump) ，只需要执行 `bash run_sync_dump.sh`。
 
 在通过Dump功能将脚本对应的图保存到磁盘上后，会产生最终执行图文件`ms_output_trace_code_graph_{graph_id}.ir`。该文件中保存了对应的图中每个算子的堆栈信息，记录了算子对应的生成脚本。
 
@@ -497,9 +436,7 @@ numpy.load("Conv2D.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.float
 
 生成numpy.array数据。
 
-## 异步Dump
-
-MindSpore通过异步Dump提供了Ascend平台上大型网络的调试能力。
+## Ascend下O2模式Dump
 
 ### 操作步骤
 
@@ -526,36 +463,38 @@ MindSpore通过异步Dump提供了Ascend平台上大型网络的调试能力。
     }
     ```
 
-    - `op_debug_mode`：该属性用于算子溢出调试，设置成0，表示不开启溢出；设置成3，表示开启溢出检测功能；设置成4，表示开启轻量异常Dump功能。在Dump数据的时候请设置成0，若设置成其他值，则只会Dump溢出算子或异常算子的数据。
-    - `dump_mode`：设置成0，表示Dump出该网络中的所有算子数据；设置成1，表示Dump`"kernels"`里面指定的算子数据或算子类型数据。仅在op_debug_mode设置为0时支持指定算子dump。op_debug_mode设置为非0值时，此字段的设置失效，Dump只会保存溢出算子的数据或者异常算子的数据。
-    - `path`：Dump保存数据的绝对路径。在图编译等级为O0时，MindSpore会在path目录下新建每个step的子目录。
-    - `net_name`：自定义的网络名称，例如："ResNet50"。
-    - `iteration`：指定需要Dump的迭代。类型为str，用“|”分离要保存的不同区间的step的数据。如"0|5-8|100-120"表示Dump第1个，第6个到第9个， 第101个到第121个step的数据。指定“all”，表示Dump所有迭代的数据。仅在op_debug_mode设置为0时支持保存指定迭代，op_debug_mode设置为3或4时不支持指定迭代。
-    - `saved_data`: 指定Dump的数据。类型为str，取值成"tensor"，表示Dump出完整张量数据；取值成"statistic"，表示只Dump张量的统计信息；取值"full"代表两种都要。异步Dump统计信息只有在`file_format`设置为`npy`时可以成功，若在`file_format`设置为`bin`时选"statistic"或"full"便会错误退出。保存统计信息仅支持op_debug_mode设置为0的场景。默认取值为"tensor"。
-    - `input_output`：设置成0，表示Dump出算子的输入和算子的输出；设置成1，表示Dump出算子的输入；设置成2，表示Dump出算子的输出。
-    - `kernels`：该项可以配置两种格式：
-        1. 算子的名称列表。指定算子需要先设置保存图文件的环境变量来保存图，再从保存的图文件中获取算子名称。保存图文件的环境变量请参考昇腾社区文档[DUMP_GE_GRAPH](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/apiref/envref/envref_07_0011.html) 、[DUMP_GRAPH_LEVEL](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/apiref/envref/envref_07_0012.html) 和[DUMP_GRAPH_PATH](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/apiref/envref/envref_07_0013.html) 。
-        2. 算子名称的正则表达式。当字符串符合"name-regex(xxx)"格式时，后台则会将其作为正则表达式。例如，"name-regex(Default/.+)"可匹配算子名称以"Default/"开头的所有算子。
-    - `support_device`：支持的设备，默认设置成0到7即可；在分布式训练场景下，需要dump个别设备上的数据，可以只在`support_device`中指定需要Dump的设备Id。
-    - `statistic_category`: 该属性用于用户配置要保存的统计信息类别，仅在开启了保存统计信息(即`saved_data`设置为"statistic"或"full")时生效。类型为字符串列表，其中的字符串可选值如下：
-
-        - "max": 表示Tensor中元素的最大值；
-        - "min": 表示Tensor中元素的最小值；
-        - "avg": 表示Tensor中元素的平均值；
-        - "count": 表示Tensor中元素的个数；
-        - "negative zero count": 表示Tensor中小于0的元素个数；
-        - "positive zero count": 表示Tensor中大于0的元素个数；
-        - "nan count": 表示Tensor中元素的`Nan`的个数；
-        - "negative inf count": 表示Tensor中`-Inf`元素的个数；
-        - "positive inf count": 表示Tensor中`+Inf`元素的个数；
-        - "zero count": 表示Tensor中元素`0`的个数；
-        - "md5": 表示Tensor的MD5值；
-        - "l2norm": 表示Tensor的L2Norm值。
-
-      该字段为可选，默认值为["max", "min", "l2norm"]。
-
-    - `file_format`: dump数据的文件类型，只支持`npy`和`bin`两种取值。设置成`npy`，则dump出的算子张量数据将为host侧格式的npy文件；设置成`bin`，则dump出的数据将为device侧格式的protobuf文件，需要借助转换工具进行处理，详细步骤请参考[异步Dump数据分析样例](#数据分析样例-1)。默认取值为`bin`。
-    - `overflow_number`：指定溢出dump的数据个数。该字段仅在`op_debug_mode`设置为3开启溢出检测功能，且`file_format`设置为`npy`时需要配置，可控制溢出数据按时间序dump，到指定数值后溢出数据不再dump。默认值为0，表示dump全部溢出数据。
+    - `common_dump_settings`:
+
+        - `op_debug_mode`：该属性用于算子溢出调试，设置成0，表示不开启溢出；设置成3，表示开启溢出检测功能；设置成4，表示开启轻量异常Dump功能。在Dump数据的时候请设置成0，若设置成其他值，则只会Dump溢出算子或异常算子的数据。
+        - `dump_mode`：设置成0，表示Dump出该网络中的所有算子数据；设置成1，表示Dump`"kernels"`里面指定的算子数据或算子类型数据。仅在op_debug_mode设置为0时支持指定算子dump。op_debug_mode设置为非0值时，此字段的设置失效，Dump只会保存溢出算子的数据或者异常算子的数据。
+        - `path`：Dump保存数据的绝对路径。
+        - `net_name`：自定义的网络名称，例如："ResNet50"。
+        - `iteration`：指定需要Dump的迭代。类型为str，用“|”分离要保存的不同区间的step的数据。如"0|5-8|100-120"表示Dump第1个，第6个到第9个， 第101个到第121个step的数据。指定“all”，表示Dump所有迭代的数据。仅在op_debug_mode设置为0时支持保存指定迭代，op_debug_mode设置为3或4时不支持指定迭代。
+        - `saved_data`: 指定Dump的数据。类型为str，取值成"tensor"，表示Dump出完整张量数据；取值成"statistic"，表示只Dump张量的统计信息；取值"full"代表两种都要。Ascend O2模式下Dump统计信息只有在`file_format`设置为`npy`时可以成功，若在`file_format`设置为`bin`时选"statistic"或"full"便会错误退出。保存统计信息仅支持op_debug_mode设置为0的场景。默认取值为"tensor"。
+        - `input_output`：设置成0，表示Dump出算子的输入和算子的输出；设置成1，表示Dump出算子的输入；设置成2，表示Dump出算子的输出。
+        - `kernels`：该项可以配置两种格式：
+          1. 算子的名称列表。指定算子需要先设置保存图文件的环境变量来保存图，再从保存的图文件中获取算子名称。保存图文件的环境变量请参考昇腾社区文档[DUMP_GE_GRAPH](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/apiref/envref/envref_07_0011.html) 、[DUMP_GRAPH_LEVEL](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/apiref/envref/envref_07_0012.html) 和[DUMP_GRAPH_PATH](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/apiref/envref/envref_07_0013.html) 。
+          2. 算子名称的正则表达式。当字符串符合"name-regex(xxx)"格式时，后台则会将其作为正则表达式。例如，"name-regex(Default/.+)"可匹配算子名称以"Default/"开头的所有算子。
+        - `support_device`：支持的设备，默认设置成0到7即可；在分布式训练场景下，需要dump个别设备上的数据，可以只在`support_device`中指定需要Dump的设备Id。
+        - `statistic_category`: 该属性用于用户配置要保存的统计信息类别，仅在开启了保存统计信息(即`saved_data`设置为"statistic"或"full")时生效。类型为字符串列表，其中的字符串可选值如下：
+
+            - "max": 表示Tensor中元素的最大值；
+            - "min": 表示Tensor中元素的最小值；
+            - "avg": 表示Tensor中元素的平均值；
+            - "count": 表示Tensor中元素的个数；
+            - "negative zero count": 表示Tensor中小于0的元素个数；
+            - "positive zero count": 表示Tensor中大于0的元素个数；
+            - "nan count": 表示Tensor中元素的`Nan`的个数；
+            - "negative inf count": 表示Tensor中`-Inf`元素的个数；
+            - "positive inf count": 表示Tensor中`+Inf`元素的个数；
+            - "zero count": 表示Tensor中元素`0`的个数；
+            - "md5": 表示Tensor的MD5值；
+            - "l2norm": 表示Tensor的L2Norm值。
+
+        该字段为可选，默认值为["max", "min", "l2norm"]。
+
+        - `file_format`: dump数据的文件类型，只支持`npy`和`bin`两种取值。设置成`npy`，则dump出的算子张量数据将为host侧格式的npy文件；设置成`bin`，则dump出的数据将为device侧格式的protobuf文件，需要借助转换工具进行处理，详细步骤请参考[Ascend O2模式下数据分析样例](#数据分析样例-1)。默认取值为`bin`。
+        - `overflow_number`：指定溢出dump的数据个数。该字段仅在`op_debug_mode`设置为3开启溢出检测功能，且`file_format`设置为`npy`时需要配置，可控制溢出数据按时间序dump，到指定数值后溢出数据不再dump。默认值为0，表示dump全部溢出数据。
 
 2. 设置数据Dump的环境变量。
 
@@ -578,14 +517,14 @@ MindSpore通过异步Dump提供了Ascend平台上大型网络的调试能力。
 
    可以在训练脚本中设置`set_context(reserve_class_name_in_scope=False)`，避免Dump文件名称过长导致Dump数据文件生成失败。
 
-4. 参考[异步Dump数据分析样例](#数据分析样例-1)解析Dump数据文件。
+4. 参考[Ascend O2模式下数据分析样例](#数据分析样例-1)解析Dump数据文件。
 
 > - 若需要dump全量或部分算子，则可以修改json配置文件中的`dump_mode`选项为0或1。
 > - 由于Dump速度较慢，在大模型场景下开启Dump会延长不同卡之间的通信间隔时间，从而导致通信算子超时。可以通过调整通信算子的超时时间来解决此问题。对于Ascend后端，可以设置HCCL_EXEC_TIMEOUT环境变量，具体设置方法请参考[昇腾CANN文档](https://www.hiascend.com/document/detail/zh/canncommercial/80RC1/apiref/envvar/envref_07_0072.html)。
 
 ### 数据对象目录和数据文件介绍
 
-图编译等级不为O0或O1时，Dump目录结构如下所示，主要特征为存在{step_id}目录，代表用户侧的训练轮次：
+Ascend O2模式下Dump目录结构如下所示，主要特征为存在{step_id}目录，代表用户侧的训练轮次：
 
 ```text
 {path}/
@@ -602,27 +541,6 @@ MindSpore通过异步Dump提供了Ascend平台上大型网络的调试能力。
     acl_dump_{device_id}.json
 ```
 
-图编译等级为O0或O1时，Dump目录结构如下所示，此种场景下aclop和aclnn算子的Dump数据会保存于{device_id}目录，"ReduceSum"类通信算子的Dump数据会保存在{iteration_id}目录：
-
-```text
-{path}/
-    - {step_id}/
-        - {time}/
-            - {device_id}/
-                - {model_name}/
-                    - {model_id}/
-                        - {iteration_id}/
-                            statistic.csv
-                            {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp} //aclop 算子
-                            {op_name}.{op_type}.{task_id}.{stream_id}.{timestamp} //aclnn 算子
-                            mapping.csv
-                statistic.csv
-                {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp} //aclop 算子
-                {op_name}.{op_type}.{task_id}.{stream_id}.{timestamp} //aclnn 算子
-                mapping.csv
-    acl_dump_{device_id}.json
-```
-
 - `path`：`data_dump.json`配置文件中设置的绝对路径。
 - `time`： dump目录的创建时间。
 - `device_id`: 卡号。
@@ -636,7 +554,7 @@ MindSpore通过异步Dump提供了Ascend平台上大型网络的调试能力。
 - `timestamp`：时间戳。
 - `step_id`: 用户侧的训练轮次。
 
-在{path}目录的`acl_dump_{device_id}.json`文件，是异步Dump在接口调用过程中生成的中间文件，一般情况下无需关注。
+在{path}目录的`acl_dump_{device_id}.json`文件，是Ascend O2模式下Dump在接口调用过程中生成的中间文件，一般情况下无需关注。
 
 其中，溢出文件（`Opdebug.Node_OpDebug.{task_id}.{stream_id}.{timestamp}`文件）只会在开启溢出Dump且检测到溢出时保存。
 
@@ -651,11 +569,11 @@ Opdebug.Node_OpDebug.{task_id}.{stream_id}.{timestamp}.output.0.json
 
 若配置文件中`file_format`值设置为`npy`，可以直接用`numpy.load`加载。
 
-若未配置`file_format`值或`file_format`值为`bin`，启动训练后，异步Dump生成的原始数据文件或溢出检测生成的溢出文件是protobuf格式的文件，需要用到海思Run包中自带的数据解析工具进行解析，详见[如何查看dump数据文件](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/devaids/auxiliarydevtool/atlasaccuracy_16_0059.html)。
+若未配置`file_format`值或`file_format`值为`bin`，启动训练后，Ascend O2模式下Dump生成的原始数据文件或溢出检测生成的溢出文件是protobuf格式的文件，需要用到海思Run包中自带的数据解析工具进行解析，详见[如何查看dump数据文件](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/devaids/auxiliarydevtool/atlasaccuracy_16_0059.html)。
 
-数据在Device侧的格式可能和Host侧计算图中的定义不同，异步Dump的bin数据格式为Device侧格式，如果想要转为Host侧格式，可以参考[如何进行dump数据文件Format转换](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/devaids/auxiliarydevtool/atlasaccuracy_16_0057.html)。
+数据在Device侧的格式可能和Host侧计算图中的定义不同，Ascend O2模式下Dump的bin数据格式为Device侧格式，如果想要转为Host侧格式，可以参考[如何进行dump数据文件Format转换](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/devaids/auxiliarydevtool/atlasaccuracy_16_0057.html)。
 
-异步Dump生成的数据文件是`bin`文件时，文件命名格式为：
+Ascend O2模式下Dump生成的数据文件是`bin`文件时，文件命名格式为：
 
 ```text
 {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}
@@ -665,17 +583,17 @@ Opdebug.Node_OpDebug.{task_id}.{stream_id}.{timestamp}.output.0.json
 
 如果`op_type`和`op_name`中出现了“.”、“/”、“\”、空格时，会转换为下划线表示。
 
-Dump生成的原始数据文件也可以使用MindSpore Insight的数据解析工具DumpParser解析，DumpParser的使用方式详见[DumpParser介绍](https://gitee.com/mindspore/mindinsight/tree/master/mindinsight/parser) 。MindSpore Insight解析出来的数据格式与同步dump的数据格式完全相同。
+Dump生成的原始数据文件也可以使用MindSpore Insight的数据解析工具DumpParser解析，DumpParser的使用方式详见[DumpParser介绍](https://gitee.com/mindspore/mindinsight/tree/master/mindinsight/parser) 。MindSpore Insight解析出来的数据格式与Ascend O0/O1模式下Dump的数据格式完全相同。
 
-若配置`file_format`值为`npy`，则启用异步dump生成的数据文件命名规则与同步Dump相同，可以参考[同步Dump数据文件介绍](#数据对象目录和数据文件介绍)，溢出检测生成的溢出文件是`json`格式，溢出文件内容解析可参考[解析算子溢出数据文件](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/devguide/appdevg/aclpythondevg/aclpythondevg_0078.html#ZH-CN_TOPIC_0000001781325073__section6864050111619) 。
+若配置`file_format`值为`npy`，则启用Ascend O2模式下Dump生成的数据文件命名规则与Ascend O0/O1模式下Dump相同，可以参考[Ascend O0/O1模式下Dump数据文件介绍](#数据对象目录和数据文件介绍)，溢出检测生成的溢出文件是`json`格式，溢出文件内容解析可参考[解析算子溢出数据文件](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/devguide/appdevg/aclpythondevg/aclpythondevg_0078.html#ZH-CN_TOPIC_0000001781325073__section6864050111619) 。
 
-选项`saved_data`只有在`file_format`为"npy"的时候生效。如`saved_data`是"statistic"或者"full"。张量统计数据会落盘到`statistic.csv`。如`saved_data`是"tensor"或者"full"完整张量数据会落盘到`{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.{dtype}.npy`。`statistic.csv`的格式与同步Dump相同，可以参考[同步Dump数据文件介绍](#数据对象目录和数据文件介绍)。
+选项`saved_data`只有在`file_format`为"npy"的时候生效。如`saved_data`是"statistic"或者"full"。张量统计数据会落盘到`statistic.csv`。如`saved_data`是"tensor"或者"full"完整张量数据会落盘到`{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy`。`statistic.csv`的格式与Ascend O0/O1模式下Dump相同，可以参考[Ascend O0/O1模式下Dump数据文件介绍](#数据对象目录和数据文件介绍)。
 
 ### 数据分析样例
 
-异步Dump不会自动保存`.ir`文件，要想查看`.ir`文件，可以在执行用例前通过MindSpore的IR保存开关`set_context(save_graphs=2)`, 执行用例后查看保存的`trace_code_graph_{xxx}`文件， 可以用vi打开。文件查看方式请参考同步dump的数据分析样例。在图编译等级为O0或O1时，异步Dump保存的算子文件和图文件中的算子名不同，所以此场景不推荐使用异步Dump，建议使用同步Dump。在图编译等级为O2时，由于`.ir`文件中并不是最终执行图，不能保证算子文件和`.ir`文件中的算子名一一对应。保存最终的执行图请参考昇腾社区文档[DUMP_GE_GRAPH](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/apiref/envref/envref_07_0011.html) 、[DUMP_GRAPH_LEVEL](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/apiref/envref/envref_07_0012.html) 和[DUMP_GRAPH_PATH](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/apiref/envref/envref_07_0013.html) 。
+Ascend O2模式下Dump不会自动保存`.ir`文件，要想查看`.ir`文件，可以在执行用例前通过MindSpore的IR保存开关`set_context(save_graphs=2)`, 执行用例后查看保存的`trace_code_graph_{xxx}`文件， 可以用vi打开。文件查看方式请参考Ascend O0模式下的数据分析样例。Ascend O2模式下，由于`.ir`文件中并不是最终执行图，不能保证算子文件和`.ir`文件中的算子名一一对应。保存最终的执行图请参考昇腾社区文档[DUMP_GE_GRAPH](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/apiref/envref/envref_07_0011.html) 、[DUMP_GRAPH_LEVEL](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/apiref/envref/envref_07_0012.html) 和[DUMP_GRAPH_PATH](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/apiref/envref/envref_07_0013.html) 。
 
-通过异步Dump的功能，获取到算子异步Dump生成的数据文件。如果异步Dump配置文件中设置的`file_format`为"npy"，可以跳过以下步骤中的1、2，如果没有设置`file_format`，或者设置为"bin"，需要先转换成`.npy`格式的文件。
+Ascend O2模式下Dump生成的数据文件可以通过以下3个步骤进行解析。如果Ascend O2模式下Dump配置文件中设置的`file_format`为"npy"，可以跳过以下步骤中的1、2，如果没有设置`file_format`，或者设置为"bin"，需要先转换成`.npy`格式的文件。
 
 1. 使用run包中提供的`msaccucmp.py`解析Dump出来的文件。不同的环境上`msaccucmp.py`文件所在的路径可能不同，可以通过`find`命令进行查找：
 
@@ -730,13 +648,344 @@ Dump生成的原始数据文件也可以使用MindSpore Insight的数据解析
     numpy.load("Conv2D.Default_network-WithLossCell__backbone-AlexNet_conv3-Conv2d_Conv2D-op12.2.7.161243956333802.input.0.32x256x13x13.npy")
     ```
 
+## CPU/GPU模式Dump
+
+### 操作步骤
+
+1. 创建json格式的配置文件，JSON文件的名称和位置可以自定义设置。
+
+    ```json
+    {
+        "common_dump_settings": {
+            "op_debug_mode": 0,
+            "dump_mode": 0,
+            "path": "/absolute_path",
+            "net_name": "ResNet50",
+            "iteration": "0|5-8|100-120",
+            "saved_data": "tensor",
+            "input_output": 0,
+            "kernels": ["Default/Conv-op12"],
+            "support_device": [0,1,2,3,4,5,6,7],
+            "statistic_category": ["max", "min", "l2norm"]
+        },
+        "e2e_dump_settings": {
+            "enable": true,
+            "trans_flag": true,
+        }
+    }
+    ```
+
+    - `common_dump_settings`:
+
+        - `op_debug_mode`：该属性用于算子溢出或算子异常调试，CPU/GPU Dump只支持设置成0，表示保存所有算子或指定算子。
+        - `dump_mode`：设置成0，表示Dump出该网络中的所有算子数据；设置成1，表示Dump`"kernels"`里面指定的算子数据或算子类型数据；设置成2，表示使用[mindspore.set_dump](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.set_dump.html) Dump指定对象。仅在op_debug_mode设置为0时支持指定算子dump。
+        - `path`：Dump保存数据的绝对路径。
+        - `net_name`：自定义的网络名称，例如："ResNet50"。
+        - `iteration`：指定需要Dump数据的迭代。类型为str，用“|”分离要保存的不同区间的step的数据。如"0|5-8|100-120"表示Dump第1个，第6个到第9个， 第101个到第121个step的数据。指定“all”，表示Dump所有迭代的数据。仅在op_debug_mode设置为0或3时支持保存指定迭代，op_debug_mode设置为4时不支持指定迭代。
+        - `saved_data`: 指定Dump的数据。类型为str，取值成"tensor"，表示Dump出完整张量数据；取值成"statistic"，表示只Dump张量的统计信息；取值"full"代表两种都要。统计信息现只支持GPU场景，CPU场景若选"statistic"或"full"便会错误退出。默认取值为"tensor"。保存统计信息仅支持op_debug_mode设置为0的场景。
+        - `input_output`：设置成0，表示Dump出算子的输入和算子的输出；设置成1，表示Dump出算子的输入；设置成2，表示Dump出算子的输出。在op_debug_mode设置为4时，只能保存算子输入。
+        - `kernels`：该项可以配置三种格式：
+          1. 算子的名称列表。开启IR保存开关`set_context(save_graphs=2)`并执行用例，从生成的IR文件`trace_code_graph_{graph_id}`中获取算子名称。详细说明可以参照教程：[如何保存IR](https://www.mindspore.cn/tutorials/zh-CN/master/advanced/error_analysis/mindir.html#如何保存ir)。
+          需要注意的是，是否设置`set_context(save_graphs=2)`可能会导致同一个算子的id不同，所以在Dump指定算子时要在获取算子名称之后保持这一项设置不变。或者也可以在Dump保存的`ms_output_trace_code_graph_{graph_id}.ir`文件中获取算子名称，参考[CPU/GPU模式下Dump数据对象目录](#数据对象目录和数据文件介绍-2)。
+          2. 还可以指定算子类型。当字符串中不带算子scope信息和算子id信息时，后台则认为其为算子类型，例如："conv"。算子类型的匹配规则为：当发现算子名中包含算子类型字符串时，则认为匹配成功（不区分大小写），例如："conv" 可以匹配算子 "Conv2D-op1234"、"Conv3D-op1221"。
+          3. 算子名称的正则表达式。当字符串符合"name-regex(xxx)"格式时，后台则会将其作为正则表达式。例如，"name-regex(Default/.+)"可匹配算子名称以"Default/"开头的所有算子。
+        - `support_device`：支持的设备，默认设置成0到7即可；在分布式训练场景下，需要dump个别设备上的数据，可以只在`support_device`中指定需要Dump的设备Id。该配置参数在CPU上无效，因为CPU下没有device这个概念，但是在json格式的配置文件中仍需保留该字段。
+        - `statistic_category`: 该属性用于用户配置要保存的统计信息类别，仅在开启了保存统计信息(即`saved_data`设置为"statistic"或"full")时生效。类型为字符串列表，其中的字符串可选值如下：
+
+            - "max": 表示Tensor中元素的最大值；
+            - "min": 表示Tensor中元素的最小值；
+            - "avg": 表示Tensor中元素的平均值；
+            - "count": 表示Tensor中元素的个数；
+            - "negative zero count": 表示Tensor中小于0的元素个数；
+            - "positive zero count": 表示Tensor中大于0的元素个数；
+            - "nan count": 表示Tensor中元素的`Nan`的个数；
+            - "negative inf count": 表示Tensor中`-Inf`元素的个数；
+            - "positive inf count": 表示Tensor中`+Inf`元素的个数；
+            - "zero count": 表示Tensor中元素`0`的个数；
+            - "md5": 表示Tensor的MD5值；
+            - "l2norm": 表示Tensor的L2Norm值。
+
+        CPU/GPU Dump模式只支持host测统计信息及结算。
+        该字段为可选，默认值为["max", "min", "l2norm"]。
+
+    - `e2e_dump_settings`:
+
+        - `enable`：在CPU/GPU Dump模式下，该字段必须设置为`true`。
+        - `trans_flag`：开启格式转换。将设备上的数据格式转换成NCHW格式。若为`True`，则数据会以Host侧的4D格式（NCHW）格式保存；若为`False`，则保留Device侧的数据格式。该配置参数在CPU上无效，因为CPU上没有format转换。默认值：true。
+
+2. 设置Dump环境变量。
+
+   指定Dump的json配置文件。
+
+   ```bash
+   export MINDSPORE_DUMP_CONFIG=${xxx}
+   ```
+
+   其中"xxx"为配置文件的绝对路径，如：
+
+   ```bash
+   export MINDSPORE_DUMP_CONFIG=/path/to/data_dump.json
+   ```
+
+   如果Dump配置文件没有设置`path`字段或者设置为空字符串，还需要配置环境变量`MS_DIAGNOSTIC_DATA_PATH`。
+
+   ```bash
+   export MS_DIAGNOSTIC_DATA_PATH=${yyy}
+   ```
+
+   则“$MS_DIAGNOSTIC_DATA_PATH/debug_dump”就会被当做`path`的值。若Dump配置文件中设置了`path`字段，则仍以该字段的实际取值为准。
+
+    注意：
+
+    - 在网络脚本执行前，设置好环境变量；网络脚本执行过程中设置将会不生效。
+    - 在分布式场景下，Dump环境变量需要在调用`mindspore.communication.init`之前配置。
+
+3. 启动网络训练脚本。
+
+   训练启动后，若正确配置了`MINDSPORE_DUMP_CONFIG`环境变量，则会读取配置文件的内容，并按照Dump配置中指定的数据保存路径保存算子数据。
+   GPU环境如果要Dump数据，必须采用非数据下沉模式（设置`model.train`或`DatasetHelper`中的`dataset_sink_mode`参数为`False`），以保证可以获取每个step的Dump数据。
+   若脚本中都不调用`model.train`或`DatasetHelper`，则默认为非数据下沉模式。使用Dump功能将自动生成最终执行图的IR文件。
+
+   可以在训练脚本中设置`set_context(reserve_class_name_in_scope=False)`，避免Dump文件名称过长导致Dump数据文件生成失败。
+
+4. 通过`numpy.load`读取和解析CPU/GPU模式下Dump数据，参考[CPU/GPU模式下Dump数据文件介绍](#数据对象目录和数据文件介绍-2)。
+
+### 数据对象目录和数据文件介绍
+
+启动训练后，CPU/GPU模式下Dump保存的数据对象包括最终执行图（`ms_output_trace_code_graph_{graph_id}.ir`文件）以及图中算子的输入和输出数据，数据目录结构如下所示：
+
+```text
+{path}/
+    - rank_{rank_id}/
+        - .dump_metadata/
+        - {net_name}/
+            - {graph_id}/
+                - {iteration_id}/
+                    {op_type}.{op_name}.json
+                    statistic.csv
+                    {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy
+                - constants/
+                    Parameter.data-{data_id}.0.0.{timestamp}.output.0.DefaultFormat.npy
+            ...
+        - graphs/
+            ms_output_trace_code_graph_{graph_id}.pb
+            ms_output_trace_code_graph_{graph_id}.ir
+        - execution_order/
+            ms_execution_order_graph_{graph_id}.csv
+            ms_global_execution_order_graph_{graph_id}.csv
+```
+
+- `path`：`data_dump.json`配置文件中设置的绝对路径。
+- `rank_id`： 逻辑卡号。
+- `net_name`：`data_dump.json`配置文件中设置的网络名称。
+- `graph_id`：训练的图标号。
+- `iteration_id`：训练的轮次。
+- `op_type`：算子类型。
+- `op_name`：算子名称。
+- `task_id`：任务标号。
+- `stream_id`：流标号。
+- `timestamp`：时间戳。
+- `input_output_index`：输入或输出标号，例如`output.0`表示该文件是该算子的第1个输出Tensor的数据。
+- `slot`：slot标号。
+- `format`: 数据格式。
+- `data_id`: 常量数据标号。
+
+对于多图网络，由于存在控制流，某些子图可能不会被执行，Dump只保存执行过的节点，所以graphs目录下`.pb`文件名中的{graph_id}并不一定在{net_name}下存在对应的{graph_id}目录。
+
+只当`saved_data`为"statistic"或者"full"时，才会生成`statistic.csv`，当`saved_data`为"tensor"或者"full"时，才会生成`{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy`命名的完整张量信息。
+
+只当`save_kernel_args`为`True`时，才会生成`{op_type}.{op_name}.json`，保存算子的初始化信息。
+
+CPU/GPU模式下Dump生成的数据文件是后缀名为`.npy`的文件，文件命名格式为：
+
+```text
+{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy
+```
+
+CPU/GPU模式下Dump生成的常量数据文件与其他数据文件格式相同，而所有常量数据的{op_type}，{task_id}，{stream_id}，{input_output_index}，{slot}，{format}不变。
+
+```text
+Parameter.data-{data_id}.0.0.{timestamp}.output.0.DefaultFormat.npy
+```
+
+{iteration_id}目录下也可能会保存Parameter开头的文件（weight, bias等参数会保存成Parameter开头的文件。
+
+可以用Numpy的`numpy.load`接口读取数据。
+
+CPU/GPU模式下Dump生成的统计数据文件名为`statistic.csv`，此文件存有相同目录下所有落盘张量（文件名为`{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{input_output_index}.{slot}.{format}.npy`）的统计信息。每个张量一行，每行有张量的 Op Type、Op Name、Task ID、Stream ID、Timestamp、IO，Slot、Data Size、Data Type、Shape以及用户配置的统计信息项。注意，如果用Excel来打开此文件，数据可能无法正确显示。请用`vi`、`cat`等命令查看，或者使用Excel自文本导入csv查看。
+
+CPU/GPU模式下Dump生成的最终执行图文件后缀名分别为`.pb`和`.ir`，文件命名格式为：
+
+```text
+ms_output_trace_code_graph_{graph_id}.pb
+ms_output_trace_code_graph_{graph_id}.ir
+```
+
+其中以`.ir`为后缀的文件可以通过`vi`命令打开查看。
+
+CPU/GPU模式下Dump生成的节点执行序文件后缀名为`.csv`，文件命名格式为：
+
+```text
+ms_execution_order_graph_{graph_id}.csv
+```
+
+图执行历史文件的后缀为`.csv`，文件名格式为：
+
+```text
+ms_global_execution_order_graph_{graph_id}.csv
+```
+
+此文件记录该图在训练过程中的执行轮次历史。图编译过程中，一张根图可能产生多张子图，但子图与根图具有相同的执行轮次历史。故与图执行序文件不同，此处仅保存根图的图执行历史文件。
+
+`.dump_metadata`记录了训练的原信息，其中`data_dump.json`保存了用户设置的dump配置。
+
+### 数据分析样例
+
+为了更好地展示使用Dump来保存数据并分析数据的流程，我们提供了一套[完整样例脚本](https://gitee.com/mindspore/docs/tree/master/docs/sample_code/dump) ，CPU/GPU模式下Dump只需要执行 `bash run_sync_dump.sh`。
+
+在通过Dump功能将脚本对应的图保存到磁盘上后，会产生最终执行图文件`ms_output_trace_code_graph_{graph_id}.ir`。该文件中保存了对应的图中每个算子的堆栈信息，记录了算子对应的生成脚本。
+
+以[AlexNet脚本](https://gitee.com/mindspore/docs/blob/master/docs/sample_code/dump/train_alexnet.py)为例 ：
+
+```python
+...
+def conv(in_channels, out_channels, kernel_size, stride=1, padding=0, pad_mode="valid"):
+    weight = weight_variable()
+    return nn.Conv2d(in_channels, out_channels,
+                     kernel_size=kernel_size, stride=stride, padding=padding,
+                     weight_init=weight, has_bias=False, pad_mode=pad_mode)
+
+
+def fc_with_initialize(input_channels, out_channels):
+    weight = weight_variable()
+    bias = weight_variable()
+    return nn.Dense(input_channels, out_channels, weight, bias)
+
+
+def weight_variable():
+    return TruncatedNormal(0.02)
+
+
+class AlexNet(nn.Cell):
+    """
+    Alexnet
+    """
+
+    def __init__(self, num_classes=10, channel=3):
+        super(AlexNet, self).__init__()
+        self.conv1 = conv(channel, 96, 11, stride=4)
+        self.conv2 = conv(96, 256, 5, pad_mode="same")
+        self.conv3 = conv(256, 384, 3, pad_mode="same")
+        self.conv4 = conv(384, 384, 3, pad_mode="same")
+        self.conv5 = conv(384, 256, 3, pad_mode="same")
+        self.relu = nn.ReLU()
+        self.max_pool2d = nn.MaxPool2d(kernel_size=3, stride=2)
+        self.flatten = nn.Flatten()
+        self.fc1 = fc_with_initialize(6 * 6 * 256, 4096)
+        self.fc2 = fc_with_initialize(4096, 4096)
+        self.fc3 = fc_with_initialize(4096, num_classes)
+
+    def construct(self, x):
+        """
+        The construct function.
+
+        Args:
+           x(int): Input of the network.
+
+        Returns:
+           Tensor, the output of the network.
+        """
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.max_pool2d(x)
+        x = self.conv2(x)
+        x = self.relu(x)
+        x = self.max_pool2d(x)
+        x = self.conv3(x)
+        x = self.relu(x)
+        x = self.conv4(x)
+        x = self.relu(x)
+        x = self.conv5(x)
+        x = self.relu(x)
+        x = self.max_pool2d(x)
+        x = self.flatten(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.relu(x)
+        x = self.fc3(x)
+        return x
+...
+```
+
+如果用户想查看脚本中第175行的代码：
+
+```python
+x = self.conv3(x)
+```
+
+执行完训练网络后，可以从最终执行图（`ms_output_trace_code_graph_{graph_id}.ir`文件）中查找到该行代码所对应的多个算子信息，例如Conv2D-op12对应的文件内容如下所示：
+
+```text
+  %20(equivoutput) = Conv2D(%17, %19) {instance name: conv2d} primitive_attrs: {IsFeatureMapInputList: (0), kernel_size: (3, 3), mode: 1, out_channel: 384, input_names: [
+x, w],    pri_format: NC1HWC0, pad: (0, 0, 0, 0), visited: true, pad_mod: same, format: NCHW,  pad_list: (1, 1, 1, 1), precision_flag: reduce, groups: 1, output_used_num:
+(1), stream_id:     0, stride: (1, 1, 1, 1), group: 1, dilation: (1, 1, 1, 1), output_names: [output], IsFeatureMapOutput: true, ms_function_graph: true}
+       : (<Tensor[Float32], (32, 256, 13, 13)>, <Tensor[Float32], (384, 256, 3, 3)>) -> (<Tensor[Float32], (32, 384, 13, 13)>)
+       : (<Float16xNC1HWC0[const vector][32, 16, 13, 13, 16]>, <Float16xFracZ[const vector][144, 24, 16, 16]>) -> (<Float32xNC1HWC0[const vector][32, 24, 13, 13, 16]>)
+       : full_name_with_scope: (Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op12)
+       ...
+       # In file ./tain_alexnet.py(175)/        x = self.conv3(x)/
+       ...
+```
+
+以上所示文件内容的各行所表示的含义如下：
+
+- 算子在Host侧（第一行）和Device侧（第二行，有些算子可能不存在）的输入输出情况。从执行图可知，该算子有两个输入（箭头左侧），一个输出（箭头右侧）。
+
+    ```text
+       : (<Tensor[Float32], (32, 256, 13, 13)>, <Tensor[Float32], (384, 256, 3, 3)>) -> (<Tensor[Float32], (32, 384, 13, 13)>)
+       : (<Float16xNC1HWC0[const vector][32, 16, 13, 13, 16]>, <Float16xFracZ[const vector][144, 24, 16, 16]>) -> (<Float32xNC1HWC0[const vector][32, 24, 13, 13, 16]>)
+    ```
+
+- 算子名称。从执行图可知，该算子在最终执行图中的完整名称为`Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op12`。
+
+    ```text
+    : (Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op12)
+    ```
+
+- 算子对应的训练脚本代码。通过搜索要查询的训练脚本代码，可以找到多个匹配的算子。
+
+    ```text
+    # In file {Absolute path of model_zoo}/official/cv/alexnet/src/alexnet.py(175)/        x = self.conv3(x)/
+    ```
+
+通过算子名称和输入输出信息，可以查找到唯一对应的Tensor数据文件。比如，若要查看Conv2D-op12算子的第1个输出数据对应的Dump文件，可获取以下信息：
+
+- `operator_name`：`Conv2D-op12`。
+
+- `input_output_index`：`output.0`表示该文件是该算子的第1个输出Tensor的数据。
+
+- `slot`：0，该算子的输出只有一个slot。
+
+在Dump保存的数据对象文件目录下搜索到相应的文件名：
+`Conv2D.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.npy`。
+
+还原数据的时候，通过执行：
+
+```python
+import numpy
+numpy.load("Conv2D.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.npy")
+```
+
+生成numpy.array数据。
+
 ## 其它说明
 
 ### 其它dump方法
 
 在一些特殊场景下，可在开发指导下应用GE dump模式。
 
-如果要使能GE dump，除了配置环境变量MINDSPORE_DUMP_CONFIG之外，还需要另外配置环境变量ENABLE_MS_GE_DUMP=1，该方式仅支持图编译等级为O2的场景。配置文件的格式和异步Dump相同，op_debug_mode字段不支持配置为4，其余各项参数和异步Dump相同。
+如果要使能GE dump，除了配置环境变量MINDSPORE_DUMP_CONFIG之外，还需要另外配置环境变量ENABLE_MS_GE_DUMP=1，该方式仅支持图编译等级为O2的场景。配置文件的格式和Ascend O2模式下Dump相同，op_debug_mode字段不支持配置为4，其余各项参数和Ascend O2模式下Dump相同。
 
 ```bash
 export ENABLE_MS_GE_DUMP=1
@@ -757,7 +1006,7 @@ GE dump的目录结构如下：
                         mapping.csv
 ```
 
-其中， `path`、`time`、`device_id`、`model_name`、`model_id`、`iteration_id`、`op_type`、`op_name`、`task_id`、`stream_id`、`timestamp`的含义和异步Dump的相同。
+其中， `path`、`time`、`device_id`、`model_name`、`model_id`、`iteration_id`、`op_type`、`op_name`、`task_id`、`stream_id`、`timestamp`的含义和Ascend O2模式下Dump的相同。
 
 该方式在将来会被废弃，不推荐使用。
 
@@ -767,4 +1016,4 @@ GE dump的目录结构如下：
 - Dump仅支持bool、int、int8、in16、int32、int64、uint、uint8、uint16、uint32、uint64、float、float16、float32、float64、bfloat16、double、complex64、complex128类型数据的保存。
 - complex64和complex128仅支持保存为npy文件，不支持保存为统计值信息。
 - Print算子内部有一个输入参数为string类型，string类型不属于Dump支持的数据类型，所以在脚本中包含Print算子时，会有错误日志，这不会影响其它类型数据的保存。
-- 使能异步Dump时，不支持同时使用set_context(ascend_config={"exception_dump": "2")配置轻量异常dump; 支持同时使用set_context(ascend_config={"exception_dump": "1")配置全量异常dump。
+- 使能Ascend O2模式下Dump时，不支持同时使用set_context(ascend_config={"exception_dump": "2"})配置轻量异常dump; 支持同时使用set_context(ascend_config={"exception_dump": "1"})配置全量异常dump。
-- 
Gitee


From 3c0435202be4086cb6ed830bd1caafea9e5197ad Mon Sep 17 00:00:00 2001
From: liuzihan000 <liuzihan8@huawei.com>
Date: Tue, 20 Aug 2024 11:36:06 +0800
Subject: [PATCH 4/7] fix dump doc review comment

---
 docs/mindspore/source_en/model_train/debug/dump.md    | 3 +--
 docs/mindspore/source_zh_cn/model_train/debug/dump.md | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/mindspore/source_en/model_train/debug/dump.md b/docs/mindspore/source_en/model_train/debug/dump.md
index 9d7906ef77..b4db5f3c4d 100644
--- a/docs/mindspore/source_en/model_train/debug/dump.md
+++ b/docs/mindspore/source_en/model_train/debug/dump.md
@@ -14,7 +14,7 @@ In different modes, the Dump features supported by MindSpore are not entirely th
 - [Dump in Ascend O2 Mode](#dump-in-ascend-o2-mode)
 - [Dump in CPU/GPU mode](#dump-in-cpugpu-mode)
 
-> - The differences between Ascend O0, O1, and O2 modes can be found in [the parameter jit_level of the set_context method](https://www.mindspore.cn/docs/en/r2.3.1/api_python/mindspore/mindspore.set_context.html).
+> - The differences between Ascend O0, O1, and O2 modes can be found in [the parameter jit_level of the set_context method](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.set_context.html).
 >
 > - Dumping constant data is only supported in CPU/GPU mode, while not supported in Ascend O0/O1/O2 mode.
 >
@@ -105,7 +105,6 @@ MindSpore supports different Dump functionalities under various modes, as shown
 </table>
 
 > 1. In terms of statistics, the computing speed of the device is faster than that of the host(currently only supported on Ascend backend), but the host has more statistical indicators than the device. Refer to the `statistic_category` option for details.
-> 2. Only supported on the Ascend backend.
 
 ## Dump in Ascend O0/O1 Mode
 
diff --git a/docs/mindspore/source_zh_cn/model_train/debug/dump.md b/docs/mindspore/source_zh_cn/model_train/debug/dump.md
index 53f568a202..ccc98ea457 100644
--- a/docs/mindspore/source_zh_cn/model_train/debug/dump.md
+++ b/docs/mindspore/source_zh_cn/model_train/debug/dump.md
@@ -14,7 +14,7 @@ MindSpore在不同模式下支持的Dump功能不完全相同，需要的配置
 - [Ascend下O2模式Dump](#ascend下o2模式dump)
 - [CPU/GPU模式Dump](#cpugpu-dump)
 
-> - Ascend下O0/O1/O2模式的区别请见[set_context的参数jit_level](https://www.mindspore.cn/docs/zh-CN/r2.3.1/api_python/mindspore/mindspore.set_context.html)。
+> - Ascend下O0/O1/O2模式的区别请见[set_context的参数jit_level](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.set_context.html)。
 >
 > - CPU/GPU模式支持dump常量数据，Ascend O0/O1/O2模式不支持Dump常量数据。
 >
@@ -104,7 +104,7 @@ MindSpore在不同模式下支持的Dump功能如下表所示：
   </tr>
 </table>
 
-> 在统计信息方面，device计算速度较host快（目前仅支持Ascend后端），但host统计指标比device多，详见`statistic_category`选项。
+> 1. 在统计信息方面，device计算速度较host快（目前仅支持Ascend后端），但host统计指标比device多，详见`statistic_category`选项。
 
 ## Ascend下O0/O1模式Dump
 
-- 
Gitee


From 2e78c4e53711fcb286bc2b5e5065df49e638e8d7 Mon Sep 17 00:00:00 2001
From: fuchao <fuchao32@huawei.com>
Date: Thu, 22 Aug 2024 04:33:54 +0800
Subject: [PATCH 5/7] =?UTF-8?q?O2=E6=B5=81=E7=A8=8B=E6=94=AF=E6=8C=81set?=
 =?UTF-8?q?=20dump?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/mindspore/source_en/model_train/debug/dump.md    | 4 ++--
 docs/mindspore/source_zh_cn/model_train/debug/dump.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/mindspore/source_en/model_train/debug/dump.md b/docs/mindspore/source_en/model_train/debug/dump.md
index b4db5f3c4d..18e67b978e 100644
--- a/docs/mindspore/source_en/model_train/debug/dump.md
+++ b/docs/mindspore/source_en/model_train/debug/dump.md
@@ -86,7 +86,7 @@ MindSpore supports different Dump functionalities under various modes, as shown
   <tr>
    <td align="left">set_dump</td>
    <td align="left">Supported</td>
-   <td align="left">Not Supported</td>
+   <td align="left">Supported</td>
    <td align="left">Not Supported</td>
   </tr>
   <tr>
@@ -477,7 +477,7 @@ Generate the numpy.array data.
     - `common_dump_settings`:
 
         - `op_debug_mode`: This attribute is used for operator overflow debugging. 0: disable overflow check function; 3: enable overflow check function; 4: enable the lightweight exception dump function. Set it to 0 when Dump data is processed. If it is not set to 0, only the data of the overflow operator or exception operator will be dumped.
-        - `dump_mode`: 0: all operator data in the network dumped out; 1: dump kernels data in kernels list. When overflow detection is enabled, the setting of this field becomes invalid, and Dump only saves the data of the overflow node. Specified data dump is supported only when "dump_mode' is set to `0`.
+        - `dump_mode`: 0: all operator data in the network dumped out; 1: dump kernels data in kernels list. When overflow detection is enabled, the setting of this field becomes invalid, and Dump only saves the data of the overflow node. 2: dump target and its contents using [mindspore.set_dump](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.set_dump.html). Specified data dump is supported only when "dump_mode' is set to `0`.
         - `path`: The absolute path to save Dump data.
         - `net_name`: The customized net name: "ResNet50".
         - `iteration`: Specify the iterations to dump, type is string. Use "|" to separate the step data of different intervals to be saved. For example, "0 | 5-8 | 100-120" represents dump the data of the 1st, 6th to 9th, and 101st to 121st steps. If iteration set to "all", data of every iteration will be dumped. Specified iteration dump is supported only when "op_debug_mode" is set to `0`, not supported when when "op_debug_mode" is set to `3` or `4`.
diff --git a/docs/mindspore/source_zh_cn/model_train/debug/dump.md b/docs/mindspore/source_zh_cn/model_train/debug/dump.md
index ccc98ea457..80c43b9527 100644
--- a/docs/mindspore/source_zh_cn/model_train/debug/dump.md
+++ b/docs/mindspore/source_zh_cn/model_train/debug/dump.md
@@ -86,7 +86,7 @@ MindSpore在不同模式下支持的Dump功能如下表所示：
   <tr>
    <td align="left">set_dump</td>
    <td align="left">支持</td>
-   <td align="left">不支持</td>
+   <td align="left">支持</td>
    <td align="left">不支持</td>
   </tr>
   <tr>
@@ -466,7 +466,7 @@ numpy.load("Conv2D.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.float
     - `common_dump_settings`:
 
         - `op_debug_mode`：该属性用于算子溢出调试，设置成0，表示不开启溢出；设置成3，表示开启溢出检测功能；设置成4，表示开启轻量异常Dump功能。在Dump数据的时候请设置成0，若设置成其他值，则只会Dump溢出算子或异常算子的数据。
-        - `dump_mode`：设置成0，表示Dump出该网络中的所有算子数据；设置成1，表示Dump`"kernels"`里面指定的算子数据或算子类型数据。仅在op_debug_mode设置为0时支持指定算子dump。op_debug_mode设置为非0值时，此字段的设置失效，Dump只会保存溢出算子的数据或者异常算子的数据。
+        - `dump_mode`：设置成0，表示Dump出该网络中的所有算子数据；设置成1，表示Dump`"kernels"`里面指定的算子数据或算子类型数据。设置成2，表示使用[mindspore.set_dump](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.set_dump.html) Dump指定对象。仅在op_debug_mode设置为0时支持指定算子dump。op_debug_mode设置为非0值时，此字段的设置失效，Dump只会保存溢出算子的数据或者异常算子的数据。
         - `path`：Dump保存数据的绝对路径。
         - `net_name`：自定义的网络名称，例如："ResNet50"。
         - `iteration`：指定需要Dump的迭代。类型为str，用“|”分离要保存的不同区间的step的数据。如"0|5-8|100-120"表示Dump第1个，第6个到第9个， 第101个到第121个step的数据。指定“all”，表示Dump所有迭代的数据。仅在op_debug_mode设置为0时支持保存指定迭代，op_debug_mode设置为3或4时不支持指定迭代。
-- 
Gitee


From 78037e3dbb943546a22eed3ee049690fd0208c6a Mon Sep 17 00:00:00 2001
From: maning202007 <maning36@huawei.com>
Date: Fri, 30 Aug 2024 10:24:59 +0800
Subject: [PATCH 6/7] Add overflow dump restrict introduce, for not support
 interger types

---
 docs/mindspore/source_en/model_train/debug/dump.md    | 4 ++--
 docs/mindspore/source_zh_cn/model_train/debug/dump.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/mindspore/source_en/model_train/debug/dump.md b/docs/mindspore/source_en/model_train/debug/dump.md
index 18e67b978e..88dad1772e 100644
--- a/docs/mindspore/source_en/model_train/debug/dump.md
+++ b/docs/mindspore/source_en/model_train/debug/dump.md
@@ -137,7 +137,7 @@ MindSpore supports different Dump functionalities under various modes, as shown
 
     - `common_dump_settings`:
 
-        - `op_debug_mode`: This attribute is used for operator overflow or operator exception debugging. 0: save all operators or specified operators; 3: only save overflow operators; 4: only save input of the exception operator. Set it to 0 when the data is dumped. If it is not set to 0, only the data of the overflow operator or exception operator will be dumped. Default: 0.
+        - `op_debug_mode`: This attribute is used for operator overflow or operator exception debugging. 0: save all operators or specified operators; 3: only save overflow operators, this feature only supports floating-point overflow and does not support integer types; 4: only save input of the exception operator. Set it to 0 when the data is dumped. If it is not set to 0, only the data of the overflow operator or exception operator will be dumped. Default: 0.
         - `dump_mode`: 0: all operator data in the network dumped out; 1: the operator data specified in Dump `"kernels"`; 2: dump target and its contents using [mindspore.set_dump](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.set_dump.html). Specified data dump is supported only when "dump_mode' is set to `0`.
         - `path`: The absolute path to Dump saved data.
         - `net_name`: The customized net name: "ResNet50".
@@ -476,7 +476,7 @@ Generate the numpy.array data.
 
     - `common_dump_settings`:
 
-        - `op_debug_mode`: This attribute is used for operator overflow debugging. 0: disable overflow check function; 3: enable overflow check function; 4: enable the lightweight exception dump function. Set it to 0 when Dump data is processed. If it is not set to 0, only the data of the overflow operator or exception operator will be dumped.
+        - `op_debug_mode`: This attribute is used for operator overflow debugging. 0: disable overflow check function; 3: enable overflow check function, this feature only supports floating-point overflow and does not support integer types; 4: enable the lightweight exception dump function. Set it to 0 when Dump data is processed. If it is not set to 0, only the data of the overflow operator or exception operator will be dumped.
         - `dump_mode`: 0: all operator data in the network dumped out; 1: dump kernels data in kernels list. When overflow detection is enabled, the setting of this field becomes invalid, and Dump only saves the data of the overflow node. 2: dump target and its contents using [mindspore.set_dump](https://www.mindspore.cn/docs/en/master/api_python/mindspore/mindspore.set_dump.html). Specified data dump is supported only when "dump_mode' is set to `0`.
         - `path`: The absolute path to save Dump data.
         - `net_name`: The customized net name: "ResNet50".
diff --git a/docs/mindspore/source_zh_cn/model_train/debug/dump.md b/docs/mindspore/source_zh_cn/model_train/debug/dump.md
index 80c43b9527..6950608f8e 100644
--- a/docs/mindspore/source_zh_cn/model_train/debug/dump.md
+++ b/docs/mindspore/source_zh_cn/model_train/debug/dump.md
@@ -137,7 +137,7 @@ MindSpore在不同模式下支持的Dump功能如下表所示：
 
     - `common_dump_settings`:
 
-        - `op_debug_mode`：该属性用于算子溢出或算子异常调试，设置成0，表示保存所有算子或指定算子；设置成3，表示只保存溢出算子；设置成4，表示只保存异常算子的输入。在Dump数据的时候请设置成0，若设置成其他值，则只会Dump溢出算子或异常算子的数据。默认值：0。
+        - `op_debug_mode`：该属性用于算子溢出或算子异常调试，设置成0，表示保存所有算子或指定算子；设置成3，表示只保存溢出算子，该功能仅支持浮点数溢出，不支持整数类型；设置成4，表示只保存异常算子的输入。在Dump数据的时候请设置成0，若设置成其他值，则只会Dump溢出算子或异常算子的数据。默认值：0。
         - `dump_mode`：设置成0，表示Dump出该网络中的所有算子数据；设置成1，表示Dump`"kernels"`里面指定的算子数据或算子类型数据；设置成2，表示使用[mindspore.set_dump](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.set_dump.html) Dump指定对象。仅在op_debug_mode设置为0时支持指定算子dump。
         - `path`：Dump保存数据的绝对路径。
         - `net_name`：自定义的网络名称，例如："ResNet50"。
@@ -465,7 +465,7 @@ numpy.load("Conv2D.Conv2D-op12.0.0.1623124369613540.output.0.DefaultFormat.float
 
     - `common_dump_settings`:
 
-        - `op_debug_mode`：该属性用于算子溢出调试，设置成0，表示不开启溢出；设置成3，表示开启溢出检测功能；设置成4，表示开启轻量异常Dump功能。在Dump数据的时候请设置成0，若设置成其他值，则只会Dump溢出算子或异常算子的数据。
+        - `op_debug_mode`：该属性用于算子溢出调试，设置成0，表示不开启溢出；设置成3，表示开启溢出检测功能，该功能仅支持浮点数溢出，不支持整数类型；设置成4，表示开启轻量异常Dump功能。在Dump数据的时候请设置成0，若设置成其他值，则只会Dump溢出算子或异常算子的数据。
         - `dump_mode`：设置成0，表示Dump出该网络中的所有算子数据；设置成1，表示Dump`"kernels"`里面指定的算子数据或算子类型数据。设置成2，表示使用[mindspore.set_dump](https://www.mindspore.cn/docs/zh-CN/master/api_python/mindspore/mindspore.set_dump.html) Dump指定对象。仅在op_debug_mode设置为0时支持指定算子dump。op_debug_mode设置为非0值时，此字段的设置失效，Dump只会保存溢出算子的数据或者异常算子的数据。
         - `path`：Dump保存数据的绝对路径。
         - `net_name`：自定义的网络名称，例如："ResNet50"。
-- 
Gitee


From ce4825f16be670c588531a1394e9c7da6c7244f4 Mon Sep 17 00:00:00 2001
From: fandawei <fandawei2@huawei.com>
Date: Fri, 30 Aug 2024 16:57:41 +0800
Subject: [PATCH 7/7] add device_stat_precision_mode doc

---
 docs/mindspore/source_en/model_train/debug/dump.md    | 1 +
 docs/mindspore/source_zh_cn/model_train/debug/dump.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs/mindspore/source_en/model_train/debug/dump.md b/docs/mindspore/source_en/model_train/debug/dump.md
index 88dad1772e..449d3ceb49 100644
--- a/docs/mindspore/source_en/model_train/debug/dump.md
+++ b/docs/mindspore/source_en/model_train/debug/dump.md
@@ -175,6 +175,7 @@ MindSpore supports different Dump functionalities under various modes, as shown
         - `enable`: When set to true, enable Synchronous Dump. When set to false or not set, Asynchronous Dump will be used on Ascend. The main difference between the two is that Asynchronous Dump has less impact on the original code execution order.
         - `trans_flag`: Enable trans flag. Transform the device data format into NCHW. If it is `True`, the data will be saved in the 4D format (NCHW) format on the Host side; if it is `False`, the data format on the Device side will be retained. Default: `True`.
         - `stat_calc_mode`: Select the backend for statistical calculations. Options are "host" and "device". Choosing "device" enables device computation of statistics, currently only effective on Ascend, and supports only min/max/avg/l2norm statistics.
+        - `device_stat_precision_mode`(optional): Device statistics calculation precision mode, optional "high" and "low". When "high" is selected, avg/l2norm statistics will be calculated using float32, which will increase device memory usage and have higher precision; when "low" is selected, the same type as the original data will be used for calculation, which will occupy less device memory, but may cause statistics overflow when processing large values. The default value is "high".
         - `sample_mode`(Optional): Setting it to 0 means the sample dump function is not enabled. Enable the sample dump function in graph compilation with optimization level O0 or O1. This field is effective only when "op_debug_mode" is set to `0`, sample dump cannot be enabled in other scene.
         - `sample_num`(Optional): Used to control the size of sample in sample dump. The default value is 100.
         - `save_kernel_args`(Optional): When set to true, the initialization information of kernels will be saved.
diff --git a/docs/mindspore/source_zh_cn/model_train/debug/dump.md b/docs/mindspore/source_zh_cn/model_train/debug/dump.md
index 6950608f8e..c580b483dc 100644
--- a/docs/mindspore/source_zh_cn/model_train/debug/dump.md
+++ b/docs/mindspore/source_zh_cn/model_train/debug/dump.md
@@ -175,6 +175,7 @@ MindSpore在不同模式下支持的Dump功能如下表所示：
         - `enable`：设置成true，表示开启同步Dump；设置成false时，采用异步Dump。不设置该字段时默认值为false，开启异步Dump。两者的区别是异步Dump对原本代码执行过程的影响更小。
         - `trans_flag`：开启格式转换，将设备上的数据格式转换成NCHW格式。若为`True`，则数据会以Host侧的4D格式（NCHW）格式保存；若为`False`，则保留Device侧的数据格式。该配置参数在CPU上无效，因为CPU上没有format转换。默认值：true。
         - `stat_calc_mode`：选择统计信息计算后端，可选"host"和"device"。选择"device"后可以使能device计算统计信息，当前只在Ascend生效，只支持`min/max/avg/l2norm`统计量。
+        - `device_stat_precison_mode`（可选）：device统计信息精度模式，可选"high"和"low"。选择"high"时，`avg/l2norm`统计量会使用float32进行计算，会增加device内存占用，精度更高；为"low"时将使用与原始数据相同的类型进行计算，device内存占用较少，但在处理较大数值时可能会导致统计量溢出。默认值为"high"。
         - `sample_mode`（可选）：设置成0，表示不开启切片dump功能；设置成1时，在图编译等级为O0或O1的情况下开启切片dump功能。仅在op_debug_mode设置为0时生效，其它场景不会开启切片dump功能。
         - `sample_num`（可选）：用于控制切片dump中切片的大小。默认值为100。
         - `same_kernel_args`（可选）: 设置成true时，会保存算子的初始化信息。
-- 
Gitee