diff --git a/.gitignore b/.gitignore
index 9135f4001a81838bf47905f80d4b0cfcb8f9e535..346c31d73e1d23baffe52f9a4348fa707a33a97c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -201,3 +201,4 @@ torch_npu/csrc/aten/RegisterAutogradNPU.cpp
 torch_npu/csrc/aten/RegisterCPU.cpp
 torch_npu/csrc/aten/RegisterNPU.cpp
 torch_npu/csrc/aten/python_custom_functions.cpp
+torch_npu/_op_plugin_docs.py
diff --git a/.gitmodules b/.gitmodules
index fb139b2f62cc6eef1d5fa1ab4ba541f63e530007..b3e22c585b857999b60fb66b05a50a5c7b5c7868 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,7 @@
 [submodule "third_party/op-plugin"]
 	path = third_party/op-plugin
 	url = https://gitee.com/ascend/op-plugin.git
+	branch = 7.1.0
 	ignore = dirty
 [submodule "third_party/googletest"]
 	path = third_party/googletest
@@ -8,9 +9,13 @@
 [submodule "third_party/torchair/torchair"]
 	path = third_party/torchair/torchair
 	url = https://gitee.com/ascend/torchair.git
+	branch = 7.1.0
 [submodule "third_party/Tensorpipe"]
 	path = third_party/Tensorpipe
 	url = https://gitee.com/ascend/Tensorpipe.git
 [submodule "third_party/fmt"]
 	path = third_party/fmt
 	url = https://gitee.com/mirrors/fmt.git
+[submodule "third_party/nlohmann"]
+	path = third_party/nlohmann
+	url = https://gitee.com/mirrors/nlohmann-json.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad39472ed6b82729c9286e3b4ad3e54b08a47e45..113c17f7a69f97d7dc8d1af053b922f0feb83576 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -205,6 +205,7 @@ include_directories(${PROJECT_SOURCE_DIR}/torch_npu/csrc/aten)
 include_directories(${PROJECT_SOURCE_DIR}/third_party/hccl/inc)
 include_directories(${PROJECT_SOURCE_DIR}/third_party/acl/inc)
 include_directories(${PROJECT_SOURCE_DIR}/third_party/Tensorpipe)
+include_directories(${PROJECT_SOURCE_DIR}/third_party/nlohmann/include)
 
 # Set installed PyTorch dir
 if(DEFINED PYTORCH_INSTALL_DIR)
diff --git a/OWNERS b/OWNERS
index 9b52a186831eefc041c8f04d4de187ba3939c3ee..0067915256f32c4b5a94ebefbc3a94044d0aba45 100644
--- a/OWNERS
+++ b/OWNERS
@@ -13,6 +13,7 @@ approvers:
 - sunboquan
 - wangchao147
 - yanpengquan07
+- kaixin1976
 reviewers:
 - xiaxia3
 - ascendzyj
@@ -47,3 +48,5 @@ reviewers:
 - guo-guanghao
 - yuhaiyan
 - wangchao147
+- insanecoder
+- Chen_LiQing
diff --git a/README.md b/README.md
index 150fae73662f471501d1193e5dcf5512c1e7ae23..cfcf17db756843a2fd88ce864866778a5b1463e9 100644
--- a/README.md
+++ b/README.md
@@ -18,13 +18,13 @@ Install **PyTorch** through pip.
 
 **For Aarch64:**
 
-```Python
+```bash
 pip3 install torch==2.1.0
 ```
 
 **For x86:**
 
-```Python
+```bash
 pip3 install torch==2.1.0+cpu  --index-url https://download.pytorch.org/whl/cpu
 ```
 
@@ -32,28 +32,28 @@ pip3 install torch==2.1.0+cpu  --index-url https://download.pytorch.org/whl/cpu
 
 Run the following command to install dependencies.
 
-```Python
+```bash
 pip3 install pyyaml
 pip3 install setuptools
 ```
 
 If the installation fails, use the download link or visit the [PyTorch official website](https://pytorch.org/) to download the installation package of the corresponding version.
 
-| OS arch | Python version | link                                                         |
-| ------- | -------------- | ------------------------------------------------------------ |
-| x86     | Python3.8      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp38-cp38-linux_x86_64.whl#sha256=9e5cfd931a65b38d222755a45dabb53b836be31bc620532bc66fee77e3ff67dc) |
-| x86     | Python3.9      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=86cc28df491fa84738affe752f9870791026565342f69e4ab63e5b935f00a495) |
-| x86     | Python3.10     | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=5077921fc2b54e69a534f3a9c0b98493c79a5547c49d46f5e77e42da3610e011) |
-| x86     | Python3.11     | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5954924ce74bc7e6a6c811e3fa4bdda9936d9889f6369fd068420c444bfd1cae) |
-| aarch64 | Python3.8      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=761822761fffaa1c18a62c5deb13abaa780862577d3eadc428f1daa632536905) |
-| aarch64 | Python3.9      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=de7d63c6ecece118684415a3dbd4805af4a4c1ee1490cccf7405d8c240a481b4) |
+| OS arch | Python version | link                                                                                                                                                                                          |
+|---------|----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| x86     | Python3.8      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp38-cp38-linux_x86_64.whl#sha256=9e5cfd931a65b38d222755a45dabb53b836be31bc620532bc66fee77e3ff67dc)                             |
+| x86     | Python3.9      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=86cc28df491fa84738affe752f9870791026565342f69e4ab63e5b935f00a495)                             |
+| x86     | Python3.10     | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=5077921fc2b54e69a534f3a9c0b98493c79a5547c49d46f5e77e42da3610e011)                           |
+| x86     | Python3.11     | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5954924ce74bc7e6a6c811e3fa4bdda9936d9889f6369fd068420c444bfd1cae)                           |
+| aarch64 | Python3.8      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=761822761fffaa1c18a62c5deb13abaa780862577d3eadc428f1daa632536905)   |
+| aarch64 | Python3.9      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=de7d63c6ecece118684415a3dbd4805af4a4c1ee1490cccf7405d8c240a481b4)   |
 | aarch64 | Python3.10     | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=a04a0296d47f28960f51c18c5489a8c3472f624ec3b5bcc8e2096314df8c3342) |
 | aarch64 | Python3.11     | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=8132efb782cd181cc2dcca5e58effbe4217cdb2581206ac71466d535bf778867) |
 
 3. **Install torch-npu**
 
 ```
-pip3 install torch-npu==2.1.0.post10
+pip3 install torch-npu==2.1.0.post12
 ```
 
 ### From Source
@@ -63,7 +63,7 @@ In some special scenarios, users may need to compile **torch-npu** by themselves
 1. **Clone torch-npu**
 
    ```
-   git clone https://github.com/ascend/pytorch.git -b v2.1.0 --depth 1
+   git clone https://github.com/ascend/pytorch.git -b v2.1.0-7.0.0 --depth 1
    ```
 
 2. **Build Docker Image**
@@ -131,17 +131,18 @@ print(z)
 
 ## User Manual
 
-Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for more detailed informations.
+Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for more detailed information.
 
 ## PyTorch and Python Version Matching Table
 
 | PyTorch Version | Python Version                                            |
 |-----------------|:----------------------------------------------------------|
 | PyTorch1.11.0   | Python3.7.x(>=3.7.5),Python3.8.x,Python3.9.x,Python3.10.x |
-| PyTorch2.1.0    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x                      |
-| PyTorch2.2.0    | Python3.8.x,Python3.9.x,Python3.10.x                       |
-| PyTorch2.3.1    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x                       |
-| PyTorch2.4.0    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x                       |
+| PyTorch2.1.0    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x         |
+| PyTorch2.2.0    | Python3.8.x,Python3.9.x,Python3.10.x                      |
+| PyTorch2.3.1    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x         |
+| PyTorch2.4.0    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x         |
+| PyTorch2.5.1    | Python3.9.x,Python3.10.x,Python3.11.x                     |
 
 ## Ascend Auxiliary Software
 
@@ -149,6 +150,10 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m
 
 | CANN Version          | Supported PyTorch Version | Supported Extension Version | Github Branch     |
 |-----------------------|---------------------------|-----------------------------|-------------------|
+| CANN 8.1.RC1          | 2.5.1                     | 2.5.1                       | v2.5.1-7.0.0      |
+|                       | 2.4.0                     | 2.4.0.post4                 | v2.4.0-7.0.0      |
+|                       | 2.3.1                     | 2.3.1.post6                 | v2.3.1-7.0.0      |
+|                       | 2.1.0                     | 2.1.0.post12                | v2.1.0-7.0.0      |
 | CANN 8.0.0            | 2.4.0                     | 2.4.0.post2                 | v2.4.0-6.0.0      | 
 |                       | 2.3.1                     | 2.3.1.post4                 | v2.3.1-6.0.0      |
 |                       | 2.1.0                     | 2.1.0.post10                | v2.1.0-6.0.0      |
@@ -159,7 +164,6 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m
 |                       | 2.2.0                     | 2.2.0.post2                 | v2.2.0-6.0.rc2    |
 |                       | 2.1.0                     | 2.1.0.post6                 | v2.1.0-6.0.rc2    |
 |                       | 1.11.0                    | 1.11.0.post14               | v1.11.0-6.0.rc2   |
-| CANN 8.0.RC2.alpha002 | 2.3.1                     | 2.3.1rc1                    | v2.3.1            |
 | CANN 8.0.RC1          | 2.2.0                     | 2.2.0                       | v2.2.0-6.0.rc1    |
 |                       | 2.1.0                     | 2.1.0.post4                 | v2.1.0-6.0.rc1    |
 |                       | 1.11.0                    | 1.11.0.post11               | v1.11.0-6.0.rc1   |
@@ -227,16 +231,17 @@ The version branches of AscendPyTorch have the following maintenance phases:
 
 ##  PyTorch Maintenance Policies
 
-| **PyTorch** |  **Maintenance Policies** | **Status** | **Launch Date**       | **Subsequent Status**            | **EOL Date**     |
-|-----------|--------------------|--------------|------------|-----------------|-----------|
-| 2.4.0     |  Regular Release  | Development  | 2024/10/15 |Expected to enter maintenance status from  June 15, 2025 |           |
-| 2.3.1     |  Regular Release  | Development   | 2024/06/06 | Expected to enter maintenance status from  June 7th, 2025 |           |
-| 2.2.0     |  Regular Release  | Maintained   | 2024/04/01 | Expected to enter maintenance free status from September 10th, 2025|           |
-| 2.1.0     | Long Term Support  | Development   | 2023/10/15 | Expected to enter maintenance status from September 15th, 2025 |           |
-| 2.0.1     | Regular Release    | EOL   | 2023/7/19  |   | 2024/3/14          |
-| 1.11.0    | Long Term Support  | Maintained   | 2023/4/19  | Expected to enter maintenance free status from September 10th, 2025  |           |
-| 1.8.1     | Long Term Support  | EOL          | 2022/4/10  |                 | 2023/4/10 |
-| 1.5.0     | Long Term Support  | EOL          | 2021/7/29  |                 | 2022/7/29 |
+| **PyTorch** | **Maintenance Policies** | **Status**  | **Launch Date** | **Subsequent Status**                                             | **EOL Date** |
+|-------------|--------------------------|-------------|-----------------|-------------------------------------------------------------------|--------------|
+| 2.5.1       | Regular Release          | Development | 2024/11/08      | Expected to enter maintenance status from April 8, 2025           |              |
+| 2.4.0       | Regular Release          | Development | 2024/10/15      | Expected to enter maintenance status from June 15, 2025           |              |
+| 2.3.1       | Regular Release          | Development | 2024/06/06      | Expected to enter maintenance status from June 7, 2025            |              |
+| 2.2.0       | Regular Release          | Maintained  | 2024/04/01      | Expected to enter maintenance free status from September 10, 2025 |              |
+| 2.1.0       | Long Term Support        | Development | 2023/10/15      | Expected to enter maintenance status from September 15, 2025      |              |
+| 2.0.1       | Regular Release          | EOL         | 2023/7/19       |                                                                   | 2024/3/14    |
+| 1.11.0      | Long Term Support        | Maintained  | 2023/4/19       | Expected to enter maintenance free status from September 10, 2025 |              |
+| 1.8.1       | Long Term Support        | EOL         | 2022/4/10       |                                                                   | 2023/4/10    |
+| 1.5.0       | Long Term Support        | EOL         | 2021/7/29       |                                                                   | 2022/7/29    |
 
 ## Reference Documents
 
diff --git a/README.zh.md b/README.zh.md
index 0d5bf465073412170c5707f356fd923cf1547f9b..2e5c370ca6946ac48f86bcefb11b541709271683 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -18,53 +18,53 @@
 
 **aarch64:**
 
-```Python
+```bash
 pip3 install torch==2.1.0
 ```
 
 **x86:**
 
-```Python
+```bash
 pip3 install torch==2.1.0+cpu  --index-url https://download.pytorch.org/whl/cpu
 ```
 
 若使用pip命令安装失败，请使用下载链接或进入[PyTorch官方网站](https://pytorch.org/)进行查询下载对应版本。
 
-| 架构    | Python版本 | 下载链接                                                     |
-| ------- | ---------- | ------------------------------------------------------------ |
-| x86     | Python3.8  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp38-cp38-linux_x86_64.whl#sha256=9e5cfd931a65b38d222755a45dabb53b836be31bc620532bc66fee77e3ff67dc) |
-| x86     | Python3.9  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=86cc28df491fa84738affe752f9870791026565342f69e4ab63e5b935f00a495) |
-| x86     | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=5077921fc2b54e69a534f3a9c0b98493c79a5547c49d46f5e77e42da3610e011) |
-| x86     | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5954924ce74bc7e6a6c811e3fa4bdda9936d9889f6369fd068420c444bfd1cae) |
-| aarch64 | Python3.8  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=761822761fffaa1c18a62c5deb13abaa780862577d3eadc428f1daa632536905) |
-| aarch64 | Python3.9  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=de7d63c6ecece118684415a3dbd4805af4a4c1ee1490cccf7405d8c240a481b4) |
+| 架构      | Python版本   | 下载链接                                                                                                                                                                                          |
+|---------|------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| x86     | Python3.8  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp38-cp38-linux_x86_64.whl#sha256=9e5cfd931a65b38d222755a45dabb53b836be31bc620532bc66fee77e3ff67dc)                             |
+| x86     | Python3.9  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=86cc28df491fa84738affe752f9870791026565342f69e4ab63e5b935f00a495)                             |
+| x86     | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=5077921fc2b54e69a534f3a9c0b98493c79a5547c49d46f5e77e42da3610e011)                           |
+| x86     | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5954924ce74bc7e6a6c811e3fa4bdda9936d9889f6369fd068420c444bfd1cae)                           |
+| aarch64 | Python3.8  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=761822761fffaa1c18a62c5deb13abaa780862577d3eadc428f1daa632536905)   |
+| aarch64 | Python3.9  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=de7d63c6ecece118684415a3dbd4805af4a4c1ee1490cccf7405d8c240a481b4)   |
 | aarch64 | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=a04a0296d47f28960f51c18c5489a8c3472f624ec3b5bcc8e2096314df8c3342) |
-| aarch64 | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=8132efb782cd181cc2dcca5e58effbe4217cdb2581206ac71466d535bf778867) 
+| aarch64 | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=8132efb782cd181cc2dcca5e58effbe4217cdb2581206ac71466d535bf778867) |
 
 2. **安装torch_npu依赖**
 
 运行以下命令安装依赖。
 
-```Python
+```bash
 pip3 install pyyaml
 pip3 install setuptools
 ```
 
 3. **安装torch_npu**
 
-```
-pip3 install torch-npu==2.1.0.post10
+```bash
+pip3 install torch-npu==2.1.0.post12
 ```
 如需要保存安装日志，可在pip3 install命令后面加上参数 `--log <PATH>`，并对您指定的目录`<PATH>`做好权限管控。
 
 ### 使用源代码进行安装
 
-某些特殊场景下，用户可能需要自行编译**torch_npu**。可以根据[昇腾辅助软件表](#昇腾辅助软件)和[PyTorch与Python版本配套表](#PyTorch与Python版本配套表)选择合适的分支。推荐使用Docker镜像编译**torch_npu**，可以通过以下步骤获取(建议只挂载工作路径，并避开系统路径，以降低安全风险), 生成的.whl文件路径为./dist/。如果不使用镜像，编译时请注意gcc版本遵循如下约束：ARM架构下推荐使用gcc 10.2版本, X86架构下推荐使用gcc 9.3.1
+某些特殊场景下，用户可能需要自行编译**torch_npu**。可以根据[昇腾辅助软件表](#昇腾辅助软件)和[PyTorch与Python版本配套表](#pytorch与python版本配套表)选择合适的分支。推荐使用Docker镜像编译**torch_npu**，可以通过以下步骤获取(建议只挂载工作路径，并避开系统路径，以降低安全风险), 生成的.whl文件路径为./dist/。如果不使用镜像，编译时请注意gcc版本遵循如下约束：ARM架构下推荐使用gcc 10.2版本, X86架构下推荐使用gcc 9.3.1
 
 1. **克隆torch_npu代码仓**
 
    ```
-   git clone https://gitee.com/ascend/pytorch.git -b v2.1.0 --depth 1
+   git clone https://gitee.com/ascend/pytorch.git -b v2.1.0-7.0.0 --depth 1
    ```
 
 2. **构建镜像**
@@ -110,7 +110,7 @@ Pytorch框架训练环境的卸载可以参考[昇腾官方文档](https://www.h
 
 torch_npu的卸载只需执行命令：
 
-  ```
+  ```bash
   pip3 uninstall torch_npu
   ```
 
@@ -147,10 +147,11 @@ print(z)
 | PyTorch版本     | Python版本                                                     |
 |---------------|:-------------------------------------------------------------|
 | PyTorch1.11.0 | Python3.7.x(>=3.7.5), Python3.8.x, Python3.9.x, Python3.10.x |
-| PyTorch2.1.0  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x                       |
+| PyTorch2.1.0  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x        |
 | PyTorch2.2.0  | Python3.8.x, Python3.9.x, Python3.10.x                       |
-| PyTorch2.3.1  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x                       |
-| PyTorch2.4.0  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x                       |
+| PyTorch2.3.1  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x        |
+| PyTorch2.4.0  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x        |
+| PyTorch2.5.1  | Python3.9.x, Python3.10.x, Python 3.11.x                     |
 
 ## 昇腾辅助软件
 
@@ -158,6 +159,10 @@ print(z)
 
 | CANN版本                | 支持的PyTorch版本 | 支持的Extension版本   | Gitee分支           | 
 |-----------------------|--------------|------------------|-------------------|
+| CANN 8.1.RC1          | 2.5.1        | 2.5.1            | v2.5.1-7.0.0      |
+|                       | 2.4.0        | 2.4.0.post4      | v2.4.0-7.0.0      |
+|                       | 2.3.1        | 2.3.1.post6      | v2.3.1-7.0.0      |
+|                       | 2.1.0        | 2.1.0.post12     | v2.1.0-7.0.0      |
 | CANN 8.0.0            | 2.4.0        | 2.4.0.post2      | v2.4.0-6.0.0      |
 |                       | 2.3.1        | 2.3.1.post4      | v2.3.1-6.0.0      |
 |                       | 2.1.0        | 2.1.0.post10     | v2.1.0-6.0.0      |
@@ -167,8 +172,7 @@ print(z)
 | CANN 8.0.RC2          | 2.3.1        | 2.3.1            | v2.3.1-6.0.rc2    | 
 |                       | 2.2.0        | 2.2.0.post2      | v2.2.0-6.0.rc2    |
 |                       | 2.1.0        | 2.1.0.post6      | v2.1.0-6.0.rc2    |
-|                       | 1.11.0       | 1.11.0.post14    | v1.11.0-6.0.rc2   | 
-| CANN 8.0.RC2.alpha002 | 2.3.1        | 2.3.1rc1         | v2.3.1            | 
+|                       | 1.11.0       | 1.11.0.post14    | v1.11.0-6.0.rc2   |
 | CANN 8.0.RC1          | 2.2.0        | 2.2.0            | v2.2.0-6.0.rc1    |
 |                       | 2.1.0        | 2.1.0.post4      | v2.1.0-6.0.rc1    | 
 |                       | 1.11.0       | 1.11.0.post11    | v1.11.0-6.0.rc1   | 
@@ -237,16 +241,17 @@ AscendPyTorch版本分支的维护阶段如下：
 
 ## PyTorch版本维护策略
 
-| **PyTorch版本** | **维护策略** | **当前状态** | **发布时间** | **后续状态** | **EOL日期** |
-|-----------|-----------|--------|------------|-----------------------|-----------|
-| 2.4.0     |  常规分支  | 开发    | 2024/10/15 | 预计2025/06/15起进入维护状态 |         -  | 
-| 2.3.1     |  常规分支  | 开发   | 2024/06/06 | 预计2025/06/07起进入维护状态 |           |
-| 2.2.0     |  常规分支   | 维护   | 2024/04/01 | 预计2025/09/10起进入无维护状态 |           |
-| 2.1.0     |  长期支持  | 开发   | 2023/10/15 | 预计2025/09/15起进入维护状态 |           |
-| 2.0.1     |  常规分支   | EOL   | 2023/7/19  |   |  2024/03/14          |
-| 1.11.0    |  长期支持  | 维护   | 2023/4/19  | 预计2025/09/10起进入无维护状态  |           |
-| 1.8.1     |  长期支持  | EOL    | 2022/4/10  |                       | 2023/4/10 |
-| 1.5.0     |  长期支持  | EOL    | 2021/7/29  |                       | 2022/7/29 |
+| **PyTorch版本** | **维护策略** | **当前状态** | **发布时间**   | **后续状态**             | **EOL日期**  |
+|---------------|----------|----------|------------|----------------------|------------|
+| 2.5.1         | 常规分支     | 开发       | 2024/11/08 | 预计2025/04/08起进入维护状态  | -          | 
+| 2.4.0         | 常规分支     | 开发       | 2024/10/15 | 预计2025/06/15起进入维护状态  | -          | 
+| 2.3.1         | 常规分支     | 开发       | 2024/06/06 | 预计2025/06/07起进入维护状态  |            |
+| 2.2.0         | 常规分支     | 维护       | 2024/04/01 | 预计2025/09/10起进入无维护状态 |            |
+| 2.1.0         | 长期支持     | 开发       | 2023/10/15 | 预计2025/09/15起进入维护状态  |            |
+| 2.0.1         | 常规分支     | EOL      | 2023/7/19  |                      | 2024/03/14 |
+| 1.11.0        | 长期支持     | 维护       | 2023/4/19  | 预计2025/09/10起进入无维护状态 |            |
+| 1.8.1         | 长期支持     | EOL      | 2022/4/10  |                      | 2023/4/10  |
+| 1.5.0         | 长期支持     | EOL      | 2021/7/29  |                      | 2022/7/29  |
 
 
 ## 安全声明
diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md
index 942a33ffa12ac75d6c01fefc0bad5bd8d0d5152a..acbfa62c2e7c6de0f8aaadc6339f1bd29c43b1f6 100644
--- a/SECURITYNOTE.md
+++ b/SECURITYNOTE.md
@@ -8,6 +8,8 @@ PyTorch 2.2.0以下版本存在CVE-2024-31580漏洞，该漏洞在torch/csrc/jit
 
 PyTorch 2.2.0以下版本存在CVE-2024-31584漏洞，该漏洞在torch/csrc/jit/mobile/flatbuffer_loader.cpp组件中存在越界读取问题。 如需要使用jit相关功能，建议用户使用2.2.0及以上的修复版本。
 
+PyTorch 2.6.0以下版本存在CVE-2025-32434漏洞，该漏洞因torch/serialization.py组件兼容性处理导致潜在的远程代码执行（RCE）风险。 torch_npu已参考[LINK](https://github.com/pytorch/pytorch/pull/145020)进行修复。
+
 ## 系统安全加固
 
 建议用户在系统中配置开启ASLR（级别2 ），又称**全随机地址空间布局随机化**，可参考以下方式进行配置：
@@ -59,7 +61,7 @@ torch_npu内集成性能分析工具profiler：
 
 ## 数据安全声明
 
-1. PyTorch使用过程中需要加载和保存数据，部分接口使用风险模块pickle，可能存在数据风险，如torch.load、torch.distributed.scatter_object_list等接口，可参考[torch.load](https://pytorch.org/docs/2.1/generated/torch.load.html#torch.load)、[collective-functions](https://pytorch.org/docs/2.1/distributed.html#collective-functions)了解具体风险。
+1. PyTorch使用过程中需要加载和保存数据，部分接口使用风险模块pickle，可能存在数据风险，如torch.load、torch.jit.load、torch.distributed.scatter_object_list等接口，可参考[torch.load](https://pytorch.org/docs/2.1/generated/torch.load.html#torch.load)、[collective-functions](https://pytorch.org/docs/2.1/distributed.html#collective-functions)了解具体风险。
 2. Ascend Extension for PyTorch依赖CANN的基础能力实现AOE性能调优、算子dump、日志记录等功能，用户需要关注上述功能生成文件的权限控制，加强对相关数据的保护。
 
 ## 构建安全声明
@@ -85,6 +87,7 @@ torch_npu支持源码编译安装，在编译时会下载依赖第三方库并
 | 自研   | 不涉及                                                                                                                                                                                          | .gitmodules                                   | https://gitee.com/ascend/torchair.git                                        | 依赖的开源代码仓                       |
 | 自研   | 不涉及                                                                                                                                                                                          | .gitmodules                                   | https://gitee.com/ascend/Tensorpipe.git                                      | 依赖的开源代码仓                       |
 | 自研   | 不涉及                                                                                                                                                                                          | .gitmodules                                   | https://gitee.com/mirrors/fmt.git                                   | 依赖的开源代码仓                       |
+| 自研   | 不涉及                                                                                                                                                                                          | .gitmodules                                   | https://gitee.com/mirrors/nlohmann-json.git                                     | 依赖的开源代码仓            |
 | 自研   | 不涉及                                                                                                                                                                                          | ci\docker\X86\Dockerfile                      | https://mirrors.huaweicloud.com/repository/pypi/simple                       | docker配置文件，用于配置pip源            |
 | 自研   | 不涉及                                                                                                                                                                                          | ci\docker\X86\Dockerfile                      | https://download.pytorch.org/whl/cpu                                         | docker配置源，用于配置torch下载连接        |
 | 自研   | 不涉及                                                                                                                                                                                          | ci\docker\ARM\Dockerfile                      | https://mirrors.huaweicloud.com/repository/pypi/simple                       | docker配置文件，用于配置pip源            |
diff --git a/Third_Party_Open_Source_Software_Notice b/Third_Party_Open_Source_Software_Notice
index b8084998368e25e73600803368f329ae3cad90b1..ca39279142bb6cd54d917e835551c818b4512a71 100644
--- a/Third_Party_Open_Source_Software_Notice
+++ b/Third_Party_Open_Source_Software_Notice
@@ -6,118 +6,109 @@ Warranty Disclaimer
 THE OPEN SOURCE SOFTWARE IN THIS PRODUCT IS DISTRIBUTED IN THE HOPE THAT IT WILL BE USEFUL, BUT WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. SEE THE APPLICABLE LICENSES FOR MORE DETAILS.
 
 Copyright Notice and License Texts
-Software: pytorch v2.1.0
+Software: pytorch v2.3.1
 Copyright notice:
-Copyright (c) 2016- Facebook, Inc 
-Copyright (c) 2014- Facebook, Inc 
-Copyright (c) 2011-2014 Idiap Research Institute 
-Copyright (c) 2012-2014 Deepmind Technologies 
-Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 
-Copyright (c) 2011-2013 NYU 
-Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) 
-Copyright (c) 2006 Idiap Research Institute 
-Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) 
-Copyright (c) 2016-present, Facebook Inc. 
-Copyright (c) 2016 Facebook Inc. 
-Copyright (c) 2015 Google Inc. 
-Copyright (c) 2015 Yangqing Jia 
-Copyright 2019-2020 Kakao Brain 
-Copyright (c) 2022 Cruise LLC. 
-Copyright (c) 2013, 2014, 2015, the respective contributors 
-Copyright (c) 2015, 2016 the respective contributors 
-Copyright (c) 2014, The Regents of the University of California (Regents) 
-Copyright (c) 2014, the respective contributors 
-Copyright (c) 2018, Steven Moshier 
-Copyright (c) 2001-2002 Enthought, Inc. 2003-2019, SciPy Developers 
-Copyright (c) 1997-2011 by Secret Labs AB 
-Copyright (c) 1995-2011 by Fredrik Lundh 
-Copyright (c) 2010-2022 by Alex Clark and contributors 
-Copyright (c) 2006 The Android Open Source Project 
-Copyright (c) Facebook, Inc. and its affiliates 
-Copyright (c) Meta Platforms, Inc. and affiliates 
-Copyright 2004-present Facebook 
-Copyright (c) 2017 by Contributors 
-Copyright (c) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura 
-Copyright (c) 2022 Apple Inc. 
-Copyright (c) 2023 Apple Inc. 
-Copyright 2005 Robert Kern (robert.kern@gmail.com) 
-copyright 2019 The TensorFlow Authors 
-Copyright (c) 2018 MathInf GmbH, Thomas Viehmann 
-Copyright (c) 2014 Indiana University (c) 
-Copyright John Maddock 2006 
-Copyright (c) 2012 Massachusetts Institute of Technology 
-Copyright (c) 2012 Giovanni Garberoglio Interdisciplinary Laboratory for Computational Science (LISC) Fondazione Bruno Kessler and University of Trento 
-Copyright (c) 2018 Marat Dukhan 
-Copyright (c) 2017-2018 Facebook Inc. 
-Copyright (c) 2017 Georgia Institute of Technology 
-Copyright 2015 Google Inc. 
-Copyright (c) 2011-2021, NVIDIA CORPORATION. 
-Copyright (c) 2022, Tri Dao 
-Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. 
-Copyright (c) 2017 The Android Open Source Project 
-Copyright (c) 2016-present, Facebook, Inc. 
-Copyright (c) 2005-2020 Rich Felker 
-Copyright Malte Skarupke 2017 
-Copyright 2008 Google Inc. 
-Copyright (c) 2011 - 2012 Andrzej Krzemienski 
-Copyright (c) 2001-2019 Free Software Foundation, Inc. 
-Copyright (c) 1994 Hewlett-Packard Company 
-Copyright (c) 1996-1998 Silicon Graphics Computer Systems, Inc. 
-Copyright (c) Bjorn Fahller 
-Copyright Michael Park, 2015-2017 
-Copyright (c) 2017-present, Facebook, Inc. 
-Copyright (c) 2018-present, Facebook, Inc. 
-Copyright (c) 2008-2015 The Khronos Group Inc. 
-Copyright 2016 Facebook 
-Copyright (c) 2016, NVIDIA CORPORATION 
-Copyright (c) 2008 - 2012 The Khronos Group Inc. 
-Copyright (c) 2008-2013 The Khronos Group Inc. 
-Copyright (c) 2008-2012 The Khronos Group Inc. 
-Copyright (c) 2016-2017, ARM Limited and Contributors 
-Copyright (c) 2014-2015 The Khronos Group Inc. 
-Copyright (c) 2015-2017 The Khronos Group Inc. 
-Copyright (c) Facebook Inc. and Microsoft Corporation 
-Copyright (c) 2014-2017 The Regents of the University of California (Regents) 
-Copyright (c) 2014-2017, the respective contributors 
-Copyright (c) 2017 Microsoft 
-Copyright 2015 The Gemmlowp Authors 
-Copyright (c) 2011-2019 Stephan Brumme 
-Copyright 2006, Google Inc. 
-Copyright (c) Meta Platforms, Inc. and its affiliates 
-Copyright (c) 2008 - 2009 NVIDIA Corporation 
-Copyright (c) 2007-2009 Scientific Computing and Imaging Institute, University of Utah 
-Copyright (c) 2006, Laurent Montel, montel@kde.org 
-Copyright 2013 Conrad Steenberg conrad.steenberg@gmail.com 
-copyright 2022, PyTorch 
-copyright 2023, PyTorch 
-Copyright (c) 2005-2022 NVIDIA Corporation Built 
-copyright PyTorch Contributors 
-Copyright (c) 2018 Alex Rogozhnikov 
-Copyright (c) 2016 Microsoft 
-Copyright (c) 2014, 2015, The Regents of the University of California (Regents) 
-Copyright (c) 2014, 2015, the respective contributors 
-Copyright (c) 2005-2017, NumPy Developers (c) Parameter containing Float 
-Copyright 2005, Google Inc. 
-Copyright 2019 Kakao Brain 
-Copyright 2013-2014 RAD Game 
-Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC 
-Copyright 2016 Martin Raiber 
-Copyright (c) 2003-2017 Josef Weidendorfer 
-Copyright (c) 2000-2017 Julian Seward 
-Copyright (c) Edward Z. Yang ezyang@mit.edu 
-Copyright (c) 2005-2010 ActiveState Software Inc. 
-Copyright (c) 2013 Eddy Petrisor 
-Copyright (c) 2010 ActiveState Software Inc. 
-Copyright (c) 2001-2014 Python Software Foundation 
-Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Python Software Foundation 
-Copyright Python Software Foundation 
-Copyright 2022 Cruise LLC 
-Copyright (c) 2014 Matthew Rocklin 
-Copyright (c) 2015 Melissa E. O'Neill 
-Copyright (c) 2019 NumPy Developers 
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. 
-Copyright 2013 Mark Dickinson
+Copyright (c) 2011-2013 NYU
+Copyright (c) Microsoft Corporation
+Copyright (c) 2014- Facebook, Inc
+Copyright (c) 2017 The Android Open Source Project
+Copyright Python Software Foundation
+Copyright (c) 2018 Alex Rogozhnikov
+Copyright (c) 2007-2009 Scientific Computing and Imaging Institute, University of Utah
+Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
+Copyright (c) 2017 Microsoft
+Copyright (c) Meta Platforms, Inc.
+Copyright (c) 2022 Apple Inc.
+Copyright (c) 2018-present, Facebook, Inc.
+Copyright (c) Facebook Inc. and Microsoft Corporation
+Copyright (c) 2005-2017, NumPy Developers. All rights reserved
+Copyright (c) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, All rights reserved
+Copyright (c) 2014, The Regents
+Copyright (c) 2005-2010 ActiveState Software Inc.
+Copyright 2005, Google Inc. All rights reserved
+Copyright (c) 2022, Tri Dao
+Copyright (c) 2001-2002 Enthought, Inc. 2003-2019, SciPy Developers. All rights reserved
+Copyright 2008 Google Inc. All rights reserved
+Copyright (c) 2003-2017 Josef Weidendorfer. All rights reserved
+Copyright (c) 2014 Matthew Rocklin
+Copyright (c) 2016 Microsoft
+Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved
+Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
+Copyright (c) 2008-2012 The Khronos Group Inc.
+Copyright (c) 2016 Facebook Inc.
+Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC All Rights Reserved
+Copyright (c) 2006, Laurent Montel, <montel@kde.org>
+Copyright (c) 2015 Google Inc. All rights reserved
+Copyright (c) 2010-2022 by Alex Clark and contributors
+Copyright 2015 Google Inc. All Rights Reserved
+Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+(c) BNParamType
+Copyright 2013-2014 RAD Game
+Copyright (c) 2011-2019 Stephan Brumme. All rights reserved
+Copyright (c) 2018 MathInf GmbH, Thomas Viehmann
+Copyright 2019 Kakao Brain
+Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Python Software Foundation All Rights Reserved
+Copyright 2004-present Facebook. All Rights Reserved
+Copyright (c) 2008-2013 The Khronos Group Inc.
+Copyright (c) Microsoft Corporation. All rights reserved
+Copyright 2006, Google Inc. All rights reserved
+Copyright (c) 2014-2015 The Khronos Group Inc.
+Copyright 2015 The TensorFlow Authors. All Rights Reserved
+Copyright (c) 2023, Tri Dao
+Copyright (c) 2011-2014 Idiap Research Institute
+Copyright (c) 2016-present, Facebook Inc. All rights reserved
+Copyright (c) Advanced Micro Devices, Inc.
+Copyright (c) 2001-2014 Python Software Foundation All Rights Reserved
+Copyright (c) Bjorn Fahller
+Copyright (c) 1995-2011 by Fredrik Lundh
+Copyright (c) Edward Z. Yang <ezyang@mit.edu>
+Copyright (c) 2012 Massachusetts Institute of Technology
+Copyright (c) 2006 Idiap Research Institute
+Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved
+Copyright (c) 2008 - 2012 The Khronos Group Inc.
+Copyright (c) 2015 Yangqing Jia All rights reserved
+Copyright 2023-present Facebook. All Rights Reserved
+Copyright 2013 Conrad Steenberg <conrad.steenberg@gmail.com>
+Copyright (c) 2008-2015 The Khronos Group Inc.
+Copyright (c) 2014-2017 The Regents
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright Malte Skarupke 2017
+Copyright (c) Meta Platforms, Inc. and affiliates
+Copyright (c) 2023, Advanced Micro Devices, Inc.
+Copyright (c) 2016- Facebook, Inc
+Copyright (c) 1997-2011 by Secret Labs AB
+Copyright (c) 2005-2022 NVIDIA Corporation Built
+Copyright (c) Facebook, Inc.
+Copyright 2019-2020 Kakao Brain
+Copyright (c) 2000-2017 Julian Seward. All rights reserved
+Copyright (c) 2016-2017, ARM Limited and Contributors
+Copyright (c) 2005-2020 Rich Felker
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+Copyright 2016 Facebook
+Copyright (c) 2012-2014 Deepmind Technologies
+Copyright (c) 2012 Giovanni Garberoglio Interdisciplinary Laboratory
+Copyright (c) 2024, Tri Dao
+Copyright (c) Donald Stufft and individual contributors. All rights reserved
+Copyright (c) 2018, Steven Moshier All rights reserved
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved
+Copyright (c) 2017-present, Facebook, Inc.
+Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved
+Copyright (c) 2014, 2015, The Regents
+Copyright (c) 2013 Eddy Petrisor
+Copyright (c) 2010 ActiveState Software Inc.
+Copyright (c) 2006 The Android Open Source Project
+Copyright (c) 2023 Apple Inc.
+Copyright 2015 The Gemmlowp Authors. All Rights Reserved
+Copyright (c) 2015-2017 The Khronos Group Inc.
+Copyright 2022 Cruise LLC
+Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved
+Copyright (c) 2022 Cruise LLC. All rights reserved
+Copyright (c) 2016-present, Facebook, Inc.
+(c) Copyright John Maddock 2006
+Copyright (c) 2014 Indiana University All rights reserved
+copyright 2019 The TensorFlow Authors
 
 License: BSD 3-Clause License
 Copyright (c) , <YEAR>,<OWNER>
@@ -129,13 +120,3 @@ Redistribution and use in source and binary forms, with or without modification,
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Written Offer
-This product contains software whose rights holders license it on the terms of the GNU General Public License, version 2 (GPLv2) and/or other open source software licenses. We will provide you and any third party with the source code of the software licensed under an open source software license if you send us a written request by mail or email to the following addresses:
-foss@huawei.com
-detailing the name of the product and the firmware version for which you need the source code and indicating how we can contact you.
-
-Please note you need to make a payment before you obtain the complete Corresponding Source Code from us. For how much you will pay and how we will deliver the complete Corresponding Source Code to you, we will further discuss it by mail or email.
-This offer is valid to anyone in receipt of this information.
-
-THIS OFFER IS VALID FOR THREE YEARS FROM THE MOMENT WE DISTRIBUTED THE PRODUCT OR FIRMWARE.
diff --git a/ci/build.sh b/ci/build.sh
index 3bb4ffee3808268cb19fa0b8689d658f6e13668f..df9a58b653c20be3d399c5141f8180f7fa06a777 100644
--- a/ci/build.sh
+++ b/ci/build.sh
@@ -5,7 +5,6 @@ set -e
 CUR_DIR=$(dirname $(readlink -f $0))
 SUPPORTED_PY_VERSION=(3.8 3.9 3.10 3.11)
 PY_VERSION='3.8'                     # Default supported python version is 3.8
-export DISABLE_RPC_FRAMEWORK=FALSE
 
 # Parse arguments inside script
 function parse_script_args() {
diff --git a/codegen/custom_functions.py b/codegen/custom_functions.py
index e2fcf8330b4cc4a7aca24b8a52922a36b4897538..03cba66226b44a2438816b2dc7e386e7538bbe87 100644
--- a/codegen/custom_functions.py
+++ b/codegen/custom_functions.py
@@ -7,7 +7,7 @@ import yaml
 from torchgen.code_template import CodeTemplate
 from torchgen.gen import (parse_tags_yaml, FileManager, cpp_string, error_check_native_functions)
 from torchgen.model import (BackendIndex, DispatchKey, Variant,
-                            NativeFunction, OperatorName, BackendMetadata, TensorOptionsArguments)
+                            NativeFunction, OperatorName, BackendMetadata, TensorOptionsArguments, OptionalType)
 from torchgen.utils import concatMap
 from torchgen.context import with_native_function, native_function_manager
 from torchgen.api.types import DispatcherSignature
@@ -129,14 +129,27 @@ def compute_op_definition(f: NativeFunction):
         f.func.arguments.flat_positional,
     )
     candidate_tensor_args = []
+    candidate_tensor_args_without_optional = []
     for a in candidate_args:
         if a.type.is_tensor_like():
             candidate_tensor_args.append(f"{a.name}")
 
+    pos = 0
+    for a in f.func.arguments.flat_positional:
+        if a.type.is_tensor_like() and not isinstance(a.type, OptionalType):
+            candidate_tensor_args_without_optional.append([f"{a.name}", pos])
+        pos += 1
+
     unsafe_tensor_check = """ // No unsafe tensor check"""
     if len(candidate_tensor_args) > 0:
-        unsafe_tensor_check = \
-"""if (c10_npu::get_npu_data_unsafe_flag()) {"""
+        unsafe_tensor_check = """"""
+        for tensor_arg in candidate_tensor_args_without_optional:
+            unsafe_tensor_check += \
+f"""at_npu::autograd::VariableType::unpack({tensor_arg[0]}, "{tensor_arg[0]}", {tensor_arg[1]});
+"""
+        unsafe_tensor_check += \
+"""
+if (c10_npu::get_npu_data_unsafe_flag()) {"""
         for tensor_arg in candidate_tensor_args:
             unsafe_tensor_check = unsafe_tensor_check + f"""
     c10_npu::check_npu_tensor_is_safe({tensor_arg});"""
@@ -167,6 +180,7 @@ def compute_op_definition(f: NativeFunction):
         None,
     )
 
+    device_guard = ""
     if has_tensor_options and device_of is not None:
         device_guard = f"""
 c10::OptionalDeviceGuard device_guard(device_of({device_of}));
diff --git a/codegen/gen_backend_stubs.py b/codegen/gen_backend_stubs.py
index 100c8be07e68e52f9f3bec7cd3563adc62cf4218..bdb6c48a13a73aed84172c65409c5e65f42201fd 100644
--- a/codegen/gen_backend_stubs.py
+++ b/codegen/gen_backend_stubs.py
@@ -42,7 +42,7 @@ from codegen.gen_functionalization_type import gen_functionalization_definition,
 from codegen.utils import (get_torchgen_dir, rename_privateuse1_dispatch_key, gen_unstructured, add_header_to_template_file,
                            get_grouped_native_functions_optional_out, parse_npu_yaml, get_opplugin_wrap_name,
                            get_target_functions, merge_custom_yaml, field_tag, gen_custom_yaml_path,
-                           update_opapi_info, is_opapi, PathManager, filt_exposed_api, get_target_native_registration,
+                           update_opapi_info, is_opapi, update_internal_format_opapi_info, PathManager, filt_exposed_api, get_target_native_registration,
                            NativeFunctionsGroupOptionalOut, gen_device_check, DEVICE_NOCHECK_SET)
 from codegen.custom_functions import (parse_custom_yaml, gen_custom_trace, gen_custom_ops_patch,
                                       gen_custom_functions_dispatch)
@@ -150,6 +150,7 @@ def parse_native_and_custom_yaml(path: str, tag_path: str, custom_path: str) ->
         supported_es = source_es.get('supported', []) + source_es.get('autograd', []) + custom_es
         for es in supported_es:
             update_opapi_info(es)
+            update_internal_format_opapi_info(es)
         custom_es = field_tag(custom_es)
         for e in custom_es:
             func, m = NativeFunction.from_yaml(e, "Location", valid_tags)
diff --git a/codegen/utils.py b/codegen/utils.py
index e8514739d1b9ef4ca581ee97ab04c2ade53a3c3a..187f02fc9dea81f99c8c4c624840273ec3f0f3f4 100644
--- a/codegen/utils.py
+++ b/codegen/utils.py
@@ -52,6 +52,7 @@ from torchgen.dest.register_dispatch_key import RegisterDispatchKey
 
 GLOBAL_STRUCTURED_OP_INFO_CACHE = defaultdict(str)
 GLOBAL_OPAPI_INFO_CACHE = set()
+GLOBAL_INTERNAL_FORMAT_OPAPI_INFO_CACHE = set()
 
 CUSTOM_YAML_NAME = "npu_native_functions_by_codegen.yaml"
 FIELDS_TO_USE = ["func", "tags", "dispatch", "device_check"]
@@ -438,7 +439,8 @@ const DeviceGuard device_guard(device_or_default(device));"""
 
                 if self.backend_index.dispatch_key is DispatchKey.PrivateUse1:
                     if op_key not in op_hook_blacklist:
-                        op_hook_check += f"""\
+                        if not is_opapi_support_internal_format(op_key):
+                            op_hook_check += f"""\
 if (C10_UNLIKELY(at_npu::native::env::CheckOpHookEnable())) {{
 {auto_lvalue}
     at_npu::native::OpHook::GetInstance().PreHook(\"{op_key}\", {args_exprs_str_for_op_hook});
@@ -451,13 +453,35 @@ if (C10_UNLIKELY(at_npu::native::env::CheckOpHookEnable())) {{
     }}
 }}
 """
-
-                return_code = f"""\
+                        else:
+                            op_hook_check += f"""\
+if (C10_UNLIKELY(at_npu::native::env::CheckOpHookEnable())) {{
+{auto_lvalue}
+    at_npu::native::OpHook::GetInstance().PreHook(\"{op_key}\", {args_exprs_str_for_op_hook});
+    if (({force_aclnn} || at_npu::native::env::CheckJitDisable())) {{
+        {res_of_op_hook_post_code}{op_api_impl_name}({args_exprs_str_for_op_hook});
+        {return_of_op_hook_post_code}
+    }} else {{
+        {res_of_op_hook_post_code}{impl_name}({args_exprs_str_for_op_hook});
+        {return_of_op_hook_post_code}
+    }}
+}}
+"""
+                if not is_opapi_support_internal_format(op_key):
+                    return_code = f"""\
 if (({force_aclnn} || at_npu::native::env::CheckJitDisable()){tensor_check_str}) {{
         return {op_api_impl_name}({args_exprs_str});
     }} else {{
         return {impl_name}({args_exprs_str});
     }}
+"""
+                else:
+                    return_code = f"""\
+if (({force_aclnn} || at_npu::native::env::CheckJitDisable())) {{
+        return {op_api_impl_name}({args_exprs_str});
+    }} else {{
+        return {impl_name}({args_exprs_str});
+    }}
 """
             else:
                 if self.backend_index.dispatch_key is DispatchKey.PrivateUse1:
@@ -594,6 +618,22 @@ def is_opapi(op_key):
     return op_key in GLOBAL_OPAPI_INFO_CACHE
 
 
+def update_internal_format_opapi_info(op_info):
+    global GLOBAL_INTERNAL_FORMAT_OPAPI_INFO_CACHE
+    if isinstance(op_info, str):
+        return
+    elif isinstance(op_info, dict):
+        if op_info.get("internal_format_opapi", False):
+            GLOBAL_INTERNAL_FORMAT_OPAPI_INFO_CACHE.add(op_info.get("func").split("(")[0])
+    else:
+        print(f"Warning: Unsupported parameter types, only str and dict is supported, but input is {type(op_info)}")
+
+
+def is_opapi_support_internal_format(op_key):
+    global GLOBAL_INTERNAL_FORMAT_OPAPI_INFO_CACHE
+    return op_key in GLOBAL_INTERNAL_FORMAT_OPAPI_INFO_CACHE
+
+
 def get_target_functions(yaml_path: str, target_op_type: str = None) -> List:
     source_es = parse_npu_yaml(yaml_path)
 
diff --git a/setup.py b/setup.py
index c6c7216ec352865b38915c081a4e7698f6fbedce..5b18fa066410b8272902ba4c3202560e3c3076df 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ from wheel.bdist_wheel import bdist_wheel
 
 BASE_DIR = os.path.dirname(os.path.realpath(__file__))
 THIRD_PARTY_PATH = os.path.join(BASE_DIR, "third_party")
-VERSION = '2.1.0.post11'
+VERSION = '2.1.0.post13'
 UNKNOWN = "Unknown"
 BUILD_PERMISSION = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP
 
@@ -443,7 +443,9 @@ def get_src_py_and_dst():
         "torch_npu/csrc/*/*/*/*/*.h",
         "third_party/acl/inc/*/*.h",
         "third_party/hccl/inc/*/*.h",
-        "third_party/acl/inc/*/*/*.h"
+        "third_party/acl/inc/*/*/*.h",
+        "torch_npu/csrc/distributed/HCCLUtils.hpp",
+        "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
     ]
     add_ops_files(BASE_DIR, header_files)
     glob_header_files = []
diff --git a/test/README.md b/test/README.md
index e878386b9ee5e1d08a0d0bb496dfa834050a20f4..ef30d897a2e73b2740be88d8352e28b10d0ed80e 100644
--- a/test/README.md
+++ b/test/README.md
@@ -77,6 +77,6 @@ python ci/access_control_test.py --distributed
     可用于复现问题的用例：
     `python test_jit.py -v -k test_annotated_empty_dict`
 
-2. test_public_bindings.py 用例的作用
+2. test_public_bindings.py 用例的功能
 
     该用例是为了校验接口的公开规范性，如果该用例报错，请确认报错的接口是否要公开，并按照报错的提示进行修改。
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 3dfe7a67ce3bcada34586656452aa2c5f81d5344..d63da7c59ee785e0b168abe3a41926f66070bd1d 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -2517,10 +2517,12 @@
     "_npu_dropout",
     "copy_memory_",
     "empty_with_format",
+    "empty_with_swapped_memory",
     "npu_alloc_float_status",
     "npu_apply_adam",
     "npu_advance_step_flashattn",
     "npu_batch_gather_matmul",
+    "npu_batch_gather_matmul_",
     "npu_bert_apply_adam",
     "npu_clear_float_status",
     "npu_cross_entropy_loss",
@@ -2530,9 +2532,11 @@
     "npu_nms_rotated",
     "npu_random_choice_with_mask",
     "npu_rms_norm",
+    "npu_add_rms_norm_cast",
     "npu_moe_compute_expert_tokens",
     "npu_fused_infer_attention_score",
     "npu_mla_prolog",
+    "npu_mla_prolog_v2",
     "npu_convert_weight_to_int4pack",
     "npu_ffn",
     "npu_geglu",
@@ -2560,7 +2564,20 @@
     "scatter_update_",
     "npu_kronecker_quant",
     "npu_group_norm_swish",
-    "npu_mrope"
+    "npu_mrope",
+    "npu_grouped_matmul_finalize_routing",
+    "npu_alltoallv_gmm",
+    "npu_gmm_alltoallv",
+    "npu_nsa_compress",
+    "npu_nsa_compress_infer",
+    "npu_nsa_compress_attention",
+    "npu_nsa_compress_attention_infer",
+    "npu_nsa_select_attention",
+    "npu_nsa_select_attention_infer",
+    "npu_transpose_batchmatmul",
+    "npu_gather_sparse_index",
+    "npu_moe_distribute_combine_add_rms_norm",
+    "npu_moe_eplb_update_expert"
   ],
   "torch_npu.contrib": [
     "npu_fused_attention_with_layernorm",
diff --git a/test/contrib/test_bbox_coder.py b/test/contrib/test_bbox_coder.py
index 32ce77847272bd1eb7979585ddf9bf8cdf0a4384..9e0b9f919d7175d34a189a69d2573ca7c37b1e1e 100644
--- a/test/contrib/test_bbox_coder.py
+++ b/test/contrib/test_bbox_coder.py
@@ -9,7 +9,20 @@ from torch_npu.contrib.function import npu_bbox_coder_encode_yolo, \
 
 
 class TestBboxCoder(TestCase):
-    @SupportedDevices(['Ascend910A'])
+    @SupportedDevices(["Ascend910B"])
+    def test_npu_bbox_coder_encode_xyxy2xywh_A2(self):
+        bboxes = torch.tensor([[1., 2., 3., 4.], [3., 4., 5., 6.]], dtype=torch.float32).to("npu")
+        gt_bboxes = torch.tensor([[5., 6., 7., 8.], [7., 8., 9., 6.]], dtype=torch.float32).to("npu")
+        npuout_1 = npu_bbox_coder_encode_xyxy2xywh(bboxes, gt_bboxes)
+        npuout_2 = npu_bbox_coder_encode_xyxy2xywh(bboxes / 512., gt_bboxes / 512., is_normalized=True,
+                                                   normalized_scale=512.)
+        expect_cpu = torch.tensor([[1.3330, 1.3330, 0.0000, 0.0000],
+                                   [1.3330, 0.6665, 0.0000, np.nan]], dtype=torch.float32)
+
+        self.assertRtolEqual(expect_cpu.numpy(), npuout_1.cpu().numpy())
+        self.assertRtolEqual(expect_cpu.numpy(), npuout_2.cpu().numpy())
+
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_npu_bbox_coder_encode_xyxy2xywh(self):
         np.random.seed(123)
         data1 = np.random.randint(low=0, high=512, size=(6, 4))
diff --git a/test/contrib/test_deform_conv.py b/test/contrib/test_deform_conv.py
index 6609685e46b62592969362ebf331a81274e965c7..13c7483ed1b805efacfbf7e149a50f278c99606c 100644
--- a/test/contrib/test_deform_conv.py
+++ b/test/contrib/test_deform_conv.py
@@ -9,7 +9,6 @@ from torch_npu.contrib.module import DCNv2
 
 
 class TestDeformConv(TestCase):
-    @SupportedDevices(['Ascend910A'])
     def test_npu_deform_conv_1(self):
         np.random.seed(226)
         data1 = np.random.randn(2, 2, 3, 3)
@@ -48,10 +47,9 @@ class TestDeformConv(TestCase):
                                           [[-0.1422, -0.2028, -0.1422],
                                            [-0.0641, 0.2660, -0.0641],
                                            [-0.1422, -0.2028, -0.1422]]]], dtype=torch.float32)
-        self.assertRtolEqual(expect_cpu_output, output.detach().cpu())
-        self.assertRtolEqual(expect_cpu_xgrad, x.grad.cpu())
+        self.assertRtolEqual(expect_cpu_output, output.detach().cpu(), prec=1.e-3)
+        self.assertRtolEqual(expect_cpu_xgrad, x.grad.cpu(), prec=1.e-3)
 
-    @SupportedDevices(['Ascend910A'])
     def test_npu_deform_conv_2(self):
         np.random.seed(546)
         data1 = np.random.randn(2, 2, 5, 5)
@@ -102,8 +100,8 @@ class TestDeformConv(TestCase):
                                            [-0.1422, -0.2028, -0.1422, -0.2028, -0.1422],
                                            [-0.0641, 0.2660, -0.0641, 0.2660, -0.0641],
                                            [-0.1422, -0.2028, -0.1422, -0.2028, -0.1422]]]], dtype=torch.float32)
-        self.assertRtolEqual(expect_cpu_output, output.detach().cpu())
-        self.assertRtolEqual(expect_cpu_xgrad, x.grad.cpu())
+        self.assertRtolEqual(expect_cpu_output, output.detach().cpu(), prec=1.e-3)
+        self.assertRtolEqual(expect_cpu_xgrad, x.grad.cpu(), prec=1.e-3)
 
 
 if __name__ == "__main__":
diff --git a/test/contrib/test_linear_quant.py b/test/contrib/test_linear_quant.py
index 4b5ccad27df2bfe627eb2bf5b1ef3ab416008003..96ae37388ecbe4524104ea87073811650da52f50 100644
--- a/test/contrib/test_linear_quant.py
+++ b/test/contrib/test_linear_quant.py
@@ -10,22 +10,22 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
 
 class TestLinearQuant(TestCase):
 
-    def npu_linear_quant(self, in_features, out_features, x1, x2, scale):
-        model = LinearQuant(in_features, out_features, bias=False, pertoken_scale=False, offset=False)
+    def npu_linear_quant(self, in_features, out_features, x1, x2, scale, output_dtype=torch.float16):
+        model = LinearQuant(in_features, out_features, bias=False, pertoken_scale=False, offset=False,
+                            output_dtype=output_dtype)
         model = model.npu()
         model.weight.data = x2
         model.scale.data = scale
         output = model(x1)
         return output
 
-    @SupportedDevices(['Ascend910A'])
     @unittest.skipIf(DEVICE_NAME == 'Ascend910A' or DEVICE_NAME == 'Ascend310P',
         "OP `QuantBatchMatmulV3` is not supported on 910A or 310P, skip this ut for this device type!")
     def test_npu_linear_quant(self):
         x1 = torch.randint(-1, 1, (1, 2), dtype=torch.int32).npu()
         x2 = torch.randint(-1, 1, (128, 2), dtype=torch.int32).npu()
         scale = torch.randn(1, dtype=torch.float32).npu()
-        supported_output = torch_npu.npu_quant_matmul(x1, x2.t(), scale)
+        supported_output = torch_npu.npu_quant_matmul(x1, x2.t(), scale, output_dtype=torch.float16)
         in_features = 2
         out_features = 128
         npu_out = self.npu_linear_quant(in_features, out_features, x1, x2, scale)
diff --git a/test/contrib/test_matmul_transpose.py b/test/contrib/test_matmul_transpose.py
index 2eebdb73527d2f415c4454ac06f916539d10d385..b60d6934cc6079b26ce8bb0ecbed5cff29ad9ebe 100644
--- a/test/contrib/test_matmul_transpose.py
+++ b/test/contrib/test_matmul_transpose.py
@@ -45,7 +45,7 @@ class TestMatmulTranspose(TestCase):
 
         return output, fast_time
 
-    @SupportedDevices(['Ascend910A'])
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_matmul_transpose_shape_format(self):
         shape_format = [
             [[np.float16, 2, [50, 25, 7, 100]], [np.float16, 2, [50, 25, 10, 100]]],
diff --git a/test/contrib/test_multiclass_nms.py b/test/contrib/test_multiclass_nms.py
index 6f070ed842988ec32b86026180356063b377d930..15533da1a8273d973285631ff83255c34fb30e58 100644
--- a/test/contrib/test_multiclass_nms.py
+++ b/test/contrib/test_multiclass_nms.py
@@ -9,7 +9,7 @@ from torch_npu.contrib.function import npu_multiclass_nms, \
 
 
 class TestMultiClassNms(TestCase):
-    @SupportedDevices(['Ascend910A'])
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_npu_multiclass_nms_1(self):
         np.random.seed(123)
         data1 = np.random.randint(low=1, high=255, size=(1000, 4))
@@ -26,7 +26,7 @@ class TestMultiClassNms(TestCase):
         self.assertRtolEqual(expect_det_bboxes, det_bboxes.cpu())
         self.assertRtolEqual(expect_det_labels, det_labels.cpu())
 
-    @SupportedDevices(['Ascend910A'])
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_npu_multiclass_nms_2(self):
         np.random.seed(123)
         data1 = np.random.randn(1000, 4)
@@ -43,7 +43,7 @@ class TestMultiClassNms(TestCase):
         self.assertRtolEqual(expect_det_bboxes, det_bboxes.cpu())
         self.assertRtolEqual(expect_det_labels, det_labels.cpu())
 
-    @SupportedDevices(['Ascend910A'])
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_npu_batched_multiclass_nms_1(self):
         np.random.seed(339)
         data1 = np.random.randint(low=1, high=255, size=(4, 200, 80, 4))
diff --git a/test/contrib/test_roll.py b/test/contrib/test_roll.py
index 0a08a6bc327e0b9fb3f410821afa8869fa5b53f6..1a3eb70d92224d8a7e90f465834fd825057a342f 100644
--- a/test/contrib/test_roll.py
+++ b/test/contrib/test_roll.py
@@ -37,7 +37,7 @@ class TestRoll(TestCase):
 
         return output.to("cpu").numpy(), fast_time
 
-    @SupportedDevices(['Ascend910A'])
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_roll_shape_format(self):
         dtype_list = [np.float16, np.float32, np.uint8, np.int32]
         format_list = [-1, 2]
diff --git a/test/contrib/test_transfer_to_npu.py b/test/contrib/test_transfer_to_npu.py
index 6e18628e092738e9f3e70bad8e2226779f19cd5f..bd950a79ce80e3c407fed4f4ec64bb9ba2da0f00 100644
--- a/test/contrib/test_transfer_to_npu.py
+++ b/test/contrib/test_transfer_to_npu.py
@@ -1,3 +1,5 @@
+import json
+from unittest.mock import patch, mock_open
 import torch
 from torch.nn.parameter import UninitializedTensorMixin
 from torch.utils.data import TensorDataset
@@ -92,7 +94,130 @@ class TestTransferToNpu(TestCase):
             if method.__name__ == "to":
                 self.assertFalse(hasattr(method, "__self__"))   # 替换后torch.Tensor.to变成普通函数，而不是原来的绑定方法
                 break
-
+            
+    def test_input_validation(self):
+        # Test file is a link
+        with patch('os.path.islink', return_value=True):
+            self.assertFalse(transfer_to_npu._check_input_file_valid('dummy_path'))
+
+        # Test file does not exist
+        with patch('os.path.islink', return_value=False), \
+                patch('os.path.realpath'), \
+                patch('os.path.exists', return_value=False):
+            self.assertFalse(transfer_to_npu._check_input_file_valid('dummy_path'))
+
+        # Test file not readable
+        with patch('os.path.islink', return_value=False), \
+                patch('os.path.realpath'), \
+                patch('os.path.exists', return_value=True), \
+                patch('os.access', return_value=False):
+            self.assertFalse(transfer_to_npu._check_input_file_valid('dummy_path'))
+
+        # Test file name too long
+        with patch('os.path.islink', return_value=False), \
+                patch('os.path.realpath'), \
+                patch('os.path.exists', return_value=True), \
+                patch('os.access', return_value=True), \
+                patch('os.path.basename', return_value='a' * 201):
+            self.assertFalse(transfer_to_npu._check_input_file_valid('dummy_path'))
+
+        # Test file too large
+        with patch('os.path.islink', return_value=False), \
+                patch('os.path.realpath'), \
+                patch('os.path.exists', return_value=True), \
+                patch('os.access', return_value=True), \
+                patch('os.path.basename', return_value='valid_name'), \
+                patch('os.path.getsize', return_value=11 * 1024 ** 2):
+            self.assertFalse(transfer_to_npu._check_input_file_valid('dummy_path'))
+
+        # Test valid file
+        with patch('os.path.islink', return_value=False), \
+                patch('os.path.realpath'), \
+                patch('os.path.exists', return_value=True), \
+                patch('os.access', return_value=True), \
+                patch('os.path.basename', return_value='valid_name'), \
+                patch('os.path.getsize', return_value=1024):
+            self.assertTrue(transfer_to_npu._check_input_file_valid('dummy_path'))
+
+    def test_load_json_file(self):
+        # Test with invalid file
+        with patch('torch_npu.contrib.transfer_to_npu._check_input_file_valid', return_value=False):
+            self.assertEqual(transfer_to_npu._load_json_file('invalid_path'), {})
+
+        # Test with JSON decode error
+        with patch('torch_npu.contrib.transfer_to_npu._check_input_file_valid', return_value=True), \
+                patch('builtins.open', mock_open(read_data='invalid json')), \
+                patch('json.load', side_effect=json.JSONDecodeError('Expecting value', 'doc', 0)):
+            self.assertEqual(transfer_to_npu._load_json_file('dummy_path'), {})
+
+        # Test with file content not a dict
+        with patch('torch_npu.contrib.transfer_to_npu._check_input_file_valid', return_value=True), \
+                patch('builtins.open', mock_open(read_data='["not", "a", "dict"]')), \
+                patch('json.load', return_value=["not", "a", "dict"]):
+            self.assertEqual(transfer_to_npu._load_json_file('dummy_path'), {})
+
+        # Test with valid JSON dict
+        valid_json_data = '{"key": "value"}'
+        with patch('torch_npu.contrib.transfer_to_npu._check_input_file_valid', return_value=True), \
+                patch('builtins.open', mock_open(read_data=valid_json_data)), \
+                patch('json.load', return_value={"key": "value"}):
+            self.assertEqual(transfer_to_npu._load_json_file('valid_path'), {"key": "value"})
+
+    def test_wrapper_function(self):
+        @transfer_to_npu._wrapper_libraries_func
+        def test_function():
+            return torch.cuda.is_available()
+
+        self.assertFalse(test_function())
+
+    def test_replace_cuda_to_npu_in_dict(self):
+        input_dict = {
+            "device": "cuda:0",
+            "cuda_version": "10.2",
+            "non_cuda_key": "no replacement needed",
+            "123": "cuda_core"
+        }
+        expected_dict = {
+            "device": "npu:0",
+            "npu_version": "10.2",
+            "non_npu_key": "no replacement needed",
+            "123": "npu_core"
+        }
+
+        result_dict = transfer_to_npu._replace_cuda_to_npu_in_dict(input_dict)
+        self.assertEqual(result_dict, expected_dict)
+
+    def test_wrapper_hccl_args_and_kwargs(self):
+        @transfer_to_npu._wrapper_hccl
+        def mock_function(*args, **kwargs):
+            return args, kwargs
+
+        args_input = ('nccl', 'cpu')
+        kwargs_input = {'backend': 'nccl', 'device': 'gpu'}
+        expected_args_output = ('hccl', 'cpu')
+        expected_kwargs_output = {'backend': 'hccl', 'device': 'gpu'}
+
+        args_output, kwargs_output = mock_function(*args_input, **kwargs_input)
+
+        self.assertEqual(args_output, expected_args_output)
+        self.assertEqual(kwargs_output, expected_kwargs_output)
+
+    def test_wrapper_profiler_experimental_config(self):
+        @transfer_to_npu._wrapper_profiler
+        def mock_function(*args, **kwargs):
+            return kwargs
+
+        wrong_config = 'not_a_valid_config'
+        correct_config = torch_npu.profiler._ExperimentalConfig(1, 1)
+
+        with patch('logging.warning') as mock_logger:
+            result = mock_function(experimental_config=wrong_config)
+            mock_logger.assert_called_once()
+            self.assertNotIn('experimental_config', result)
+
+        result = mock_function(experimental_config=correct_config)
+        self.assertIn('experimental_config', result)
+        self.assertIs(result['experimental_config'], correct_config)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/custom_ops/test_npu_anti_quant.py b/test/custom_ops/test_npu_anti_quant.py
index b9d8590eeee4ef91d8ec778b5d2d2e405a068b54..be4404815493d199da515e7a817b72a164979037 100644
--- a/test/custom_ops/test_npu_anti_quant.py
+++ b/test/custom_ops/test_npu_anti_quant.py
@@ -1,5 +1,6 @@
 import unittest
 import numpy as np
+from ml_dtypes import int4
 import torch
 
 import torch_npu
@@ -8,10 +9,26 @@ from torch_npu.testing.common_utils import create_common_tensor
 
 DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
 
+def unpack_int4(s32arr):
+    dst_shape = s32arr.numpy().shape
+    if len(dst_shape) == 0:
+        dst_shape = (8, )
+    else:
+        dst_shape = (*(dst_shape[:-1]), dst_shape[-1] * 8)
+
+    sa1 = s32arr.numpy().astype(np.int32)
+    sa2 = sa1.tobytes()
+    sa3 = np.frombuffer(sa2, dtype=np.uint8)
+    shift = np.array([0, 4], dtype=np.uint8)
+    sa4 = np.bitwise_and(sa3.reshape([-1, 1]) >> shift, 0b00001111).astype(int4).astype(np.int8).reshape(dst_shape)
+    return torch.from_numpy(sa4)
+
 
 class TestAntiQuant(TestCase):
 
     def custom_op_exec(self, input_x, scale, offset, dst_dtype, src_dtype):
+        if input_x.dtype == torch.int32:
+            input_x = unpack_int4(input_x)
         scale = torch.broadcast_to(scale, input_x.shape)
         if offset is None:
             offset = torch.zeros_like(scale)
@@ -38,8 +55,8 @@ class TestAntiQuant(TestCase):
             [[np.int8, -1, [10, 100]], [np.float32, -1, [100]], [np.float32, -1, [100]], torch.bfloat16, torch.int8],
             [[np.int32, -1, [10, 25]], [np.float32, -1, [200]], [np.float32, -1, [200]], torch.float16, None],
             [[np.int32, -1, [10, 25]], [np.float32, -1, [200]], [np.float32, -1, [200]], torch.bfloat16, None],
-            [[np.int32, -1, [10, 25]], [np.float32, -1, [200]], [np.float32, -1, [200]], torch.float16, torch.quint4x2],
-            [[np.int32, -1, [10, 25]], [np.float32, -1, [200]], [np.float32, -1, [200]], torch.bfloat16, torch.quint4x2],
+            [[np.int32, -1, [10, 25]], [np.float32, -1, [200]], [np.float32, -1, [200]], torch.float16, None],
+            [[np.int32, -1, [10, 25]], [np.float32, -1, [200]], [np.float32, -1, [200]], torch.bfloat16, None],
         ]
         
         for item in shape_format:
diff --git a/test/custom_ops/test_npu_bounding_box_encode.py b/test/custom_ops/test_npu_bounding_box_encode.py
index db3af810e3b32ee410f33aab623a1b87e2211dcc..1eda2417868797969cb034652da03ee4d9297fbb 100644
--- a/test/custom_ops/test_npu_bounding_box_encode.py
+++ b/test/custom_ops/test_npu_bounding_box_encode.py
@@ -1,6 +1,7 @@
 import torch
 
 import torch_npu
+from torch_npu.testing.common_utils import SupportedDevices
 from torch_npu.testing.testcase import TestCase, run_tests
 
 
@@ -48,6 +49,7 @@ class TestBoundingBoxEncode(TestCase):
         output = output.numpy()
         return output
 
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_encode_shape_format_fp32(self):
         input1 = torch.tensor([[1., 2., 3., 4.], [3., 4., 5., 6.]],
                               dtype=torch.float32).to("npu")
@@ -60,6 +62,7 @@ class TestBoundingBoxEncode(TestCase):
                                             0.1, 0.1, 0.2, 0.2)
         self.assertRtolEqual(npu_output, custom_output, 1e-3)
 
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_encode_shape_format_fp16(self):
         input1_fp16 = torch.tensor([[1., 2., 3., 4.], [3., 4., 5., 6.]],
                                    dtype=torch.float16).to("npu")
diff --git a/test/custom_ops/test_npu_conv3d.py b/test/custom_ops/test_npu_conv3d.py
index 674e903a18c2d5d2fb1b3f98db0ddb1034522356..a35a55a27ac83960814e69a76c673ff26b981a2e 100644
--- a/test/custom_ops/test_npu_conv3d.py
+++ b/test/custom_ops/test_npu_conv3d.py
@@ -50,6 +50,8 @@ class TestNpuConv3d(TestCase):
             self.assertRtolEqual(custom_output, npu_output)
 
     def test_npu_conv3d_fp32(self):
+        torch.npu.config.allow_internal_format = True
+        torch.npu.set_compile_mode(jit_compile=True)
         shape_format = [
             # input, weigth, bias, stride, padding, dilation, groups
             [[np.float32, 30, [1, 128, 4, 14, 14]], [np.float32, 30, [1, 128, 3, 3, 3]], None, [1, 1, 1], [1, 1, 1],
diff --git a/test/custom_ops/test_npu_fused_attention_score_fwd.py b/test/custom_ops/test_npu_fused_attention_score_fwd.py
deleted file mode 100644
index 9fdf30309eaa659883209546c5ad140b8bdf73f5..0000000000000000000000000000000000000000
--- a/test/custom_ops/test_npu_fused_attention_score_fwd.py
+++ /dev/null
@@ -1,59 +0,0 @@
-#
-
-import torch
-
-import torch_npu
-from torch_npu.testing.testcase import TestCase, run_tests
-
-
-class TestFusedAttentionScoreFwd(TestCase):
-
-    def supported_op_exec(self, q, k, v, mask, scale, keep_prob):
-        attention_scores = torch.matmul(q, k.transpose(-1, -2))
-        attn_scores = mask + attention_scores * scale
-        attn_probss = torch.nn.functional.softmax(attn_scores, dim=-1)
-        drop_p = 1 - keep_prob
-        drop = torch_npu.contrib.module.DropoutWithByteMask(p=drop_p).npu()
-        attn_probs = drop(attn_probss)
-        context_layer = torch.matmul(attn_probs, v)
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = (q.shape[0] * q.shape[2],
-                                   q.shape[1] * q.shape[3])
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        # drop_mask[i] = 1 for inferencing
-        drop_mask = torch.ones(q.shape[0] * q.shape[1] * q.shape[2] * q.shape[2])
-        drop_mask = drop_mask.to(torch.uint8)
-
-        return context_layer.cpu(), attn_probss.cpu(), drop_mask.cpu()
-
-    def custom_op_exec(self, q, k, v, mask, scale, keep_prob):
-        attention_score, softmax_output, drop_mask = torch_npu.npu_fused_attention_score_fwd(q, k, v, mask, scale,
-                                                                                             keep_prob)
-        return attention_score.cpu(), softmax_output.cpu(), drop_mask.cpu()
-
-    def test_npu_fused_attention_score_fwd(self):
-        q = torch.rand(24, 16, 512, 64).uniform_(-3, 3).half().npu()
-        k = torch.rand(24, 16, 512, 64).uniform_(-3, 3).half().npu()
-        v = torch.rand(24, 16, 512, 64).uniform_(-3, 3).half().npu()
-        mask = torch.ones(512) * -10000.
-        mask[:6] = -0.
-        mask = mask.expand(24, 1, 512, 512).half().npu()
-        scale = 0.125
-        # keep_prob = 1 for inferencing
-        keep_prob = 1
-
-        supported_attention_score, supported_softmax_output, supported_drop_mask = self.supported_op_exec(q, k, v,
-                                                                                                          mask,
-                                                                                                          scale,
-                                                                                                          keep_prob)
-        custom_attention_score, custom_softmax_output, custom_drop_mask = self.custom_op_exec(q, k, v, mask, scale,
-                                                                                              keep_prob)
-
-        self.assertRtolEqual(supported_attention_score, custom_attention_score, prec16=0.006)
-        self.assertRtolEqual(supported_softmax_output, custom_softmax_output)
-        self.assertRtolEqual(supported_drop_mask, custom_drop_mask)
-
-
-if __name__ == '__main__':
-    run_tests()
diff --git a/test/custom_ops/test_npu_fusion_attention.py b/test/custom_ops/test_npu_fusion_attention.py
index de017c35ff380bbcaae839e832042b78f6056bce..5396bd1d34d0110adb9ecf5dfb5f697136888d9b 100644
--- a/test/custom_ops/test_npu_fusion_attention.py
+++ b/test/custom_ops/test_npu_fusion_attention.py
@@ -12,7 +12,7 @@ class TestNPUFlashAttention(TestCase):
         scale = 0.08838
         qk = torch.matmul(query, key.transpose(2, 3)).mul(scale)
         qk = qk + atten_mask * (-10000.0)
-        softmax_res = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32).to(torch.float16)
+        softmax_res = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32)
         attention_out = torch.matmul(softmax_res, value)
         return attention_out
 
@@ -62,7 +62,7 @@ class TestNPUFlashAttention(TestCase):
     # sparse_params = [sparse_mode, pre_tokens, next_tokens]
     def check_result(self, query, key, value, sparse_params):
         atten_mask = self.get_atten_mask(sparse_params[0], sparse_params[1], sparse_params[2])
-        output = self.supported_op_exec(query, key, value, atten_mask)
+        output = self.supported_op_exec(query.float(), key.float(), value.float(), atten_mask.float()).to(torch.float16)
         fa_result = self.custom_op_exec(query.npu(), key.npu(), value.npu(), sparse_params)
         self.assertRtolEqual(output, fa_result[0], prec=0.01, prec16=0.01)
 
diff --git a/test/custom_ops/test_npu_ifmr.py b/test/custom_ops/test_npu_ifmr.py
index ec9a65ddbabb13070347fb78c389b98a88f2dfaf..ee8c84448fdc91dcf0991873126756513c5fea08 100644
--- a/test/custom_ops/test_npu_ifmr.py
+++ b/test/custom_ops/test_npu_ifmr.py
@@ -3,7 +3,7 @@ import torch
 
 import torch_npu
 from torch_npu.testing.testcase import TestCase, run_tests
-from torch_npu.testing.common_utils import create_common_tensor
+from torch_npu.testing.common_utils import create_common_tensor, SupportedDevices
 
 
 class TestIFMR(TestCase):
@@ -54,6 +54,7 @@ class TestIFMR(TestCase):
                                            search_start, search_end, search_step, with_offset)
         return scale.cpu().detach(), offset.cpu().detach()
 
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_npu_ifmr(self, device="npu"):
         item = [np.float32, 0, (2, 2, 4, 4)]
         _, npu_input = create_common_tensor(item, -1, 1)
diff --git a/test/custom_ops/test_npu_stride_add.py b/test/custom_ops/test_npu_stride_add.py
index 9308e6bf3ba6dbdfee932153848df0c191fa7842..5e3133e1f0cb4016e64514cd36df5e48bb099bc9 100644
--- a/test/custom_ops/test_npu_stride_add.py
+++ b/test/custom_ops/test_npu_stride_add.py
@@ -1,6 +1,7 @@
 import torch
 import numpy as np
 import torch_npu
+from torch_npu.testing.common_utils import SupportedDevices
 from torch_npu.testing.testcase import TestCase, run_tests
 from torch_npu.testing.decorator import Dtypes, instantiate_tests
 
@@ -30,6 +31,7 @@ class TestNpuStrideAdd(TestCase):
         output = output.numpy()
         return output
 
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_StrideAdd(self):
         input1 = torch.tensor([[[[[1.]]]]]).npu()
         input2 = input1
diff --git a/test/deprecated_apis.json b/test/deprecated_apis.json
new file mode 100644
index 0000000000000000000000000000000000000000..293a11ab562aa0a04ed23d07f129101f89b6ceb1
--- /dev/null
+++ b/test/deprecated_apis.json
@@ -0,0 +1,59 @@
+{
+    "torch_npu": [
+        "npu_broadcast",
+        "npu_conv2d",
+        "npu_conv_transpose2d",
+        "npu_convolution",
+        "npu_convolution_transpose",
+        "npu_dtype_cast",
+        "npu_gru",
+        "npu_layer_norm_eval",
+        "npu_min",
+        "npu_mish",
+        "npu_ptiou",
+        "npu_reshape",
+        "npu_silu",
+        "npu_sort_v2"
+    ],
+    "torch_npu.contrib": [
+        "BiLSTM",
+        "DCNv2",
+        "FusedColorJitter",
+        "Mish",
+        "SiLU",
+        "Swish",
+        "npu_ciou",
+        "npu_diou",
+        "npu_giou",
+        "npu_iou",
+        "npu_ptiou"
+    ],
+    "torch_npu.contrib.module": [
+        "BiLSTM",
+        "DCNv2",
+        "FusedColorJitter",
+        "Mish",
+        "SiLU",
+        "Swish"
+    ],
+    "torch_npu.contrib.function": [
+        "npu_ciou",
+        "npu_diou",
+        "npu_giou",
+        "npu_iou",
+        "npu_ptiou"
+    ],
+    "torch_npu.contrib.function.iou": [
+        "npu_ciou",
+        "npu_diou",
+        "npu_giou",
+        "npu_iou",
+        "npu_ptiou"
+    ],
+    "torch_npu.contrib.module.deform_conv": [
+        "DCNv2"
+    ],
+    "torch_npu.contrib.module.fusedcolorjitter": [
+        "FusedColorJitter"
+    ]
+}
diff --git a/test/distributed/_tensor/test_dtensor.py b/test/distributed/_tensor/test_dtensor.py
index cbfee4b8d19746054f51b3f1cb15bb421d85c9db..2cb3455c9ab23073f975f705c8598f7a0cebf163 100644
--- a/test/distributed/_tensor/test_dtensor.py
+++ b/test/distributed/_tensor/test_dtensor.py
@@ -404,6 +404,16 @@ class DTensorTest(DTensorTestBase):
         reloaded_st = torch.load(buffer)
         self.assertEqual(sharded_tensor, reloaded_st)
 
+    @skipIfUnsupportMultiNPU(4)
+    @with_comms
+    def test_dtensor_to_copy(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+        tensor = torch.randn(4, 2).npu()
+        dtensor = distribute_tensor(tensor, device_mesh, shard_spec)
+        dtensor_cpu = dtensor.cpu()
+        self.assertEqual(dtensor_cpu._local_tensor, dtensor._local_tensor)
+
 
 class DTensorMeshTest(DTensorTestBase):
     @property
diff --git a/test/distributed/test_flight_recorder.py b/test/distributed/test_flight_recorder.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e0b57873c149571bb30cfd1bc94a854d745e202
--- /dev/null
+++ b/test/distributed/test_flight_recorder.py
@@ -0,0 +1,989 @@
+import os
+import json
+import pickle
+import sys
+import tempfile
+import threading
+import time
+from datetime import datetime, timedelta
+from unittest import mock, skipIf
+
+import torch
+import torch.distributed as c10d
+import torch.distributed as dist
+from torch.testing._internal.common_distributed import MultiProcessTestCase, TEST_SKIPS
+from torch.testing._internal.common_utils import instantiate_parametrized_tests, parametrize, run_tests
+
+import torch_npu
+
+
+class HCCLTraceTestBase(MultiProcessTestCase):
+    def setUp(self):
+        super().setUp()
+        os.environ[
+            "TORCH_HCCL_ENABLE_TIMING"
+        ] = "0"  # see 'timing_enabled' parametrized tests
+        os.environ["TORCH_HCCL_TRACE_BUFFER_SIZE"] = "1000"
+        os.environ["TORCH_HCCL_DUMP_ON_TIMEOUT"] = "1"
+        self.tempdir = tempfile.TemporaryDirectory()
+        os.environ["TORCH_HCCL_DEBUG_INFO_TEMP_FILE"] = self._trace_basename()
+        os.environ["TORCH_HCCL_DEBUG_INFO_PIPE_FILE"] = self._trace_basename()
+        self._spawn_processes()
+
+    @classmethod
+    def _run(
+        cls,
+        parent_conn,
+        rank: int,
+        test_name: str,
+        file_name: str,
+        parent_pipe,
+        **kwargs,
+    ) -> None:
+        cls.parent = parent_conn
+        super()._run(rank, test_name, file_name, parent_pipe)
+
+    @property
+    def local_device(self):
+        return torch.device("npu", self.rank_to_NPU[self.rank][0])
+
+    def _join_processes(self, fn):
+        # We need to patch sys.exit() as skip_if will use sys.exit() and
+        # the exit code from the this process will not be catched.
+        with mock.patch("sys.exit") as exit_mock:
+            fn()
+        super()._join_processes(fn)
+
+    def _spawn_processes(self) -> None:
+        proc = torch.multiprocessing.get_context("spawn").Process
+        self.children_pipes = []
+        parent_pipes = []
+        for i in range(self.world_size):
+            parent_conn, child_conn = torch.multiprocessing.Pipe()
+            self.children_pipes.append(child_conn)
+            parent_pipes.append(parent_conn)
+        piter = iter(parent_pipes)
+
+        def wrap(*positional, args, **kwargs):
+            args = (next(piter), *args)
+            return proc(*positional, args=args, **kwargs)
+
+        self._start_processes(wrap)
+
+    def _create_process_group_hccl(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            "hccl", world_size=self.world_size, rank=self.rank, store=store
+        )
+        pg = c10d.distributed_c10d._get_default_group()
+        return pg
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    @property
+    def world_size(self):
+        return 2
+
+    @property
+    def rank_to_NPU(self):
+        # return rank to NPU map
+        return {i: [i] for i in range(self.world_size)}
+
+    def _trace_basename(self):
+        # we pass the base to the env, and the dump util will append rank
+        return os.path.join(self.tempdir.name, "trace_")
+
+    def _trace_name(self, rank):
+        return self._trace_basename() + str(rank)
+
+    def started_or_scheduled(self, timing_enabled):
+        return "started" if timing_enabled else "scheduled"
+
+
+class HCCLTraceTest(HCCLTraceTestBase):
+    def _verify_trace(self, t, include_collectives, timing_enabled, is_json):
+        ver = t["version"]
+        self.assertEqual(ver, "2.4")
+        pg_config = t["pg_config"]
+        self.assertEqual(len(pg_config), 1)
+        default_pg_info = pg_config["group_name_0"]
+        self.assertIn("name", default_pg_info)
+        self.assertIn("desc", default_pg_info)
+        self.assertIn("ranks", default_pg_info)
+        pg_status = t["pg_status"]
+        self.assertEqual(len(pg_status), 1)
+        self.assertEqual(str(pg_status["0"]["last_enqueued_collective"]), "2")
+        self.assertEqual(str(pg_status["0"]["last_completed_collective"]), "2")
+        self.assertEqual(
+            str(pg_status["0"]["last_started_collective"]),
+            "2" if timing_enabled else "-1",
+        )
+        global_ranks = pg_config["group_name_0"]["ranks"]
+        self.assertEqual(len(json.loads(global_ranks)), self.world_size)
+        if include_collectives:
+            self.assertEqual(len(t["entries"]), 2)
+            t = t["entries"]
+            last = t[-1]
+            self.assertEqual(last["process_group"], ("group_name_0", ""))
+            self.assertEqual(last["state"], "completed")
+            s = last["time_discovered_started_ns"]
+            f = last["time_discovered_completed_ns"]
+            self.assertEqual(last["record_id"], 1)
+            self.assertIsNotNone(f)
+            if timing_enabled:
+                self.assertIsNotNone(s)
+                self.assertTrue(s <= f)
+            # we don't collect stack traces in JSON at the moment
+            if not is_json:
+                self.assertIn("test_flight_recorder.py", str(last["frames"]))
+            self.assertEqual(last["input_sizes"], ((3, 4),))
+            self.assertEqual(last["input_dtypes"], ["Float"])
+            self.assertEqual(last["output_sizes"], ((3, 4),))
+            self.assertEqual(last["output_dtypes"], ["Float"])
+            self.assertEqual(last["collective_seq_id"], 2)
+            # HCCL_EXEC_TIMEOUT will impact watchdog timeout
+            self.assertEqual(last["timeout_ms"], 3636000)
+            now = datetime.now()
+            event_created_time = datetime.fromtimestamp(
+                last["time_created_ns"] / 1000000000
+            )
+            before_test = now - timedelta(minutes=1)
+            self.assertTrue(before_test < event_created_time < now)
+            if timing_enabled:
+                # very loose bounds, measured 0.036 ms on devnpu
+                self.assertTrue(0 < last["duration_ms"] < 100)
+            else:
+                self.assertTrue("duration_ms" not in last)
+        else:
+            self.assertTrue("entries" not in t)
+
+    @parametrize("timing_enabled", [False])
+    @parametrize("include_collectives", [True, False])
+    def test_short_json(self, timing_enabled, include_collectives):
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_hccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+        device = self.local_device
+        a = torch.full((3, 4), float(self.rank), device=device)
+        for i in range(2):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.npu.synchronize(device=device)
+        # gah ok so now the duration_ms is populated best-effort since it can only happen outside "dump()" api
+        time.sleep(1)
+        t = json.loads(
+            torch_npu._C._distributed_c10d._dump_hccl_trace_json(
+                includeCollectives=include_collectives
+            )
+        )
+        self._verify_trace(t, include_collectives, timing_enabled, True)
+        dist.destroy_process_group()
+
+    @parametrize("timing_enabled", [False])
+    @parametrize("include_collectives", [True, False])
+    def test_short_pickle(self, timing_enabled, include_collectives):
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_hccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+        device = self.local_device
+        a = torch.full((3, 4), float(self.rank), device=device)
+        for i in range(2):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.npu.synchronize(device=device)
+        # gah ok so now the duration_ms is populated best-effort since it can only happen outside "dump()" api
+        time.sleep(1)
+        t = pickle.loads(
+            torch_npu._C._distributed_c10d._dump_hccl_trace(
+                includeCollectives=include_collectives
+            )
+        )
+        self._verify_trace(
+            t,
+            include_collectives=include_collectives,
+            timing_enabled=timing_enabled,
+            is_json=True,
+        )
+        dist.destroy_process_group()
+
+    def test_dump_pipe(self):
+        if self.rank != self.MAIN_PROCESS_RANK:
+            # now we need start heartbeatmonitor thread manually
+            os.environ["TORCH_HCCL_ENABLE_MONITORING"] = "1"
+            # makesure dump_pipe not heartbeat dump
+            os.unsetenv("TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC")
+
+        def open_file_with_timeout(file_path, mode, timeout=1.0):
+            start_time = time.time()
+            while time.time() - start_time < timeout:
+                if os.path.exists(file_path):
+                    return open(file_path, mode)
+                time.sleep(0.1)
+            raise FileNotFoundError
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            for c in self.children_pipes:
+                self.assertEqual(c.recv(), "next")
+
+            dump_file = self._trace_name(rank=0)
+            pipe_file = dump_file + ".pipe"
+            with open_file_with_timeout(pipe_file, "w") as f:
+                f.write("1\n")
+            with open_file_with_timeout(dump_file, "rb", timeout=10.0) as f:
+                # does not support profiling, so we use test_dump_pipe instead of all_reduce
+                self.assertTrue("test_dump_pipe" in str(pickle.load(f)))
+
+            for c in self.children_pipes:
+                c.send("next")
+            return
+
+        pg = self._create_process_group_hccl()
+        device = self.local_device
+        a = torch.full((3, 4), float(self.rank), device=device)
+        for _ in range(2):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.npu.synchronize(device=device)
+        self.parent.send("next")
+        self.parent.recv()
+
+    def test_long(self):
+        os.environ["TORCH_HCCL_TRACE_BUFFER_SIZE"] = "10"
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_hccl()
+        device = self.local_device
+        a = torch.full((3, 4), float(self.rank), device=device)
+        for _ in range(2):
+            # test some other primitives to make sure
+            # their strings are valid
+            xs = [torch.ones(3, 4, device=device)]
+            pg.broadcast(xs).wait()
+            pg.allreduce(xs).wait()
+            pg.reduce(xs).wait()
+            ys = [[torch.empty(3, 4, device=device) for _ in range(self.world_size)]]
+            pg.allgather(ys, xs).wait()
+            pg.reduce_scatter(xs, ys).wait()
+            f = pg.allreduce(a)
+        f.wait()
+        torch.npu.synchronize(device=device)
+        t = pickle.loads(torch_npu._C._distributed_c10d._dump_hccl_trace())
+        t = t["entries"]
+        self.assertEqual(len(t), 10)
+        first = t[0]
+        last = t[-1]
+        # profiling is not supported
+        self.assertEqual(last["profiling_name"], "")
+        self.assertEqual(last["state"], "completed")
+        self.assertIn("test_flight_recorder.py", str(last["frames"]))
+        self.assertEqual(last["input_sizes"], ((3, 4),))
+        self.assertEqual(last["input_dtypes"], ["Float"])
+        self.assertEqual(last["output_sizes"], ((3, 4),))
+        self.assertEqual(last["output_dtypes"], ["Float"])
+        # timeout_ms adapt to npu
+        self.assertEqual(last["timeout_ms"], 3636000)
+        self.assertEqual(last["collective_seq_id"] - first["collective_seq_id"], 9)
+        dist.destroy_process_group()
+
+    @skipIf(True, "profiling is not supported")
+    def test_barrier_profiling(self):
+        os.environ["TORCH_HCCL_TRACE_BUFFER_SIZE"] = "10"
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_hccl()
+        device = self.local_device
+        a = torch.full((3, 4), float(self.rank), device=device)
+        f = pg.barrier()
+        f = pg.allreduce(a)
+        f.wait()
+        torch.npu.synchronize(device=device)
+        t = pickle.loads(torch_npu._C._distributed_c10d._dump_hccl_trace())
+        t = t["entries"]
+        self.assertEqual(len(t), 2)
+        first = t[0]
+        last = t[-1]
+        self.assertEqual(first["profiling_name"], "hccl:all_reduce_barrier")
+        self.assertEqual(last["profiling_name"], "hccl:all_reduce")
+        dist.destroy_process_group()
+
+    def test_trace_while_all_works_retired(self):
+        os.environ["TORCH_HCCL_TRACE_BUFFER_SIZE"] = "10"
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_hccl()
+        device = self.local_device
+        # send more works than the buffer size to overwrite the previous entry
+        for _ in range(12):
+            a = [torch.ones(3, 4, device=device)]
+            pg.broadcast(a).wait()
+        torch.npu.synchronize(device=device)
+
+        # wait for all works to be retired, we use sleep instead of pg._wait_for_pending_works()
+        time.sleep(30)
+        t = pickle.loads(torch_npu._C._distributed_c10d._dump_hccl_trace())
+        t = t["entries"]
+        self.assertEqual(len(t), 10)
+        last = t[-1]
+        self.assertEqual(last["retired"], True)
+        self.assertEqual(last["state"], "completed")
+
+    # timing_enabled is not supported
+    @parametrize("timing_enabled", [False])
+    @parametrize("only_active", [True, False])
+    def test_trace_while_active(self, timing_enabled, only_active):
+        if self.rank == self.MAIN_PROCESS_RANK:
+            for c in self.children_pipes:
+                self.assertEqual(c.recv(), "next")
+            for c in self.children_pipes:
+                c.send("next")
+            return
+
+        pg = self._create_process_group_hccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+        device = self.local_device
+        with torch.npu.device(device):
+            a = torch.full((3, 4), float(self.rank), device=device)
+
+            pg.allreduce(a).wait()
+            e = torch.npu.Event()
+            e.record()
+            if self.rank != 0:
+                pg.allreduce(a).wait()
+            e.synchronize()
+            t = pickle.loads(
+                torch_npu._C._distributed_c10d._dump_hccl_trace(onlyActive=only_active)
+            )
+            t = t["entries"]
+            if only_active:
+                if self.rank == 0:
+                    self.assertEqual(len(t), 0)
+                else:
+                    self.assertEqual(len(t), 1)
+            if not only_active:
+                if self.rank == 0:
+                    self.assertEqual(t[-1]["profiling_name"], "")
+                    self.assertEqual(t[-1]["collective_seq_id"], 1)
+                    self.assertEqual(t[-1]["state"], "completed")
+                else:
+                    self.assertEqual(t[-1]["profiling_name"], "")
+                    self.assertEqual(t[-1]["collective_seq_id"], 2)
+                    self.assertEqual(
+                        t[-1]["state"], self.started_or_scheduled(timing_enabled)
+                    )
+
+            self.parent.send("next")
+            self.assertEqual("next", self.parent.recv())
+            if self.rank == 0:
+                pg.allreduce(a).wait()
+            torch.npu.synchronize(device=device)
+
+    @parametrize("timing_enabled", [False])
+    def test_trace_while_stuck(self, timing_enabled):
+        if self.rank == self.MAIN_PROCESS_RANK:
+            for c in self.children_pipes:
+                self.assertEqual(c.recv(), "next")
+            for c in self.children_pipes:
+                c.send("next")
+            return
+
+        pg = self._create_process_group_hccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+
+        device = self.local_device
+        with torch.npu.device(device):
+            a = torch.full((3, 4), float(self.rank), device=device)
+
+            pg.allreduce(a).wait()
+            e = torch.npu.Event()
+            e.record()
+
+            def gather_trace():
+                e.synchronize()
+                # give the other thread some time to fill the npu buffer
+                time.sleep(5)
+                t = pickle.loads(torch_npu._C._distributed_c10d._dump_hccl_trace())
+                t = t["entries"]
+                self.assertEqual(t[-1]["profiling_name"], "")
+                if self.rank == 0:
+                    self.assertEqual(t[-1]["collective_seq_id"], 1)
+                    self.assertEqual(t[-1]["state"], "completed")
+                else:
+                    self.assertEqual(t[-1]["collective_seq_id"], 2)
+                    self.assertEqual(
+                        t[-1]["state"], self.started_or_scheduled(timing_enabled)
+                    )
+                    self.assertIsNone(t[-1]["time_discovered_completed_ns"])
+                # this will eventually cause the missing rank 0
+                # to continue which will unblock the non-zero ranks
+                self.parent.send("next")
+
+            if self.rank != 0:
+                pg.allreduce(a).wait()
+                th = threading.Thread(target=gather_trace)
+                th.start()
+                # fill the npu buffer, at around 1024 events
+                # this will stall
+                for _ in range(2000):
+                    a = a + a
+                th.join()
+            else:
+                gather_trace()
+
+            self.assertEqual("next", self.parent.recv())
+            if self.rank == 0:
+                pg.allreduce(a).wait()
+            torch.npu.synchronize(device=device)
+
+    @skipIf(True, "send_recv is not supported")
+    @parametrize(
+        "op_sizes_per_coalesce",
+        [
+            [(2, 3)],
+            [(2, 3), (5, 5), (1,)],
+        ],
+    )
+    @parametrize("timing_enabled", [True, False])
+    def test_batched_send_recv(self, op_sizes_per_coalesce, timing_enabled):
+        """
+        'WorkEnqueue' was skipped for isendirecv, leading to segfault on dump_entries when update_state tried to use
+        a destructed Work obj's npu events
+        """
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_hccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+
+        num_coalesced_ops = 20
+        ops_per_coalesce = len(op_sizes_per_coalesce)
+        for _ in range(num_coalesced_ops):
+            ops = []
+            for input_sizes in op_sizes_per_coalesce:
+                tensor = torch.zeros(input_sizes).to(self.local_device)
+                if self.rank == 0:
+                    ops.append(dist.P2POp(dist.irecv, tensor, 1))
+                elif self.rank == 1:
+                    tensor *= 2
+                    ops.append(dist.P2POp(dist.isend, tensor, 0))
+
+            dist.batch_isend_irecv(ops).pop().wait()
+
+        torch.npu.synchronize(device=self.local_device)
+
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch_npu._C._distributed_c10d._dump_hccl_trace())
+        self.assertEqual(len(t["entries"]), num_coalesced_ops * (ops_per_coalesce + 1))
+
+        expected_record_id = 0
+        expected_seq = 1
+        expected_op_id = 1
+        for seq in range(num_coalesced_ops):
+            first_op = seq * (ops_per_coalesce + 1)
+            coalesced_op = first_op + ops_per_coalesce
+            for p2p_op_idx, input_sizes in zip(
+                range(first_op, coalesced_op, 1), op_sizes_per_coalesce
+            ):
+                # the indivudal ops inside the coalescing group the individual op metadata,
+                # but not the timing info coming from the actual coalesced kernel
+                profiling_name = (
+                    "hccl:recv 0<-1" if self.rank == 0 else "hccl:send 1->0"
+                )
+                self.assertEqual(
+                    t["entries"][p2p_op_idx]["record_id"], expected_record_id
+                )
+                expected_record_id += 1
+                self.assertEqual(
+                    t["entries"][p2p_op_idx]["profiling_name"], profiling_name
+                )
+                # we don't increment collective_seq_id for p2p ops.
+                self.assertEqual(t["entries"][p2p_op_idx]["collective_seq_id"], 0)
+                self.assertEqual(t["entries"][p2p_op_idx]["p2p_seq_id"], expected_seq)
+                self.assertEqual(t["entries"][p2p_op_idx]["op_id"], expected_op_id)
+                expected_op_id += 1
+                self.assertEqual(t["entries"][p2p_op_idx]["input_sizes"], [input_sizes])
+                self.assertEqual(
+                    t["entries"][p2p_op_idx]["output_sizes"], [input_sizes]
+                )
+                # duration doesn't get tagged onto individual ops yet, nor is their state updated
+                self.assertEqual(t["entries"][p2p_op_idx]["state"], "scheduled")
+                self.assertTrue("duration_ms" not in t["entries"][p2p_op_idx])
+
+            # the coalesced op has no metadata but indicates that coalescing was used,
+            # and accurately reflects the timing and state info for the whole group
+            self.assertEqual(
+                t["entries"][coalesced_op]["record_id"], expected_record_id
+            )
+            expected_record_id += 1
+            self.assertEqual(
+                t["entries"][coalesced_op]["profiling_name"], "hccl:coalesced"
+            )
+            self.assertEqual(t["entries"][coalesced_op]["p2p_seq_id"], expected_seq)
+            expected_seq += 1
+            self.assertEqual(t["entries"][coalesced_op]["state"], "completed")
+            self.assertEqual(t["entries"][coalesced_op]["input_sizes"], [])
+            self.assertEqual(t["entries"][coalesced_op]["output_sizes"], [])
+            if timing_enabled:
+                duration = t["entries"][coalesced_op]["duration_ms"]
+                self.assertTrue(0.001 < duration < 10000, duration)
+            else:
+                self.assertTrue("duration_ms" not in t["entries"][coalesced_op])
+            self.assertEqual(t["entries"][coalesced_op]["timeout_ms"], 600000)
+
+    @skipIf(True, "send_recv is not supported")
+    @parametrize(
+        "op_sizes",
+        [
+            [(2, 3)],
+            [(2, 3), (5, 5), (1,)],
+        ],
+    )
+    @parametrize("timing_enabled", [True, False])
+    def test_individual_send_recv(self, op_sizes, timing_enabled):
+        """
+        'WorkEnqueue' was skipped for isendirecv, leading to segfault on dump_entries when update_state tried to use
+        a destructed Work obj's npu events
+        """
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_hccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+        num_repeats = 10
+        ops_per_repeat = len(op_sizes)
+        for _ in range(num_repeats):
+            for input_sizes in op_sizes:
+                tensor = torch.zeros(input_sizes).to(self.local_device)
+                if self.rank == 0:
+                    dist.recv(tensor, 1)
+                elif self.rank == 1:
+                    tensor *= 2
+                    dist.send(tensor, 0)
+
+        torch.npu.synchronize(device=self.local_device)
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch_npu._C._distributed_c10d._dump_hccl_trace())
+        self.assertEqual(len(t["entries"]), num_repeats * (ops_per_repeat))
+        expected_seq = 1
+        expected_op_id = 1
+        for seq in range(num_repeats * ops_per_repeat):
+            input_sizes = op_sizes[seq % ops_per_repeat]
+            profiling_name = "hccl:recv 0<-1" if self.rank == 0 else "hccl:send 1->0"
+            self.assertEqual(t["entries"][seq]["profiling_name"], profiling_name)
+            # we don't increment collective_seq_id for p2p ops.
+            self.assertEqual(t["entries"][seq]["collective_seq_id"], 0)
+            self.assertEqual(t["entries"][seq]["p2p_seq_id"], expected_seq)
+            expected_seq += 1
+            self.assertEqual(t["entries"][seq]["op_id"], expected_op_id)
+            expected_op_id += 1
+            self.assertEqual(t["entries"][seq]["input_sizes"], [input_sizes])
+            self.assertEqual(t["entries"][seq]["output_sizes"], [input_sizes])
+            self.assertEqual(t["entries"][seq]["state"], "completed")
+
+            if timing_enabled:
+                duration = t["entries"][seq]["duration_ms"]
+                self.assertTrue(0.001 < duration < 10000, duration)
+            else:
+                self.assertTrue("duration_ms" not in t["entries"][seq])
+
+    @skipIf(True, "coalescing_manager is not supported")
+    @parametrize("timing_enabled", [True, False])
+    def test_coalescing_manager_collective(self, timing_enabled):
+        """
+        The coalescing manager api works by accumulating operations in python via a contextmanager, and then making
+        one call into c++ to an <op>_coalesced API.  It has limited support for ops and has been added recently to
+        avoid overheads of making individual py-cpp calls.  This complicates flight recording..
+
+        For now, flight recording of coalescing_manager collectives is less detailed than cpp coalesced collectives.
+        """
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_hccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+
+        output_tensors = torch.zeros(2, 2).to(self.rank)
+        input_tensors = [torch.ones(2, 2).to(self.rank) for _ in range(self.world_size)]
+
+        # TODO(whc) make this work with bigger world or something
+        self.assertEqual(self.world_size, 2, self.world_size)
+
+        with dist._coalescing_manager():
+            for i in range(self.world_size):
+                dist.reduce_scatter_tensor(output_tensors[i], input_tensors[i])
+        self.assertEqual(output_tensors, input_tensors[self.rank] * self.world_size)
+
+        torch.npu.synchronize(device=self.rank)
+
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch_npu._C._distributed_c10d._dump_hccl_trace())
+
+        self.assertEqual(
+            len(t["entries"]), 1
+        )  # one for the reduce_scatter_tensor_coalesced
+        self.assertEqual(
+            t["entries"][0]["profiling_name"], "hccl:reduce_scatter_tensor_coalesced"
+        )
+        # collective_seq_id should be incremented once.
+        self.assertEqual(t["entries"][0]["collective_seq_id"], 1)
+        self.assertEqual(t["entries"][0]["input_sizes"], [[2, 2], [2, 2]])
+        self.assertEqual(
+            t["entries"][0]["output_sizes"],
+            [
+                [
+                    2,
+                ],
+                [
+                    2,
+                ],
+            ],
+        )
+        self.assertEqual(t["entries"][0]["state"], "completed")
+        if timing_enabled:
+            duration = t["entries"][0]["duration_ms"]
+            self.assertTrue(0.001 < duration < 10000, duration)
+        else:
+            self.assertTrue("duration_ms" not in t["entries"][0])
+
+
+def check_if_test_is_skipped(fn):
+    def wrapper(self, *args, **kwargs):
+        for skip in TEST_SKIPS.values():
+            if self.processes[0].exitcode == skip.exit_code:
+                return MultiProcessTestCase._check_return_codes(self, *args, **kwargs)
+        return fn(self, *args, **kwargs)
+
+    return wrapper
+
+
+class HCCLTraceTestDumpOnTimeoutBase(HCCLTraceTestBase):
+    timeout_sec = 60
+
+    def _create_process_group_hccl(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            "hccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            timeout=timedelta(seconds=HCCLTraceTestDumpOnTimeoutBase.timeout_sec),
+        )
+        pg = c10d.distributed_c10d._get_default_group()
+        return pg
+
+    @check_if_test_is_skipped
+    def _check_return_codes(self, elapsed_time):
+        # the base test infra assumes processes exit with matching return codes,
+        # but we want rank0 to abort and rank1 to exit cleanly in this test
+        self.assertEqual(self.processes[0].exitcode, -6)
+        self.assertEqual(self.processes[1].exitcode, 0)
+
+    def _wait_process(self, rank, timeout):
+        try:
+            self.processes[rank].join(timeout)
+            return self.processes[rank].exitcode
+        except TimeoutError:
+            return None
+
+
+class HCCLTraceTestDumpOnTimeout(HCCLTraceTestDumpOnTimeoutBase):
+    @parametrize("timing_enabled", [False])
+    def test_timeout_dumps(self, timing_enabled):
+        if self.rank != self.MAIN_PROCESS_RANK:
+            # dump on heartbeatmonitor thread
+            os.environ["TORCH_HCCL_ENABLE_MONITORING"] = "1"
+        # need rank0 to crash before looking for its output file
+        os.environ["TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC"] = "60"
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            # wait for rank0 to crash before looking for its output file
+            # we rely on rank0 holding off its abort long enough to dump the debug info
+            self.assertEqual(self._wait_process(0, timeout=180), -6)
+            with open(self._trace_name(rank=0), "rb") as f:
+                t = pickle.load(f)
+                t = t["entries"]
+                self.assertEqual(len(t), 2)
+                self.assertEqual(t[0]["collective_seq_id"], 1)
+                self.assertEqual(t[0]["state"], "completed")
+                self.assertEqual(t[1]["collective_seq_id"], 2)
+                self.assertEqual(
+                    t[1]["state"], self.started_or_scheduled(timing_enabled)
+                )
+
+            self.assertFalse(os.path.exists(self._trace_name(rank=1)))
+
+            return
+
+        pg = self._create_process_group_hccl()
+        if timing_enabled:
+            # we force disabled timing in setup, since there is no 'disable' function
+            pg._enable_collectives_timing()
+
+        device = self.local_device
+        with torch.npu.device(device):
+            a = torch.full((3, 4), float(self.rank), device=device)
+
+            pg.allreduce(a).wait()
+            if self.rank == 0:
+                pg.allreduce(a).wait()
+
+            # rank 0 will crash before it passes the sync, but rank1 will exit quickly and cleanly
+            torch.npu.synchronize(device=device)
+
+
+instantiate_parametrized_tests(HCCLTraceTestDumpOnTimeout)
+instantiate_parametrized_tests(HCCLTraceTest)
+
+
+class HCCLTraceTestTimeoutDumpOnStuckRanks(HCCLTraceTestDumpOnTimeoutBase):
+    @check_if_test_is_skipped
+    def _check_return_codes(self, elapsed_time):
+        # the base test infra assumes processes exit with matching return codes,
+        # but we want rank0 to abort and rank1 to exit cleanly in this test
+        self.assertEqual(self.processes[0].exitcode, -6)
+        self.assertEqual(self.processes[1].exitcode, 0)
+
+    def test_timeout_dumps_on_stuck_ranks(self):
+        if self.rank != self.MAIN_PROCESS_RANK:
+            # now we need start heartbeatmonitor thread manually
+            os.environ["TORCH_HCCL_ENABLE_MONITORING"] = "1"
+        # need rank0 to crash quicker after detecting timeout
+        os.environ["TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC"] = "60"
+        # restore this env var to its prior default in case another test changed it
+        os.environ["TORCH_HCCL_COORD_CHECK_MILSEC"] = "1000"
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            # wait for both rank0 and 1 to crash before looking for both ranks' output
+            # file, and we rely on rank1 to sleep long enough to dump the debug info.
+            self.assertEqual(self._wait_process(0, timeout=180), -6)
+            self.assertEqual(self._wait_process(1, timeout=180), 0)
+            self.assertTrue(os.path.exists(self._trace_name(rank=1)))
+            self.assertTrue(os.path.exists(self._trace_name(rank=0)))
+            with open(self._trace_name(rank=0), "rb") as f:
+                t = pickle.load(f)
+                t = t["entries"]
+                self.assertEqual(len(t), 2)
+            with open(self._trace_name(rank=1), "rb") as f:
+                t = pickle.load(f)
+                t = t["entries"]
+                self.assertEqual(len(t), 1)
+                self.assertEqual(t[0]["collective_seq_id"], 1)
+                self.assertEqual(t[0]["state"], "completed")
+            return
+
+        pg = self._create_process_group_hccl()
+        device = self.local_device
+        with torch.npu.device(device):
+            a = torch.full((3, 4), float(self.rank), device=device)
+
+            pg.allreduce(a).wait()
+            if self.rank == 0:
+                pg.allreduce(a).wait()
+
+            # rank 0 will get stuck, timeout and then signal a timeout to all ranks.
+            torch.npu.synchronize(device=device)
+
+            if self.rank == 1:
+                # Force rank 1 to sleep 120s so that it will eventually exit as well after
+                # getting the global signal to dump the debugging info(won't break).
+                time.sleep(120)
+
+
+class HcclErrorDumpTest(HCCLTraceTestBase):
+    def _wait_process(self, rank, timeout):
+        try:
+            self.processes[rank].join(timeout)
+            return self.processes[rank].exitcode
+        except TimeoutError:
+            return None
+
+    @check_if_test_is_skipped
+    def _check_return_codes(self, elapsed_time):
+        # the base test infra assumes processes exit with matching return codes,
+        # but we want rank0 to abort with exception and rank1 to exit with exit 1
+        self.assertEqual(self.processes[0].exitcode, -6)
+        self.assertEqual(self.processes[1].exitcode, 1)
+
+    def test_hccl_errors_dump(self):
+        if self.rank != self.MAIN_PROCESS_RANK:
+            # now we need start heartbeatmonitor thread manually
+            os.environ["TORCH_HCCL_ENABLE_MONITORING"] = "1"
+        os.environ["TORCH_HCCL_ASYNC_ERROR_HANDLING"] = "1"
+        os.environ["TORCH_HCCL_TRACE_BUFFER_SIZE"] = "1000"
+        os.environ["TORCH_HCCL_DUMP_ON_TIMEOUT"] = "1"
+        # need rank0 to dump before abort and we update it to 30 to avoid heratbeat dump
+        os.environ["TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC"] = "30"
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            # wait for both rank0 and 1 to crash before looking for dump
+            self.assertEqual(self._wait_process(0, timeout=90), -6)
+            self.assertEqual(self._wait_process(1, timeout=90), 1)
+            # verify that the trace file exists for rank0
+            self.assertTrue(os.path.exists(self._trace_name(rank=0)))
+            return
+
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            "hccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            timeout=timedelta(seconds=10),
+        )
+        process_group = c10d.distributed_c10d._get_default_group()
+        process_group.allreduce(torch.rand(10).npu(self.rank))
+        if self.rank == 0:
+            work = process_group.allreduce(torch.rand(10).npu(self.rank))
+            # expect an error to be raised
+            with self.assertRaisesRegex(dist.DistBackendError, ""):
+                # Block the current stream on the HCCL stream
+                work.wait()
+                # Run some NPU operations
+                a = torch.rand(10).npu(self.rank)
+        elif self.rank == 1:
+            # Clean up structures (ex: files for FileStore before going down)
+            del process_group
+            sys.exit(1)
+
+
+class HcclHeartbeatDumpTest(HCCLTraceTestBase):
+    def _wait_process(self, rank, timeout):
+        try:
+            self.processes[rank].join(timeout)
+            return self.processes[rank].exitcode
+        except TimeoutError:
+            return None
+
+    def test_hccl_heartbeat_dump(self):
+        if self.rank != self.MAIN_PROCESS_RANK:
+            # dump on heartbeatmonitor thread
+            os.environ["TORCH_HCCL_ENABLE_MONITORING"] = "1"
+        os.environ["TORCH_HCCL_ASYNC_ERROR_HANDLING"] = "1"
+        os.environ["TORCH_HCCL_TRACE_BUFFER_SIZE"] = "1000"
+        os.environ["TORCH_HCCL_DUMP_ON_TIMEOUT"] = "1"
+        # need rank0 to dump
+        os.environ["TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC"] = "10"
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            # wait for both rank0 and 1 to finish
+            self.assertEqual(self._wait_process(0, timeout=90), 0)
+            self.assertEqual(self._wait_process(1, timeout=90), 0)
+            # verify that the trace file exists for rank0
+            self.assertTrue(os.path.exists(self._trace_name(rank=0)))
+            with open(self._trace_name(rank=0) + "_py_traceback", "r") as f:
+                self.assertTrue("time.sleep(30)" in str(f.readlines()))
+            # verify that the trace file not exists for rank1
+            self.assertFalse(os.path.exists(self._trace_name(rank=1)))
+            return
+
+        pg = self._create_process_group_hccl()
+        device = self.local_device
+        with torch.npu.device(device):
+            a = torch.full((3, 4), float(self.rank), device=device)
+
+            pg.allreduce(a).wait()
+            if self.rank == 0:
+                # sleep for heartbeat dump
+                time.sleep(30)
+            
+            pg.allreduce(a).wait()
+
+            torch.npu.synchronize(device=device)
+
+
+class HCCLTraceTestDumpOnHcclTimeout(HCCLTraceTestBase):
+    def setUp(self):
+        os.environ["HCCL_EXEC_TIMEOUT"] = "60"
+        os.environ["HCCL_EVENT_TIMEOUT"] = "90"
+        super().setUp()
+
+    def tearDown(self):
+        # unset env to avoid impact watchdog timeout
+        os.unsetenv('HCCL_EXEC_TIMEOUT')
+        os.unsetenv('HCCL_EVENT_TIMEOUT')
+        super().tearDown()
+
+    @check_if_test_is_skipped
+    def _check_return_codes(self, elapsed_time):
+        # the base test infra assumes processes exit with matching return codes,
+        # but we want rank0 to hccl exec timeout and rank1 to exit cleanly in this test
+        self.assertEqual(self.processes[0].exitcode, 10)
+        self.assertEqual(self.processes[1].exitcode, 0)
+
+    def _wait_process(self, rank, timeout):
+        try:
+            self.processes[rank].join(timeout)
+            return self.processes[rank].exitcode
+        except TimeoutError:
+            return None
+
+    @parametrize("timing_enabled", [False])
+    def test_hccl_timeout_dumps(self, timing_enabled):
+        if self.rank != self.MAIN_PROCESS_RANK:
+            # dump on heartbeatmonitor thread
+            os.environ["TORCH_HCCL_ENABLE_MONITORING"] = "1"
+        # need rank0 to crash before looking for its output file
+        os.environ["TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC"] = "60"
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            # wait for rank0 to crash before looking for its output file
+            self.assertEqual(self._wait_process(0, timeout=180), 10)
+            with open(self._trace_name(rank=0), "rb") as f:
+                t = pickle.load(f)
+                t = t["entries"]
+                self.assertEqual(len(t), 2)
+                self.assertEqual(t[0]["collective_seq_id"], 1)
+                self.assertEqual(t[0]["state"], "completed")
+                self.assertEqual(t[1]["collective_seq_id"], 2)
+                self.assertEqual(
+                    t[1]["state"], self.started_or_scheduled(timing_enabled)
+                )
+
+            self.assertFalse(os.path.exists(self._trace_name(rank=1)))
+
+            return
+
+        pg = self._create_process_group_hccl()
+        if timing_enabled:
+            # we force disabled timing in setup, since there is no 'disable' function
+            pg._enable_collectives_timing()
+
+        device = self.local_device
+        with torch.npu.device(device):
+            a = torch.full((3, 4), float(self.rank), device=device)
+
+            pg.allreduce(a).wait()
+            if self.rank == 0:
+                pg.allreduce(a).wait()
+
+            # rank 0 will crash before it passes the sync, but rank1 will exit quickly and cleanly
+            torch.npu.synchronize(device=device)
+
+
+instantiate_parametrized_tests(HCCLTraceTestDumpOnHcclTimeout)
+
+
+if __name__ == "__main__":
+    if torch.npu.is_available() and torch.npu.device_count() >= 2:
+        run_tests()
diff --git a/test/distributed/test_hccl_stream_id.py b/test/distributed/test_hccl_stream_id.py
index 4d9d7b4ca142755abd4478174ee8d9240e97c4ab..7326f13902c6a74b00d3c9834965ddda8e09fb20 100644
--- a/test/distributed/test_hccl_stream_id.py
+++ b/test/distributed/test_hccl_stream_id.py
@@ -38,7 +38,15 @@ class HcclStreamIdTest(TestCase):
             dist_group.recv(recv_tensor, src)
             p2p_stream_id = _world.default_pg._get_backend(torch.device('npu'))._get_stream_id(True, src)
 
-        assert0 = ((collective_stream_id & 32) == 32)
+        stream_num = os.environ.get("STREAMS_PER_DEVICE", 8)
+        try:
+            stream_num = int(stream_num)
+        except Exception as e:
+            stream_num = 8  # default 8
+
+        if stream_num != 32:
+            stream_num = 8
+        assert0 = ((collective_stream_id & stream_num) == stream_num)
         assert1 = (collective_stream_id == p2p_stream_id)
         collective_stream = torch.npu.Stream(stream_id=collective_stream_id, device_type=20)
         p2p_stream = torch.npu.Stream(stream_id=collective_stream_id, device_type=20)
diff --git a/test/distributed/test_reduce_scatter.py b/test/distributed/test_reduce_scatter.py
index 40a28c053351648c321b86827f7b8fed770c84a2..36431af96a5517972b229a311b738e0c0457f921 100644
--- a/test/distributed/test_reduce_scatter.py
+++ b/test/distributed/test_reduce_scatter.py
@@ -83,7 +83,7 @@ class HcclReduceScatterTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist.reduce_scatter)
                 self._test_multiprocess(HcclReduceScatterTest._test_reduce_scatter,
@@ -126,7 +126,7 @@ class HcclReduceScatterTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist.reduce_scatter, dist.ReduceOp.AVG)
                 self._test_multiprocess(HcclReduceScatterTest._test_reduce_scatter,
diff --git a/test/distributed/test_reduce_scatter_base.py b/test/distributed/test_reduce_scatter_base.py
index 4b45a89a476e2787273af2990a99b9328b45d154..5d11345d44d15eaca4ba6c2dbc59666793b806fe 100644
--- a/test/distributed/test_reduce_scatter_base.py
+++ b/test/distributed/test_reduce_scatter_base.py
@@ -42,7 +42,7 @@ class HcclReduceScatterBaseTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist._reduce_scatter_base)
                 self._test_multiprocess(HcclReduceScatterBaseTest._test_reduce_scatter_base,
@@ -60,7 +60,7 @@ class HcclReduceScatterBaseTest(HcclReduceScatterTestBase):
             for shape in shape_format:
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist._reduce_scatter_base)
                 self._test_multiprocess(HcclReduceScatterBaseTest._test_reduce_scatter_base,
@@ -77,7 +77,7 @@ class HcclReduceScatterBaseTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist._reduce_scatter_base, dist.ReduceOp.AVG)
                 self._test_multiprocess(HcclReduceScatterBaseTest._test_reduce_scatter_base,
diff --git a/test/distributed/test_reduce_scatter_tensor.py b/test/distributed/test_reduce_scatter_tensor.py
index c58236ba1a5c346fde2ef0505704a19b1289bf90..52eb58adbbedb2b326c219cd2ceec97a14aa130e 100644
--- a/test/distributed/test_reduce_scatter_tensor.py
+++ b/test/distributed/test_reduce_scatter_tensor.py
@@ -42,7 +42,7 @@ class HcclReduceScatterTensorTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist.reduce_scatter_tensor)
                 self._test_multiprocess(HcclReduceScatterTensorTest._test_reduce_scatter_tensor,
@@ -74,7 +74,7 @@ class HcclReduceScatterTensorTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, torch_npu.distributed.reduce_scatter_tensor_uneven)
                 self._test_multiprocess(HcclReduceScatterTensorTest._test_reduce_scatter_tensor_uneven,
@@ -91,7 +91,7 @@ class HcclReduceScatterTensorTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist.reduce_scatter_tensor, dist.ReduceOp.AVG)
                 self._test_multiprocess(HcclReduceScatterTensorTest._test_reduce_scatter_tensor,
@@ -108,7 +108,7 @@ class HcclReduceScatterTensorTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist.reduce_scatter_tensor_uneven, dist.ReduceOp.AVG)
                 self._test_multiprocess(HcclReduceScatterTensorTest._test_reduce_scatter_tensor_uneven,
diff --git a/test/distributed/test_scatter.py b/test/distributed/test_scatter.py
index 3daf58288890ce2b669d4522f78cc3c726027a2b..a35a32a163aa396b33b6af4eccaecd01e2fe7057 100644
--- a/test/distributed/test_scatter.py
+++ b/test/distributed/test_scatter.py
@@ -76,7 +76,7 @@ class HcclScatterTest(TestCase):
             for shape in shape_format:
                 input_list = []
                 for _ in range(world_size):
-                    _, npu_input = create_common_tensor(shape, -10, -10)
+                    _, npu_input = create_common_tensor(shape, -10, 10)
                     input_list.append(npu_input.cpu())
                 expected = self._construct_expected_result(input_list, dist.scatter)
                 self._test_multiprocess(HcclScatterTest._test_scatter,
diff --git a/test/get_failed_ut_from_log.py b/test/get_failed_ut_from_log.py
index 2bde722a51c85ef41ec1314f751a9cf8444f7db1..a2ef79423432beb3fc7fd61f071ab8ecf5100adf 100644
--- a/test/get_failed_ut_from_log.py
+++ b/test/get_failed_ut_from_log.py
@@ -18,19 +18,22 @@ def get_error_or_fail_ut(file):
 
 
 def write_to_json(ut_list=None):
-    file1 = ".pytorch-disabled-tests.json"
+    file1 = "unsupported_test_cases/.pytorch-disabled-tests.json"
     fr = open(file1)
     content = json.load(fr)
+    if not ut_list:
+        return
     for line in ut_list:
         content[line] = ["", [""]]
     with open("./pytorch-disabled-tests.json", mode="w") as fp:
         fp.write("{\n")
         length = len(content.keys()) - 1
         for i, (key, (value1, value2)) in enumerate(content.items()):
+            value2_str = "\"" + "\",\"".join(value2) + "\""
             if i < length:
-                fp.write(f"  \"{key}\": [\"{value1}\", [\"\"]]" + ",\n")
+                fp.write(f"  \"{key}\": [\"{value1}\", [{value2_str}]]" + ",\n")
             else:
-                fp.write(f"  \"{key}\": [\"{value1}\", [\"\"]]" + "\n")
+                fp.write(f"  \"{key}\": [\"{value1}\", [{value2_str}]]" + "\n")
         fp.write("}\n")
     fr.close()
 
diff --git a/test/npu/test_aclgraph_update.py b/test/npu/test_aclgraph_update.py
new file mode 100644
index 0000000000000000000000000000000000000000..13d089e268e49c4455cd23c23d8e94a380bd467d
--- /dev/null
+++ b/test/npu/test_aclgraph_update.py
@@ -0,0 +1,129 @@
+import os
+import unittest
+from itertools import chain
+
+import torch
+
+os.environ["ASCEND_LAUNCH_BLOCKING"] = "0"
+import torch_npu
+from torch_npu.testing.common_utils import SupportedDevices
+from torch_npu.testing.testcase import TestCase, run_tests
+
+
+class TestAclgraphUpdate(TestCase):
+
+    @SupportedDevices(['Ascend910B'])
+    def test_ifa_update(self):
+        torch.npu.set_device(0)
+        length = [29]
+        length_new = [100]
+        scale = 1 / 0.0078125
+        query = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu")
+        key = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu")
+        value = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu")
+
+        res_src = torch_npu.npu_fused_infer_attention_score(
+            query, key, value, num_heads=32, input_layout="BNSD", scale=scale, pre_tokens=65535,
+            next_tokens=65535, softmax_lse_flag=False, actual_seq_lengths=length_new)
+
+        g = torch.npu.NPUGraph()
+        event = torch.npu.ExternalEvent()
+        update_stream = torch.npu.Stream()
+        handle = None
+        output = None
+        softmax_lse = None
+
+        workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(
+            query, key, value, num_heads=32, input_layout="BNSD", scale=scale, pre_tokens=65535,
+            next_tokens=65535, softmax_lse_flag=False, actual_seq_lengths=length)
+
+        with torch.npu.graph(g):
+            stream = torch.npu.current_stream()
+            output = torch.empty(1, 32, 1, 128, dtype=torch.float16, device="npu")
+            softmax_lse = torch.empty(1, dtype=torch.float16, device="npu")
+            event.wait(stream)
+            event.reset(stream)
+            torch.npu.graph_task_group_begin(stream)
+            torch_npu.npu_fused_infer_attention_score.out(
+                query, key, value, num_heads=32, input_layout="BNSD", scale=scale, pre_tokens=65535, workspace=workspace,
+                next_tokens=65535, softmax_lse_flag=False, actual_seq_lengths=length, out=[output, softmax_lse])
+            handle = torch.npu.graph_task_group_end(stream)
+        
+        with torch.npu.stream(update_stream):
+            torch.npu.graph_task_update_begin(update_stream, handle)
+            torch_npu.npu_fused_infer_attention_score.out(
+                query, key, value, num_heads=32, input_layout="BNSD", scale=scale, pre_tokens=65535, workspace=workspace,
+                next_tokens=65535, softmax_lse_flag=False, actual_seq_lengths=length_new, out=[output, softmax_lse])
+            torch.npu.graph_task_update_end(update_stream)
+            event.record(update_stream)
+
+        g.replay()
+        self.assertEqual(output.cpu(), res_src[0].cpu())
+        self.assertEqual(softmax_lse.cpu(), res_src[1].cpu())
+
+    @SupportedDevices(['Ascend910B'])
+    def test_ifa_update_with_auto_dispatch_capture(self):
+        torch.npu.set_device(0)
+        length = [29]
+        length_new = [100]
+        scale = 1 / 0.0078125
+        query = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu")
+        key = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu")
+        value = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu")
+
+        res_src = torch_npu.npu_fused_infer_attention_score(
+            query, key, value, num_heads=32, input_layout="BNSD", scale=scale, pre_tokens=65535,
+            next_tokens=65535, softmax_lse_flag=False, actual_seq_lengths=length_new)
+
+        g = torch.npu.NPUGraph()
+        output = None
+        softmax_lse = None
+
+        workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(
+            query, key, value, num_heads=32, input_layout="BNSD", scale=scale, pre_tokens=65535,
+            next_tokens=65535, softmax_lse_flag=False, actual_seq_lengths=length)
+
+        with torch.npu.graph(g, auto_dispatch_capture=True):
+            output = torch.empty(1, 32, 1, 128, dtype=torch.float16, device="npu")
+            softmax_lse = torch.empty(1, dtype=torch.float16, device="npu")
+            torch_npu.npu_fused_infer_attention_score.out(
+                query, key, value, num_heads=32, input_layout="BNSD", scale=scale, pre_tokens=65535, workspace=workspace,
+                next_tokens=65535, softmax_lse_flag=False, actual_seq_lengths=length, out=[output, softmax_lse])
+        
+        g.update(cpu_update_input=[{"actual_seq_lengths": length_new}])
+        g.replay()
+        self.assertEqual(output.cpu(), res_src[0].cpu())
+        self.assertEqual(softmax_lse.cpu(), res_src[1].cpu())
+
+    @SupportedDevices(['Ascend910B'])
+    def test_ifa_update_with_non_out_and_auto_dispatch_capture(self):
+        torch.npu.set_device(0)
+        length = [29]
+        length_new = [100]
+        scale = 1 / 0.0078125
+        query = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu")
+        key = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu")
+        value = torch.randn(1, 32, 1, 128, dtype=torch.float16, device="npu")
+
+        res_src = torch_npu.npu_fused_infer_attention_score(
+            query, key, value, num_heads=32, input_layout="BNSD", scale=scale, pre_tokens=65535,
+            next_tokens=65535, softmax_lse_flag=False, actual_seq_lengths=length_new)
+
+        g = torch.npu.NPUGraph()
+        output = None
+        softmax_lse = None
+
+        with torch.npu.graph(g, auto_dispatch_capture=True):
+            output = torch.empty(1, 32, 1, 128, dtype=torch.float16, device="npu")
+            softmax_lse = torch.empty(1, dtype=torch.float16, device="npu")
+            output, softmax_lse = torch_npu.npu_fused_infer_attention_score(
+                query, key, value, num_heads=32, input_layout="BNSD", scale=scale, pre_tokens=65535,
+                next_tokens=65535, softmax_lse_flag=False, actual_seq_lengths=length)
+        
+        g.update(cpu_update_input=[{"actual_seq_lengths": length_new}])
+        g.replay()
+        self.assertEqual(output.cpu(), res_src[0].cpu())
+        self.assertEqual(softmax_lse.cpu(), res_src[1].cpu())
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/npu/test_amp.py b/test/npu/test_amp.py
index e56b1fa2a132fa7baa57f8fc0776de1ef4ff3726..e7b810d15537dabbc1fa82f92a248d9297856529 100644
--- a/test/npu/test_amp.py
+++ b/test/npu/test_amp.py
@@ -455,6 +455,7 @@ class TestAmp(TestCase):
             @staticmethod
             @torch.npu.amp.custom_fwd
             def forward(ctx, a, b):
+                self.assertTrue(ctx._dtype is torch.npu.get_autocast_dtype())
                 self.assertTrue(a.dtype is torch.float32)
                 self.assertTrue(b.dtype is torch.float32)
                 self.assertTrue(torch.npu.is_autocast_enabled())
@@ -464,6 +465,7 @@ class TestAmp(TestCase):
             @staticmethod
             @torch.npu.amp.custom_bwd
             def backward(ctx, grad):
+                self.assertTrue(ctx._dtype is torch.npu.get_autocast_dtype())
                 self.assertTrue(torch.npu.is_autocast_enabled())
                 a, b = ctx.saved_tensors
                 return grad.mm(b.t()), a.t().mm(grad)
@@ -484,6 +486,7 @@ class TestAmp(TestCase):
             @staticmethod
             @torch.npu.amp.custom_fwd(cast_inputs=torch.float32)
             def forward(ctx, a, container, expect_type):
+                self.assertTrue(ctx._dtype is torch.npu.get_autocast_dtype())
                 b = container[1][0]
                 self.assertTrue(a.dtype is expect_type)
                 self.assertTrue(b.dtype is expect_type)
@@ -494,6 +497,7 @@ class TestAmp(TestCase):
             @staticmethod
             @torch.npu.amp.custom_bwd
             def backward(ctx, grad):
+                self.assertTrue(ctx._dtype is torch.npu.get_autocast_dtype())
                 a, b = ctx.saved_tensors
                 return grad.mm(b.t()), None, None
 
diff --git a/test/npu/test_cann_version.py b/test/npu/test_cann_version.py
index 7c3632803ed664810a1c6692a9827123fdadad2e..b768d0255925379530f7317d8d81f09dc43392f0 100644
--- a/test/npu/test_cann_version.py
+++ b/test/npu/test_cann_version.py
@@ -17,9 +17,6 @@ class TestCANNversion(TestCase):
                         or re.match("([0-9]+).([0-9]+).T([0-9]+)", version)
                         or re.match("([0-9]+).([0-9]+).RC([0-9]+).alpha([0-9]+)", version))
             self.assertTrue(is_match, f"The env version is {version_env}. The format of cann version {version} is invalid.")
-        else:
-            self.assertEqual(version, "")
-
 
     def test_compare_cann_version(self):
         version_env = get_cann_version_from_env()
@@ -28,7 +25,7 @@ class TestCANNversion(TestCase):
             self.assertTrue(result, f"The env version is {version_env}, the result from _is_gte_cann_version is False")
         else:
             with self.assertRaisesRegex(RuntimeError,
-                    "When the version is less than \"8.1.RC1\", this function is not supported"):
+                    "When the version 7.0.0 is less than \"8.1.RC1\", this function is not supported"):
                 _is_gte_cann_version("7.0.0", "CANN")
 
 
diff --git a/test/npu/test_compatibility.py b/test/npu/test_compatibility.py
index 4882fa293ba3a9d26bfc773ecbab6565c58747cd..f0a26c239f6f1007304455f700c69c53e07f0e91 100644
--- a/test/npu/test_compatibility.py
+++ b/test/npu/test_compatibility.py
@@ -243,6 +243,10 @@ class TestPublicApiCompatibility(TestCase):
         if update_allow_dict_torchair:
             allow_dict.update(update_allow_dict_torchair)
 
+        with open(
+                os.path.join(os.path.dirname(os.path.dirname(__file__)), 'deprecated_apis.json')) as json_file:
+            deprecated_dict = json.load(json_file)
+
         # load torch_npu_schema.json
         base_schema = {}
         with open(get_file_path_2(os.path.dirname(os.path.dirname(__file__)), "torch_npu_schema.json")) as fp:
@@ -274,6 +278,8 @@ class TestPublicApiCompatibility(TestCase):
                 obj = getattr(mod, elem)
                 if not (isinstance(obj, (Callable, torch.dtype)) or inspect.isclass(obj)):
                     return
+                if modname in deprecated_dict and elem in deprecated_dict[modname]:
+                    return
                 elem_module = getattr(obj, '__module__', None)
 
                 modname = allow_dict["being_migrated"].get(modname, modname)
diff --git a/test/npu/test_fault_mode.py b/test/npu/test_fault_mode.py
index d71f7e6fd7b21b5cefaa6ccfa56d84ab5936b83a..0d52d5d11c6a55f679676dd6d50ea3e5bf601e99 100644
--- a/test/npu/test_fault_mode.py
+++ b/test/npu/test_fault_mode.py
@@ -102,7 +102,7 @@ class TestMode(TestCase):
             torch.Generator(device="cuda")
 
     def test_not_supported_ops(self):
-        command = ['python', '-c', 'import torch; import torch_npu; torch.rand(1, 3, 3).npu().logit()']
+        command = ['python', '-c', 'import torch; import torch_npu; t = torch.rand(1, 3, 3).npu();t.fmax(t)']
         process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
         message = process.stderr.read()
         process.stderr.close()
@@ -110,7 +110,7 @@ class TestMode(TestCase):
         process.terminate()
         process.wait()
         self.assertIn(
-            "CAUTION: The operator 'aten::logit' is not currently supported on the NPU backend and will fall back "
+            "CAUTION: The operator 'aten::fmax.out' is not currently supported on the NPU backend and will fall back "
             "to run on the CPU. This may have performance implications. (function npu_cpu_fallback)",
             message
         )
diff --git a/test/npu/test_graph_tree.py b/test/npu/test_graph_tree.py
new file mode 100644
index 0000000000000000000000000000000000000000..997b9f038c52a55e888ccb3f2649d1ca0bff0a01
--- /dev/null
+++ b/test/npu/test_graph_tree.py
@@ -0,0 +1,1158 @@
+import os
+
+os.environ["ASCEND_LAUNCH_BLOCKING"] = "0"
+
+from unittest.mock import patch, MagicMock, call, ANY
+import weakref
+import pytest
+import torch
+import torch_npu
+from torch_npu.npu._graph_tree import (
+    check_memory_pool,
+    clear_cublass_cache,
+    clear_cublas_manager,
+    disable_conv_cache_emptying,
+    enable_history_recording,
+    format_tb,
+    npugraphify,
+    npugraphify_impl,
+    TreeManagerContainer,
+    StorageWeakRefWrapper,
+    NPUWarmupNode,
+    CompilationMode,
+    get_container,
+    get_block_addrs,
+    get_manager,
+    get_npugraph_segments,
+    reset_npugraph_trees,
+    local,
+    OutputAliasInfo,
+    UnaliasedStorage,
+    AliasesPriorGraphOutput,
+    AliasesNewOutput,
+    NPUGraphNode,
+    WrappedFunction,
+    NPUGraphTreeManager,
+    ExecutionState,
+    FunctionID,
+    GraphID,
+)
+from torch_npu.testing.testcase import TestCase, run_tests
+
+
+device = "npu:0"
+torch.npu.set_device(device)
+
+
+class TestCublasCacheManagement(TestCase):
+    @patch("torch_npu.npu._graph_tree.clear_cublass_cache")
+    def test_clear_cublas_manager_context(self, mock_clear):
+        with clear_cublas_manager():
+            mock_clear.assert_called_once()
+            mock_clear.reset_mock()
+        mock_clear.assert_called_once()
+
+
+class TestDisableConvCache(TestCase):
+    def test_disable_conv_cache_emptying(self):
+        with disable_conv_cache_emptying():
+            pass  # No operation, just ensure no exceptions
+
+
+class TestHistoryRecording(TestCase):
+    @patch("torch.npu.memory._record_memory_history")
+    def test_enable_history_recording(self, mock_record):
+        original_state = torch_npu._C._npu_isHistoryEnabled()
+        with enable_history_recording():
+            if not original_state:
+                mock_record.assert_called_once()
+            else:
+                mock_record.assert_not_called()
+        mock_record.assert_any_call(None)
+
+
+class TestNpuGraphFunctions(TestCase):
+    def setUp(self):
+        # Reset global state before each test
+        reset_npugraph_trees()
+
+    @patch("torch_npu.npu._graph_tree.TreeManagerContainer")
+    def test_get_manager(self, mock_container):
+        # Test manager creation
+        mock_container.return_value.get_tree_manager.return_value = "mock_manager"
+        manager = get_manager(0)
+        self.assertEqual(manager, "mock_manager")
+
+        # Test no-creation path
+        manager = get_manager(0, create_if_none_exists=False)
+        mock_container.return_value.get_tree_manager.assert_called_once()
+
+    @patch("torch_npu.npu._graph_tree.npugraphify")
+    @patch("torch._inductor.compile_fx.align_inputs_from_check_idxs")
+    def test_npugraphify_impl(self, mock_align, mock_npugraphify):
+        # Setup mock model and inputs
+        mock_model = MagicMock()
+        inputs = [1, torch.tensor([2]), 3]
+        static_idxs = (1,)
+
+        # Test caching behavior
+        impl = npugraphify_impl(mock_model, inputs, static_idxs)
+
+        # First call
+        mock_npugraphify.return_value = (lambda x: "output1", "output1")
+        result = impl(inputs)
+        self.assertEqual(result, "output1")
+
+        # Second call with same int keys
+        result = impl(inputs)
+        self.assertEqual(result, "output1")
+        mock_npugraphify.assert_called_once()
+
+    @patch("torch_npu.npu._graph_tree.get_container")
+    def test_npugraphify(self, mock_container):
+        # Setup mock manager
+        mock_manager = MagicMock()
+        mock_container.return_value.get_tree_manager.return_value = mock_manager
+
+        # Test valid mode combinations
+        model = MagicMock()
+        inputs = [torch.tensor([1])]
+
+        # Test forward mode
+        npugraphify(
+            model, inputs, (), device_index=0, is_backward=False, is_inference=False
+        )
+        mock_manager.add_function.assert_called_with(
+            model, inputs, (), None, CompilationMode.FORWARD, (),
+        )
+
+        # Test backward mode
+        mock_manager.reset_mock()
+        npugraphify(
+            model, inputs, (), device_index=0, is_backward=True, is_inference=False
+        )
+        mock_manager.add_function.assert_called_with(
+            model, inputs, (), None, CompilationMode.BACKWARD, (),
+        )
+
+        # Test invalid mode combination
+        with self.assertRaises(RuntimeError):
+            npugraphify(
+                model, inputs, (), device_index=0, is_backward=True, is_inference=True
+            )
+
+
+class TestTreeManagerContainer(TestCase):
+    def setUp(self):
+        self.container = TreeManagerContainer(0)
+
+    def test_initial_state(self):
+        self.assertIsNone(self.container.tree_manager)
+        self.assertEqual(self.container.live_npugraphify_fns, 0)
+
+    def test_add_strong_reference(self):
+        self.container.add_strong_reference(lambda: None)
+        # Simulate finalization of fn
+        finalizer = weakref.finalize(
+            lambda: None,
+            self.container.finalize_npugraphify_fn,  # Object to monitor  # Callback
+        )
+        finalizer.atexit = False  # Prevent finalizer from running at exit
+
+        # Simulate finalization
+        finalizer()
+        # If all references are gone, tree_manager should be None
+        self.container._finalize_tree_manager = MagicMock()
+        self.container._finalize_tree_manager()
+        self.container._finalize_tree_manager.assert_called_once()
+
+    def test_get_tree_manager(self):
+        with patch("torch_npu.npu.graphs.NPUGraph.capture_begin"), patch(
+            "torch_npu.npu.graphs.NPUGraph.capture_end"
+        ):
+            manager = self.container.get_tree_manager()
+        self.assertIsNotNone(manager)
+        self.assertIs(manager, self.container.get_tree_manager())  # Same instance
+
+
+class TestStorageWeakRefWrapper(TestCase):
+    def test_storage_ref(self):
+        tensor = torch.tensor([1], device="npu")
+        wrapper = StorageWeakRefWrapper(tensor)
+        self.assertEqual(wrapper.data_ptr(), tensor.untyped_storage().data_ptr())
+        del tensor
+        # Storage might still be alive due to Python's ref counting; force GC
+        import gc
+
+        gc.collect()
+        self.assertTrue(wrapper.expired())
+
+
+class TestNPUWarmupNode(TestCase):
+    @patch("torch_npu.npu._graph_tree.StorageWeakRefWrapper")
+    @patch("torch_npu.npu._graph_tree.check_memory_pool")
+    def test_run_captures_outputs(self, mock_check, mock_wrapper):
+        mock_model = MagicMock(return_value=[torch.tensor([2], device="npu")])
+        wrapped_fn = MagicMock(model=mock_model, constants=[])
+        stream = torch.npu.Stream()
+        node = NPUWarmupNode(
+            wrapped_fn,
+            parent=None,
+            npu_graphs_pool=(0, 0),
+            existing_npu_graph=None,
+            device_index=0,
+            stack_traces=None,
+            stream=stream,
+            already_warm=False,
+        )
+        outputs = node.run([])
+        self.assertEqual(len(node.outputs_weakrefs), 1)
+
+
+class TestTreeManagerIntegration(TestCase):
+    def test_get_container_singleton_per_device(self):
+        container1 = get_container(0)
+        container2 = get_container(0)
+        self.assertIs(container1, container2)
+        container3 = get_container(1)
+        self.assertIsNot(container1, container3)
+
+    def test_reset_npugraph_trees(self):
+        get_container(0)  # Initialize a container
+        reset_npugraph_trees()
+        container_dict = getattr(local, "tree_manager_containers", {})
+        self.assertEqual(len(container_dict), 0)
+
+
+@pytest.fixture
+def mock_wrapped_function():
+    def model_side_effect(inputs):
+        # Clear inputs list while preserving reference
+        inputs[:] = []
+        return []
+
+    return MagicMock(
+        spec=WrappedFunction,
+        static_input_idxs=[0],
+        constants=[],
+        model=MagicMock(side_effect=model_side_effect),
+    )
+
+
+@pytest.fixture
+def mock_parent_node():
+    parent = MagicMock(spec=NPUGraphNode)
+    parent.outputs_weakrefs = []
+    parent.path_weakrefs = []
+    parent.parent = None
+    parent.stack_traces = []
+    parent.recorded_liveness_after_graph = []
+    return parent
+
+
+@pytest.fixture
+def basic_npu_graph_node(mock_wrapped_function, mock_parent_node):
+    with patch("torch_npu.npu._graph_tree._use_npu_memory_pool_manager"), patch(
+        "torch_npu.npu._graph_tree.check_memory_pool"
+    ), patch("torch_npu._C._npu_getCheckpointState"):
+        return NPUGraphNode(
+            wrapped_function=mock_wrapped_function,
+            graph_id=1,
+            parent=mock_parent_node,
+            inputs=[torch.tensor([1.0], device="npu")],
+            npu_graphs_pool=(0, 0),
+            device_index=0,
+            stack_traces=None,
+            stream=torch.npu.Stream(),
+        )
+
+
+class TestOutputAliasInfo(TestCase):
+    def test_aliases_prior_graph_output_validation(self):
+        with pytest.raises(RuntimeError):
+            AliasesPriorGraphOutput("invalid_index")
+
+    def test_aliases_new_output_validation(self):
+        with pytest.raises(RuntimeError):
+            AliasesNewOutput("not_an_int")
+
+
+class TestNPUGraphNode:
+    def tearDown(self):
+        torch_npu._C._npu_endAllocateCurrentStreamToPool(0, (0, 0))
+        torch_npu._C._npu_releasePool(0, (0, 0))
+
+    def test_initialization(self, mock_wrapped_function, mock_parent_node):
+        inputs = [torch.tensor([1.0], device="npu")]
+        with patch("torch_npu.npu._graph_tree._use_npu_memory_pool_manager"), patch(
+            "torch_npu.npu._graph_tree.check_memory_pool"
+        ), patch("torch_npu._C._npu_getCheckpointState"):
+            node = NPUGraphNode(
+                wrapped_function=mock_wrapped_function,
+                graph_id=1,
+                parent=mock_parent_node,
+                inputs=inputs,
+                npu_graphs_pool=(0, 0),
+                device_index=0,
+                stack_traces=None,
+                stream=torch.npu.Stream(),
+            )
+
+        assert node.id == 1
+        assert node.device == 0
+        assert node.parent == mock_parent_node
+        assert node.graph is not None
+
+    def test_invalid_input_type(self, mock_wrapped_function):
+        with pytest.raises(RuntimeError):
+            NPUGraphNode(
+                wrapped_function=mock_wrapped_function,
+                graph_id=1,
+                parent=None,
+                inputs="not_a_list",
+                npu_graphs_pool=(0, 0),
+                device_index=0,
+                stack_traces=None,
+                stream=torch.npu.Stream(),
+            )
+
+    @patch("torch_npu.npu._graph_tree.check_memory_pool")
+    def test_record_method(self, mock_check, basic_npu_graph_node):
+        def model_side_effect(inputs):
+            # Clear inputs list while preserving reference
+            inputs[:] = []
+            return []
+
+        mock_model = MagicMock(side_effect=model_side_effect)
+        mock_inputs = [torch.tensor([1.0], device="npu")]
+
+        with patch("torch_npu.npu._graph_tree.clear_cublas_manager"), patch(
+            "torch_npu.npu._graph_tree.get_history_recording"
+        ), patch("torch_npu.npu.graphs.NPUGraph.capture_begin"), patch(
+            "torch_npu.npu.graphs.NPUGraph.capture_end"
+        ), patch(
+            "torch_npu._C._npu_getCheckpointState"
+        ), patch(
+            "torch._dynamo.utils.preserve_rng_state"
+        ):
+
+            outputs = basic_npu_graph_node._record(mock_model, mock_inputs)
+
+            mock_model.assert_called_once_with(mock_inputs)
+            assert basic_npu_graph_node.recording_outputs == outputs
+
+    def test_reconstruct_outputs(self, basic_npu_graph_node):
+        # Setup mock metadata and storage info
+        basic_npu_graph_node.outputs_metadata = [
+            {
+                "nbytes": 4,
+                "data_ptr": 1234,
+                "size": (1,),
+                "stride": (1,),
+                "dtype": torch.float32,
+                "device": "npu",
+                "storage_offset": 0,
+            }
+        ]
+        basic_npu_graph_node.output_weakrefs = [MagicMock()]
+        basic_npu_graph_node.output_storage_alias = [UnaliasedStorage]
+        basic_npu_graph_node.cached_tensor_outputs = [MagicMock()]
+
+        with patch(
+            "torch_npu._C._construct_NPU_Tensor_From_Storage_And_Metadata"
+        ) as mock_construct:
+            outputs = basic_npu_graph_node.reconstruct_outputs()
+            assert len(outputs) == 1
+
+    def test_aliased_output_reconstruction(self, basic_npu_graph_node):
+        basic_npu_graph_node.outputs_metadata = [
+            {
+                "nbytes": 4,
+                "data_ptr": 1234,
+                "size": (1,),
+                "stride": (1,),
+                "dtype": torch.float32,
+                "device": "npu",
+                "storage_offset": 0,
+            }
+        ]
+        basic_npu_graph_node.output_storage_alias = [AliasesPriorGraphOutput((0, 0))]
+        basic_npu_graph_node.outputs_weakrefs = [MagicMock()]
+        basic_npu_graph_node.cached_tensor_outputs = [MagicMock()]
+
+        with patch("torch_npu.npu._graph_tree.maybe_deref") as mock_maybe_deref:
+            mock_maybe_deref.return_value = (MagicMock(), 1234)
+            outputs = basic_npu_graph_node.reconstruct_outputs()
+            assert len(outputs) == 1
+
+    def test_liveness_tracking(self, basic_npu_graph_node):
+        mock_ref = MagicMock()
+        basic_npu_graph_node.path_weakrefs = [[mock_ref]]
+
+        with patch("torch_npu.npu._graph_tree.is_live") as mock_is_live:
+            mock_is_live.return_value = True
+            liveness = basic_npu_graph_node._get_liveness(
+                basic_npu_graph_node.path_weakrefs
+            )
+            assert liveness == [[True]]
+
+    def test_child_management(self, basic_npu_graph_node):
+        mock_child = MagicMock()
+        basic_npu_graph_node.add_child("test_func", mock_child)
+        assert "test_func" in basic_npu_graph_node.children
+        assert mock_child in basic_npu_graph_node.children["test_func"]
+
+    def test_invalid_run_conditions(self, basic_npu_graph_node):
+        basic_npu_graph_node.graph = None
+        with pytest.raises(RuntimeError):
+            basic_npu_graph_node.run_graph()
+
+    def test_storage_metadata_handling(self, basic_npu_graph_node):
+        tensor = torch.tensor([1.0], device="npu")
+        metadata = basic_npu_graph_node._tensor_metadata(tensor)
+
+        assert metadata["data_ptr"] == tensor.untyped_storage().data_ptr()
+        assert metadata["size"] == tensor.shape
+
+    @patch("torch.npu.synchronize")
+    @patch("torch_npu.npu._graph_tree._use_npu_memory_pool_manager")
+    def test_input_processing(self, mock_pool_manager, mock_sync, basic_npu_graph_node):
+        inputs = [torch.tensor([1.0], device="npu")]
+        processed = basic_npu_graph_node._allocate_and_copy_recording_inputs(inputs)
+        assert len(processed) == 1
+        assert isinstance(processed[0], torch.Tensor)
+
+    def test_check_invariants(self, basic_npu_graph_node):
+        mock_inputs = [torch.tensor([1.0], device="npu")]
+        basic_npu_graph_node.static_input_data_ptrs = [mock_inputs[0].data_ptr()]
+        basic_npu_graph_node.npugraph_managed_idxs = [0]
+
+        assert basic_npu_graph_node.check_invariants(mock_inputs)
+
+    def test_descendant_count(self, basic_npu_graph_node):
+        mock_child = MagicMock(num_descendants=lambda: 0)
+        basic_npu_graph_node.children["test"] = [mock_child]
+        assert basic_npu_graph_node.num_descendants() == 1
+
+    def test_prepare_alias_info_metadata_int(self, basic_npu_graph_node):
+        result = basic_npu_graph_node.prepare_alias_info_for_tensor_construction(
+            MagicMock(), 42
+        )
+        assert result is None
+
+    def test_prepare_alias_info_unaliased_storage(self, basic_npu_graph_node):
+        result = basic_npu_graph_node.prepare_alias_info_for_tensor_construction(
+            UnaliasedStorage, {"meta": "data"}
+        )
+        assert result is None
+
+    def test_prepare_alias_info_aliases_prior_graph_valid(self, basic_npu_graph_node):
+        mock_ref = MagicMock()
+        basic_npu_graph_node.path_weakrefs = [[mock_ref, mock_ref]]
+        alias_info = AliasesPriorGraphOutput((0, 1))
+
+        with patch("torch.UntypedStorage._new_with_weak_ptr") as mock_new:
+            result = basic_npu_graph_node.prepare_alias_info_for_tensor_construction(
+                alias_info, {"meta": "data"}
+            )
+            mock_new.assert_called_once_with(mock_ref())
+            assert result == mock_new.return_value
+
+    def test_prepare_alias_info_aliases_prior_graph_none_ref(
+        self, basic_npu_graph_node
+    ):
+        basic_npu_graph_node.path_weakrefs = [[None, None]]
+        alias_info = AliasesPriorGraphOutput((0, 1))
+
+        with pytest.raises(RuntimeError):
+            basic_npu_graph_node.prepare_alias_info_for_tensor_construction(
+                alias_info, {"meta": "data"}
+            )
+
+    def test_prepare_alias_info_aliases_new_output(self, basic_npu_graph_node):
+        alias_info = AliasesNewOutput(123)
+        result = basic_npu_graph_node.prepare_alias_info_for_tensor_construction(
+            alias_info, {"meta": "data"}
+        )
+        assert result == 123
+
+    def test_prepare_alias_info_invalid_type(self, basic_npu_graph_node):
+        with pytest.raises(RuntimeError):
+            basic_npu_graph_node.prepare_alias_info_for_tensor_construction(
+                "invalid_type", {"meta": "data"}
+            )
+
+    # Tests for prepare_storages_for_construction
+    def test_prepare_storages_mixed_aliases(self, basic_npu_graph_node):
+        basic_npu_graph_node.output_storage_alias = [
+            UnaliasedStorage,
+            AliasesNewOutput(123),
+            AliasesPriorGraphOutput((0, 1)),
+        ]
+        basic_npu_graph_node.outputs_metadata = [None, {}, {}]
+        basic_npu_graph_node.path_weakrefs = [[None, MagicMock(), MagicMock()]]
+
+        with patch("torch.UntypedStorage._new_with_weak_ptr"):
+            results = basic_npu_graph_node.prepare_storages_for_construction()
+
+        assert len(results) == 3
+        assert results[0] is None
+        assert results[1] == 123
+
+    # Tests for debug_assert_invariants
+    def test_debug_assert_invariants_valid(self, basic_npu_graph_node):
+        from torch._inductor import config
+
+        config.triton.fast_path_cudagraph_asserts = True
+        expected_liveness = [[], [True, False]]
+        newly_dead = [(1, 1)]
+        ref = MagicMock(return_value=None)
+        basic_npu_graph_node.outputs_weakrefs = [None, ref]
+        basic_npu_graph_node.parent.outputs_weakrefs = []
+        basic_npu_graph_node.path_weakrefs = [
+            basic_npu_graph_node.parent.outputs_weakrefs,
+            basic_npu_graph_node.outputs_weakrefs,
+        ]
+
+        # Should not raise
+        with patch("torch_npu.npu._graph_tree.get_block_addrs"):
+            basic_npu_graph_node.debug_assert_invariants(expected_liveness, newly_dead)
+        config.triton.fast_path_cudagraph_asserts = False
+
+    def test_debug_assert_invariants_dead_ref_alive(self, basic_npu_graph_node):
+        from torch._inductor import config
+
+        config.triton.fast_path_cudagraph_asserts = True
+        expected_liveness = [[False]]
+        newly_dead = [(0, 0)]
+        basic_npu_graph_node.path_weakrefs = [
+            [MagicMock(return_value=("ptr", 123))]
+        ]  # Live ref
+
+        with pytest.raises(RuntimeError):
+            basic_npu_graph_node.debug_assert_invariants(expected_liveness, newly_dead)
+        config.triton.fast_path_cudagraph_asserts = False
+
+    # Tests for _initialize_cached_tensors
+    def test_initialize_cached_tensors_valid(self, basic_npu_graph_node):
+        basic_npu_graph_node.output_storage_alias = [UnaliasedStorage, UnaliasedStorage]
+        basic_npu_graph_node.outputs_metadata = [
+            {"dtype": torch.float},
+            {"dtype": torch.int},
+        ]
+        basic_npu_graph_node.unaliased_in_all_paths = [True, False]
+        basic_npu_graph_node.outputs_weakrefs = [None, None]
+
+        with patch.object(basic_npu_graph_node, "create_storage"), patch(
+            "torch_npu._C._add_cached_tensor"
+        ), patch.object(
+            basic_npu_graph_node, "_reconstruct_from_tensor_metadata"
+        ) as mock_reconstruct:
+
+            mock_reconstruct.return_value = torch.tensor([1.0], device="npu:0")
+            basic_npu_graph_node._initialize_cached_tensors()
+
+            assert len(basic_npu_graph_node.cached_tensor_outputs) == 2
+            assert basic_npu_graph_node.cached_tensor_outputs[0] is not None
+            assert len(basic_npu_graph_node.outputs_weakrefs) == 2
+
+    def test_initialize_cached_tensors_invalid_storage_info(self, basic_npu_graph_node):
+        basic_npu_graph_node.output_storage_alias = ["invalid"]
+        basic_npu_graph_node.unaliased_in_all_paths = [True]
+
+        basic_npu_graph_node._initialize_cached_tensors()
+
+
+@patch("torch_npu.npu.graphs.NPUGraph.replay")
+@patch("torch_npu.npu._graph_tree.check_memory_pool")
+@patch("torch_npu.npu._graph_tree._use_npu_memory_pool_manager")
+class TestNPUGraphNodeRun(TestCase):
+    def setUp(self):
+        """Initialize common test components and configurations"""
+        self.device = "npu:0"
+
+        def model_side_effect(inputs):
+            # Clear inputs list while preserving reference
+            inputs[:] = []
+            return []
+
+        self.wrapped_function = MagicMock(
+            spec=WrappedFunction,
+            static_input_idxs=[0],
+            constants=[],
+            model=MagicMock(side_effect=model_side_effect),
+        )
+        self.graph_id = 1
+        self.npu_graphs_pool = (0, 0)
+        self.stream = torch.npu.Stream(device=self.device)
+
+        # Create test tensors
+        self.static_input = torch.randn(
+            3, 3, device=self.device
+        )  # Static input (parameter-like)
+        self.dynamic_input = torch.randn(2, 2, device=self.device)  # Dynamic input
+
+    def _create_node(self, inputs, parent=None):
+        """Helper to create NPUGraphNode instance"""
+        with patch("torch_npu._C._npu_getCheckpointState"), patch(
+            "torch_npu.npu.graphs.NPUGraph.capture_begin"
+        ), patch("torch_npu.npu.graphs.NPUGraph.capture_end"):
+            return NPUGraphNode(
+                wrapped_function=self.wrapped_function,
+                graph_id=self.graph_id,
+                parent=parent,
+                inputs=inputs,
+                npu_graphs_pool=self.npu_graphs_pool,
+                device_index=0,
+                stack_traces=None,
+                stream=self.stream,
+            )
+
+    @patch.object(NPUGraphNode, "run_graph")
+    def test_static_input_optimization(
+        self, mock_run_graph, mock_pool, mock_check, mock_replay
+    ):
+        """Verify static inputs bypass copy operations"""
+        # Mark all inputs as static
+        self.wrapped_function.static_input_idxs = [0, 1]
+        node = self._create_node([self.static_input, self.static_input.clone()])
+
+        # Execute with cloned inputs
+        node.run([self.static_input.clone(), self.static_input.clone()])
+
+        # Validate no copy operations occurred
+        self.assertEqual(mock_run_graph.call_count, 1)
+
+    @patch.object(NPUGraphNode, "reconstruct_outputs")
+    def test_output_reconstruction_flow(
+        self, mock_reconstruct, mock_pool, mock_check, mock_replay
+    ):
+        """Test full output reconstruction pipeline"""
+        # Configure mock reconstruction
+        expected_output = torch.tensor([1.0], device=self.device)
+        mock_reconstruct.return_value = [expected_output]
+
+        node = self._create_node([self.static_input])
+        outputs = node.run([self.static_input.clone()])
+
+        # Validate outputs
+        self.assertEqual(outputs, [expected_output])
+        mock_reconstruct.assert_called_once()
+
+    @patch("torch._foreach_copy_")
+    def test_batched_copy_optimization(
+        self, mock_batched_copy, mock_pool, mock_check, mock_replay
+    ):
+        """Verify batched copy operations for efficiency"""
+        # Configure multiple dynamic inputs
+        self.wrapped_function.static_input_idxs = []
+        inputs = [torch.randn(2, 2, device=self.device) for _ in range(3)]
+        new_inputs = [t.clone() for t in inputs]
+        node = self._create_node(inputs)
+
+        # Execute with new inputs
+        node.run(new_inputs)
+
+        # Validate single batched copy call
+        args, _ = mock_batched_copy.call_args
+        self.assertEqual(len(args[0]), 3)
+
+    def test_memory_cleanup_after_execution(self, mock_pool, mock_check, mock_replay):
+        """Validate input list cleanup post-execution"""
+        initial_inputs = [self.static_input.clone(), self.dynamic_input.clone()]
+        input_copy = [t.clone() for t in initial_inputs]
+        node = self._create_node(initial_inputs)
+
+        # Execute and verify cleanup
+        node.run(input_copy)
+        self.assertEqual(len(input_copy), 0)
+
+
+class TestGetNpugraphSegments(TestCase):
+    @patch('torch.npu.memory_snapshot')  
+    def test_get_npugraph_segments(self, mock_snapshot):            
+        mock_snapshot.return_value = [
+                    {"segment_pool_id": (0, 1), "address": 1000, "blocks": []},
+                    {"segment_pool_id": (0, 0), "address": 2000, "blocks": []},
+                    {"segment_pool_id": (0, 1), "address": 3000, "blocks": []},
+                ]                      
+        result = get_npugraph_segments((0, 1))                      
+        self.assertEqual(len(result), 2)      
+        mock_snapshot.assert_called_once_with()
+
+
+class TestGetBlockAddrs(TestCase):
+    @patch('torch_npu.npu._graph_tree.get_npugraph_segments')
+    def test_get_block_addrs_live_only(self, mock_segments):
+        mock_segments.return_value = [
+            {
+                "segment_pool_id": (0, 0),
+                "address": 1000,
+                "blocks": [
+                    {"state": "active_allocated", "size": 100},
+                    {"state": "inactivate", "size": 200},
+                    {"state": "active_allocated", "size": 300},
+                ]
+            },
+            {
+                "segment_pool_id": (0, 0),
+                "address": 2000,
+                "blocks": [
+                    {"state": "active_allocated", "size": 50},
+                    {"state": "inactivate", "size": 150},
+                ]
+            }
+        ]
+        result = get_block_addrs((0, 0), live_only=True)
+        self.assertEqual(result, [1000, 1300, 2000])
+        mock_segments.assert_called_once_with((0, 0))
+
+    @patch('torch_npu.npu._graph_tree.get_npugraph_segments')
+    def test_get_block_addrs_all_blocks(self, mock_segments):
+        mock_segments.return_value = [
+            {
+                "segment_pool_id": (0, 0),
+                "address": 1000,
+                "blocks": [
+                    {"state": "active_allocated", "size": 100},
+                    {"state": "inactivate", "size": 200},
+                ]
+            }
+        ]
+        result = get_block_addrs((0, 0), live_only=False)
+        self.assertEqual(result, [1000, 1100])
+        mock_segments.assert_called_once_with((0, 0))
+
+
+class TestFormatTb(TestCase):
+    def test_format_tb(self):
+        frames = [
+            {"filename": "/path/to/file.py", "line": 42, "name": "test_function"},
+            {"filename": "/path/to/module.py", "line": 100, "name": "helper_method"},
+        ]
+        result = format_tb(frames)
+        self.assertIn("/path/to/file.py", result)
+        self.assertIn("test_function", result)
+        self.assertIn("/path/to/module.py", result)
+        self.assertIn("helper_method", result)
+        self.assertIn("line 100", result)
+
+
+class TestCheckMemoryPool(TestCase):
+    @patch('torch_npu._C._npu_checkPoolLiveAllocations')
+    def test_check_memory_pool_fast_path_pass(self, mock_check):
+        mock_check.return_value = True
+
+        mock_storage1 = MagicMock(spec=StorageWeakRefWrapper)
+        mock_storage1.data_ptr.return_value = 1001
+        mock_storage1.return_value = True
+
+        mock_storage2 = MagicMock(spec=StorageWeakRefWrapper)
+        mock_storage2.data_ptr.return_value = 1002
+        mock_storage2.return_value = True
+
+        check_memory_pool("npu:0", (0, 0), [mock_storage1, mock_storage2])
+        mock_check.assert_called_once_with(
+            "npu:0", (0, 0), {1001, 1002}
+        )
+
+    @patch('torch_npu._C._npu_checkPoolLiveAllocations')
+    @patch('torch_npu.npu._graph_tree.get_npugraph_segments')
+    @patch('torch_npu.npu._graph_tree.format_tb')
+    @patch('gc.collect')
+    def test_check_memory_pool_slow_path_unallocated_storage(
+        self, mock_gc, mock_format_tb, mock_segments, mock_check
+    ):
+        mock_check.return_value = False
+        mock_segments.return_value = [
+            {
+                "segment_pool_id": (0, 0),
+                "address": 2000,
+                "blocks": [
+                    {"state": "active_allocated", "size": 100, "frames": []},
+                ]
+            }
+        ]
+        mock_storage = MagicMock(spec=StorageWeakRefWrapper)
+        mock_storage.data_ptr.return_value = 1000
+        mock_storage.return_value = True
+        with self.assertRaisesRegex(
+            RuntimeError, r"These storage data ptrs are not allocated in pool \(0, 0\) but should be \{1000\}"
+        ):
+            check_memory_pool("npu:0", (0, 0), [mock_storage])
+
+    @patch('torch_npu._C._npu_checkPoolLiveAllocations')
+    @patch('torch_npu.npu._graph_tree.get_npugraph_segments')
+    @patch('torch_npu.npu._graph_tree.format_tb')
+    @patch('gc.collect')
+    def test_check_memory_pool_slow_path_unaccounted_blocks(
+        self, mock_gc, mock_format_tb, mock_segments, mock_check
+    ):
+        mock_check.return_value = False
+        mock_segments.return_value = [
+            {
+                "segment_pool_id": (0, 0),
+                "address": 1000,
+                "blocks": [
+                    {"state": "active_allocated", "size": 100, "frames": [
+                        {"filename": "/path/to/file.py", "line": 42, "name": "allocate_func"}
+                    ]},
+                ]
+            }
+        ]
+        live_storages = []
+        mock_format_tb.return_value = "Formatted Traceback"
+        with self.assertRaisesRegex(
+            RuntimeError, "These live storage data ptrs are in the npugraph pool but not accounted for"
+        ):
+            check_memory_pool("npu:0", (0, 0), live_storages)
+
+    def test_check_memory_pool_invalid_input(self):
+        invalid_storages = [1, 2, 3]
+        with self.assertRaisesRegex(
+            RuntimeError, r"check all\(isinstance\(elem, StorageWeakRefWrapper\) for elem in live_storages_ptrs\) fail"
+        ):
+            check_memory_pool("npu:0", (0, 0), invalid_storages)
+
+
+class TestNPUGraphTreeManager:
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager._run')
+    def test_run_forward_mode(self, mock_run):
+        manager = NPUGraphTreeManager(0)
+        manager.id_to_mode[FunctionID(1)] = CompilationMode.FORWARD
+        result = manager.run([torch.tensor([1.0])], FunctionID(1))
+        mock_run.assert_called_once_with([torch.tensor([1.0])], FunctionID(1))
+        self.assertTrue(manager.running_forwards_with_pending_backwards)
+        self.assertTrue(result == mock_run.return_value)
+
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager._run')
+    def test_run_backward_mode(self, mock_run):
+        manager = NPUGraphTreeManager(0)
+        manager.id_to_mode[FunctionID(1)] = CompilationMode.BACKWARD
+        result = manager.run([torch.tensor([1.0])], FunctionID(1))
+        mock_run.assert_called_once_with([torch.tensor([1.0])], FunctionID(1))
+        self.assertFalse(manager.running_forwards_with_pending_backwards)
+        self.assertTrue(result == mock_run.return_value)
+
+    def test_set_to_running_backward(self):
+        manager = NPUGraphTreeManager(0)
+        manager.running_forwards_with_pending_backwards = True
+        manager.set_to_running_backward()
+        self.assertFalse(manager.running_forwards_with_pending_backwards)
+
+    def test_shutdown(self):
+        manager = NPUGraphTreeManager(0)
+        mock_node1 = MagicMock()
+        mock_node2 = MagicMock()
+        mock_node3 = MagicMock()
+        manager.roots = {FunctionID(1): [mock_node1]}
+        mock_node1.children = {FunctionID(2): [mock_node2]}
+        mock_node2.children = {FunctionID(3): [mock_node3]}
+        manager.shutdown()
+        mock_node1.remove_node_cached_tensors.assert_called_once_with()
+        mock_node2.remove_node_cached_tensors.assert_called_once_with()
+        mock_node3.remove_node_cached_tensors.assert_called_once_with()
+        assert mock_node1.graph is None
+        assert mock_node2.graph is None
+        assert mock_node3.graph is None
+        assert manager.graph is None
+        assert manager.roots is None
+        assert manager.current_node is None
+
+    @patch('torch.npu.synchronize')
+    @patch('torch_npu.npu._graph_tree.NPUGraphNode')
+    def test_record_function(self, mock_node, mock_synchronize):
+        manager = NPUGraphTreeManager(0)
+        manager.ids_to_funcs[FunctionID(1)] = MagicMock()
+        manager.ids_to_stack_traces[FunctionID(1)] = "stack_trace"
+        manager.npu_graphs_thread_pool = "pool_handle"
+        manager.device_index = 0
+        manager.stream = MagicMock()
+        
+        # 设置模拟返回值
+        mock_node_instance = MagicMock()
+        mock_node.return_value = mock_node_instance
+        mock_node_instance.run_first_inputs.return_value = [torch.tensor([1.0])]
+        
+        # 执行测试
+        result = manager.record_function([torch.tensor([1.0])], FunctionID(1))
+        
+        # 验证调用
+        mock_synchronize.assert_any_call()
+        mock_node.assert_called_once_with(
+            manager.ids_to_funcs[FunctionID(1)],
+            ANY,  # graph_id
+            None,  # parent
+            [torch.tensor([1.0])],
+            "pool_handle",
+            0,
+            "stack_trace",
+            manager.stream
+        )
+        assert isinstance(mock_node.call_args[0][1], GraphID)
+        assert manager.current_node == mock_node_instance
+        assert manager.path_state == ExecutionState.RECORDING
+        assert result == [torch.tensor([1.0])]
+
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.update_generation')
+    def test_execute_node(self, mock_update_gen):
+        manager = NPUGraphTreeManager(0)
+        mock_node = MagicMock()
+        mock_node.run.return_value = [torch.tensor([1.0])]
+        
+        # 执行测试
+        result = manager.execute_node(mock_node, [torch.tensor([1.0])])
+        
+        # 验证调用
+        mock_update_gen.assert_called_once_with()
+        assert manager.current_node == mock_node
+        assert manager.path_state == ExecutionState.EXECUTION
+        assert result == [torch.tensor([1.0])]
+
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.update_generation')
+    @patch('torch_npu.npu._graph_tree.NPUWarmupNode')
+    def test_run_eager(self, mock_warmup_node, mock_update_gen):
+        manager = NPUGraphTreeManager(0)
+        manager.ids_to_funcs[FunctionID(1)] = MagicMock()
+        manager.ids_to_stack_traces[FunctionID(1)] = "stack_trace"
+        manager.npu_graphs_thread_pool = "pool_handle"
+        manager.graph = MagicMock()
+        manager.device_index = 0
+        manager.stream = MagicMock()
+        
+        # 设置模拟返回值
+        mock_node_instance = MagicMock()
+        mock_warmup_node.return_value = mock_node_instance
+        mock_node_instance.run.return_value = [torch.tensor([1.0])]
+        
+        # 执行测试
+        result = manager.run_eager([torch.tensor([1.0])], FunctionID(1))
+        
+        # 验证调用
+        mock_update_gen.assert_called_once_with()
+        mock_warmup_node.assert_called_once_with(
+            manager.ids_to_funcs[FunctionID(1)],
+            None,
+            "pool_handle",
+            manager.graph,
+            0,
+            "stack_trace",
+            manager.stream,
+            False,
+        )
+        assert manager.current_node == mock_node_instance
+        assert manager.path_state == ExecutionState.WARMUP
+        assert result == [torch.tensor([1.0])]
+
+    def test_new_graph_id(self):
+        manager = NPUGraphTreeManager(0)
+        id1 = manager.new_graph_id()
+        id2 = manager.new_graph_id()
+        assert isinstance(id1, GraphID)
+        assert isinstance(id2, GraphID)
+        assert id1 != id2
+
+    def test_new_func_id(self):
+        manager = NPUGraphTreeManager(0)
+        id1 = manager.new_func_id()
+        id2 = manager.new_func_id()
+        assert isinstance(id1, FunctionID)
+        assert isinstance(id2, FunctionID)
+        assert id1 != id2
+
+    def test_in_recording_property(self):
+        manager = NPUGraphTreeManager(0)
+        manager.path_state = ExecutionState.NONE
+        assert manager.in_recording is False
+        manager.path_state = ExecutionState.RECORDING
+        assert manager.in_recording is True
+
+    def test_in_warmup_property(self):
+        manager = NPUGraphTreeManager(0)
+        manager.path_state = ExecutionState.NONE
+        assert manager.in_warmup is False
+        manager.path_state = ExecutionState.WARMUP
+        assert manager.in_warmup is True
+
+    def test_get_roots(self):
+        manager = NPUGraphTreeManager(0)
+        mock_node1 = MagicMock()
+        mock_node2 = MagicMock()
+        manager.roots = {
+            FunctionID(1): [mock_node1],
+            FunctionID(2): [mock_node2]
+        }
+        roots = list(manager.get_roots())
+        assert roots == [mock_node1, mock_node2]
+
+    def test_current_node_property_and_setter(self):
+        manager = NPUGraphTreeManager(0)
+        assert manager.current_node is None
+        assert manager.path_state == ExecutionState.NONE
+        mock_node = MagicMock()
+        manager.current_node = mock_node
+        assert manager.current_node == mock_node
+        assert manager._current_node == mock_node
+        manager.current_node = None
+        assert manager.current_node is None
+        assert manager.path_state == ExecutionState.NONE
+
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.get_curr_generation')
+    def test_update_generation(self, mock_get_gen):
+        manager = NPUGraphTreeManager(0)
+        mock_get_gen.return_value = 5
+        manager.update_generation()
+        assert manager.current_gen == 5
+        mock_get_gen.assert_called_once_with()
+
+    @patch('torch_npu.npu._graph_tree.MarkStepBox.mark_step_counter', 3)
+    def test_get_curr_generation_mark_step(self):
+        result = NPUGraphTreeManager.get_curr_generation()
+        assert result == 3
+
+    @patch('torch_npu.npu._graph_tree.MarkStepBox.mark_step_counter', 0)
+    @patch('torch_npu.npu._graph_tree.GenerationTracker.generation', 5)
+    def test_get_curr_generation_generation_tracker(self):
+        result = NPUGraphTreeManager.get_curr_generation()
+        assert result == 5
+
+    @patch('torch_npu.npu._graph_tree.MarkStepBox.mark_step_counter', 3)
+    def test_user_invoked_mark_step_true(self):
+        result = NPUGraphTreeManager.user_invoked_mark_step()
+        assert result is True
+
+    @patch('torch_npu.npu._graph_tree.MarkStepBox.mark_step_counter', 0)
+    def test_user_invoked_mark_step_false(self):
+        result = NPUGraphTreeManager.user_invoked_mark_step()
+        assert result is False
+
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation')
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.user_invoked_mark_step')
+    def test_can_start_new_generation_true_user_mark_step(
+        self, mock_user_mark_step, mock_in_new_invocation
+    ):
+        manager = NPUGraphTreeManager(0)
+        mock_in_new_invocation.return_value = True
+        mock_user_mark_step.return_value = True
+        result = manager.can_start_new_generation
+
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation')
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.user_invoked_mark_step')
+    def test_can_start_new_generation_true_no_pending_backwards(
+        self, mock_user_mark_step, mock_in_new_invocation
+    ):
+        manager = NPUGraphTreeManager(0)
+        manager.running_forwards_with_pending_backwards = False
+        mock_in_new_invocation.return_value = True
+        mock_user_mark_step.return_value = False
+        result = manager.can_start_new_generation()
+        assert result is True
+
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation')
+    def test_can_start_new_generation_false_pending_backwards(
+        self, mock_in_new_invocation
+    ):
+        manager = NPUGraphTreeManager(0)
+        manager.running_forwards_with_pending_backwards = True
+        mock_in_new_invocation.return_value = True
+        result = manager.can_start_new_generation()
+        assert result is False
+
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation')
+    def test_can_start_new_generation_false_not_new_invocation(
+        self, mock_in_new_invocation
+    ):
+        manager = NPUGraphTreeManager(0)
+        mock_in_new_invocation.return_value = False
+        result = manager.can_start_new_generation()
+        assert result is False
+
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.get_curr_generation')
+    def test_in_new_torch_compile_invocation_true(self, mock_get_gen):
+        manager = NPUGraphTreeManager(0)
+        manager.current_gen = 1
+        mock_get_gen.return_value = 2
+        result = manager.in_new_torch_compile_invocation()
+        assert result is True
+
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.get_curr_generation')
+    def test_in_new_torch_compile_invocation_false(self, mock_get_gen):
+        manager = NPUGraphTreeManager(0)
+        manager.current_gen = 1
+        mock_get_gen.return_value = 1
+        result = manager.in_new_torch_compile_invocation()
+        assert result is False
+
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation')
+    @patch('warnings.warn')
+    def test_check_warn_on_unable_to_start_executing_no_warn(
+        self, mock_warn, mock_in_new_invocation
+    ):
+        manager = NPUGraphTreeManager(0)
+        mock_in_new_invocation.return_value = False
+        manager.check_warn_on_unable_to_start_executing(FunctionID(1))
+        mock_warn.assert_not_called()
+
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation')
+    @patch('warnings.warn')
+    def test_check_warn_on_unable_to_start_executing_already_warned(
+        self, mock_warn, mock_in_new_invocation
+    ):
+        manager = NPUGraphTreeManager(0)
+        manager.warned_functions.add(FunctionID(1))
+        mock_in_new_invocation.return_value = True
+        manager.check_warn_on_unable_to_start_executing(FunctionID(1))
+        mock_warn.assert_not_called()
+
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation')
+    @patch('warnings.warn')
+    def test_check_warn_on_unable_to_start_executing_no_repeated_pattern(
+        self, mock_warn, mock_in_new_invocation
+    ):
+        manager = NPUGraphTreeManager(0)
+        mock_in_new_invocation.return_value = True
+        
+        mock_node = MagicMock()
+        mock_node._path_from_root = [MagicMock()]
+        mock_node._path_from_root[0].wrapped_function.id = FunctionID(2)
+        mock_node.wrapped_function.id = FunctionID(1)
+        manager.current_node = mock_node
+        manager.check_warn_on_unable_to_start_executing(FunctionID(1))
+        mock_warn.assert_not_called()
+
+    @patch('torch_npu.npu._graph_tree.NPUGraphTreeManager.in_new_torch_compile_invocation')
+    @patch('warnings.warn')
+    def test_check_warn_on_unable_to_start_executing_warn(
+        self, mock_warn, mock_in_new_invocation
+    ):
+        manager = NPUGraphTreeManager(0)
+        mock_in_new_invocation.return_value = True
+        
+        mock_node1 = MagicMock()
+        mock_node1.wrapped_function.id = FunctionID(1)
+        mock_node1.parent = MagicMock()
+        mock_node1.parent.wrapped_function.id = FunctionID(0)
+        
+        mock_node2 = MagicMock()
+        mock_node2.wrapped_function.id = FunctionID(1)
+        mock_node2.parent = MagicMock()
+        mock_node2.parent.wrapped_function.id = FunctionID(0)
+        
+        mock_current_node = MagicMock()
+        mock_current_node.wrapped_function.id = FunctionID(1)
+        mock_current_node.parent = MagicMock()
+        mock_current_node.parent.wrapped_function.id = FunctionID(0)
+        
+        mock_current_node._path_from_root = [mock_node1, mock_node2]
+        manager.current_node = mock_current_node
+        manager.check_warn_on_unable_to_start_executing(FunctionID(1))
+        mock_warn.assert_called_once_with(
+            "Unable to hit fast path of NPUGraphs because of pending, uninvoked backwards. "
+            "Consider running with torch.no_grad() or using torch.compiler.npugraph_mark_step_begin() "
+            "before each model invocation"
+        )
+        assert FunctionID(1) in manager.warned_functions
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/npu/test_mstx.py b/test/npu/test_mstx.py
index 89bac7923a55b92d30f8f75e23ee965af7452e19..21efd19d93a6aa0772c06601f1a1f378cfba84a5 100644
--- a/test/npu/test_mstx.py
+++ b/test/npu/test_mstx.py
@@ -5,55 +5,107 @@ from torch_npu.testing.testcase import TestCase, run_tests
 
 class TestMstx(TestCase):
     mark_msg = ''
+    mark_domain = ''
     range_msg = ''
     range_id = 0
+    range_domain = ''
 
     def setUp(self):
-        def stub_mark(message: str = ''):
+        def stub_mark(message: str, stream=None, domain: str = 'default'):
             self.mark_msg = message
+            self.mark_domain = domain
 
-        def stub_range_start_on_host(message: str) -> int:
+        def stub_mark_on_host(message: str, domain: str = 'default'):
+            self.mark_msg = message
+            self.mark_domain = domain
+
+        def stub_range_start_on_host(message: str, domain: str = 'default') -> int:
             self.range_msg = message
             self.range_id += 1
+            self.range_domain = domain
             return self.range_id
 
-        def stub_range_start(message: str, stream=None):
+        def stub_range_start(message: str, stream=None, domain: str = 'default'):
             self.range_msg = message
             self.range_id += 1
+            self.range_domain = domain
             return self.range_id
 
-        def stub_range_end(range_id: int):
+        def stub_range_end(range_id: int, domain: str = 'default'):
             self.range_id = range_id
+            self.range_domain = domain
 
-        torch_npu._C._mark = stub_mark
+        torch_npu._C._mstx._mark = stub_mark
+        torch_npu._C._mstx._mark_on_host = stub_mark_on_host
         torch_npu._C._mstx._range_start = stub_range_start
         torch_npu._C._mstx._range_start_on_host = stub_range_start_on_host
         torch_npu._C._mstx._range_end = stub_range_end
 
     def test_mark(self):
+        # invalid inputs
+        torch_npu.npu.mstx.mark("")
+        self.assertEqual("", self.mark_msg)
+        self.assertEqual("", self.mark_domain)
+        torch_npu.npu.mstx.mark(message=0)
+        self.assertEqual("", self.mark_msg)
+        self.assertEqual("", self.mark_domain)
+        torch_npu.npu.mstx.mark("test", stream=None, domain=1)
+        self.assertEqual("", self.mark_msg)
+        self.assertEqual("", self.mark_domain)
+        torch_npu.npu.mstx.mark("test", stream=1, domain="test")
+        self.assertEqual("", self.mark_msg)
+        self.assertEqual("", self.mark_domain) 
+
+        # valid inputs
         torch_npu.npu.mstx.mark("test1")
         self.assertEqual("test1", self.mark_msg)
-        torch_npu.npu.mstx().mark("test2") # Verify compatibility
+        self.assertEqual("default", self.mark_domain)
+        torch_npu.npu.mstx.mark("test2", stream=None, domain="test_domain1")
         self.assertEqual("test2", self.mark_msg)
+        self.assertEqual("test_domain1", self.mark_domain)
+
+        torch.npu.set_device(0)
+        current_stream = torch.npu.current_stream()
+        torch_npu.npu.mstx.mark("test3", stream=current_stream, domain="test_domain2")
+        self.assertEqual("test3", self.mark_msg)
+        self.assertEqual("test_domain2", self.mark_domain)
+
 
     def test_range_start(self):
-        self.range_id = 0
+        # invalid inputs
         ret_id = torch_npu.npu.mstx.range_start("")
         self.assertEqual(0, ret_id)
+        ret_id = torch_npu.npu.mstx.range_start(message=0)
+        self.assertEqual(0, ret_id)
+        ret_id = torch_npu.npu.mstx.range_start(message="test", stream=None, domain=1)
+        self.assertEqual(0, ret_id)
+        ret_id = torch_npu.npu.mstx.range_start(message="test", stream=1, domain="test")
+        self.assertEqual(0, ret_id)
+
+        # valid inputs
         ret_id = torch_npu.npu.mstx.range_start("test1")
         self.assertEqual(1, ret_id)
         self.assertEqual("test1", self.range_msg)
+        self.assertEqual("default", self.range_domain)
         ret_id = torch_npu.npu.mstx.range_start("test2", None)
         self.assertEqual(2, ret_id)
         self.assertEqual("test2", self.range_msg)
-        
-        torch.npu.set_device(0)
-        current_stream = torch.npu.current_stream()
-        ret_id = torch_npu.npu.mstx.range_start("test3", current_stream)
+        self.assertEqual("default", self.range_domain)
+        ret_id = torch_npu.npu.mstx.range_start("test3", None, domain="test_domain1")
         self.assertEqual(3, ret_id)
         self.assertEqual("test3", self.range_msg)
-        ret_id = torch_npu.npu.mstx.range_start("test4", 'invalid_stream')
-        self.assertEqual(0, ret_id)
+        self.assertEqual("test_domain1", self.range_domain)
+
+        torch.npu.set_device(0)
+        current_stream = torch.npu.current_stream()
+        ret_id = torch_npu.npu.mstx.range_start("test4", current_stream)
+        self.assertEqual(4, ret_id)
+        self.assertEqual("test4", self.range_msg)
+        self.assertEqual("default", self.range_domain)
+        ret_id = torch_npu.npu.mstx.range_start("test5", current_stream, domain="test_domain2")
+        self.assertEqual(5, ret_id)
+        self.assertEqual("test5", self.range_msg)
+        self.assertEqual("test_domain2", self.range_domain)
 
     def test_range_end(self):
         self.range_id = 0
@@ -61,6 +113,10 @@ class TestMstx(TestCase):
         self.assertEqual(0, self.range_id)
         torch_npu.npu.mstx.range_end(1)
         self.assertEqual(1, self.range_id)
+        self.assertEqual("default", self.range_domain)
+        torch_npu.npu.mstx.range_end(2, domain="test_domain1")
+        self.assertEqual(2, self.range_id)
+        self.assertEqual("test_domain1", self.range_domain)
 
 
 if __name__ == '__main__':
diff --git a/test/npu/test_npu.py b/test/npu/test_npu.py
index ab426f586e00ea9898a764223a9e5c21cc7302d1..7e7608329d02774809266f04125e04de6c6edd8b 100644
--- a/test/npu/test_npu.py
+++ b/test/npu/test_npu.py
@@ -477,6 +477,23 @@ class TestNpu(TestCase):
 
         self.assertEqual(result.tolist(), [1, 2, 3, 4])
 
+    def test_erase_stream(self):
+        stream1 = torch_npu.npu.Stream()
+        stream2 = torch_npu.npu.Stream()
+
+        with torch_npu.npu.stream(stream2):
+            matrix1 = torch.ones(1000, 1000, device='npu')
+            matrix2 = torch.ones(1000, 1000, device='npu')
+            tensor1 = torch.matmul(matrix1, matrix2)
+            data_ptr1 = tensor1.data_ptr()
+
+            tensor1.record_stream(stream1)
+            torch_npu.erase_stream(tensor1, stream1)
+            del tensor1
+
+            tensor2 = torch.ones(1000, 1000, device='npu')
+            self.assertEqual(tensor2.data_ptr(), data_ptr1)
+
     @staticmethod
     def _stream_synchronize(self, spin_time_cycles):
         s = torch_npu.npu.current_stream()
diff --git a/test/npu/test_public_bindings.py b/test/npu/test_public_bindings.py
index 31e465f6da549f4626b8ccf1c643ff3f623f9d53..954dd756f7d92e79baff435f2dc890855c668206 100644
--- a/test/npu/test_public_bindings.py
+++ b/test/npu/test_public_bindings.py
@@ -331,6 +331,9 @@ class TestPublicBindings(TestCase):
             for modname in allow_dict["being_migrated"]:
                 if modname in allow_dict:
                     allow_dict[allow_dict["being_migrated"][modname]] = allow_dict[modname]
+        with open(
+                os.path.join(os.path.dirname(os.path.dirname(__file__)), 'deprecated_apis.json')) as json_file:
+            deprecated_dict = json.load(json_file)
         
         if update_allow_dict_torchair:
             allow_dict.update(update_allow_dict_torchair)
@@ -380,7 +383,8 @@ class TestPublicBindings(TestCase):
                         return
                     if f"{modname}.{elem}" in tempFilter:
                         return
-                    if modname in allow_dict and elem in allow_dict[modname]:
+                    if ((modname in allow_dict and elem in allow_dict[modname]) or
+                        (modname in deprecated_dict and elem in deprecated_dict[modname])):
                         return
 
                     if is_public:
diff --git a/test/npu/test_serialization.py b/test/npu/test_serialization.py
index a5d8afea5d406a980111e9a2a69925a0eafde8f3..96254f26377ad759a8b3a5397f4e5e0636a88825 100644
--- a/test/npu/test_serialization.py
+++ b/test/npu/test_serialization.py
@@ -1,6 +1,7 @@
 import io
 import os
 import tempfile
+import tarfile
 import argparse
 
 import torch
@@ -274,6 +275,25 @@ class TestSerialization(TestCase):
                 b = torch.tensor([], dtype=other_dtype, device='npu')
                 save_load_check(a, b)
 
+    def test_tarfile_with_weights_only_unpickler(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "mock.tar")
+            with tarfile.open(path, 'w') as tar:
+                tar.add(os.path.devnull, arcname="empty_file")
+
+            with self.assertRaisesRegex(
+                Exception, "Cannot use ``weights_only=True`` with files saved in the legacy .tar format"
+            ):
+                torch.load(path, weights_only=True)
+
+            with self.assertRaisesRegex(Exception, "Unsupported operand"):
+                with open(path, "rb") as opened_file:
+                    try:
+                        with tarfile.open(fileobj=opened_file, mode="r:", format=tarfile.PAX_FORMAT):
+                            pass
+                    finally:
+                        torch.load(opened_file, weights_only=True)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/npu/test_torch_npu.py b/test/npu/test_torch_npu.py
index e9da6ce0cb1315c208f91386f3e2251bc3fa2759..ca5c77b21e8ca0d2ba9b9715d4b3bd020dc870a2 100644
--- a/test/npu/test_torch_npu.py
+++ b/test/npu/test_torch_npu.py
@@ -2,6 +2,9 @@ import unittest
 import contextlib
 import collections
 import multiprocessing
+import threading
+import sys
+from subprocess import check_output
 
 import torch
 import torch_npu
@@ -171,7 +174,27 @@ class TorchNPUApiTestCase(TestCase):
     def test_npu_stream(self):
         s = torch_npu.npu.current_stream()
         res = torch_npu.npu.stream(s)
-        self.assertIsInstance(res, contextlib._GeneratorContextManager)
+        self.assertIsInstance(res, torch_npu.npu.utils.StreamContext)
+
+    def test_npu_streamcontext(self):
+        s = torch_npu.npu.current_stream()
+
+        def thread_func():
+            res1 = torch_npu.npu.Stream()
+            res2 = torch_npu.npu.Stream()
+            with torch_npu.npu.stream(res1) as current:
+                self.assertEqual(torch_npu.npu.current_stream(), res1)
+            with torch_npu.npu.stream(res2) as current:
+                self.assertEqual(torch_npu.npu.current_stream(), res2)
+            with torch_npu.npu.stream(res1) as current:
+                self.assertEqual(torch_npu.npu.current_stream(), res1)
+            with torch_npu.npu.stream(res2) as current:
+                self.assertEqual(torch_npu.npu.current_stream(), res2)
+
+        thread = threading.Thread(target=thread_func)
+        thread.start()
+        thread.join()
+        self.assertEqual(torch_npu.npu.current_stream(), s)
 
     def test_npu_synchronize(self):
         res = torch_npu.npu.synchronize()
@@ -276,6 +299,33 @@ class TorchNPUApiTestCase(TestCase):
             if not result_queue.empty():
                 raise result_queue.get()
 
+    def test_npu_device_count_without_visible_devices(self):
+        test_script = f"import torch; import torch_npu; \
+        count1 = torch.npu.device_count(); count2 = torch_npu._C._npu_getDeviceCount(); print(count1 == count2)"
+        rc = check_output([sys.executable, '-c', test_script]).decode("ascii").strip()
+        self.assertEqual(rc, "True")
+
+    @skipIfUnsupportMultiNPU(2)
+    def test_npu_device_count_with_visible_devices(self):
+        for var in ['', ',', ' ,', ', ', '0,', ',0', '0, ', '0, 1', '0 ,1', '0,1', '0,32,1', '0,32,0', '0,0', '0,1,1', 'npu0', '1,0']:
+            test_script = f"import os; import torch; import torch_npu; os.environ['ASCEND_RT_VISIBLE_DEVICES'] = '{var}'; \
+            count1 = torch.npu.device_count(); count2 = torch_npu._C._npu_getDeviceCount(); print(count1 == count2)"
+            rc = check_output([sys.executable, '-c', test_script]).decode("ascii").strip()
+            self.assertEqual(rc, "True")
+
+    def test_npu_lazy_init(self):
+        """ Validate that no NPU calls are made during `import torch_npu` call or `torch.npu.device_count()` call"""
+        VISIBLE_DEVICES = "ASCEND_RT_VISIBLE_DEVICES"
+        # Check that `rts` was not called during the import
+        # By using torch_npu._C._npu_getDeviceCount() because it will not change if `rts` was called
+        # torch_npu.npu.device_count() will parses ASCEND_RT_VISIBLE_DEVICES and will change along with it
+        test_script = f"import os; import torch; import torch_npu; os.environ['{VISIBLE_DEVICES}']='32';print(torch_npu._C._npu_getDeviceCount())"
+        rc = check_output([sys.executable, "-c", test_script]).decode("ascii").strip()
+        self.assertEqual(rc, "0")
+
+        test_script = f"import os; import torch; import torch_npu; torch.npu.device_count(); os.environ['{VISIBLE_DEVICES}']='32';print(torch_npu._C._npu_getDeviceCount())"
+        rc = check_output([sys.executable, "-c", test_script]).decode("ascii").strip()
+        self.assertEqual(rc, "0")
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/onnx/test_combined_onnx_ops.py b/test/onnx/test_combined_onnx_ops.py
index 67ebe7481a34e1dbb02423fdc2a9f7c6fb80dff3..61b6194e475208141f8edaffe8b9aef0d40c32bf 100644
--- a/test/onnx/test_combined_onnx_ops.py
+++ b/test/onnx/test_combined_onnx_ops.py
@@ -186,7 +186,6 @@ class TestOnnxOps(TestCase):
         assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
                                             onnx_model_name)))
 
-    @unittest.skip("Case Failures not caused by pr, skip first")
     def test_wrapper_npu_conv3d(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -214,6 +213,9 @@ class TestOnnxOps(TestCase):
                 return torch_npu.npu_conv3d(input_, self.weight, self.bias,
                                             stride, paddings, dilation, groups)
 
+        torch.npu.config.allow_internal_format = True
+        torch.npu.set_compile_mode(jit_compile=True)
+
         def export_onnx(onnx_model_name):
             input_ = torch.rand([1, 128, 4, 14, 14]).npu()
             model = Model().to("npu")
diff --git a/test/onnx/test_wrapper_onnx_ops.py b/test/onnx/test_wrapper_onnx_ops.py
index 521fc0d09ecbd8a77c0ee30b580de56cfe6ae081..1363719fbd278025381553a6380d445c4d575b61 100644
--- a/test/onnx/test_wrapper_onnx_ops.py
+++ b/test/onnx/test_wrapper_onnx_ops.py
@@ -139,7 +139,7 @@ class TestOnnxOps(TestCase):
         assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
                                             onnx_model_name)))
 
-    @unittest.skip("Case Failures not caused by pr, skip first")
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_wrapper_npu_batch_nms(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -184,34 +184,6 @@ class TestOnnxOps(TestCase):
         assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
                                             onnx_model_name)))
 
-    @unittest.skip("Case Failures not caused by pr, skip first")
-    def test_wrapper_npu_fused_attention_score(self):
-        class Model(torch.nn.Module):
-            def __init__(self):
-                super(Model, self).__init__()
-
-            def forward(self, query_layer, key_layer, value_layer, attention_mask):
-                scale = 0.125
-                keep_prob = 1
-                return torch_npu.npu_fused_attention_score(query_layer, key_layer,
-                                                           value_layer, attention_mask, scale, keep_prob)
-
-        def export_onnx(onnx_model_name):
-            q = torch.rand(24, 16, 512, 64).uniform_(-3, 3).npu().half()
-            k = torch.rand(24, 16, 512, 64).uniform_(-3, 3).npu().half()
-            v = torch.rand(24, 16, 512, 64).uniform_(-3, 3).npu().half()
-            mask = torch.ones(512) * -10000.
-            mask[:6] = -0.
-            mask = mask.expand(24, 1, 512, 512).npu().half()
-            model = Model().to("npu")
-            model(q, k, v, mask)
-            self.onnx_export(model, (q, k, v, mask), onnx_model_name, ["q", "k", "v", "mask"])
-
-        onnx_model_name = "model_npu_fused_attention_score.onnx"
-        export_onnx(onnx_model_name)
-        assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
-                                            onnx_model_name)))
-
     def test_wrapper_npu_multi_head_attention(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -491,7 +463,7 @@ class TestOnnxOps(TestCase):
         assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
                                             onnx_model_name)))
 
-    @unittest.skip("Case Failures not caused by pr, skip first")
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_wrapper_npu_ifmr(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -521,33 +493,6 @@ class TestOnnxOps(TestCase):
         assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
                                             onnx_model_name)))
 
-    @unittest.skip("Case Failures not caused by pr, skip first")
-    def test_wrapper_npu_fused_attention_score_fwd(self):
-        class Model(torch.nn.Module):
-            def __init__(self):
-                super(Model, self).__init__()
-
-            def forward(self, q, k, v, mask):
-                return torch_npu.npu_fused_attention_score_fwd(q, k, v, mask, 0.125, 1)
-
-        def export_onnx(onnx_model_name):
-            q = torch.rand(24, 16, 512, 64).uniform_(-3, 3).half().npu()
-            k = torch.rand(24, 16, 512, 64).uniform_(-3, 3).half().npu()
-            v = torch.rand(24, 16, 512, 64).uniform_(-3, 3).half().npu()
-            mask = torch.ones(512) * -10000.
-            mask[:6] = -0.
-            mask = mask.expand(24, 1, 512, 512).half().npu()
-
-            model = Model().to("npu")
-            model(q, k, v, mask)
-            self.onnx_export(model, (q, k, v, mask), onnx_model_name,
-                             ["q", "k", "v", "mask"], ["out1", "out2", "out3"])
-
-        onnx_model_name = "model_npu_fused_attention_score_fwd.onnx"
-        export_onnx(onnx_model_name)
-        assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
-                                            onnx_model_name)))
-
     def test_wrapper_npu_sign_bits_unpack(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -941,29 +886,6 @@ class TestOnnxOps(TestCase):
         assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
                                             onnx_model_name)))
 
-    @unittest.skip("Case Failures not caused by pr, skip first")
-    def test_wrapper_npu_scatter(self):
-        class Model(torch.nn.Module):
-            def __init__(self):
-                super(Model, self).__init__()
-
-            def forward(self, input_, indices, updates):
-                return torch_npu.npu_scatter(input_, indices, updates, 0)
-
-        def export_onnx(onnx_model_name):
-            input_ = torch.tensor([[1.6279, 0.1226], [0.9041, 1.0980]]).npu()
-            indices = torch.tensor([0, 1], dtype=torch.int32).npu()
-            updates = torch.tensor([-1.1993, -1.5247]).npu()
-            model = Model().to("npu")
-            model(input_, indices, updates)
-            self.onnx_export(model, (input_, indices, updates),
-                             onnx_model_name, ["input_", "indices", "updates"])
-
-        onnx_model_name = "model_npu_scatter.onnx"
-        export_onnx(onnx_model_name)
-        assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
-                                            onnx_model_name)))
-
     def test_wrapper_npu_lstm_cell(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -1233,7 +1155,6 @@ class TestOnnxOps(TestCase):
         assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
                                             onnx_model_name)))
 
-    @unittest.skip("Case Failures not caused by pr, skip first")
     @SupportedDevices(['Ascend910B'])
     def test_wrapper_npu_rms_norm(self):
         class Model(torch.nn.Module):
@@ -1247,7 +1168,7 @@ class TestOnnxOps(TestCase):
             
         def export_onnx(onnx_model_name):
             x = torch.rand(10, 1024).uniform_(-3, 3).npu().half()
-            gamma = torch.rand(10).uniform_(-3, 3).npu().half()
+            gamma = torch.rand(1024).uniform_(-3, 3).npu().half()
             model = Model().to("npu")
             model(x, gamma)
             self.onnx_export(model, (x, gamma), onnx_model_name)
@@ -1402,7 +1323,6 @@ class TestOnnxOps(TestCase):
         export_onnx(onnx_model_name)
         assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path, onnx_model_name)))
 
-    @unittest.skip("Case Failures not caused by pr, skip first")
     @SupportedDevices(['Ascend910B'])      
     def test_wrapper_npu_weight_quant_batchmatmul(self):
         class Model(torch.nn.Module):
@@ -1410,11 +1330,11 @@ class TestOnnxOps(TestCase):
                 super().__init__()
 
             def forward(self, x, weight, antiquant_scale, antiquant_offset, quant_scale, quant_offset, bias, antiquant_group_size):
-                return torch_npu.npu_weight_quant_batchmatmul(x, weight, antiquant_scale, antiquant_offset, quant_scale, quant_offset, bias, antiquant_group_size)
+                return torch_npu.npu_weight_quant_batchmatmul(x, weight, antiquant_scale, antiquant_offset, quant_scale, quant_offset, bias, 0)
 
         def export_onnx(onnx_model_name):
             x = torch.randn((8192, 320), dtype=torch.bfloat16).npu()
-            weight = torch.randn((320, 256), dtype=torch.int8).npu()
+            weight = torch.randn((320, 256), dtype=torch.int8, device="npu")
             antiquantscale = torch.randn((1, 256), dtype=torch.bfloat16).npu()
             antiquantoffset = torch.randn((1, 256), dtype=torch.bfloat16).npu()
             model = Model().to("npu")
@@ -1554,7 +1474,6 @@ class TestOnnxOps(TestCase):
         assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
                                             onnx_model_name)))
 
-    @unittest.skip("Case Failures not caused by pr, skip first")
     @SupportedDevices(['Ascend910B'])
     def test_wrapper_npu_moe_gating_top_k_softmax(self):
         class Model(torch.nn.Module):
@@ -1562,7 +1481,7 @@ class TestOnnxOps(TestCase):
                 super(Model, self).__init__()
 
             def forward(self, x, finished=None, k=1):
-                return torch_npu.npu_moe_gating_top_k_softmax(x, finished, k=k)
+                return torch_npu.npu_moe_gating_top_k_softmax(x, finished, k=2)
 
         def export_onnx(onnx_model_name):
             x = torch.tensor([[0.1, 0.1, 0.1, 0.1],
@@ -1571,7 +1490,6 @@ class TestOnnxOps(TestCase):
             model = Model().to("npu")
             model(x, None, 2)
             self.onnx_export(model, (x, None, 2), onnx_model_name,
-                             input_names=["x", "finished", "k"],
                              output_names=["y", "expert_idx", "row_idx"])
 
         onnx_model_name = "model_npu_moe_gating_top_k_softmax.onnx"
@@ -1579,8 +1497,6 @@ class TestOnnxOps(TestCase):
         assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
                                             onnx_model_name)))
 
-
-    @unittest.skip("Case Failures not caused by pr, skip first")
     @SupportedDevices(['Ascend910B'])
     def test_wrapper_npu_moe_finalize_routing_v2(self):            
         class Model(torch.nn.Module):
diff --git a/test/profiler/analysis/prof_view/test_memory_timeline_parser.py b/test/profiler/analysis/prof_view/test_memory_timeline_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..9732ed01a255ef3af8869e98f7785a83364440ff
--- /dev/null
+++ b/test/profiler/analysis/prof_view/test_memory_timeline_parser.py
@@ -0,0 +1,184 @@
+import os
+import random
+import unittest
+from unittest.mock import patch, MagicMock
+
+from torch_npu.profiler.analysis.prof_common_func._constant import Constant
+from torch_npu.profiler.analysis.prof_parse._event_tree_parser import (_ProfilerEvent,
+                                                                       _DeviceType, _EventType)
+from torch_npu.profiler.analysis.prof_view._memory_timeline_parser import (
+    MemoryProfile, MemoryProfileTimeline, Storage,
+    DeviceKey, TensorKey, Category, Action, _CATEGORY_TO_COLORS
+)
+
+
+class TestMemoryProfile(unittest.TestCase):
+    def setUp(self):
+        self.memory_profile = MagicMock()
+        self.memory_profile._root_nodes = []
+        self.memory_profile._categories = {}
+
+    @patch("torch_npu.profiler.analysis.prof_view._memory_timeline_parser.EventTree")
+    def test_init_success(self, mock_event_tree):
+        mock_event_tree.return_value.sorted_events = []
+        mock_event_tree.return_value.get_root_nodes.return_value = []
+        mp = MemoryProfile("valid.prof")
+        self.assertIsNotNone(mp)
+
+    def test_memory_history(self):
+        mock_event = MagicMock(spec=_ProfilerEvent)
+        mock_event.tag = _EventType.Allocation
+        mock_event.extra_fields = MagicMock()
+        mock_event.extra_fields.device_type = _DeviceType.CUDA
+        mock_event.extra_fields.device_index = 0
+        mock_event.extra_fields.total_active = 100
+        mock_event.extra_fields.total_allocated = 200
+        mock_event.extra_fields.total_reserved = 300
+        mock_event.children = []
+        self.memory_profile._root_nodes = [mock_event]
+        self.memory_profile.memory_history = [(DeviceKey(_DeviceType.NPU, 0), 100, 200, 300)]
+        result = self.memory_profile.memory_history
+        expected = [(DeviceKey(_DeviceType.NPU, 0), 100, 200, 300)]
+        self.assertEqual(result, expected)
+
+    def test_is_gradient(self):
+        mock_categories = MagicMock()
+        mock_categories.get.return_value = Category.GRADIENT
+        self.memory_profile._categories = mock_categories
+        self.assertTrue(self.memory_profile._is_gradient(TensorKey(1, 0, 1, "storage"), 0))
+
+    def test_set_gradients_and_temporaries(self):
+        mock_event = MagicMock(spec=_ProfilerEvent)
+        mock_event.tag = _EventType.PyCall
+
+        mock_event.extra_fields = MagicMock()
+        mock_event.extra_fields.grads = [TensorKey(1, 0, 1, "storage")]
+
+        self.assertEqual(mock_event.extra_fields.grads[0].id, 1)
+        self.assertEqual(mock_event.extra_fields.grads[0].storage, "storage")
+
+    def test_set_optimizer_state(self):
+        mock_event = MagicMock(spec=_ProfilerEvent)
+        mock_event.tag = _EventType.PyCall
+
+        mock_event.extra_fields = MagicMock()
+        mock_event.extra_fields.optimizer_parameters = [MagicMock()]
+
+        random_data = [random.random() for _ in range(2)]
+        mock_event.extra_fields.optimizer_parameters[0].state = {"weight": random_data}
+
+        self.memory_profile._root_nodes = [mock_event]
+
+        with patch("torch_npu.profiler.analysis.prof_view._memory_timeline_parser.TensorKey.from_tensor",
+                   return_value=TensorKey(1, 0, 1, "storage")):
+            self.memory_profile._set_optimizer_state()
+            self.assertEqual(self.memory_profile._categories.get(TensorKey(1, 0, 1, "storage"), 0), 0)
+
+
+class TestMemoryProfileTimeline(unittest.TestCase):
+
+    def setUp(self):
+        self.memory_profile = MagicMock()
+        self.mpt = MemoryProfileTimeline(self.memory_profile)
+
+    def test_parse_device_cpu(self):
+        result = self.mpt._parse_device_info("cpu")
+        self.assertIsInstance(result, DeviceKey)
+        self.assertEqual(result.device_type, 0)
+        self.assertEqual(result.device_index, -1)
+
+    def test_parse_device_npu(self):
+        result = self.mpt._parse_device_info("npu:0")
+        self.assertIsInstance(result, DeviceKey)
+        self.assertEqual(result.device_index, 0)
+
+    def test_construct_timeline_empty(self):
+        self.memory_profile.timeline = []
+        timestamps, sizes = self.mpt._construct_timeline("cpu")
+        self.assertEqual(len(timestamps), 0)
+        self.assertEqual(len(sizes), 0)
+
+    def test_construct_timeline_filter_device(self):
+        key1 = TensorKey(0, 0, 0, Storage(0, 1))
+        key2 = TensorKey(1, 1, 1, Storage(0, 1))
+        self.memory_profile.timeline = [
+            (1000000, Action.CREATE, (key1, 0), 1024),
+            (2000000, Action.CREATE, (key2, 0), 2048),
+        ]
+        timestamps, sizes = self.mpt._construct_timeline("cpu")
+        self.assertEqual(len(timestamps), 0)
+
+    @patch('torch_npu.profiler.analysis.prof_common_func._file_manager.FileManager.create_json_file_by_path')
+    def test_export_json(self, mock_write):
+        self.memory_profile.timeline = [(1000000, Action.CREATE, (TensorKey(0, 0, 0, Storage(0, 1)), 0), 1024)]
+        self.mpt._construct_timeline = MagicMock(return_value=([1000], [[0, 1024]]))
+        self.mpt.export_memory_timeline_json("output.json", "cpu")
+        expected_path = os.path.abspath("output.json")
+        mock_write.assert_called_once_with(expected_path, [[1000], [[0, 1024]]])
+
+
+class TestMemoryTimelineParser(unittest.TestCase):
+
+    @patch('torch_npu.profiler.analysis.prof_view._memory_timeline_parser.MemoryProfile')
+    @patch('torch_npu.profiler.analysis.prof_view._memory_timeline_parser.MemoryProfileTimeline')
+    def test_run_method(self, mock_timeline_class, mock_profile_class):
+        parser = mock_timeline_class()
+        parser._device = "npu"
+        parser.logger = MagicMock()
+        mock_profile_instance = mock_profile_class.return_value
+        mock_profile_instance.some_method_we_use.return_value = "mocked profile data"
+        mock_timeline_instance = mock_timeline_class.return_value
+        mock_timeline_instance.export_memory_timeline_html.return_value = None
+        parser.run.return_value = [Constant.SUCCESS]
+        result = parser.run(deps_data={})
+        self.assertEqual(result[0], Constant.SUCCESS)
+
+    @patch('torch_npu.profiler.analysis.prof_view._memory_timeline_parser.MemoryProfile')
+    @patch('torch_npu.profiler.analysis.prof_view._memory_timeline_parser.MemoryProfileTimeline')
+    def test_run_with_exception(self, mock_timeline_class, mock_profile_class):
+        parser = mock_timeline_class()
+        parser._device = "npu"
+        parser.logger = MagicMock()
+        mock_profile_class.side_effect = Exception("Mocked Initialization Error")
+        parser.run.return_value = [Constant.FAIL]
+        result = parser.run(deps_data={})
+        self.assertEqual(result[0], Constant.FAIL)
+
+
+class TestEdgeCases(unittest.TestCase):
+
+    def test_category_handling(self):
+        mock_mem_profile = MagicMock()
+        mock_mem_profile.timeline = []
+        mock_mem_profile.memory_history = []
+        mock_mem_profile._categories = MagicMock()
+
+        test_cases = [
+            (Category.INPUT, "black"),
+            (Category.PARAMETER, "darkgreen"),
+            (None, "grey")
+        ]
+
+        for category, expected_color in test_cases:
+            mock_mem_profile._categories.get.return_value = category
+            timeline = MemoryProfileTimeline(mock_mem_profile)
+
+            idx = timeline._get_category_index(MagicMock(), 0)
+            self.assertEqual(_CATEGORY_TO_COLORS[category], expected_color)
+
+
+def run_tests():
+    loader = unittest.TestLoader()
+
+    suite = unittest.TestSuite()
+    suite.addTests(loader.loadTestsFromTestCase(TestMemoryProfile))
+    suite.addTests(loader.loadTestsFromTestCase(TestMemoryProfileTimeline))
+    suite.addTests(loader.loadTestsFromTestCase(TestMemoryTimelineParser))
+    suite.addTests(loader.loadTestsFromTestCase(TestEdgeCases))
+
+    runner = unittest.TextTestRunner(verbosity=2)
+    runner.run(suite)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/profiler/test_experimental_config.py b/test/profiler/test_experimental_config.py
index 87d0168b5e3361fc132be8089786f6e1de4d3c1f..0397472e8e53a98e31ac6260ec8e232d1a1a545d 100644
--- a/test/profiler/test_experimental_config.py
+++ b/test/profiler/test_experimental_config.py
@@ -45,6 +45,67 @@ class TestExperimentalConfig(TestCase):
         experimental_config = _ExperimentalConfig()
         self.assertTrue(isinstance(experimental_config(), Cpp_ExperimentalConfig))
 
+    def test_mstx_domain_switches_will_reset_when_msproftx_not_enabled(self):
+        experimental_config = _ExperimentalConfig(msprof_tx=False,
+                                                  mstx_domain_include=['x'],
+                                                  mstx_domain_exclude=['y'])
+        self.assertEqual([], experimental_config._mstx_domain_include)
+        self.assertEqual([], experimental_config._mstx_domain_exclude)
+
+    def test_mstx_domain_switches_will_save_empty_list_when_not_set_domain_switches(self):
+        experimental_config = _ExperimentalConfig(msprof_tx=True)
+        self.assertEqual([], experimental_config._mstx_domain_include)
+        self.assertEqual([], experimental_config._mstx_domain_exclude)
+
+    def test_mstx_domain_switches_will_reset_when_input_invalid_domain_switches(self):
+        experimental_config = _ExperimentalConfig(msprof_tx=True,
+                                                  mstx_domain_include=1,
+                                                  mstx_domain_exclude=1)
+        self.assertEqual([], experimental_config._mstx_domain_include)
+        self.assertEqual([], experimental_config._mstx_domain_exclude)
+
+        experimental_config = _ExperimentalConfig(msprof_tx=True,
+                                                  mstx_domain_include=[1],
+                                                  mstx_domain_exclude=[1])
+        self.assertEqual([], experimental_config._mstx_domain_include)
+        self.assertEqual([], experimental_config._mstx_domain_exclude)
+
+    def test_mstx_domain_switches_will_reset_exclude_domain_when_both_set_domain_switches(self):
+        experimental_config = _ExperimentalConfig(msprof_tx=True,
+                                                  mstx_domain_include=['x'],
+                                                  mstx_domain_exclude=['y'])
+        self.assertEqual(['x'], experimental_config._mstx_domain_include)
+        self.assertEqual([], experimental_config._mstx_domain_exclude)
+
+    def test_mstx_domain_switches_will_save_when_input_valid_domain_switches(self):
+        experimental_config = _ExperimentalConfig(msprof_tx=True,
+                                                  mstx_domain_include=['x'])
+        self.assertEqual(['x'], experimental_config._mstx_domain_include)
+        self.assertEqual([], experimental_config._mstx_domain_exclude)
+
+        experimental_config = _ExperimentalConfig(msprof_tx=True,
+                                                  mstx_domain_exclude=['y'])
+        self.assertEqual([], experimental_config._mstx_domain_include)
+        self.assertEqual(['y'], experimental_config._mstx_domain_exclude)
+
+    def test_host_sys_switches_will_save_empty_list_when_not_set_host_sys(self):
+        experimental_config = _ExperimentalConfig()
+        self.assertEqual([], experimental_config._host_sys)
+
+    def test_host_sys_switches_will_save_when_set_valid_host_sys(self):
+        experimental_config = _ExperimentalConfig(host_sys=[Constant.CPU])
+        self.assertEqual(["cpu"], experimental_config._host_sys)
+
+    def test_sys_switches_will_save_empty_list_when_not_set_sys(self):
+        experimental_config = _ExperimentalConfig()
+        self.assertEqual(False, experimental_config._sys_io)
+        self.assertEqual(False, experimental_config._sys_interconnection)
+
+    def test_sys_switches_will_save_when_set_valid_sys(self):
+        experimental_config = _ExperimentalConfig(sys_io=True, sys_interconnection=True)
+        self.assertEqual(True, experimental_config._sys_io)
+        self.assertEqual(True, experimental_config._sys_interconnection)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/profiler/test_npu_profiler.py b/test/profiler/test_npu_profiler.py
index 6c3ddffe1eda99984c35e5bfb0ddc224c6dbddf6..035adcf7e0060f1dbb96fb081ac66fd753a9651d 100644
--- a/test/profiler/test_npu_profiler.py
+++ b/test/profiler/test_npu_profiler.py
@@ -187,6 +187,23 @@ class TestNpuProfiler(TestCase):
         self.assertEqual(True, self._has_view_result(self.results_path, worker_name, self.OPERATOR_MEMORY))
         self.assertEqual(True, self._has_view_result(self.results_path, worker_name, self.MEMORY_RECORD))
 
+    def test_memory_when_workspace(self):
+        original_value = os.environ.get("TASK_QUEUE_ENABLE")
+        os.environ["TASK_QUEUE_ENABLE"] = "2"
+        worker_name = self.worker_name
+        with torch_npu.profiler.profile(
+                profile_memory=True,
+                on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(self.results_path, worker_name=worker_name)
+        ) as prof:
+            for _ in range(self.small_steps):
+                self.model_train.train_one_step()
+        self.assertEqual(True, self._has_view_result(self.results_path, worker_name, self.OPERATOR_MEMORY))
+        self.assertEqual(True, self._has_view_result(self.results_path, worker_name, self.MEMORY_RECORD))
+        if original_value is None:
+            del os.environ["TASK_QUEUE_ENABLE"]
+        else:
+            os.environ["TASK_QUEUE_ENABLE"] = original_value
+
     def test_ascend_work_path(self):
         PathManager.remove_path_safety(self.results_work_path)
         os.environ["ASCEND_WORK_PATH"] = self.results_work_path
@@ -279,7 +296,7 @@ class TestNpuProfiler(TestCase):
         self.model_train.train_one_step()
         prof.stop()
         result_dir = os.path.join(self.results_work_path, "profiling_data")
-        torch_npu.profiler.profiler.analyse(result_dir)
+        torch_npu.profiler.profiler.analyse(result_dir, export_type="text")
         work_names = [p for p in os.listdir(result_dir) if p.endswith("ascend_pt")]
         os.environ["ASCEND_WORK_PATH"] = ""
         # only one device
diff --git a/test/requirements-arm.txt b/test/requirements-arm.txt
index 30fcebbe00b9c911db7e7f6227878b5c83174366..3105ee37d42c30fb96fb8b9df9a9595ab938765b 100644
--- a/test/requirements-arm.txt
+++ b/test/requirements-arm.txt
@@ -14,3 +14,4 @@ transformers==4.40.0
 setuptools==69.5.1
 pytest==8.1.1
 parameterized==0.9.0
+ml-dtypes==0.2.0
diff --git a/test/requirements-x86.txt b/test/requirements-x86.txt
index 951d1086a71fcf599aee7d4131661e5f04900a10..57402f3346559deec318bb17f9e26334ca79d51d 100644
--- a/test/requirements-x86.txt
+++ b/test/requirements-x86.txt
@@ -13,4 +13,5 @@ torchvision==0.16.0 --index-url https://download.pytorch.org/whl/test/cpu
 transformers==4.40.0
 setuptools==69.5.1
 pytest==8.1.1
-parameterized==0.9.0
\ No newline at end of file
+parameterized==0.9.0
+ml-dtypes==0.2.0
\ No newline at end of file
diff --git a/test/test_native_mha.py b/test/test_native_mha.py
new file mode 100644
index 0000000000000000000000000000000000000000..8457bf7f050213badace86540eb5affa947df63f
--- /dev/null
+++ b/test/test_native_mha.py
@@ -0,0 +1,353 @@
+# Owner(s): ["module: nn"]
+import math
+import copy
+import unittest
+
+import torch
+import torch_npu
+import torch_npu.testing
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    dtypesIfPRIVATEUSE1,
+    instantiate_device_type_tests,
+    onlyPRIVATEUSE1,
+    skipMeta,
+)
+from torch.testing._internal.common_utils import parametrize, run_tests, TestCase
+
+
+class TestMHADeviceType(TestCase):
+    @torch.no_grad()
+    def _test_transform_bias_rescale_qkv_impl(
+        self, device, dtype, use_nt, use_padding=False
+    ):
+        tests = [
+            (64, 4, 16, 8),
+            # dim_per_head = 12 does not divide evenly by CPU vectorization length of 8
+            (24, 2, 4, 2),
+            # Make sure CUDA can handle small input sizes
+            (2, 2, 2, 2),
+            # dim_per_head = 6 does not divide evenly by CUDA vectorization length of 4,
+            # causes alignment issues
+            (24, 4, 4, 2),
+            (48, 4, 16, 8),
+        ]
+        for (embed_dim, num_heads, bs, sl) in tests:
+            with self.subTest(embed_dim=embed_dim, num_heads=num_heads, bs=bs, sl=sl):
+                torch.manual_seed(9343)
+                dense_x = x = (
+                    torch.randn(bs, sl, 3 * embed_dim, device=device, dtype=dtype) * 10
+                )
+                if use_padding:
+                    x[0][-1] = torch.full(x[0][-1].shape, float("-Inf"))
+                if use_nt:
+                    xs = list(torch.unbind(x))
+                    if use_padding:
+                        xs[0] = xs[0][:-1]
+                    x = torch.nested.nested_tensor(xs, device=device, dtype=dtype)
+                qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=dtype)
+
+                # We have to use inference_mode here because q/k/v are
+                # all views of the same Tensor, which autograd doesn't
+                # like. This is fine because this function is only
+                # exposed to Python for purposes of writing this test.
+                with torch.inference_mode():
+                    (q, k, v) = torch._transform_bias_rescale_qkv(
+                        x, qkv.bias, num_heads=num_heads
+                    )
+
+                    def simple_transform_bias_rescale_qkv(qkv, bias):
+                        (q, k, v) = torch.split(qkv, embed_dim, dim=-1)
+                        (q_bias, k_bias, v_bias) = torch.split(bias, embed_dim, dim=-1)
+
+                        def embiggen(x):
+                            if not use_nt:
+                                return x
+                            b, t, d = x.size()
+                            t = t + (8 - t % 8) % 8
+                            newsize = (b, t, d)
+                            new_x = torch.zeros(newsize, device=device, dtype=dtype)
+                            new_x[:x.size()[0], :x.size()[1], :x.size()[2]] = x
+                            return new_x
+                        return tuple(
+                            embiggen(x).reshape(
+                                (bs, -1, num_heads, embed_dim // num_heads)
+                            ).transpose(2, 1)
+                            for x in (
+                                (q + q_bias) / math.sqrt(embed_dim // num_heads),
+                                (k + k_bias),
+                                (v + v_bias),
+                            )
+                        )
+
+                    correct_q, correct_k, correct_v = simple_transform_bias_rescale_qkv(
+                        dense_x, qkv.bias
+                    )
+                    if use_nt and use_padding:
+                        for t in (correct_q, correct_k, correct_v):
+                            t[t == float("-Inf")] = 0
+
+                self.assertEqual(q.size(), correct_q.size())
+                torch.testing.assert_close(q, correct_q)
+                torch.testing.assert_close(k, correct_k)
+                torch.testing.assert_close(v, correct_v)
+
+    @dtypesIfPRIVATEUSE1(torch.float)
+    @dtypes(torch.float)
+    @skipMeta
+    def test_transform_bias_rescale_qkv(self, device, dtype):
+        for use_padding in (False, True):
+            with self.subTest(use_padding=use_padding):
+                self._test_transform_bias_rescale_qkv_impl(
+                    device, dtype, use_nt=False, use_padding=use_padding
+                )
+
+    @unittest.skip("NPU currently do not support nested tensor.")
+    @dtypesIfPRIVATEUSE1(torch.float)
+    @dtypes(torch.float)
+    @skipMeta
+    @onlyPRIVATEUSE1
+    def test_transform_bias_rescale_qkv_nested(self, device, dtype):
+        for use_padding in (False, True):
+            with self.subTest(use_padding=use_padding):
+                self._test_transform_bias_rescale_qkv_impl(
+                    device, dtype, use_nt=True, use_padding=use_padding
+                )
+
+    # pylint:disable = huawei-too-many-arguments
+    def _test_multihead_attention_impl(
+        self, device, dtype, mode, use_nt, need_weights, average_attn_weights, use_padding=False, pad_all=False
+    ):
+        embed_dim = 64
+        num_heads = 4
+        bs = 16
+        sl = 8
+
+        q = 6 * torch.rand(bs, sl, embed_dim, device=device, dtype=torch.float32) - 3
+        if use_padding:
+            if pad_all:
+                for q_i in q:
+                    q_i[-1] = torch.zeros_like(q[0][-1], device=device, dtype=torch.float32)
+                mask = torch.zeros(q.shape[:-1], device=device, dtype=torch.bool)
+                for mask_i in mask:
+                    mask_i[-1] = True
+            else:
+                q[0][-1] = torch.zeros_like(q[0][-1], device=device, dtype=torch.float32)
+                mask = torch.zeros(q.shape[:-1], device=device, dtype=torch.bool)
+                mask[0][-1] = True
+        if mode == "self":
+            k = q
+            v = q
+        elif mode == "encdec":
+            k = 6 * torch.rand(bs, sl, embed_dim, device=device, dtype=torch.float32) - 3
+            v = k
+        elif mode == "generic":
+            k = 6 * torch.rand(bs, sl, embed_dim, device=device, dtype=torch.float32) - 3
+            v = 6 * torch.rand(bs, sl, embed_dim, device=device, dtype=torch.float32) - 3
+        else:
+            self.fail(f"invalid mode `{mode}`!")
+
+        qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=torch.float32)
+        native_qkv = copy.deepcopy(qkv).to(dtype=dtype)
+
+        proj = torch.nn.Linear(embed_dim, embed_dim, device=device, dtype=torch.float32)
+        native_proj = copy.deepcopy(proj).to(dtype=dtype)
+
+        pt = torch.nn.MultiheadAttention(
+            embed_dim, num_heads, batch_first=True, device=device, dtype=torch.float32
+        )
+
+        pt.in_proj_weight = qkv.weight
+        pt.in_proj_bias = qkv.bias
+        pt.out_proj.weight = proj.weight
+        pt.out_proj.bias = proj.bias
+
+        class NativeMHA(torch.nn.Module):
+            def __init__(self, embed_dim, num_heads, qkv, proj):
+                super().__init__()
+                self.qkv = qkv
+                self.proj = proj
+                self.embed_dim = embed_dim
+                self.num_heads = num_heads
+
+            def forward(self, q, k, v, key_padding_mask):
+                return torch._native_multi_head_attention(
+                    q,
+                    k,
+                    v,
+                    self.embed_dim,
+                    self.num_heads,
+                    self.qkv.weight,
+                    self.qkv.bias,
+                    self.proj.weight,
+                    self.proj.bias,
+                    key_padding_mask,
+                    need_weights=need_weights,
+                    average_attn_weights=average_attn_weights,
+                    mask_type=1,   # mask_type = 1 => src_key_padding_mask, mask_type = 0 => src_mask
+                )
+
+        npt = NativeMHA(
+            embed_dim=embed_dim, num_heads=num_heads, qkv=native_qkv, proj=native_proj
+        ).to(dtype)
+
+        if device == "npu":
+            pt = pt.npu()
+            npt = npt.npu()
+
+        ypt, weight_pt = pt(
+            q,
+            k,
+            v,
+            need_weights=need_weights,
+            average_attn_weights=average_attn_weights,
+            key_padding_mask=mask if use_padding else None,
+        )
+        if use_nt:
+            qs = list(torch.unbind(q))
+            if use_padding:
+                if pad_all:
+                    qs = [x[:-1] for x in qs]
+                else:
+                    qs[0] = qs[0][:-1]
+            q = torch.nested.nested_tensor(qs, device=device, dtype=dtype)
+            if mode == "self":
+                k = v = q
+            elif mode == "encdec":
+                k = torch.nested.nested_tensor(torch.unbind(k), device=device, dtype=dtype)
+                v = k
+            else:
+                k = torch.nested.nested_tensor(torch.unbind(k), device=device, dtype=dtype)
+                v = torch.nested.nested_tensor(torch.unbind(v), device=device, dtype=dtype)
+
+        native_q = q.to(dtype=dtype)
+        native_k = k.to(dtype=dtype)
+        native_v = v.to(dtype=dtype)
+
+        ynpt, weight_npt = npt(
+            native_q, native_k, native_v, key_padding_mask=mask if use_padding and not use_nt else None
+        )
+        if use_nt:
+            ynpt = ynpt.to_padded_tensor(0)
+            if pad_all:
+                ynpt_final = torch.zeros_like(ypt)
+                ynpt_final[:, :ynpt.shape[1], :] = ynpt
+                ynpt = ynpt_final
+
+        def do_pad_all(tensors):
+            for t in tensors:
+                for t_i in t:
+                    t_i[-1] = torch.zeros_like(t_i[-1], device=device, dtype=dtype)
+
+        # PyTorch implementation returns non-zero junk in the padding
+        # locations; overwrite it so that the comparison works out.
+        if use_padding:
+            ypt[0][-1] = torch.zeros_like(ypt[0][-1], device=device, dtype=dtype)
+            ynpt[0][-1] = torch.zeros_like(ynpt[0][-1], device=device, dtype=dtype)
+            if pad_all:
+                do_pad_all((ypt, ynpt))
+            # Zero the last row of each TxT weight matrix
+            if need_weights:
+                if average_attn_weights:
+                    weight_pt[0][-1] = torch.zeros_like(weight_pt[0][-1], device=device, dtype=dtype)
+                    weight_npt[0][-1] = torch.zeros_like(weight_npt[0][-1], device=device, dtype=dtype)
+                    if pad_all:
+                        do_pad_all((weight_pt, weight_npt))
+                else:
+                    for nh in range(num_heads):
+                        weight_pt[0][nh][-1] = torch.zeros_like(weight_pt[0][nh][-1], device=device, dtype=dtype)
+                        weight_npt[0][nh][-1] = torch.zeros_like(weight_npt[0][nh][-1], device=device, dtype=dtype)
+
+        if dtype == torch.half:
+            torch.testing.assert_close(ypt, ynpt.to(torch.float32), atol=1e-3, rtol=1e-3)
+        else:
+            # High rtol seems necessary for
+            # test_native_multihead_attention_cpu_float32 on Windows,
+            # otherwise 2e-4 would likely be fine.
+            torch.testing.assert_close(ypt, ynpt, atol=2e-5, rtol=2e-3)
+
+        if need_weights:
+            torch.testing.assert_close(weight_pt, weight_npt.to(torch.float32), atol=5e-4, rtol=5e-4)
+        else:
+            self.assertEqual(weight_pt, weight_npt)
+
+    # NPU currently do not support nested tensor, we set use_nt=False.
+    # NPU currently do not support calculate with key_padding_mask, we set use_padding=False.
+    @dtypesIfPRIVATEUSE1(torch.float, torch.half)
+    @dtypes(torch.float)
+    @skipMeta
+    @parametrize("use_nt", [False])
+    @parametrize("use_padding, pad_all", [(False, False), (False, True)])
+    @parametrize("need_weights", [False])
+    @parametrize("average_attn_weights", [False, True])
+    @parametrize("fused", [False, True])
+    @torch.no_grad()
+    # pylint:disable = huawei-too-many-arguments
+    def test_native_multihead_self_attention(self, device, dtype, use_nt,
+                                             need_weights, average_attn_weights, use_padding, pad_all, fused):
+        for need_weights in (False, not pad_all):
+            with self.subTest(use_padding=use_padding, pad_all=pad_all,
+                              use_nt=use_nt, need_weights=need_weights,
+                              average_attn_weights=average_attn_weights):
+                # NPU do not use sdp_kernel, here we simply call _test_multihead_attention_impl.
+                if "npu" in device:
+                    self._test_multihead_attention_impl(
+                        device,
+                        dtype,
+                        "self",
+                        use_nt=use_nt,
+                        use_padding=use_padding,
+                        pad_all=pad_all,
+                        need_weights=need_weights,
+                        average_attn_weights=average_attn_weights,
+                    )
+                else:
+                    with torch.backends.npu.sdp_kernel(
+                            enable_flash=False, enable_mem_efficient=False
+                    ) if not fused else torch.backends.npu.sdp_kernel(
+                            enable_flash=True, enable_mem_efficient=True
+                    ):
+                        self._test_multihead_attention_impl(
+                            device,
+                            dtype,
+                            "self",
+                            use_nt=use_nt,
+                            use_padding=use_padding,
+                            pad_all=pad_all,
+                            need_weights=need_weights,
+                            average_attn_weights=average_attn_weights,
+                        )
+
+    @dtypesIfPRIVATEUSE1(torch.float, torch.half)
+    @dtypes(torch.float)
+    @skipMeta
+    @torch.no_grad()
+    def test_native_multihead_encoder_decoder_attention(self, device, dtype):
+        self._test_multihead_attention_impl(
+            device,
+            dtype,
+            "encdec",
+            use_nt=False,
+            need_weights=False,
+            average_attn_weights=False,
+        )
+
+    @dtypesIfPRIVATEUSE1(torch.float, torch.half)
+    @dtypes(torch.float)
+    @skipMeta
+    @torch.no_grad()
+    def test_native_multihead_attention(self, device, dtype):
+        self._test_multihead_attention_impl(
+            device,
+            dtype,
+            "generic",
+            use_nt=False,
+            need_weights=False,
+            average_attn_weights=False,
+        )
+
+
+instantiate_device_type_tests(TestMHADeviceType, globals())
+
+if __name__ == "__main__":
+    pass
diff --git a/test/test_npu.py b/test/test_npu.py
index 84462eeb9ec87c88d8b27247e764c35c044abb8c..ae108f2467cbc176ed4be5315a675cc6720984ce 100644
--- a/test/test_npu.py
+++ b/test/test_npu.py
@@ -3317,12 +3317,15 @@ exit(2)
                 with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"):
                     torch.addmm(s, m1, m2)
 
-    @unittest.skipIf(not TEST_MULTINPU, "Testing on one NPU is sufficient")
+    @unittest.skipIf(TEST_MULTINPU, "Testing on one NPU is sufficient")
     def test_lazy_init(self):
-        """ Validate that no NPU calls are made during `import torch` call"""
+        """ Validate that no NPU calls are made during `import torch_npu` call"""
         from subprocess import check_output
-        VISIBLE_DEVICES = "HIP_VISIBLE_DEVICES" if TEST_WITH_ROCM else "CUDA_VISIBLE_DEVICES"
-        test_script = f"import os; import torch; import torch_npu; os.environ['{VISIBLE_DEVICES}']='32';print(torch_npu.npu.device_count())"
+        VISIBLE_DEVICES = "HIP_VISIBLE_DEVICES" if TEST_WITH_ROCM else "ASCEND_RT_VISIBLE_DEVICES"
+        # Check that `rts` was not called during the import
+        # By using torch_npu._C._npu_getDeviceCount() because it will not change if `rts` was called
+        # torch_npu.npu.device_count() will parses ASCEND_RT_VISIBLE_DEVICES and will change along with it
+        test_script = f"import os; import torch; import torch_npu; os.environ['{VISIBLE_DEVICES}']='32';print(torch_npu._C._npu_getDeviceCount())"
         rc = check_output([sys.executable, '-c', test_script]).decode("ascii").strip()
         self.assertEqual(rc, "0")
 
diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index 7d85362d113ae8f2a5c1516a7b5fb2b9c88e17f8..8b7424862e70c1d328a4b367ed1d27c8ccf140db 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -1,10 +1,4 @@
 {
-  "torch_npu.contrib.BiLSTM": {
-    "signature": "(input_size, hidden_size)"
-  },
-  "torch_npu.contrib.BiLSTM.forward": {
-    "signature": "(self, inputs)"
-  },
   "torch_npu.contrib.LinearQuant": {
     "signature": "(in_features: int, out_features: int, *, bias: bool = True, offset: bool = False, pertoken_scale: bool = False, device=None, dtype=None, output_dtype=None) -> None"
   },
@@ -20,30 +14,12 @@
   "torch_npu.contrib.ChannelShuffle.forward": {
     "signature": "(self, x1, x2)"
   },
-  "torch_npu.contrib.DCNv2": {
-    "signature": "(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True, pack=True)"
-  },
-  "torch_npu.contrib.DCNv2.init_param": {
-    "signature": "(self)"
-  },
-  "torch_npu.contrib.DCNv2.forward": {
-    "signature": "(self, x)"
-  },
   "torch_npu.contrib.Focus": {
     "signature": "(c1, c2, k=1, s=1, p=None, g=1, act=True)"
   },
   "torch_npu.contrib.Focus.forward": {
     "signature": "(self, x)"
   },
-  "torch_npu.contrib.FusedColorJitter": {
-    "signature": "(brightness=0, contrast=0, saturation=0, hue=0)"
-  },
-  "torch_npu.contrib.FusedColorJitter._check_input": {
-    "signature": "(self, value, name, center=1, bound=(0, inf), clip_first_on_zero=True)"
-  },
-  "torch_npu.contrib.FusedColorJitter.forward": {
-    "signature": "(self, img)"
-  },
   "torch_npu.contrib.LabelSmoothingCrossEntropy": {
     "signature": "(num_classes=1000, smooth_factor=0.0)"
   },
@@ -62,12 +38,6 @@
   "torch_npu.contrib.LinearWeightQuant.forward": {
     "signature": "(self, x: torch.Tensor) -> torch.Tensor"
   },
-  "torch_npu.contrib.Mish": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.Mish.forward": {
-    "signature": "(self, x)"
-  },
   "torch_npu.contrib.ModulatedDeformConv": {
     "signature": "(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True, pack=True)"
   },
@@ -167,18 +137,6 @@
   "torch_npu.contrib.ROIAlign.forward": {
     "signature": "(self, input_tensor, rois)"
   },
-  "torch_npu.contrib.SiLU": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.SiLU.forward": {
-    "signature": "(self, x)"
-  },
-  "torch_npu.contrib.Swish": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.Swish.forward": {
-    "signature": "(self, x)"
-  },
   "torch_npu.contrib.matmul_transpose": {
     "signature": "(*args, **kwargs)"
   },
@@ -194,12 +152,6 @@
   "torch_npu.contrib.npu_bbox_coder_encode_yolo": {
     "signature": "(bboxes, gt_bboxes, stride)"
   },
-  "torch_npu.contrib.npu_ciou": {
-    "signature": "(boxes1, boxes2, trans=True, is_cross=False, mode=0)"
-  },
-  "torch_npu.contrib.npu_diou": {
-    "signature": "(boxes1, boxes2, trans=True, is_cross=False, mode=0)"
-  },
   "torch_npu.contrib.npu_fast_condition_index_put": {
     "signature": "(x, condition, value)"
   },
@@ -209,18 +161,9 @@
   "torch_npu.contrib.npu_fused_attention_with_layernorm": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.contrib.npu_giou": {
-    "signature": "(boxes1, boxes2, is_permuted=True)"
-  },
-  "torch_npu.contrib.npu_iou": {
-    "signature": "(boxes1, boxes2, mode='ptiou', is_normalized=False, normalized_scale=100.0)"
-  },
   "torch_npu.contrib.npu_multiclass_nms": {
     "signature": "(multi_bboxes, multi_scores, score_thr=0.05, nms_thr=0.45, max_num=50, score_factors=None)"
   },
-  "torch_npu.contrib.npu_ptiou": {
-    "signature": "(boxes1, boxes2, mode='ptiou', is_normalized=False, normalized_scale=100.0)"
-  },
   "torch_npu.contrib.npu_single_level_responsible_flags": {
     "signature": "(featmap_size, gt_bboxes, stride, num_base_anchors)"
   },
@@ -248,12 +191,6 @@
   "torch_npu.contrib.function.npu_bbox_coder_encode_yolo": {
     "signature": "(bboxes, gt_bboxes, stride)"
   },
-  "torch_npu.contrib.function.npu_ciou": {
-    "signature": "(boxes1, boxes2, trans=True, is_cross=False, mode=0)"
-  },
-  "torch_npu.contrib.function.npu_diou": {
-    "signature": "(boxes1, boxes2, trans=True, is_cross=False, mode=0)"
-  },
   "torch_npu.contrib.function.npu_fast_condition_index_put": {
     "signature": "(x, condition, value)"
   },
@@ -263,18 +200,9 @@
   "torch_npu.contrib.function.npu_fused_attention_with_layernorm": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.contrib.function.npu_giou": {
-    "signature": "(boxes1, boxes2, is_permuted=True)"
-  },
-  "torch_npu.contrib.function.npu_iou": {
-    "signature": "(boxes1, boxes2, mode='ptiou', is_normalized=False, normalized_scale=100.0)"
-  },
   "torch_npu.contrib.function.npu_multiclass_nms": {
     "signature": "(multi_bboxes, multi_scores, score_thr=0.05, nms_thr=0.45, max_num=50, score_factors=None)"
   },
-  "torch_npu.contrib.function.npu_ptiou": {
-    "signature": "(boxes1, boxes2, mode='ptiou', is_normalized=False, normalized_scale=100.0)"
-  },
   "torch_npu.contrib.function.npu_single_level_responsible_flags": {
     "signature": "(featmap_size, gt_bboxes, stride, num_base_anchors)"
   },
@@ -308,21 +236,6 @@
   "torch_npu.contrib.function.index_op.npu_fast_condition_index_put": {
     "signature": "(x, condition, value)"
   },
-  "torch_npu.contrib.function.iou.npu_ciou": {
-    "signature": "(boxes1, boxes2, trans=True, is_cross=False, mode=0)"
-  },
-  "torch_npu.contrib.function.iou.npu_diou": {
-    "signature": "(boxes1, boxes2, trans=True, is_cross=False, mode=0)"
-  },
-  "torch_npu.contrib.function.iou.npu_giou": {
-    "signature": "(boxes1, boxes2, is_permuted=True)"
-  },
-  "torch_npu.contrib.function.iou.npu_iou": {
-    "signature": "(boxes1, boxes2, mode='ptiou', is_normalized=False, normalized_scale=100.0)"
-  },
-  "torch_npu.contrib.function.iou.npu_ptiou": {
-    "signature": "(boxes1, boxes2, mode='ptiou', is_normalized=False, normalized_scale=100.0)"
-  },
   "torch_npu.contrib.function.matmul_transpose.matmul_transpose": {
     "signature": "(*args, **kwargs)"
   },
@@ -338,12 +251,6 @@
   "torch_npu.contrib.function.roll.roll": {
     "signature": "(x, shifts, dims)"
   },
-  "torch_npu.contrib.module.BiLSTM": {
-    "signature": "(input_size, hidden_size)"
-  },
-  "torch_npu.contrib.module.BiLSTM.forward": {
-    "signature": "(self, inputs)"
-  },
   "torch_npu.contrib.module.ChannelShuffle": {
     "signature": "(in_channels, groups=2, split_shuffle=True)"
   },
@@ -353,15 +260,6 @@
   "torch_npu.contrib.module.ChannelShuffle.forward": {
     "signature": "(self, x1, x2)"
   },
-  "torch_npu.contrib.module.DCNv2": {
-    "signature": "(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True, pack=True)"
-  },
-  "torch_npu.contrib.module.DCNv2.init_param": {
-    "signature": "(self)"
-  },
-  "torch_npu.contrib.module.DCNv2.forward": {
-    "signature": "(self, x)"
-  },
   "torch_npu.contrib.module.DropoutWithByteMask": {
     "signature": "(p=0.5, inplace=False, max_seed=1023)"
   },
@@ -374,15 +272,6 @@
   "torch_npu.contrib.module.Focus.forward": {
     "signature": "(self, x)"
   },
-  "torch_npu.contrib.module.FusedColorJitter": {
-    "signature": "(brightness=0, contrast=0, saturation=0, hue=0)"
-  },
-  "torch_npu.contrib.module.FusedColorJitter._check_input": {
-    "signature": "(self, value, name, center=1, bound=(0, inf), clip_first_on_zero=True)"
-  },
-  "torch_npu.contrib.module.FusedColorJitter.forward": {
-    "signature": "(self, img)"
-  },
   "torch_npu.contrib.module.LinearQuant": {
     "signature": "(in_features: int, out_features: int, *, bias: bool = True, offset: bool = False, pertoken_scale: bool = False, device=None, dtype=None, output_dtype=None) -> None"
   },
@@ -413,12 +302,6 @@
   "torch_npu.contrib.module.LinearWeightQuant.forward": {
     "signature": "(self, x: torch.Tensor) -> torch.Tensor"
   },
-  "torch_npu.contrib.module.Mish": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.module.Mish.forward": {
-    "signature": "(self, x)"
-  },
   "torch_npu.contrib.module.ModulatedDeformConv": {
     "signature": "(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True, pack=True)"
   },
@@ -518,42 +401,6 @@
   "torch_npu.contrib.module.ROIAlign.forward": {
     "signature": "(self, input_tensor, rois)"
   },
-  "torch_npu.contrib.module.SiLU": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.module.SiLU.forward": {
-    "signature": "(self, x)"
-  },
-  "torch_npu.contrib.module.Swish": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.module.Swish.forward": {
-    "signature": "(self, x)"
-  },
-  "torch_npu.contrib.module.activations.Mish": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.module.activations.Mish.forward": {
-    "signature": "(self, x)"
-  },
-  "torch_npu.contrib.module.activations.SiLU": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.module.activations.SiLU.forward": {
-    "signature": "(self, x)"
-  },
-  "torch_npu.contrib.module.activations.Swish": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.module.activations.Swish.forward": {
-    "signature": "(self, x)"
-  },
-  "torch_npu.contrib.module.bidirectional_lstm.BiLSTM": {
-    "signature": "(input_size, hidden_size)"
-  },
-  "torch_npu.contrib.module.bidirectional_lstm.BiLSTM.forward": {
-    "signature": "(self, inputs)"
-  },
   "torch_npu.contrib.module.channel_shuffle.ChannelShuffle": {
     "signature": "(in_channels, groups=2, split_shuffle=True)"
   },
@@ -593,15 +440,6 @@
   "torch_npu.contrib.module.crossentropy.LabelSmoothingCrossEntropy.forward": {
     "signature": "(self, pred, target)"
   },
-  "torch_npu.contrib.module.deform_conv.DCNv2": {
-    "signature": "(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True, pack=True)"
-  },
-  "torch_npu.contrib.module.deform_conv.DCNv2.init_param": {
-    "signature": "(self)"
-  },
-  "torch_npu.contrib.module.deform_conv.DCNv2.forward": {
-    "signature": "(self, x)"
-  },
   "torch_npu.contrib.module.deform_conv.ModulatedDeformConv": {
     "signature": "(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True, pack=True)"
   },
@@ -668,15 +506,6 @@
   "torch_npu.contrib.module.focus.fast_slice": {
     "signature": "(x)"
   },
-  "torch_npu.contrib.module.fusedcolorjitter.FusedColorJitter": {
-    "signature": "(brightness=0, contrast=0, saturation=0, hue=0)"
-  },
-  "torch_npu.contrib.module.fusedcolorjitter.FusedColorJitter._check_input": {
-    "signature": "(self, value, name, center=1, bound=(0, inf), clip_first_on_zero=True)"
-  },
-  "torch_npu.contrib.module.fusedcolorjitter.FusedColorJitter.forward": {
-    "signature": "(self, img)"
-  },
   "torch_npu.contrib.module.linear_a8w8_quant.LinearA8W8Quant": {
     "signature": "(in_features: int, out_features: int, *, bias: bool = True, offset: bool = False, pertoken_scale: bool = False, device=None, dtype=None, output_dtype=None) -> None"
   },
@@ -908,6 +737,195 @@
   "torch_npu.npu.DoubleStorage": {
     "signature": "(*args, wrap_storage=None, dtype=None, device=None, _internal=False)"
   },
+  "torch_npu.utils.tensor_methods.npu_dtype_cast": {
+    "signature": "(self, dtype)"
+  },
+  "torch_npu.npu.MemPoolContext.active_pool": {
+    "signature": "() -> Optional[torch_npu._C._MemPool]"
+  },
+  "torch_npu.npu.memory.MemPoolContext.active_pool": {
+    "signature": "() -> Optional[torch_npu._C._MemPool]"
+  },
+  "torch_npu.npu.graphs.graph_task_group_begin": {
+    "signature": "(stream)"
+  },
+  "torch_npu.npu.graphs.NPUGraph": {
+    "signature": "()"
+  },
+  "torch_npu.profiler.experimental_config._ExperimentalConfig._conver_export_type_to_list": {
+    "signature": "(self, export_type: Union[str, list]) -> list"
+  },
+  "torch_npu.npu.utils.get_cann_version": {
+    "signature": "(module='CANN')"
+  },
+  "torch_npu.contrib.module.activations.Mish": {
+    "signature": "()"
+  },
+  "torch_npu.npu.graphs.graph": {
+    "signature": "(npu_graph, pool=None, stream=None, capture_error_mode: str = 'global')"
+  },
+  "torch_npu.npu.graphs.graph_task_update_end": {
+    "signature": "(stream)"
+  },
+  "torch_npu.npu.graph_pool_handle": {
+    "signature": "()"
+  },
+  "torch_npu.contrib.module.bidirectional_lstm.BiLSTM.forward": {
+    "signature": "(self, inputs)"
+  },
+  "torch_npu.npu.graphs.is_current_stream_capturing": {
+    "signature": "()"
+  },
+  "torch_npu.npu_diou": {
+    "signature": "(self, gtboxes, trans=False, is_cross=False, mode=0)"
+  },
+  "torch_npu.npu.graphs.NPUGraph.capture_begin": {
+    "signature": "(self, pool=None, capture_error_mode='global')"
+  },
+  "torch_npu.npu_giou": {
+    "signature": "(self, gtboxes, trans=False, is_cross=False, mode=0)"
+  },
+  "torch_npu.npu.memory.MemPoolContext": {
+    "signature": "(pool: torch_npu.npu.memory.MemPool)"
+  },
+  "torch_npu.npu_iou": {
+    "signature": "(bboxes, gtboxes, mode=0)"
+  },
+  "torch_npu.contrib.module.activations.Mish.forward": {
+    "signature": "(self, x)"
+  },
+  "torch_npu.npu.graphs.graph_pool_handle": {
+    "signature": "()"
+  },
+  "torch_npu.npu.NPUGraph.capture_end": {
+    "signature": "(self)"
+  },
+  "torch_npu.npu_mla_prolog": {
+    "signature": "(*args, **kwargs)"
+  },
+  "torch_npu.npu.MemPoolContext": {
+    "signature": "(pool: torch_npu.npu.memory.MemPool)"
+  },
+  "torch_npu.profiler._ExperimentalConfig._conver_export_type_to_list": {
+    "signature": "(self, export_type: Union[str, list]) -> list"
+  },
+  "torch_npu.npu.memory.MemPool": {
+    "signature": "(allocator: Optional[torch_npu._C._npu_NPUAllocator] = None)"
+  },
+  "torch_npu.npu.graphs.graph_task_group_end": {
+    "signature": "(stream)"
+  },
+  "torch_npu.contrib.module.bidirectional_lstm.BiLSTM": {
+    "signature": "(input_size, hidden_size)"
+  },
+  "torch_npu.npu.graphs.NPUGraph.reset": {
+    "signature": "(self)"
+  },
+  "torch_npu.npu_cross_entropy_loss": {
+    "signature": "(*args, **kwargs)"
+  },
+  "torch_npu.npu.memory.use_mem_pool": {
+    "signature": "(pool: torch_npu.npu.memory.MemPool, device=None)"
+  },
+  "torch_npu.npu.streams.ExternalEvent.reset": {
+    "signature": "(self, stream=None)"
+  },
+  "torch_npu.npu.graphs.graph_task_update_begin": {
+    "signature": "(stream, handle)"
+  },
+  "torch_npu.npu_grouped_matmul_finalize_routing": {
+    "signature": "(*args, **kwargs)"
+  },
+  "torch_npu.npu.streams.ExternalEvent.record": {
+    "signature": "(self, stream=None)"
+  },
+  "torch_npu.npu_advance_step_flashattn": {
+    "signature": "(*args, **kwargs)"
+  },
+  "torch_npu.contrib.module.activations.Swish.forward": {
+    "signature": "(self, x)"
+  },
+  "torch_npu.npu.NPUGraph.reset": {
+    "signature": "(self)"
+  },
+  "torch_npu.npu.use_mem_pool": {
+    "signature": "(pool: torch_npu.npu.memory.MemPool, device=None)"
+  },
+  "torch_npu.npu.MemPool": {
+    "signature": "(allocator: Optional[torch_npu._C._npu_NPUAllocator] = None)"
+  },
+  "torch_npu.npu.streams.ExternalEvent.wait": {
+    "signature": "(self, stream=None)"
+  },
+  "torch_npu.npu.NPUGraph": {
+    "signature": "()"
+  },
+  "torch_npu.npu.graphs.make_graphed_callables": {
+    "signature": "(callables, sample_args, num_warmup_iters=3, allow_unused_input=False, pool=None)"
+  },
+  "torch_npu.npu.streams.ExternalEvent": {
+    "signature": "()"
+  },
+  "torch_npu.npu.NPUGraph.pool": {
+    "signature": "(self)"
+  },
+  "torch_npu.contrib.module.activations.SiLU.forward": {
+    "signature": "(self, x)"
+  },
+  "torch_npu.empty_with_swapped_memory": {
+    "signature": "(*args, **kwargs)"
+  },
+  "torch_npu.npu_gmm_alltoallv": {
+    "signature": "(*args, **kwargs)"
+  },
+  "torch_npu.npu_mrope": {
+    "signature": "(*args, **kwargs)"
+  },
+  "torch_npu.npu.streams.Event.recorded_time": {
+    "signature": "(self)"
+  },
+  "torch_npu.npu.make_graphed_callables": {
+    "signature": "(callables, sample_args, num_warmup_iters=3, allow_unused_input=False, pool=None)"
+  },
+  "torch_npu.npu.NPUGraph.capture_begin": {
+    "signature": "(self, pool=None, capture_error_mode='global')"
+  },
+  "torch_npu.npu.graph": {
+    "signature": "(npu_graph, pool=None, stream=None, capture_error_mode: str = 'global')"
+  },
+  "torch_npu.npu.Event.recorded_time": {
+    "signature": "(self)"
+  },
+  "torch_npu.npu_alltoallv_gmm": {
+    "signature": "(*args, **kwargs)"
+  },
+  "torch_npu.npu.graphs.NPUGraph.replay": {
+    "signature": "(self)"
+  },
+  "torch_npu.contrib.module.activations.Swish": {
+    "signature": "()"
+  },
+  "torch_npu.npu.NPUGraph.replay": {
+    "signature": "(self)"
+  },
+  "torch_npu.npu_group_norm_swish": {
+    "signature": "(*args, **kwargs)"
+  },
+  "torch_npu.npu.graphs.NPUGraph.capture_end": {
+    "signature": "(self)"
+  },
+  "torch_npu.npu.is_current_stream_capturing": {
+    "signature": "()"
+  },
+  "torch_npu.npu.graphs.NPUGraph.pool": {
+    "signature": "(self)"
+  },
+  "torch_npu.npu_ciou": {
+    "signature": "(self, gtboxes, trans=False, is_cross=True, mode=0, atan_sub_flag=False)"
+  },
+  "torch_npu.contrib.module.activations.SiLU": {
+    "signature": "()"
+  },
   "torch_npu.npu.Event": {
     "signature": "(enable_timing=False, blocking=False, interprocess=False)"
   },
@@ -926,6 +944,30 @@
   "torch_npu.npu.Event.synchronize": {
     "signature": "(self)"
   },
+  "torch_npu.npu.ExternalEvent": {
+    "signature": "()"
+  },
+  "torch_npu.npu.ExternalEvent.record": {
+    "signature": "(self, stream=None)"
+  },
+  "torch_npu.npu.ExternalEvent.wait": {
+    "signature": "(self, stream=None)"
+  },
+  "torch_npu.npu.ExternalEvent.reset": {
+    "signature": "(self, stream=None)"
+  },
+  "torch_npu.npu.graph_task_group_begin": {
+    "signature": "(stream)"
+  },
+  "torch_npu.npu.graph_task_group_end": {
+    "signature": "(stream)"
+  },
+  "torch_npu.npu.graph_task_update_begin": {
+    "signature": "(stream, handle)"
+  },
+  "torch_npu.npu.graph_task_update_end": {
+    "signature": "(stream)"
+  },
   "torch_npu.npu.FloatStorage": {
     "signature": "(*args, wrap_storage=None, dtype=None, device=None, _internal=False)"
   },
@@ -1022,6 +1064,9 @@
   "torch_npu.npu.check_uce_in_memory": {
     "signature": "(device_id)"
   },
+  "torch_npu.npu.get_uce_addr": {
+    "signature": "()"
+  },
   "torch_npu.npu.clear_npu_overflow_flag": {
     "signature": "()"
   },
@@ -1041,11 +1086,14 @@
     "signature": "(device)"
   },
   "torch_npu.npu.device_count": {
-    "signature": "()"
+    "signature": "() -> int"
   },
   "torch_npu.npu.device_of": {
     "signature": "(obj)"
   },
+  "torch_npu.npu.StreamContext": {
+    "signature": "(stream_ctx: Optional[ForwardRef('torch_npu.npu.Stream')])"
+  },
   "torch_npu.npu.disable_deterministic_with_backward": {
     "signature": "(tensor: torch.Tensor)"
   },
@@ -1158,16 +1206,16 @@
     "signature": "()"
   },
   "torch_npu.npu.mstx.mark": {
-    "signature": "(message='')"
+    "signature": "(message: str, stream=None, domain: str = 'default')"
   },
   "torch_npu.npu.mstx.mstx_range": {
-    "signature": "(message: str, stream=None)"
+    "signature": "(message: str, stream=None, domain: str = 'default')"
   },
   "torch_npu.npu.mstx.range_start": {
-    "signature": "(message: str, stream=None) -> int"
+    "signature": "(message: str, stream=None, domain: str = 'default') -> int"
   },
   "torch_npu.npu.mstx.range_end": {
-    "signature": "(range_id: int)"
+    "signature": "(range_id: int, domain: str = 'default')"
   },
   "torch_npu.npu.reset_accumulated_memory_stats": {
     "signature": "(device=None)"
@@ -1446,16 +1494,16 @@
     "signature": "()"
   },
   "torch_npu.npu.mstx.mstx.mark": {
-    "signature": "(message='')"
+    "signature": "(message: str, stream=None, domain: str = 'default')"
   },
   "torch_npu.npu.mstx.mstx.range_start": {
-    "signature": "(message: str, stream=None) -> int"
+    "signature": "(message: str, stream=None, domain: str = 'default') -> int"
   },
   "torch_npu.npu.mstx.mstx.range_end": {
-    "signature": "(range_id: int)"
+    "signature": "(range_id: int, domain: str = 'default')"
   },
   "torch_npu.npu.mstx.mstx.mstx_range": {
-    "signature": "(message: str, stream=None)"
+    "signature": "(message: str, stream=None, domain: str = 'default')"
   },
   "torch_npu.npu.npu_config.finalize_dump": {
     "signature": "()"
@@ -1575,11 +1623,14 @@
     "signature": "(device)"
   },
   "torch_npu.npu.utils.device_count": {
-    "signature": "()"
+    "signature": "() -> int"
   },
   "torch_npu.npu.utils.device_of": {
     "signature": "(obj)"
   },
+  "torch_npu.npu.utils.StreamContext": {
+    "signature": "(stream_ctx: Optional[ForwardRef('torch_npu.npu.Stream')])"
+  },
   "torch_npu.npu.utils.finalize_dump": {
     "signature": "()"
   },
@@ -2076,11 +2127,14 @@
     "signature": "()"
   },
   "torch_npu.profiler._ExperimentalConfig": {
-    "signature": "(profiler_level: int = 'Level0', aic_metrics: int = 'ACL_AICORE_NONE', l2_cache: bool = False, msprof_tx: bool = False, data_simplification: bool = True, record_op_args: bool = False, op_attr: bool = False, gc_detect_threshold: float = None, export_type: Union[str, list] = None)"
+    "signature": "(profiler_level: int = 'Level0', aic_metrics: int = 'ACL_AICORE_NONE', l2_cache: bool = False, msprof_tx: bool = False, data_simplification: bool = True, record_op_args: bool = False, op_attr: bool = False, gc_detect_threshold: float = None, export_type: Union[str, list] = None, host_sys: list = None, sys_io: bool = False, sys_interconnection: bool = False, mstx_domain_include: list = None, mstx_domain_exclude: list = None)"
   },
   "torch_npu.profiler._ExperimentalConfig._check_params": {
     "signature": "(self)"
   },
+  "torch_npu.profiler._ExperimentalConfig._check_mstx_domain_params": {
+    "signature": "(self)"
+  },
   "torch_npu.profiler.profile": {
     "signature": "(*, activities: Optional[Iterable[torch_npu._C._profiler.ProfilerActivity]] = None, schedule: Optional[Callable[[int], torch_npu.profiler.scheduler.ProfilerAction]] = None, on_trace_ready: Optional[Callable[..., Any]] = None, record_shapes: bool = False, profile_memory: bool = False, with_stack: bool = False, with_flops: bool = False, with_modules: bool = False, experimental_config: Optional[torch_npu.profiler.experimental_config._ExperimentalConfig] = None, use_cuda: Optional[bool] = None)"
   },
@@ -2148,11 +2202,14 @@
     "signature": "()"
   },
   "torch_npu.profiler.experimental_config._ExperimentalConfig": {
-    "signature": "(profiler_level: int = 'Level0', aic_metrics: int = 'ACL_AICORE_NONE', l2_cache: bool = False, msprof_tx: bool = False, data_simplification: bool = True, record_op_args: bool = False, op_attr: bool = False, gc_detect_threshold: float = None, export_type: Union[str, list] = None)"
+    "signature": "(profiler_level: int = 'Level0', aic_metrics: int = 'ACL_AICORE_NONE', l2_cache: bool = False, msprof_tx: bool = False, data_simplification: bool = True, record_op_args: bool = False, op_attr: bool = False, gc_detect_threshold: float = None, export_type: Union[str, list] = None, host_sys: list = None, sys_io: bool = False, sys_interconnection: bool = False, mstx_domain_include: list = None, mstx_domain_exclude: list = None)"
   },
   "torch_npu.profiler.experimental_config._ExperimentalConfig._check_params": {
     "signature": "(self)"
   },
+  "torch_npu.profiler.experimental_config._ExperimentalConfig._check_mstx_domain_params": {
+    "signature": "(self)"
+  },
   "torch_npu.profiler.experimental_config.supported_ai_core_metrics": {
     "signature": "()"
   },
@@ -2193,7 +2250,7 @@
     "signature": "(self)"
   },
   "torch_npu.profiler.profiler.analyse": {
-    "signature": "(profiler_path: str, max_process_number: int = 36)"
+    "signature": "(profiler_path: str, max_process_number: int = 36, export_type: Union[str, list] = None)"
   },
   "torch_npu.profiler.profiler.profile": {
     "signature": "(*, activities: Optional[Iterable[torch_npu._C._profiler.ProfilerActivity]] = None, schedule: Optional[Callable[[int], torch_npu.profiler.scheduler.ProfilerAction]] = None, on_trace_ready: Optional[Callable[..., Any]] = None, record_shapes: bool = False, profile_memory: bool = False, with_stack: bool = False, with_flops: bool = False, with_modules: bool = False, experimental_config: Optional[torch_npu.profiler.experimental_config._ExperimentalConfig] = None, use_cuda: Optional[bool] = None)"
@@ -2480,18 +2537,9 @@
   "torch_npu.utils.tensor_methods.npu_confusion_transpose": {
     "signature": "(self, perm, shape, transpose_first)"
   },
-  "torch_npu.utils.tensor_methods.npu_dtype_cast": {
-    "signature": "(self, dtype)"
-  },
   "torch_npu._npu_dropout": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.copy_memory_": {
-    "signature": "(*args, **kwargs)"
-  },
-  "torch_npu.empty_with_format": {
-    "signature": "(*args, **kwargs)"
-  },
   "torch_npu.fast_gelu": {
     "signature": "(self)"
   },
@@ -2504,9 +2552,6 @@
   "torch_npu.npu_anti_quant": {
     "signature": "(x, scale, offset=None, dst_dtype=None, src_dtype=None)"
   },
-  "torch_npu.npu_apply_adam": {
-    "signature": "(*args, **kwargs)"
-  },
   "torch_npu.npu_batch_nms": {
     "signature": "(self, scores, score_threshold, iou_threshold, max_size_per_class, max_total_size, change_coordinate_frame=False, transpose_box=False)"
   },
@@ -2522,48 +2567,24 @@
   "torch_npu.npu_bounding_box_encode": {
     "signature": "(anchor_box, ground_truth_box, means0, means1, means2, means3, stds0, stds1, stds2, stds3)"
   },
-  "torch_npu.npu_broadcast": {
-    "signature": "(self, size, out=None)"
-  },
-  "torch_npu.npu_ciou": {
-    "signature": "(self, gtboxes, trans=False, is_cross=True, mode=0, atan_sub_flag=False)"
-  },
   "torch_npu.npu_clear_float_status": {
     "signature": "(*args, **kwargs)"
   },
   "torch_npu.npu_confusion_transpose": {
     "signature": "(self, perm, shape, transpose_first)"
   },
-  "torch_npu.npu_conv2d": {
-    "signature": "(input_, weight, bias, stride, padding, dilation, groups)"
-  },
   "torch_npu.npu_conv3d": {
     "signature": "(input_, weight, bias, stride, padding, dilation, groups)"
   },
-  "torch_npu.npu_conv_transpose2d": {
-    "signature": "(input_, weight, bias, padding, output_padding, stride, dilation, groups)"
-  },
   "torch_npu.npu_convert_weight_to_int4pack": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.npu_convolution": {
-    "signature": "(input_, weight, bias, stride, padding, dilation, groups)"
-  },
-  "torch_npu.npu_convolution_transpose": {
-    "signature": "(input_, weight, bias, padding, output_padding, stride, dilation, groups)"
-  },
   "torch_npu.npu_deformable_conv2d": {
     "signature": "(inputs, weight, offset, bias, kernel_size, stride, padding, dilation=[1, 1, 1, 1], groups=1, deformable_groups=1, modulated=True)"
   },
-  "torch_npu.npu_diou": {
-    "signature": "(self, gtboxes, trans=False, is_cross=False, mode=0)"
-  },
   "torch_npu.npu_dropout_with_add_softmax": {
     "signature": "(self, x1, alpha, prob, dim)"
   },
-  "torch_npu.npu_dtype_cast": {
-    "signature": "(self, dtype)"
-  },
   "torch_npu.npu_dynamic_quant": {
     "signature": "(input_dummy, smooth_scales=None)"
   },
@@ -2594,30 +2615,18 @@
   "torch_npu.npu_get_float_status": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.npu_giou": {
-    "signature": "(self, gtboxes, trans=False, is_cross=False, mode=0)"
-  },
   "torch_npu.npu_grid_assign_positive": {
     "signature": "(self, overlaps, box_responsible_flags, max_overlaps, argmax_overlaps, gt_max_overlaps, gt_argmax_overlaps, num_gts, pos_iou_thr, min_pos_iou, gt_max_assign_all)"
   },
   "torch_npu.npu_grouped_matmul": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.npu_gru": {
-    "signature": "(inputs, hx, weight_input, weight_hidden, bias_input, bias_hidden, seq_length, has_biases, num_layers, dropout, train, bidirectional, batch_first)"
-  },
   "torch_npu.npu_incre_flash_attention": {
     "signature": "(self, query, key, value, padding_mask, atten_mask, pse_shift, actual_seq_lengths, antiquant_scale, antiquant_offset, block_table, num_heads, scale_value, input_layout, num_key_value_heads, block_size, inner_precise)"
   },
   "torch_npu.npu_indexing": {
     "signature": "(self, begin, end, strides, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0)"
   },
-  "torch_npu.npu_iou": {
-    "signature": "(bboxes, gtboxes, mode=0)"
-  },
-  "torch_npu.npu_layer_norm_eval": {
-    "signature": "(input_, normalized_shape, weight=None, bias=None, eps=1e-05)"
-  },
   "torch_npu.npu_linear": {
     "signature": "(input_, weight, bias=None)"
   },
@@ -2627,12 +2636,6 @@
   "torch_npu.npu_max": {
     "signature": "(self, dim, keepdim=False)"
   },
-  "torch_npu.npu_min": {
-    "signature": "(self, dim, keepdim=False)"
-  },
-  "torch_npu.npu_mish": {
-    "signature": "(self)"
-  },
   "torch_npu.npu_mm_all_reduce_base": {
     "signature": "(x1, x2, hcom, reduce_op, bias, antiquant_scale, antiquant_offset, x3, dequant_scale, pertoken_scale, comm_quant_scale_1, comm_quant_scale_2, antiquant_group_size, comm_turn)"
   },
@@ -2645,9 +2648,6 @@
   "torch_npu.npu_multi_head_attention": {
     "signature": "(query, key, value, query_weight, key_weight, value_weight, attn_mask, out_proj_weight, query_bias, key_bias, value_bias, out_proj_bias, dropout_mask, attn_head_num, attn_dim_per_head, src_len, tgt_len, dropout_prob, softmax_use_float)"
   },
-  "torch_npu.npu_nms_rotated": {
-    "signature": "(*args, **kwargs)"
-  },
   "torch_npu.npu_nms_v4": {
     "signature": "(self, scores, max_output_size, iou_threshold, scores_threshold, pad_to_max_output_size=False)"
   },
@@ -2666,9 +2666,6 @@
   "torch_npu.npu_ps_roi_pooling": {
     "signature": "(self, rois, spatial_scale, group_size, output_dim)"
   },
-  "torch_npu.npu_ptiou": {
-    "signature": "(bboxes, gtboxes, mode=0)"
-  },
   "torch_npu.npu_quant_matmul": {
     "signature": "(*args, **kwargs)"
   },
@@ -2681,9 +2678,6 @@
   "torch_npu.npu_random_choice_with_mask": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.npu_reshape": {
-    "signature": "(self, shape, can_refresh=False, out=None)"
-  },
   "torch_npu.npu_rms_norm": {
     "signature": "(self, gamma, epsilon=1e-06)"
   },
@@ -2714,18 +2708,12 @@
   "torch_npu.npu_sign_bits_unpack": {
     "signature": "(inputs, size, dtype)"
   },
-  "torch_npu.npu_silu": {
-    "signature": "(self)"
-  },
   "torch_npu.npu_slice": {
     "signature": "(self, offsets, size)"
   },
   "torch_npu.npu_softmax_cross_entropy_with_logits": {
     "signature": "(self, labels)"
   },
-  "torch_npu.npu_sort_v2": {
-    "signature": "(self, dim=-1, descending=False, out=None)"
-  },
   "torch_npu.npu_stride_copy": {
     "signature": "(self, shape, stride, storage_offset, out=None)"
   },
@@ -2735,6 +2723,9 @@
   "torch_npu.npu_transpose": {
     "signature": "(self, perm, require_contiguous=True, out=None)"
   },
+  "torch_npu.npu_transpose_batchmatmul": {
+    "signature": "(*args, **kwargs)"
+  },
   "torch_npu.npu_weight_quant_batchmatmul": {
     "signature": "(x, weight, antiquant_scale, antiquant_offset=None, quant_scale=None, quant_offset=None, bias=None, antiquant_group_size=0, inner_precise=0)"
   },
@@ -2744,6 +2735,12 @@
   "torch_npu.npu_group_norm_silu": {
     "signature": "(x, gamma, beta, group, eps=1e-05)"
   },
+  "torch_npu.erase_stream": {
+    "signature": "(tensor, stream)"
+  },
+  "torch_npu.matmul_checksum": {
+    "signature": "(a, b, c)"
+  },
   "torch_npu.utils.FlopsCounter": {
     "signature": "()"
   },
@@ -2795,12 +2792,24 @@
   "torch_npu.npu_all_gather_base_mm": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.dynamo.torchair.ops.NpuStreamSwitch": {
+  "torch_npu.utils.set_thread_affinity": {
+    "signature": "(core_range: List[int] = None)"
+  },
+  "torch_npu.utils.reset_thread_affinity": {
+    "signature": "()"
+  },
+  "torch_npu.dynamo.torchair.scope.npu_stream_switch": {
     "signature": "(stream_tag: str, stream_priority: int = 0)"
   },
-  "torch_npu.dynamo.torchair.ops.npu_wait_tensor": {
+  "torch_npu.dynamo.torchair.scope.npu_wait_tensor": {
     "signature": "(self: torch.Tensor, dependency: torch.Tensor)"
   },
+  "torch_npu.dynamo.torchair.scope.super_kernel": {
+    "signature": "(scope: str, options: str = '')"
+  },
+  "torch_npu.dynamo.torchair.scope.limit_core_num": {
+    "signature": "(op_aicore_num: int, op_vectorcore_num: int)"
+  },
   "torch_npu.distributed.run.parse_args": {
     "signature": "(args)"
   },
@@ -2813,6 +2822,24 @@
   "func: unsafe_empty_with_format": {
     "signature": "(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, bool keep_format=False) -> Tensor"
   },
+  "func: empty_with_format.names": {
+    "signature": "(int[] size, Dimname[]? names, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2) -> Tensor"
+  },
+  "func: empty_with_format": {
+    "signature": "(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, int? base_addr_aligned_kb=None) -> Tensor"
+  },
+  "func: copy_memory_": {
+    "signature": "(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)"
+  },
+  "func: _copy_from_and_resize": {
+    "signature": ""
+  },
+  "func: resize_as_": {
+    "signature": ""
+  },
+  "func: empty_with_swapped_memory": {
+    "signature": "(int[] size, *, ScalarType? dtype=None, Device? device=None) -> Tensor"
+  },
   "func: npu_format_cast": {
     "signature": "(Tensor self, int acl_format) -> Tensor"
   },
@@ -2834,15 +2861,6 @@
   "func: get_npu_format": {
     "signature": "(Tensor self) -> int"
   },
-  "func: empty_with_format": {
-    "signature": "(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2) -> Tensor"
-  },
-  "func: empty_with_format.names": {
-    "signature": "(int[] size, Dimname[]? names, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2) -> Tensor"
-  },
-  "func: copy_memory_": {
-    "signature": "(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)"
-  },
   "func: copy_": {
     "signature": ""
   },
@@ -2876,6 +2894,9 @@
   "torch_npu_public_env: TASK_QUEUE_ENABLE": {
     "mode": "std::unordered_map<int32_t, std::string> taskQueueEnableMode = {{0, \"close\"}, {1, \"level 1\"}, {2, \"level 2\"}}"
   },
+  "torch_npu_public_env: INF_NAN_MODE_FORCE_DISABLE": {
+    "mode": "std::unordered_map<int32_t, std::string> disableInfNanMode = {{0, \"enable\"}, {1, \"disable\"}}"
+  },
   "torch_c_func: torch_npu::init_npu(const c10::DeviceIndex device_index = 0)": {
     "signature": "(const c10::DeviceIndex device_index = 0) -> void",
     "file": "torch_npu/csrc/libs/init_npu.h"
@@ -3265,6 +3286,10 @@
     "signature": "(c10::IntArrayRef sizes, const c10::TensorOptions& options, int64_t format, bool keep_format = false) -> at::Tensor",
     "file": "torch_npu/csrc/core/npu/NPUFormat.h"
   },
+  "torch_c_func: at_npu::native::empty_with_swapped_memory": {
+    "signature": "(c10::IntArrayRef size, c10::optional<at::ScalarType> dtype_opt, c10::optional<c10::Device> device_opt) -> at::Tensor",
+    "file": "torch_npu/csrc/core/npu/NPUFormat.h"
+  },
   "torch_c_func: c10_npu::c10_npu_get_error_message": {
     "signature": "() -> char *",
     "file": "torch_npu/csrc/core/npu/NPUException.h"
diff --git a/test/trans_contiguous/test_as_strided_copy_to_contiguous.py b/test/trans_contiguous/test_as_strided_copy_to_contiguous.py
index 7d04c40c980da14908c2daa7bd6f4b5e17c4a587..160cee442ff97aeb459f4fa711a1a895a1846a6b 100644
--- a/test/trans_contiguous/test_as_strided_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_as_strided_copy_to_contiguous.py
@@ -19,7 +19,9 @@ class TestAsStridedCopyToContiguous(TestCase):
         with torch.autograd.profiler.profile(use_device='npu') as prof:
             output = torch.as_strided(input1, size, stride, storage_offset).contiguous()
         self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, [
-                         'contiguous_h_combined']), True, "Error operators called!")
+                         'contiguous_h_combined']) or
+                         check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                         True, message="Error operators called!")
         output = output.cpu().numpy()
         return output
 
diff --git a/test/trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py b/test/trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py
index 9087109e7630f393c5c9e7b304b8a00e2814b291..d3737df42503e013748eb252b601b7cb57e861f8 100644
--- a/test/trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py
+++ b/test/trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py
@@ -28,15 +28,17 @@ class CombinedFlattenXCopyToContiguous(TestCase):
             # case 1: flatten+select
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.flatten(2).select(1, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_StridedSlice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.flatten(2).select(1, 1).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: select+flatten == can be optimized as single select(contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.select(2, 1).flatten(1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.select(2, 1).flatten(1).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -55,16 +57,18 @@ class CombinedFlattenXCopyToContiguous(TestCase):
             # case 1: flatten+strideslice ==> can be optimized as slice(contiguous with offset) + select
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.flatten()[2:100:10].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape', 'contiguous_d_StridedSlice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape', 'contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.flatten()[2:100:10].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: strideslice+flatten==> can be optimized as single strideslice
             # (contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, 2:20:3].flatten().contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, 2:20:3].flatten().contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
diff --git a/test/trans_contiguous/test_combined_reshape_x_copy_to_contiguous.py b/test/trans_contiguous/test_combined_reshape_x_copy_to_contiguous.py
index 46e0fed4aba442f37e14c074f7a2d69cda91c174..3c79e895ed68e436de8cf332b7a8f9cbba253acc 100644
--- a/test/trans_contiguous/test_combined_reshape_x_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_combined_reshape_x_copy_to_contiguous.py
@@ -31,8 +31,9 @@ class CombinedReshapeXCopyToContiguous(TestCase):
                     .view(npu_input.size(0) * npu_input.size(1), npu_input.size(2), npu_input.size(3)) \
                     .transpose(0, 1) \
                     .contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input \
                 .view(cpu_input.size(0) * cpu_input.size(1), cpu_input.size(2), cpu_input.size(3)) \
                 .transpose(0, 1) \
@@ -45,8 +46,9 @@ class CombinedReshapeXCopyToContiguous(TestCase):
                     .permute(1, 0, 2, 3) \
                     .view(npu_input.size(1), npu_input.size(0), npu_input.size(2) * npu_input.size(3)) \
                     .contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input \
                 .permute(1, 0, 2, 3) \
                 .view(cpu_input.size(1), cpu_input.size(0), cpu_input.size(2) * cpu_input.size(3)) \
@@ -71,8 +73,9 @@ class CombinedReshapeXCopyToContiguous(TestCase):
                     .view(npu_input.size(0), npu_input.size(1) * npu_input.size(2), npu_input.size(3)) \
                     .select(2, 1) \
                     .contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_StridedSlice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_StridedSlice'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input \
                 .view(npu_input.size(0), npu_input.size(1) * npu_input.size(2), npu_input.size(3)) \
                 .select(2, 1) \
@@ -81,8 +84,9 @@ class CombinedReshapeXCopyToContiguous(TestCase):
             # case 2: select+view ==> can be optimized as reshape+narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.select(2, 1).view(npu_input.size(1), npu_input.size(0), -1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.select(2, 1).view(npu_input.size(1), npu_input.size(0), -1).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -101,15 +105,17 @@ class CombinedReshapeXCopyToContiguous(TestCase):
             # case 1: view + narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.view(20, 1200, 16)[:, 20:150, :].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.view(20, 1200, 16)[:, 20:150, :].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: narrow + view
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, 10:19, :, :].view(20, 360, 16).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, 10:19, :, :].view(20, 360, 16).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -128,15 +134,17 @@ class CombinedReshapeXCopyToContiguous(TestCase):
             # case 1: view + strideslice
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.view(20, 1200, 10)[:, 20:150:3, :].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.view(20, 1200, 10)[:, 20:150:3, :].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: strideslice + view
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[10:19:3, :, :].view(3, 2400, 5).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[10:19:3, :, :].view(3, 2400, 5).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
diff --git a/test/trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py b/test/trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py
index 2e0e5ebee115a059ce75491a4bbbc716e70f85b8..4dccc03a36cd2ea84a6f9592ca7a16cc8237da3e 100644
--- a/test/trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py
@@ -28,16 +28,18 @@ class CombinedSqueezeXCopyToContiguous(TestCase):
             # case 1: squeeze+permute ==> can be optimized as single permute(contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.squeeze(1).transpose(0, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.squeeze(1).transpose(0, 1).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
             # case 2: permute+squeeze ==> can be optimized as single permute(contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.permute(1, 0, 3, 2).squeeze(0).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.permute(1, 0, 3, 2).squeeze(0).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -57,15 +59,17 @@ class CombinedSqueezeXCopyToContiguous(TestCase):
             # case 1: squeeze + narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.squeeze(1)[:, 1:10, :].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.squeeze(1)[:, 1:10, :].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: narrow + squeeze
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, :, :, 10:19].squeeze(1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, :, :, 10:19].squeeze(1).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -84,15 +88,17 @@ class CombinedSqueezeXCopyToContiguous(TestCase):
             # case 1: squeeze+select
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.squeeze().select(2, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_StridedSlice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.squeeze().select(2, 1).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: select+squeeze
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.select(2, 1).squeeze().contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_StridedSlice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.select(2, 1).squeeze().contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -111,15 +117,17 @@ class CombinedSqueezeXCopyToContiguous(TestCase):
             # case 1: squeeze + strideslice ==> cannot be optimized(contiguous_h_combined should not called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.squeeze(1)[:, 20:150:3].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.squeeze(1)[:, 20:150:3].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: strideslice + squeeze ==> cannot be optimized(contiguous_h_combined should not called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, :, 10:19:3].squeeze(1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, :, 10:19:3].squeeze(1).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
diff --git a/test/trans_contiguous/test_combined_unsqueeze_x_copy_to_contiguous.py b/test/trans_contiguous/test_combined_unsqueeze_x_copy_to_contiguous.py
index ed775befa1d3f1e383b30cb6e9cc3553a31b3e0c..e9e8318a5352120714b3f9c4634a3f37e1a130d8 100644
--- a/test/trans_contiguous/test_combined_unsqueeze_x_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_combined_unsqueeze_x_copy_to_contiguous.py
@@ -29,8 +29,9 @@ class CombinedUnsqueezeXCopyToContiguous(TestCase):
             # (contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.unsqueeze(1).transpose(2, 3).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof, ['contiguous_h_combined']) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.unsqueeze(1).transpose(2, 3).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
@@ -38,8 +39,9 @@ class CombinedUnsqueezeXCopyToContiguous(TestCase):
             # (contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.permute(1, 0, 2, 3).unsqueeze(0).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof, ['contiguous_h_combined']) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.permute(1, 0, 2, 3).unsqueeze(0).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -63,15 +65,17 @@ class CombinedUnsqueezeXCopyToContiguous(TestCase):
             # case 1: unsqueeze+narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.unsqueeze(0)[:, :, 1:10].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.unsqueeze(0)[:, :, 1:10].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: narrow+unsqueeze
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, 1:10].unsqueeze(2).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, 1:10].unsqueeze(2).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -93,14 +97,16 @@ class CombinedUnsqueezeXCopyToContiguous(TestCase):
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.unsqueeze(0).select(2, 1).contiguous()
             cpu_out1 = cpu_input.unsqueeze(0).select(2, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: select+unsqueeze
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.select(1, 1).unsqueeze(0).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.select(1, 1).unsqueeze(0).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -119,17 +125,20 @@ class CombinedUnsqueezeXCopyToContiguous(TestCase):
             # case 1: unsqueeze+unfold:size==step ==> can be optimized as reshape+permute
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.unsqueeze(1).unfold(0, 2, 2).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof),
-                             True, "Error operators called!")
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.unsqueeze(1).unfold(0, 2, 2).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: unfold+unsqueeze: size!=step ==> cannot be optimized
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.unfold(2, 2, 3).unsqueeze(1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.unfold(2, 2, 3).unsqueeze(1).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -149,15 +158,17 @@ class CombinedUnsqueezeXCopyToContiguous(TestCase):
             # case 1: squeeze + strideslice ==> cannot be optimized(contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.unsqueeze(1)[:, :, 20:150:3].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.unsqueeze(1)[:, :, 20:150:3].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: strideslice + squeeze ==> cannot be optimized(contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, :, 10:19:3].unsqueeze(0).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, :, 10:19:3].unsqueeze(0).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
diff --git a/test/trans_contiguous/test_combined_views_copy_to_contiguous.py b/test/trans_contiguous/test_combined_views_copy_to_contiguous.py
index 554c71f6e3b971fcef61691b51e6702f8ab0814d..c5cde2e41bce1cbd06c79eb71153ece4b6e1b0b8 100644
--- a/test/trans_contiguous/test_combined_views_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_combined_views_copy_to_contiguous.py
@@ -1,4 +1,6 @@
 import os
+from cgi import print_form
+
 import numpy as np
 import torch
 import torch_npu
@@ -26,16 +28,18 @@ class CombinedViewsCopyToContiguous(TestCase):
             # case 1: permute+narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.permute(1, 3, 2, 0)[:10].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice', 'contiguous_d_Transpose'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice', 'contiguous_d_Transpose'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.permute(1, 3, 2, 0)[:10].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
             # case 2: narrow+permute
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, 1:10].permute(1, 0, 3, 2).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice', 'contiguous_d_Transpose'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice', 'contiguous_d_Transpose'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, 1:10].permute(1, 0, 3, 2).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -52,16 +56,18 @@ class CombinedViewsCopyToContiguous(TestCase):
             # case 1: permute+select
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.permute(1, 3, 2, 0).select(1, 2).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice', 'contiguous_d_Transpose'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice', 'contiguous_d_Transpose'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.permute(1, 3, 2, 0).select(1, 2).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
             # case 2: select+permute
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.select(1, 0).permute(1, 0, 2).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice', 'contiguous_d_Transpose'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice', 'contiguous_d_Transpose'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.select(1, 0).permute(1, 0, 2).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -79,8 +85,9 @@ class CombinedViewsCopyToContiguous(TestCase):
             # (contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.permute(1, 3, 2, 0)[::2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.permute(1, 3, 2, 0)[::2].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
@@ -88,8 +95,9 @@ class CombinedViewsCopyToContiguous(TestCase):
             # (contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, 1:10:3].permute(1, 3, 0, 2).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, 1:10:3].permute(1, 3, 0, 2).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -107,14 +115,16 @@ class CombinedViewsCopyToContiguous(TestCase):
             # narrow at any dim + select the last dim ==> narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input[:, 2:4].select(3, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input[:, 2:4].select(3, 1).contiguous()
             # narrow at 0 dim + select the any dim ==> common copy
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[2:4].select(2, 2).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[2:4].select(2, 2).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
@@ -122,14 +132,16 @@ class CombinedViewsCopyToContiguous(TestCase):
             # select the 0 dim + narrow at the 1 dim ==> reshape + select
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out3 = npu_input.select(0, 2)[:, 1:2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out3 = cpu_input.select(0, 2)[:, 1:2].contiguous()
             # select the 0 dim + narrow at the last dim ==> reshape + select
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out4 = npu_input.select(0, 1)[:, :, 1:2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out4 = cpu_input.select(0, 1)[:, :, 1:2].contiguous()
 
             self.assertRtolEqual(npu_out3.to("cpu").numpy(), cpu_out3.numpy())
@@ -149,20 +161,23 @@ class CombinedViewsCopyToContiguous(TestCase):
             # slice at adjacent axes + strideslice at lower dim ==> cannot be optimized(contiguous_h_combined is called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input[2:4, ::2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input[2:4, ::2].contiguous()
             # strideslice at last dim ==> cannot be optimized(contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, 2:4, :, 1:10:2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, 2:4, :, 1:10:2].contiguous()
             # narrow at 0 dim and strideslice at last dim==> can be optimized as slice(contiguous)+select
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out3 = npu_input[2:4, :, :, ::2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape', 'contiguous_d_StridedSlice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape', 'contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out3 = cpu_input[2:4, :, :, ::2].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
@@ -172,14 +187,16 @@ class CombinedViewsCopyToContiguous(TestCase):
             # slice at adjacent axes + strideslice at higher dim ==> reshape+narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out4 = npu_input[1:10:2, 1:10].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out4 = cpu_input[1:10:2, 1:10].contiguous()
             # slice at non-adjacent axes
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out5 = npu_input[::2, :, 1:10].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out5 = cpu_input[::2, :, 1:10].contiguous()
             self.assertRtolEqual(npu_out4.to("cpu").numpy(), cpu_out4.numpy())
             self.assertRtolEqual(npu_out5.to("cpu").numpy(), cpu_out5.numpy())
@@ -198,29 +215,33 @@ class CombinedViewsCopyToContiguous(TestCase):
             # select at last dim ==> cannot be optimized(contiguous_h_combined is called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input[:10:2].select(3, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input[:10:2].select(3, 1).contiguous()
             # select at lower dims except last dim ==> reshape+narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[1:10:2].select(2, 1).contiguous()
             cpu_out2 = cpu_input[1:10:2].select(2, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
             # case 2: select+strideslice
             # strideslice at lower dims except last dim ==> reshape+narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out3 = npu_input.select(0, 1)[1:10:2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out3 = cpu_input.select(0, 1)[1:10:2].contiguous()
             # strideslice at the last dim ==> cannot be optimized(contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out4 = npu_input.select(0, 1)[:, :, ::3].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out4 = cpu_input.select(0, 1)[:, :, ::3].contiguous()
             self.assertRtolEqual(npu_out3.to("cpu").numpy(), cpu_out3.numpy())
             self.assertRtolEqual(npu_out4.to("cpu").numpy(), cpu_out4.numpy())
@@ -242,8 +263,9 @@ class CombinedViewsCopyToContiguous(TestCase):
             # Broadcast + permute all cannot be optimized(contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.expand(item[2][1]).transpose(1, 3).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.expand(item[2][1]).transpose(1, 3).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
diff --git a/test/trans_contiguous/test_single_broadcast_copy_to_contiguous.py b/test/trans_contiguous/test_single_broadcast_copy_to_contiguous.py
index ff607878917a67de4e61bdf61cf902cb5e1f160e..adbae281603c2e213552d97382a9ca8922e3f889 100644
--- a/test/trans_contiguous/test_single_broadcast_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_single_broadcast_copy_to_contiguous.py
@@ -35,8 +35,9 @@ class SingleViewCopyToContiguous(TestCase):
             cpu_input, npu_input = create_common_tensor(item_broadcast, 0, 100)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.expand(item[2][1]).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_BroadcastTo'], prof),
-                             True, "contiguous_d_BroadcastTo is not called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_BroadcastTo'], prof) or
+                             check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="contiguous_d_BroadcastTo or aclnnInplaceCopy is not called!")
             cpu_out1 = cpu_input.expand(item[2][1]).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
diff --git a/test/trans_contiguous/test_single_permute_copy_to_contiguous.py b/test/trans_contiguous/test_single_permute_copy_to_contiguous.py
index 8bec6d1f1623290186a02feba932c4f10519c964..39c81fe30303f3fa19ff8c7343208c4aaaddf4eb 100644
--- a/test/trans_contiguous/test_single_permute_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_single_permute_copy_to_contiguous.py
@@ -24,13 +24,15 @@ class SingleViewCopyToContiguous(TestCase):
             cpu_input, npu_input = create_common_tensor(item, 0, 100)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.permute(1, 0, 2, 3).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof),
-                             True, "contiguous_d_Transpose op is not called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, "contiguous_d_Transpose or aclnnInplaceCopy op is not called!")
 
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.permute(2, 3, 0, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof),
-                             True, "contiguous_d_Transpose op is not called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, "contiguous_d_Transpose or aclnnInplaceCopy op is not called!")
 
             cpu_out1 = cpu_input.permute(1, 0, 2, 3).contiguous()
             cpu_out2 = cpu_input.permute(2, 3, 0, 1).contiguous()
diff --git a/test/trans_contiguous/test_single_reshape_copy_to_contiguous.py b/test/trans_contiguous/test_single_reshape_copy_to_contiguous.py
index 53e9ed9bcd370f7b9d9da98fce0fb7f5485b2ff9..134bda76cd11a2f1617dabad8599835489700aa2 100644
--- a/test/trans_contiguous/test_single_reshape_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_single_reshape_copy_to_contiguous.py
@@ -35,11 +35,13 @@ class SingleViewCopyToContiguous(TestCase):
             # case2. The key axis remains unchanged for NZ format
             match_case2 = (item[1] == 29)
             if match_case1 or match_case2:
-                self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape'], prof),
-                                 True, "contiguous_d_Reshape is not called!")
+                self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape'], prof) or
+                                 check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, message="contiguous_d_Reshape or aclnnInplaceCopy is not called!")
             else:
-                self.assertEqual(check_operators_in_prof(['d2dCopyAsync'], prof),
-                                 True, "d2dCopyAsync is not called!")
+                self.assertEqual(check_operators_in_prof(['d2dCopyAsync'], prof) or
+                                 check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, message="d2dCopyAsync or aclnnInplaceCopy is not called!")
             cpu_out1 = cpu_input.view(1, 6, cpu_input.size(2), cpu_input.size(3)).clone()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
@@ -47,11 +49,13 @@ class SingleViewCopyToContiguous(TestCase):
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.view(1, 6, npu_input.size(2) * npu_input.size(3), 1).clone()
             if match_case1:
-                self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape'], prof),
-                                 True, "contiguous_d_Reshape is not called!")
+                self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape'], prof) or
+                                 check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, message="contiguous_d_Reshape or aclnnInplaceCopy is not called!")
             else:
-                self.assertEqual(check_operators_in_prof(['d2dCopyAsync'], prof),
-                                 True, "d2dCopyAsync is not called!")
+                self.assertEqual(check_operators_in_prof(['d2dCopyAsync'], prof) or
+                                 check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, message="d2dCopyAsync or aclnnInplaceCopy is not called!")
             cpu_out2 = cpu_input.view(1, 6, cpu_input.size(2) * cpu_input.size(3), 1).clone()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -77,11 +81,13 @@ class SingleViewCopyToContiguous(TestCase):
                 with torch.autograd.profiler.profile(use_device='npu') as prof:
                     npu_out = npu_input.unsqueeze(i).clone()
                 if match_case1 or match_case2:
-                    self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape'], prof),
-                                     True, "contiguous_d_Reshape is not called!")
+                    self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape'], prof) or
+                                     check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                     True, message="contiguous_d_Reshape or aclnnInplaceCopy is not called!")
                 else:
-                    self.assertEqual(check_operators_in_prof(['d2dCopyAsync'], prof),
-                                     True, "d2dCopyAsync is not called!")
+                    self.assertEqual(check_operators_in_prof(['d2dCopyAsync'], prof) or
+                                     check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                     True, message="d2dCopyAsync or aclnnInplaceCopy is not called!")
                 cpu_out = cpu_input.unsqueeze(i).clone()
                 self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy())
 
@@ -102,12 +108,14 @@ class SingleViewCopyToContiguous(TestCase):
                 npu_out = torch.flatten(npu_input, 0, 1).clone()
             if item[1] == 3:
                 # Using d2dcopy with transdata(d2dCopyAsync)
-                self.assertEqual(check_operators_in_prof(['d2dCopyAsync'], prof),
-                                 True, "d2dCopyAsync is not called!")
+                self.assertEqual(check_operators_in_prof(['d2dCopyAsync'], prof) or
+                                 check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, message="d2dCopyAsync or aclnnInplaceCopy is not called!")
             else:
                 # Directly using d2dcopy without transdata(contiguous_d_Reshape)
-                self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape'], prof),
-                                 True, "contiguous_d_Reshape is not called!")
+                self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape'], prof) or
+                                 check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, message="contiguous_d_Reshape or aclnnInplaceCopy is not called!")
 
             cpu_out = torch.flatten(cpu_input, 0, 1).clone()
             self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy())
@@ -139,11 +147,13 @@ class SingleViewCopyToContiguous(TestCase):
             # case3. NZ format with padding but no offset
             match_case3 = (item[1] == 29 and True)
             if match_case1 or match_case2 or match_case3:
-                self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape'], prof),
-                                 True, "contiguous_d_Reshape is not called!")
+                self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape'], prof) or
+                                 check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, message="contiguous_d_Reshape or aclnnInplaceCopy is not called!")
             else:
-                self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof),
-                                 True, "contiguous_d_Slice is not called!")
+                self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof) or
+                                 check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, message="contiguous_d_Slice or aclnnInplaceCopy is not called!")
             cpu_out1 = cpu_input[:10, :, :].clone()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
@@ -152,11 +162,13 @@ class SingleViewCopyToContiguous(TestCase):
                 npu_out2 = npu_input[1:10, :, :].clone()
             match_case3 = False
             if match_case1 or match_case2 or match_case3:
-                self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape'], prof),
-                                 True, "contiguous_d_Reshape is not called!")
+                self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape'], prof) or
+                                 check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, message="contiguous_d_Reshape or aclnnInplaceCopy is not called!")
             else:
-                self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof),
-                                 True, "contiguous_d_Slice is not called!")
+                self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof) or
+                                 check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, message="contiguous_d_Slice or aclnnInplaceCopy is not called!")
             cpu_out2 = cpu_input[1:10, :, :].clone()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -175,23 +187,27 @@ class SingleViewCopyToContiguous(TestCase):
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input[0].clone()
             if match_case:
-                self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape'], prof),
-                                 True, "contiguous_d_Reshape is not called!")
+                self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape'], prof) or
+                                 check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, message="contiguous_d_Reshape or aclnnInplaceCopy is not called!")
             else:
-                self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof),
-                                 True, "contiguous_d_Slice is not called!")
+                self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof) or
+                                 check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, message="contiguous_d_Slice or aclnnInplaceCopy is not called!")
             cpu_out1 = cpu_input[0].clone()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[0] + 1
             if match_case:
-                self.assertEqual(check_operators_in_prof(['contiguous_h_memRepoint'], prof),
-                                 True, "contiguous_h_memRepoint is not called!")
+                self.assertEqual(check_operators_in_prof(['contiguous_h_memRepoint'], prof) or
+                                 check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, message="contiguous_h_memRepoint or aclnnInplaceCopy is not called!")
             else:
                 # refresh storage desc after transdata
-                self.assertEqual(check_operators_in_prof(['Identity'], prof),
-                                 True, "Identity is not called!")
+                self.assertEqual(check_operators_in_prof(['Identity'], prof) or
+                                 check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, message="Identity or aclnnInplaceCopy is not called!")
             cpu_out2 = cpu_input[0] + 1
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
diff --git a/test/trans_contiguous/test_single_slice_copy_to_contiguous.py b/test/trans_contiguous/test_single_slice_copy_to_contiguous.py
index e46c141f147e5b61e5e32f027c6a4c849c1901e6..3d24aac3d8e3451d8dda4a46023d14f280fc5eec 100644
--- a/test/trans_contiguous/test_single_slice_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_single_slice_copy_to_contiguous.py
@@ -4,7 +4,7 @@ import numpy as np
 import torch_npu
 
 from torch_npu.testing.testcase import TestCase, run_tests
-from torch_npu.testing.common_utils import create_common_tensor, check_operators_in_prof
+from torch_npu.testing.common_utils import create_common_tensor, check_operators_in_prof, SupportedDevices
 
 os.environ["COMBINED_ENABLE"] = "1"  # Open combined-view cases optimization
 
@@ -33,15 +33,18 @@ class SingleViewCopyToContiguous(TestCase):
             # for narrow with step=1, if narrow at the first axis, it will generate a contiguous tensor
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input[:, :16, :, :].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof),
-                             True, "contiguous_d_Slice is not called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, "contiguous_d_Slice or aclnnInplaceCopy is not called!")
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, :, 1:16, :].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof),
-                             True, "contiguous_d_Slice is not called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, "contiguous_d_Slice or aclnnInplaceCopy is not called!")
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out3 = npu_input[:, :, :, 2:16].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof),
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
                              True, "contiguous_d_Slice is not called!")
 
             cpu_out1 = cpu_input[:, :16, :, :].contiguous()
@@ -70,24 +73,29 @@ class SingleViewCopyToContiguous(TestCase):
             if cpu_input.dim() == 4:
                 with torch.autograd.profiler.profile(use_device='npu') as prof:
                     npu_out1 = npu_input[::2].contiguous()
-                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof),
+                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
                                  True, "Error operators called!")
                 with torch.autograd.profiler.profile(use_device='npu') as prof:
                     npu_out2 = npu_input[:, 1:17:4].contiguous()
-                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof),
+                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
                                  True, "Error operators called!")
                 with torch.autograd.profiler.profile(use_device='npu') as prof:
                     npu_out3 = npu_input[:, :, 2:16:5].contiguous()
-                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof),
+                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
                                  True, "Error operators called!")
                 with torch.autograd.profiler.profile(use_device='npu') as prof:
                     # stridedSlice do not support slice at last dim
                     npu_out4 = npu_input[:, :, :, 3:9:2].contiguous()
-                self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof),
+                self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
                                  True, "Error operators called!")
                 with torch.autograd.profiler.profile(use_device='npu') as prof:
                     npu_out5 = npu_input[::2, 1:17:4, 2:16:5, :].contiguous()
-                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof),
+                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
                                  True, "Error operators called!")
 
                 cpu_out1 = cpu_input[::2].contiguous()
@@ -120,8 +128,9 @@ class SingleViewCopyToContiguous(TestCase):
             for dim in range(1, len(item[2])):
                 with torch.autograd.profiler.profile(use_device='npu') as prof:
                     npu_out = npu_input.select(dim, 1).contiguous()
-                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof),
-                                 True, "contiguous_d_StridedSlice is not called!")
+                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, "contiguous_d_StridedSlice or aclnnInplaceCopy is not called!")
                 cpu_out = cpu_input.select(dim, 1).contiguous()
                 self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy())
 
@@ -139,7 +148,8 @@ class SingleViewCopyToContiguous(TestCase):
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out = torch.as_strided(npu_input,
                                            shape_list[1][0], shape_list[1][1], shape_list[1][2]).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_d_StridedSlice']),
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_d_StridedSlice'])
+                         or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_d_StridedSlice']),
                              True, "Error operators called!")
             cpu_out = torch.as_strided(cpu_input,
                                        shape_list[1][0], shape_list[1][1], shape_list[1][2]).contiguous()
diff --git a/test/trans_contiguous/test_special_cases_copy_to_contiguous.py b/test/trans_contiguous/test_special_cases_copy_to_contiguous.py
index 73dd786a078e2e0817bd8a32c639691d98bae03c..e2c2f853696cd4e75ae3ba18d598baa875d91475 100644
--- a/test/trans_contiguous/test_special_cases_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_special_cases_copy_to_contiguous.py
@@ -38,6 +38,14 @@ class TestSpecialCasesCopyToContiguous(TestCase):
             npu_out = torch.as_strided(npu_input, (1, 32, 96, 96), (746496, 0, 96, 1), 737280).clone()
             self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy())
 
+    def test_h2d_copy_discontiguous(self):
+        a = torch.randn(256, 320)
+        b = a.transpose(-1, -2) # make b NOT contiguous
+        self.assertFalse(b.is_contiguous())
+        b = b.npu()
+        self.assertFalse(b.is_contiguous()) # after to npu, b is still NOT contiguous
+        self.assertEqual(b.stride(), (1, 320))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/trans_contiguous/test_tri_combined_views_copy_to_contiguous.py b/test/trans_contiguous/test_tri_combined_views_copy_to_contiguous.py
index d970098906d1ecdc49ad9405141c4b518d082d5c..9496d34e2bb031c76558f785c25a66c446019747 100644
--- a/test/trans_contiguous/test_tri_combined_views_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_tri_combined_views_copy_to_contiguous.py
@@ -26,8 +26,9 @@ class TestTriCombinedViewsCopyToContiguous(TestCase):
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.view(npu_input.size(0) * npu_input.size(1), npu_input.size(2),
                                           npu_input.size(3))[:, 1:10].transpose(0, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.view(cpu_input.size(0) * cpu_input.size(1), cpu_input.size(2),
                                       cpu_input.size(3))[:, 1:10].transpose(0, 1).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
@@ -37,8 +38,9 @@ class TestTriCombinedViewsCopyToContiguous(TestCase):
                 npu_out2 = npu_input.permute(1, 0, 2, 3). \
                     view(npu_input.size(1), npu_input.size(0), npu_input.size(
                         2) * npu_input.size(3))[:, :, 1:10].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.permute(1, 0, 2, 3). \
                 view(
                 cpu_input.size(1),
@@ -66,8 +68,9 @@ class TestTriCombinedViewsCopyToContiguous(TestCase):
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.view(npu_input.size(0) * npu_input.size(1), npu_input.size(2),
                                           npu_input.size(3))[:, 1].transpose(0, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.view(cpu_input.size(0) * cpu_input.size(1), cpu_input.size(2),
                                       cpu_input.size(3))[:, 1].transpose(0, 1).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
@@ -77,8 +80,9 @@ class TestTriCombinedViewsCopyToContiguous(TestCase):
                 npu_out2 = npu_input.permute(1, 0, 2, 3). \
                     view(npu_input.size(1), npu_input.size(0), npu_input.size(
                         2) * npu_input.size(3))[:, :, 2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.permute(1, 0, 2, 3). \
                 view(cpu_input.size(1), cpu_input.size(0), cpu_input.size(2) * cpu_input.size(3))[:, :, 2].contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
diff --git a/test/unsupported_test_cases/.pytorch-disabled-tests.json b/test/unsupported_test_cases/.pytorch-disabled-tests.json
index 3c0d8f9c27ddba0fca442241899f308cdd80dfbf..666a67df22a2533d05c36d5729483f988cd1fd94 100644
--- a/test/unsupported_test_cases/.pytorch-disabled-tests.json
+++ b/test/unsupported_test_cases/.pytorch-disabled-tests.json
@@ -1,7 +1,6 @@
 {
   "test_export_dynamo_config (__main__.TestExport)": ["", [""]],
   "test_views_op_having_view_copy (__main__.TestPasses)": ["", [""]],
-  "test_custom_class (__main__.TestSerializeCustomClass)": ["", [""]],
   "test_build_tuple_unpack_dynamic_shapes (__main__.DynamicShapesMiscTests)": ["", [""]],
   "test_module_attribute_mutation_violation_negative_2 (__main__.MutationExportTests)": ["", [""]],
   "test_module_attribute_mutation_violation_negative_3 (__main__.MutationExportTests)": ["", [""]],
@@ -17,7 +16,7 @@
   "test_graph_break_inlining_autocast (__main__.CtxManagerTests)": ["", [""]],
   "test_graph_break_inlining_autocast (__main__.DynamicShapesCtxManagerTests)": ["", [""]],
   "test_graph_break_inlining_autocast_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": ["", [""]],
-  "test_graph_breaks (__main__.LoggingTests)": ["", [""]],
+  "test_graph_breaks (__main__.LoggingTests)": ["", ["A2"]],
   "test_schedule (__main__.LoggingTests)": ["", [""]],
   "test_external_module_and_backend_register (__main__.TestExtensionUtils)": ["", [""]],
   "test_load_standalone (__main__.TestStandaloneCPPJIT)": ["", [""]],
@@ -132,6 +131,21 @@
   "test_check_inplace_nn_SELU_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", [""]],
   "test_check_inplace_nn_SiLU_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", [""]],
   "test_check_inplace_nn_Threshold_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", [""]],
+  "test_forward_nn_Conv3d_npu_float32 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_LPPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", [""]],
+  "test_if_train_and_eval_modes_differ_nn_LazyConvTranspose2d_npu_float32 (__main__.TestModulePRIVATEUSE1)": ["", [""]],
+  "test_pickle_nn_AdaptiveMaxPool3d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", [""]],
+  "test_pickle_nn_LazyConvTranspose2d_npu_float32 (__main__.TestModulePRIVATEUSE1)": ["", [""]],
+  "test_forward_nn_GRU_train_mode_npu_float32 (__main__.TestModulePRIVATEUSE1)": ["", [""]],
+  "test_GroupNorm_empty_npu (__main__.TestNNDeviceTypePRIVATEUSE1)": ["", [""]],
+  "test_sparse_add_out_bfloat16_npu_float32 (__main__.TestSparsePRIVATEUSE1)": ["", [""]],
+  "test_inplace_sqrt_npu_int16 (__main__.TestSparseUnaryUfuncsPRIVATEUSE1)": ["", [""]],
+  "test_sparse_add_out_bfloat16_npu_float32 (__main__.TestSparsePRIVATEUSE1)": ["", [""]],
+  "test_inplace_sqrt_npu_int16 (__main__.TestSparseUnaryUfuncsPRIVATEUSE1)": ["", [""]],
+  "test_inplace_sqrt_npu_int32 (__main__.TestSparseUnaryUfuncsPRIVATEUSE1)": ["", [""]],
+  "test_inplace_sqrt_npu_int64 (__main__.TestSparseUnaryUfuncsPRIVATEUSE1)": ["", [""]],
+  "test_inplace_sqrt_npu_int8 (__main__.TestSparseUnaryUfuncsPRIVATEUSE1)": ["", [""]],
+  "test_inplace_sqrt_npu_uint8 (__main__.TestSparseUnaryUfuncsPRIVATEUSE1)": ["", [""]],
   "test_correctness_CSEPass_MutationTorchTensorCall_npu (__main__.TestCommonPass)": ["", [""]],
   "test_class_member_back_compat (__main__.TestFXAPIBackwardCompatibility)": ["", [""]],
   "test_function_back_compat (__main__.TestFXAPIBackwardCompatibility)": ["", [""]],
@@ -1941,7 +1955,6 @@
   "test_cuda_graph_error_options (__main__.TestNpu)": ["", [""]],
   "test_batch_norm_gather_stats (__main__.TestNpu)": ["", [""]],
   "test_matmul_device_mismatch (__main__.TestNpu)": ["", [""]],
-  "test_lazy_init (__main__.TestNpu)": ["", [""]],
   "test_npu_kernel_loop_overflow (__main__.TestNpu)": ["", [""]],
   "test_allocator_settings (__main__.TestNpu)": ["", [""]],
   "test_raises_oom (__main__.TestNpu)": ["", [""]],
@@ -13418,6 +13431,10 @@
   "test_inplace_grad_abs_npu_float64 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", [""]],
   "test_inplace_grad_acos_npu_complex128 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", [""]],
   "test_inplace_grad_acos_npu_float64 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", [""]],
+  "test_inplace_grad_acosh_npu_complex128 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", ["910A"]],
+  "test_inplace_grad_asinh_npu_complex128 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", ["910A"]],
+  "test_inplace_grad_atanh_npu_complex128 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", ["910A"]],
+  "test_inplace_grad_ldexp_npu_complex128 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", ["910A"]],
   "test_inplace_grad_add_npu_complex128 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", [""]],
   "test_inplace_grad_add_npu_float64 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", [""]],
   "test_inplace_grad_addbmm_npu_complex128 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", [""]],
@@ -26876,6 +26893,7 @@
   "test_output_match_unflatten_cpu_int8 (__main__.TestOnnxModelOutputConsistency_opset18CPU)": ["", [""]],
   "test_output_match_unflatten_cpu_uint8 (__main__.TestOnnxModelOutputConsistency_opset18CPU)": ["", [""]],
   "test_output_match_arange_cpu_int8 (__main__.TestOnnxModelOutputConsistency_opset18CPU)": ["", [""]],
+  "test_output_match_nn_functional_batch_norm_cpu_float32 (__main__.TestOnnxModelOutputConsistency_opset18CPU)": ["", [""]],
   "test_output_match_nn_functional_embedding_cpu_float16 (__main__.TestOnnxModelOutputConsistency_opset18CPU)": ["", [""]],
   "test_output_match_nn_functional_embedding_cpu_float32 (__main__.TestOnnxModelOutputConsistency_opset18CPU)": ["", [""]],
   "test_output_match_var_mean_cpu_float16 (__main__.TestOnnxModelOutputConsistency_opset18CPU)": ["", [""]],
@@ -27420,6 +27438,26 @@
   "test_nms (__main__.TestONNXRuntime_opset_version_17_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
   "test_nms (__main__.TestONNXRuntime_opset_version_17_is_script_True_keep_initializers_as_inputs_False)": ["", [""]],
   "test_nms (__main__.TestONNXRuntime_opset_version_17_is_script_True_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_10_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_10_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_11_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_11_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_12_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_12_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_13_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_13_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_14_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_14_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_15_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_15_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_16_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_16_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_17_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_17_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_7_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_8_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_9_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_9_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
   "test_rnn_name_lstm_nonlinearity_None_trilayer_bidirectional_no_initial_state_with_batch_first_sequence_lengths_with_dropout (__main__.TestONNXRuntime_opset_version_10_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
   "test_rnn_name_lstm_nonlinearity_None_trilayer_bidirectional_no_initial_state_with_batch_first_sequence_lengths_with_dropout (__main__.TestONNXRuntime_opset_version_10_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
   "test_rnn_name_lstm_nonlinearity_None_trilayer_bidirectional_no_initial_state_with_batch_first_sequence_lengths_with_dropout (__main__.TestONNXRuntime_opset_version_11_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
@@ -29836,6 +29874,7 @@
   "test_Conv2d_inconsistent_types_on_GPU_without_cudnn (__main__.TestConvolutionNN)": ["", [""]],
   "test_fake_tensor_mode_huggingface_databricks_dolly_v2_3b (__main__.TestFxToOnnx)": ["", [""]],
   "test_fake_tensor_mode_huggingface_mosaicml_mpt_7b (__main__.TestFxToOnnx)": ["", [""]],
+  "test_dispatch_overload_fall_back_default_raise_diagnostic_warning (__main__.TestFxToOnnx)": ["", [""]],
   "test_extract_gradients_from_module (__main__.TestIdentifyGradients)": ["", [""]],
   "test_extract_gradients_from_module_and_optimizer (__main__.TestIdentifyGradients)": ["", [""]],
   "test_direct_traceback (__main__.TestNpuMallocAsync)": ["", [""]],
@@ -29846,6 +29885,10 @@
   "test_AdaptiveLogSoftmax (__main__.TestNN)": ["", [""]],
   "test_cudnn_rnn_dropout_states_device (__main__.TestNN)": ["", [""]],
   "test_generated_rule_set_is_up_to_date (__main__.TestGeneratedTypePromotionRuleSet)": ["", [""]],
+  "test_fake_autocast_index_add_npu_float32 (__main__.TestFakeTensorPRIVATEUSE1)": ["", [""]],
+  "test_output_match_cross_cpu_float32 (__main__.TestOnnxModelOutputConsistency_opset18CPU)": ["", [""]],
+  "test_output_match_cross_cpu_int32 (__main__.TestOnnxModelOutputConsistency_opset18CPU)": ["", [""]],
+  "test_output_match_cross_cpu_int64 (__main__.TestOnnxModelOutputConsistency_opset18CPU)": ["", [""]],
   "test_upsamplingNearestExact1d_correctness_isize_10_osize_15_npu (__main__.TestNNDeviceTypePRIVATEUSE1)": ["", ["910A"]],
   "test_upsamplingNearestExact1d_correctness_isize_20_osize_11_npu (__main__.TestNNDeviceTypePRIVATEUSE1)": ["", ["910A"]],
   "test_upsamplingNearestExact1d_rescale_npu (__main__.TestNNDeviceTypePRIVATEUSE1)": ["", ["910A"]],
@@ -29986,6 +30029,28 @@
   "test_npu_flash_attention (__main__.TestNPUFlashAttention)": ["", ["A2"]],
   "test_npu_scatter (__main__.TestNpuScatter)": ["", ["A2"]],
   "test_assign_does_not_exist (__main__.OptimizedModuleTest)": ["", ["A2"]],
+  "test_matmul (__main__.TestFxToOnnxWithOnnxRuntime_op_level_debug_True_dynamic_shapes_True)": ["", [""]],
+  "test_mnist (__main__.TestFxToOnnxWithOnnxRuntime_op_level_debug_False_dynamic_shapes_True)": ["", [""]],
+  "test_fake_tensor_mode_simple (__main__.TestFxToOnnxFakeTensorWithOnnxRuntime_op_level_debug_True_dynamic_shapes_True_load_checkpoint_during_init_False_export_within_fake_mode_True)": ["", [""]],
+  "test_matmul (__main__.TestFxToOnnxWithOnnxRuntime_op_level_debug_False_dynamic_shapes_True)": ["", [""]],
+  "test_flatten_dynamic_axes (__main__.TestFxToOnnxWithOnnxRuntime_op_level_debug_True_dynamic_shapes_True)": ["", [""]],
+  "test_fake_tensor_mode_simple (__main__.TestFxToOnnxFakeTensorWithOnnxRuntime_op_level_debug_False_dynamic_shapes_True_load_checkpoint_during_init_False_export_within_fake_mode_True)": ["", [""]],
+  "test_flatten_dynamic_axes (__main__.TestFxToOnnxWithOnnxRuntime_op_level_debug_False_dynamic_shapes_True)": ["", [""]],
+  "test_mutation (__main__.TestFxToOnnxWithOnnxRuntime_op_level_debug_False_dynamic_shapes_True)": ["", [""]],
+  "test_arange (__main__.TestFxToOnnxWithOnnxRuntime_op_level_debug_True_dynamic_shapes_True)": ["", [""]],
+  "test_arange (__main__.TestFxToOnnxWithOnnxRuntime_op_level_debug_False_dynamic_shapes_True)": ["", [""]],
+  "test_operator_with_data_dependent_output (__main__.TestFxToOnnxWithOnnxRuntime_op_level_debug_False_dynamic_shapes_True)": ["", [""]],
+  "test_fake_tensor_mode_simple (__main__.TestFxToOnnxFakeTensorWithOnnxRuntime_op_level_debug_True_dynamic_shapes_True_load_checkpoint_during_init_True_export_within_fake_mode_False)": ["", [""]],
+  "test_fake_tensor_mode_simple (__main__.TestFxToOnnxFakeTensorWithOnnxRuntime_op_level_debug_True_dynamic_shapes_True_load_checkpoint_during_init_False_export_within_fake_mode_False)": ["", [""]],
+  "test_fake_tensor_mode_simple (__main__.TestFxToOnnxFakeTensorWithOnnxRuntime_op_level_debug_False_dynamic_shapes_True_load_checkpoint_during_init_False_export_within_fake_mode_False)": ["", [""]],
+  "test_operator_with_data_dependent_output (__main__.TestFxToOnnxWithOnnxRuntime_op_level_debug_True_dynamic_shapes_True)": ["", [""]],
+  "test_fake_tensor_mode_simple (__main__.TestFxToOnnxFakeTensorWithOnnxRuntime_op_level_debug_True_dynamic_shapes_True_load_checkpoint_during_init_True_export_within_fake_mode_True)": ["", [""]],
+  "test_fake_tensor_mode_simple (__main__.TestFxToOnnxFakeTensorWithOnnxRuntime_op_level_debug_False_dynamic_shapes_True_load_checkpoint_during_init_True_export_within_fake_mode_True)": ["", [""]],
+  "test_fake_tensor_mode_simple (__main__.TestFxToOnnxFakeTensorWithOnnxRuntime_op_level_debug_False_dynamic_shapes_True_load_checkpoint_during_init_True_export_within_fake_mode_False)": ["", [""]],
+  "test_slice (__main__.TestFxToOnnxWithOnnxRuntime_op_level_debug_True_dynamic_shapes_True)": ["", [""]],
+  "test_slice (__main__.TestFxToOnnxWithOnnxRuntime_op_level_debug_False_dynamic_shapes_True)": ["", [""]],
+  "test_mutation (__main__.TestFxToOnnxWithOnnxRuntime_op_level_debug_True_dynamic_shapes_True)": ["", [""]],
+  "test_mnist (__main__.TestFxToOnnxWithOnnxRuntime_op_level_debug_True_dynamic_shapes_True)": ["", [""]],
   "test_EmbeddingBag_empty_per_sample_weights_and_offsets_npu_int32_int32_float64 (__main__.TestEmbeddingNNDeviceTypePRIVATEUSE1)": ["", ["A2"]],
   "test_EmbeddingBag_empty_per_sample_weights_and_offsets_npu_int32_int32_float16 (__main__.TestEmbeddingNNDeviceTypePRIVATEUSE1)": ["", ["A2"]],
   "test_EmbeddingBag_empty_per_sample_weights_and_offsets_npu_int32_int32_float32 (__main__.TestEmbeddingNNDeviceTypePRIVATEUSE1)": ["", ["A2"]],
@@ -30154,6 +30219,43 @@
   "test_custom_op_fallthrough (__main__.TestUtilityFuns_opset_16)": ["", ["A2"]],
   "test_custom_op_fallthrough (__main__.TestUtilityFuns_opset_17)": ["", ["A2"]],
   "test_custom_op_fallthrough (__main__.TestUtilityFuns_opset_9)": ["", ["A2"]],
+  "test_check_inplace_nn_ReLU_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_cpu_gpu_parity_nn_ReLU_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_cpu_gpu_parity_nn_Tanh_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_cpu_gpu_parity_nn_Tanhshrink_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_AvgPool1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_AvgPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_FractionalMaxPool3d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_LPPool1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_LPPPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_ReLU_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_ReplicationPad2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_Tanh_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_Tanhshrink_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_ZeroPad1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_ZeroPad2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_if_train_and_eval_modes_differ_nn_AvgPool1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_if_train_and_eval_modes_differ_nn_AvgPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_if_train_and_eval_modes_differ_nn_LPPool1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_if_train_and_eval_modes_differ_nn_LPPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_if_train_and_eval_modes_differ_nn_ReLU_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_if_train_and_eval_modes_differ_nn_Tanh_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_if_train_and_eval_modes_differ_nn_Tanhshrink_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_AvgPool1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_AvgPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_CircularPad2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_CircularPad3d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_ConstantPad1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_ConstantPad2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_ConstantPad3d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_FractionalMaxPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_FractionalMaxPool3d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_LPPool1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_LPPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_ReLU_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_Tanh_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_Tanhshrink_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_ZeroPad3d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
   "test_unsupported_dtypes (__main__.TestTEFuserDynamic)": ["", [""]],
   "test_unsupported_dtypes (__main__.TestTEFuserStatic)": ["", [""]],
   "test_conj_view__refs_unbind_copy_npu_complex64 (__main__.TestMathBitsPRIVATEUSE1)": ["", [""]],
diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h
index 9780a01f8fbdf7b3e49d7b3dd2e7326c4f7da486..b8ef9dbd34075370416a049efff05be7b4c110df 100755
--- a/third_party/acl/inc/acl/acl_base.h
+++ b/third_party/acl/inc/acl/acl_base.h
@@ -56,6 +56,7 @@ typedef void *aclrtAllocatorDesc;
 typedef void *aclrtAllocator;
 typedef void *aclrtAllocatorBlock;
 typedef void *aclrtAllocatorAddr;
+typedef void *aclrtTaskGrp;
 
 static const int ACL_ERROR_NONE = 0;
 static const int ACL_SUCCESS = 0;
@@ -135,6 +136,9 @@ static const int ACL_ERROR_DRV_FAILURE = 500004;
 static const int ACL_ERROR_PROFILING_FAILURE = 500005;
 static const int ACL_ERROR_RT_DEVICE_MEM_ERROR = 507053;
 static const int ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR = 507054;
+static const int ACL_ERROR_RT_SUSPECT_DEVICE_MEM_ERROR = 507055;
+static const int ACL_ERROR_RT_LINK_ERROR = 507056;
+static const int ACL_ERROR_RT_COMM_OP_RETRY_FAIL = 507904;
 
 #define ACL_TENSOR_SHAPE_RANGE_NUM 2
 #define ACL_TENSOR_VALUE_RANGE_NUM 2
diff --git a/third_party/acl/inc/acl/acl_mdl.h b/third_party/acl/inc/acl/acl_mdl.h
index 78dcabb8f165ac9aa8b5f37455e2861ba9b5ab2c..f13950ab8504fcb45a80e009ef4d5f9fb6b90679 100755
--- a/third_party/acl/inc/acl/acl_mdl.h
+++ b/third_party/acl/inc/acl/acl_mdl.h
@@ -50,6 +50,7 @@ typedef struct aclmdlAIPP aclmdlAIPP;
 typedef struct aclAippExtendInfo aclAippExtendInfo;
 typedef struct aclmdlConfigHandle aclmdlConfigHandle;
 typedef struct aclmdlExecConfigHandle aclmdlExecConfigHandle;
+typedef void *aclmdlRI;
 
 typedef enum {
     ACL_YUV420SP_U8 = 1,
@@ -215,16 +216,16 @@ typedef struct aclmdlExeOMDesc {
 } aclmdlExeOMDesc;
 
 typedef enum {
-    ACL_MODEL_CAPTURE_MODE_GLOBAL = 0,
-    ACL_MODEL_CAPTURE_MODE_THREAD_LOCAL,
-    ACL_MODEL_CAPTURE_MODE_RELAXED,
-} aclmdlCaptureMode;
+    ACL_MODEL_RI_CAPTURE_MODE_GLOBAL = 0,
+    ACL_MODEL_RI_CAPTURE_MODE_THREAD_LOCAL,
+    ACL_MODEL_RI_CAPTURE_MODE_RELAXED,
+} aclmdlRICaptureMode;
 
 typedef enum {
-    ACL_MODEL_CAPTURE_STATUS_NONE = 0,
-    ACL_MODEL_CAPTURE_STATUS_ACTIVE,
-    ACL_MODEL_CAPTURE_STATUS_INVALIDATED,
-} aclmdlCaptureStatus;
+    ACL_MODEL_RI_CAPTURE_STATUS_NONE = 0,
+    ACL_MODEL_RI_CAPTURE_STATUS_ACTIVE,
+    ACL_MODEL_RI_CAPTURE_STATUS_INVALIDATED,
+} aclmdlRICaptureStatus;
 
 /**
  * @ingroup AscendCL
@@ -663,45 +664,34 @@ ACL_FUNC_VISIBILITY aclError aclmdlExecuteV2(uint32_t modelId, const aclmdlDatas
  * @ingroup AscendCL
  * @brief Execute model asynchronous inference until the inference result is returned
  *
- * @param  modelId [IN]   ID of the model to perform inference
- * @param  input [IN]     Input data for model inference
- * @param  output [OUT]   Output data for model inference
- * @param  stream [IN]   stream
- * @param  handle [IN]   config of model execute
+ * @param  modelRI [IN]   runtime instance of the model to perform inference
+ * @param  stream [IN]    stream
  *
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY  aclError aclmdlExecuteAsyncV2(uint32_t modelId, const aclmdlDataset *input, aclmdlDataset *output,
-                                                   aclrtStream stream, const aclmdlExecConfigHandle *handle);
+ACL_FUNC_VISIBILITY aclError aclmdlRIExecuteAsync(aclmdlRI modelRI, aclrtStream stream);
+
 /**
  * @ingroup AscendCL
- * @brief Execute model asynchronous inference until the inference result is returned
- *
- * @param  modelId [IN]   ID of the model to perform inference
- * @param  input [IN]     Input data for model inference
- * @param  output [OUT]   Output data for model inference
- * @param  stream [IN]    stream
+ * @brief unload model with model id
  *
- * @retval ACL_SUCCESS The function is successfully executed.
+ * @param  modelId [IN]   model id to be unloaded
+ * @retval ACL_ERROR_NONE The function is successfully executed.
  * @retval OtherValues Failure
- *
- * @see aclmdlLoadFromFile | aclmdlLoadFromMem | aclmdlLoadFromFileWithMem |
- * aclmdlLoadFromMemWithMem
  */
-ACL_FUNC_VISIBILITY aclError aclmdlExecuteAsync(uint32_t modelId, const aclmdlDataset *input,
-                                                aclmdlDataset *output, aclrtStream stream);
+ACL_FUNC_VISIBILITY aclError aclmdlUnload(uint32_t modelId);
 
 /**
  * @ingroup AscendCL
- * @brief unload model with model id
+ * @brief destroy the model
  *
- * @param  modelId [IN]   model id to be unloaded
+ * @param  modelRI [IN]   runtime instance of the model to be destroyed
  *
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY aclError aclmdlUnload(uint32_t modelId);
+ACL_FUNC_VISIBILITY aclError aclmdlRIDestroy(aclmdlRI modelRI);
 
 /**
  * @ingroup AscendCL
@@ -1523,37 +1513,75 @@ ACL_FUNC_VISIBILITY const char *aclmdlGetTensorRealName(const aclmdlDesc *modelD
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY aclError aclmdlCaptureBegin(aclrtStream stream, aclmdlCaptureMode mode);
+ACL_FUNC_VISIBILITY aclError aclmdlRICaptureBegin(aclrtStream stream, aclmdlRICaptureMode mode);
 
 /**
  * @ingroup AscendCL
  * @brief obtain the capture information of a stream
  * @param stream [IN] stream to be queried
  * @param status [OUT] return the stream status
- * @param modelId [OUT] return the model id
+ * @param modelRI [OUT] return the model runtime instance
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY aclError aclmdlCaptureGetInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId);
+ACL_FUNC_VISIBILITY aclError aclmdlRICaptureGetInfo(aclrtStream stream, aclmdlRICaptureStatus *status, aclmdlRI *modelRI);
 
 /**
  * @ingroup AscendCL
  * @brief end the stream capture and obtain the corresponding model
  * @param stream [IN] stream to be ended
- * @param modelId [OUT] return the model id
+ * @param modelRI [OUT] return the model runtime instance
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY aclError aclmdlCaptureEnd(aclrtStream stream, uint32_t *modelId);
+ACL_FUNC_VISIBILITY aclError aclmdlRICaptureEnd(aclrtStream stream, aclmdlRI *modelRI);
 
 /**
  * @ingroup AscendCL
  * @brief print model information
- * @param modelId [IN] model information needs to be printed
+ * @param modelRI [IN] model runtime instance
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlRIDebugPrint(aclmdlRI modelRI);
+
+/**
+ * @ingroup AscendCL
+ * @brief the start interface of the task group
+ * @param stream [IN] capture stream
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlRICaptureTaskGrpBegin(aclrtStream stream);
+
+/**
+ * @ingroup AscendCL
+ * @brief the end interface of the task group
+ * @param stream [IN] capture stream
+ * @param handle [OUT] task group handle
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlRICaptureTaskGrpEnd(aclrtStream stream, aclrtTaskGrp *handle);
+
+/**
+ * @ingroup AscendCL
+ * @brief begin to update the task group specified by the handle
+ * @param stream [IN] specify the stream used for task update
+ * @param handle [IN] task group handle
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlRICaptureTaskUpdateBegin(aclrtStream stream, aclrtTaskGrp handle);
+
+/**
+ * @ingroup AscendCL
+ * @brief end the update of the task
+ * @param stream [IN] specify the stream used for task update
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY aclError aclmdlDebugPrint(uint32_t modelId);
+ACL_FUNC_VISIBILITY aclError aclmdlRICaptureTaskUpdateEnd(aclrtStream stream);
 
 #ifdef __cplusplus
 }
diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h
index be607a828130aa010461aac0e98dcab704b28d32..98b520ba4ac73a4b5072d98fd436edde37b51655 100755
--- a/third_party/acl/inc/acl/acl_rt.h
+++ b/third_party/acl/inc/acl/acl_rt.h
@@ -22,6 +22,7 @@ extern "C" {
 #define ACL_EVENT_SYNC                    0x00000001u
 #define ACL_EVENT_CAPTURE_STREAM_PROGRESS 0x00000002u
 #define ACL_EVENT_TIME_LINE               0x00000008u
+#define ACL_EVENT_EXTERNAL                0x00000020u
 
 #define ACL_STREAM_FAST_LAUNCH 0x00000001u
 #define ACL_STREAM_FAST_SYNC   0x00000002u
@@ -92,6 +93,10 @@ typedef enum aclrtMemMallocPolicy {
     ACL_MEM_TYPE_HIGH_BAND_WIDTH  = 0x1000,
 } aclrtMemMallocPolicy;
 
+typedef enum aclrtHostRegisterType {
+    ACL_HOST_REGISTER_MAPPED = 0U,
+} aclrtHostRegisterType;
+
 typedef enum aclrtMemAttr {
     ACL_DDR_MEM,
     ACL_HBM_MEM,
diff --git a/third_party/acl/libs/hccl.cpp b/third_party/acl/libs/hccl.cpp
index 839b5ea3c4c67f54401cdbede7724ad3c2e95051..e956269a1457d563c0e4cbf595ae6fb55369060b 100644
--- a/third_party/acl/libs/hccl.cpp
+++ b/third_party/acl/libs/hccl.cpp
@@ -34,3 +34,4 @@ hcclResult_t HcclBatchSendRecv(HcclSendRecvItemDef* sendRecvInfo, u32 itemNum, h
     aclrtStream stream) {return HCCL_SUCCESS;}
 hcclResult_t HcclCommInitAll(u32 ndev, s32 *devices, hcclComm_t *comms) {return HCCL_SUCCESS;}
 hcclResult_t HcclCommResume(hcclComm_t comm) {return HCCL_SUCCESS;}
+hcclResult_t HcclCommWorkingDevNicSet(HcclComm comm, u32 *ranks, bool *useBackup, u32 nRanks){return HCCL_SUCCESS;}
diff --git a/third_party/acl/libs/hccl.h b/third_party/acl/libs/hccl.h
index 41874cd808f409b3926450ac7abf035bd105d882..439be6f63e8691b656c4228ba5ff6f6dfd9a5da2 100644
--- a/third_party/acl/libs/hccl.h
+++ b/third_party/acl/libs/hccl.h
@@ -108,4 +108,5 @@ hcclResult_t HcclScatter(void *sendBuf, void *recvBuf, u64 count, HcclDataType d
 hcclResult_t HcclBatchSendRecv(HcclSendRecvItemDef* sendRecvInfo, u32 itemNum, hcclComm_t comm, aclrtStream stream);
 hcclResult_t HcclCommInitAll(u32 ndev, s32 *devices, hcclComm_t *comms);
 hcclResult_t HcclCommResume(hcclComm_t comm);
+hcclResult_t HcclCommWorkingDevNicSet(HcclComm comm, u32 *ranks, bool *useBackup, u32 nRanks);
 }
diff --git a/third_party/dcmi/CMakeLists.txt b/third_party/dcmi/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a5b6ee02b090b6703d3f3a887f057194c37f3ac
--- /dev/null
+++ b/third_party/dcmi/CMakeLists.txt
@@ -0,0 +1 @@
+INSTALL(DIRECTORY inc/ DESTINATION include/third_party/dcmi/inc FILES_MATCHING PATTERN "*.h")
\ No newline at end of file
diff --git a/third_party/dcmi/inc/dcmi_interface_api.h b/third_party/dcmi/inc/dcmi_interface_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..a55402f30ddf31e1aed96ce11102292abbde313e
--- /dev/null
+++ b/third_party/dcmi/inc/dcmi_interface_api.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright: Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
+ * Author: huawei
+ * Date: 2021-03-17 17:46:08
+ * @LastEditors: huawei
+ * @LastEditTime: 2022-11-03 11:17:04
+ * Description: DCMI API Reference
+ */
+
+/***************************************************************************************/
+
+#ifdef __linux
+#define DCMIDLLEXPORT
+#else
+#define DCMIDLLEXPORT _declspec(dllexport)
+#endif
+
+#define TOPO_INFO_MAX_LENTH   32 // topo info max length
+
+DCMIDLLEXPORT int dcmi_init(void);
+
+DCMIDLLEXPORT int dcmi_get_card_num_list(int *card_num, int *card_list, int list_len);  // card_num is the number of device.
+
+DCMIDLLEXPORT int dcmi_get_affinity_cpu_info_by_device_id(int card_id, int device_id, char *affinity_cpu, int *length);  // card_id is the ID of NPU card.
+
+DCMIDLLEXPORT int dcmi_get_device_id_in_card(int card_id, int *device_id_max, int *mcu_id, int *cpu_id);
diff --git a/third_party/hccl/inc/hccl/hccl.h b/third_party/hccl/inc/hccl/hccl.h
index 4ccda684b3657c8f7715adc6028d8220a3d982b0..023914a348285ad17c459b077cdd03c4593637ea 100644
--- a/third_party/hccl/inc/hccl/hccl.h
+++ b/third_party/hccl/inc/hccl/hccl.h
@@ -183,6 +183,8 @@ extern HcclResult HcclCommInitAll(uint32_t ndev, int32_t *devices, HcclComm *com
 
 extern HcclResult HcclCommResume(HcclComm comm);
 
+extern HcclResult HcclCommWorkingDevNicSet(HcclComm comm, uint32_t *ranks, bool *useBackup, uint32_t nRanks);
+
 /**
  * @brief Initialize the comm configuration.
  * @param config Pointer to the comm configuration that needs to be initialized.
diff --git a/third_party/nlohmann b/third_party/nlohmann
new file mode 160000
index 0000000000000000000000000000000000000000..87cda1d6646592ac5866dc703c8e1839046a6806
--- /dev/null
+++ b/third_party/nlohmann
@@ -0,0 +1 @@
+Subproject commit 87cda1d6646592ac5866dc703c8e1839046a6806
diff --git a/third_party/op-plugin b/third_party/op-plugin
index dcb36be0441bc60f03b027edb6c80365e4f2c184..a8133207c46379c55e6f9dc5d89ec94fd2bb040d 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit dcb36be0441bc60f03b027edb6c80365e4f2c184
+Subproject commit a8133207c46379c55e6f9dc5d89ec94fd2bb040d
diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index a013adee642fa13d97607f0323bb59f828f29649..b81514e88e3fe56330d08920e90f9c5df8cf0bb4 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit a013adee642fa13d97607f0323bb59f828f29649
+Subproject commit b81514e88e3fe56330d08920e90f9c5df8cf0bb4
diff --git a/tools/flight_recorder/components/builder.py b/tools/flight_recorder/components/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb13bdc9ae9c819a33c3de4392cf6ba7423083db
--- /dev/null
+++ b/tools/flight_recorder/components/builder.py
@@ -0,0 +1,307 @@
+import argparse
+import ast
+import os
+from typing import Any
+
+from tools.flight_recorder.components.fr_logger import FlightRecorderLogger
+from tools.flight_recorder.components.types import (
+    Collective,
+    Database,
+    EntryState,
+    Group,
+    MatchStateRecord,
+    Membership,
+    HCCLCall,
+    Op,
+    Traceback,
+)
+from tools.flight_recorder.components.utils import (
+    ProcessGroupData,
+    align_trace_from_beginning,
+    check_current_entry_match,
+    check_no_missing_dump_files,
+    check_version,
+    EntryContext,
+    error_analysis,
+    get_version_detail,
+    just_print_entries,
+)
+
+
+# Set up logging
+logger: FlightRecorderLogger = FlightRecorderLogger()
+
+
+try:
+    from tabulate import tabulate
+except ModuleNotFoundError:
+    logger.warning("tabulate is not installed. Proceeding without it.")
+
+    # Define a no-op tabulate function
+    def tabulate(data: Any, headers: Any = None) -> Any:  # type: ignore[misc]
+        return data
+
+
+"""
+Flat DB builder
+"""
+
+
+def build_groups_memberships(
+    pg_config: Any,
+) -> tuple[
+    list[Group],
+    dict[Any, Group],
+    list[Membership],
+    dict[str, set[Any]],
+    dict[tuple[str, int], str],
+]:
+    """
+    pg_config: {
+        global_rank: {
+            (pg_guid, desc, ranks)
+        }
+    }
+
+    `pg_guid` is a system generated id, but depending on the mode of PG creation it could be a globally incrementing int
+          or a hash of the ranks.  See `_process_group_name` in distributed_c10d.py.
+    `desc` is provided by the user (optionally) and should be 'meaningful' (e.g. TP/PP/DP group)
+    `ranks` is a list of the 'global ranks' that are members of the PG.
+
+    (pg_guid, desc, ranks) tuples are appended lazily to the flight buffer when `getHCCLComm` is called on a PG and
+    the `enabled_` flag is true for that PG.
+        - the order of calling (init_process_group, new_group, etc) does not affect the order of the tuples in the list
+
+    Returns:
+        `groups`: a groups table where each row is a Group namedtuple.
+        `_groups`: a dict that is indexed by pg_guid with Group namedtuple as value.
+        `memberships`: a membership table where each row is a Membership namedtuple.
+        `_memberships`: a dict that is indexed by pg_guid with set of ranks (int) as value.
+        `_pg_guids`: a dict that is indexed by (pg_uid, global_rank) with pg_guid as value.
+    """
+    # flat lists for return
+    groups = []
+    memberships = []
+
+    # dicts for faster cross-rank validation
+    _groups = {}
+    _memberships = {}
+    _pg_guids = {}
+    for global_rank in pg_config:
+        for pg_uid in pg_config[global_rank]:
+            desc = pg_config[global_rank][pg_uid]["desc"]
+            ranks = ast.literal_eval(pg_config[global_rank][pg_uid]["ranks"])
+            # With the adoption of the split_group API, we can have multiple PGs with the same pg_guid (PG Name)
+            # So we need to add the hash of all its ranks within the PG as well.
+            # Also guid must be a string because `_process_group_name` returns a string.
+            pg_guid = pg_uid + str(hash(frozenset(ranks)))
+            _pg_guids[(pg_uid, global_rank)] = pg_guid
+            if isinstance(ranks, str):
+                ranks = ast.literal_eval(ranks)
+            if pg_guid not in _groups:
+                groups.append(Group(id=pg_guid, desc=desc, size=len(ranks)))
+                for rank in ranks:
+                    memberships.append(Membership(group_id=pg_guid, global_rank=rank))
+                _groups[pg_guid] = groups[-1]
+                _memberships[pg_guid] = set(ranks)
+            else:
+                # validation across ranks
+                if _groups[pg_guid].desc != desc:
+                    raise ValueError(
+                        f"Description mismatch for group {pg_guid}: "
+                        f"expected '{desc}', got '{_groups[pg_guid].desc}'"
+                    )
+
+                if _memberships[pg_guid] != set(ranks):
+                    raise ValueError(
+                        f"Membership mismatch for group {pg_guid}: "
+                        f"expected {set(ranks)}, got {_memberships[pg_guid]}"
+                    )
+
+    return groups, _groups, memberships, _memberships, _pg_guids
+
+
+def build_collectives(
+    all_entries: dict[int, list[dict[str, Any]]],
+    _groups: dict[str, Group],
+    _memberships: dict[str, set[Any]],
+    _pg_guids: dict[tuple[str, int], str],
+    version: str,
+) -> tuple[list[Traceback], list[Collective], list[HCCLCall]]:
+    """
+    groups, memberships are the non-flat dicts that are indexable
+    all_entries is a raw dict from the original dumps:
+
+    all_entries: {
+        global_rank: [
+            {
+                record_id: ordered id of the event in the trace buffer
+                pg_id: ProcessGroupHCCL::uid_
+                    *note: `pg_id` corresponds to nothing in groups table
+                process_group: (pg_name, desc)
+                    *note: `pg_name`, `desc` corresponds to `pg_id`, `desc` in groups table
+                collective_seq_id: ordered id for collective operations and coalesced group operations
+                p2p_seq_id: ordered id for point-to-point operations
+                op_id: ordered id including individual ops inside coalescing group
+                profiling_name: descriptive name of the operation
+                'time_created_ns',
+                'input_sizes',
+                'output_sizes',
+                'state',
+                'time_discovered_started_ns',
+                'time_discovered_completed_ns',
+                'retired',
+                'frames',
+            }
+        ]
+    }
+    """
+    tracebacks: list[Traceback] = []
+
+    collectives: list[Collective] = []
+    hccl_calls: list[HCCLCall] = []
+
+    # once we find one mismatch, we stop pairing up collectives since the pairing is possibly incorrect
+    # instead, just record the remaining ops as HCCLCalls
+    mismatch = {_groups[g].id: 0 for g in _groups}
+    MISMATCH_TAIL = 10
+
+    # For best effort partial analysis.
+    dumps_ranks = set()
+    for key in all_entries.keys():
+        try:
+            dumps_ranks.add(int(key))
+        except ValueError as e:
+            raise ValueError(f"Cannot extract rank from '{key}") from e
+    """
+    - it doesn't matter what order I put collectives/hcclops into their table. we can later on re-sort it by start time
+    - there could be multiple options for the "first" collective to pair up (rank 0,1 might do a bcast while rank 2,3 do a bcast)
+    - within a group, the first collective must be the same on all ranks in the group, then it can be marked as a
+    collective and removed
+    """
+    while all_entries:
+        # we greedily match collectives, starting arbitrarily with the trace from the first rank
+        # later, if we exhaust the first rank, we continue with the next 'first rank'
+        rank_iter = iter(all_entries)
+        first_rank = next(rank_iter)
+        other_ranks = list(rank_iter)
+
+        if len(all_entries[first_rank]) == 0:
+            all_entries.pop(first_rank)
+            continue
+
+        # lets match the first collective! we need to know which ranks are involved, and ensure that this same
+        # collective is also the first one on those ranks within that group
+        entries = all_entries[first_rank]
+        current_entry = entries[0]
+
+        desc = current_entry["process_group"][1] if current_entry["process_group"][1] else "default_pg"
+        # For db build and logs printing, we want to use the original pg_name, not the hash one.
+        original_pg_name = current_entry["process_group"][0]
+        pg_name = _pg_guids[(original_pg_name, first_rank)]
+        expected_ranks = set(_memberships[pg_name])
+        entry_state = EntryState(current_entry, expected_ranks)
+        match_record = MatchStateRecord(
+            expected_ranks=expected_ranks,
+            other_ranks=other_ranks,
+            entry_state=entry_state,
+            candidate_ranks={first_rank},
+            candidate_idx={},
+            found_ranks=set(),
+            found_idx={},
+            errors=set(),
+        )
+
+        check_current_entry_match(
+            all_entries=all_entries,
+            current_entry=current_entry,
+            _memberships=_memberships,
+            pg_data=ProcessGroupData(pg_guids=_pg_guids, pg_name=pg_name, desc=desc, mismatch=mismatch),
+            match_record=match_record,
+        )
+
+        # Use heuristics to decide what type of errors and error messages we should print.
+        error_analysis(
+            entry_context=EntryContext(all_entries, current_entry, dumps_ranks, first_rank),
+            match_record=match_record,
+            mismatch=mismatch,
+            version=get_version_detail(version),
+            pg_name=pg_name,
+        )
+        # at this point there are 3 possibilities
+        # 1. we found a match on all the ranks that are members of the group
+        #  -> we create a Collective and remove the individual entries from their original lists
+        if match_record.found_ranks == expected_ranks and mismatch[pg_name] == 0:
+            collectives.append(match_record.entry_state.to_collective(len(collectives)))
+            idx_map = {r: match_record.found_idx[r] if r != first_rank else 0 for r in match_record.found_ranks}
+            hccl_calls.extend(
+                match_record.entry_state.to_hccl_call(all_entries, idx_map, len(hccl_calls), collectives[-1].id)
+            )
+
+        # 2. we found a partial match but some ranks are missing
+        # 3. we found no match
+        else:
+            logger.debug("appending a non-matching collective")
+            idx_map = {r: match_record.candidate_idx[r] if r != first_rank else 0 for r in match_record.candidate_ranks}
+            collectives.append(
+                match_record.entry_state.to_collective(
+                    len(collectives),
+                    errors=match_record.errors,
+                    idx_map=idx_map,
+                    all_entries=all_entries,
+                )
+            )
+            hccl_calls.extend(match_record.entry_state.to_hccl_call(all_entries, idx_map, len(hccl_calls), None))
+
+        if mismatch[pg_name] > MISMATCH_TAIL:
+            logger.error("Too many mismatches for process_group %s: %s aborting", pg_name, desc)
+            break
+    return tracebacks, collectives, hccl_calls
+
+
+def build_db(details: dict[str, dict[str, Any]], args: argparse.Namespace, version: str) -> Database:
+    if args.verbose:
+        os.environ["FR_TRACE_VERBOSE_OUTPUT"] = "1"
+    # temporary state used for building database
+    entries = {}
+    pg_config = {}
+    version_by_ranks = {}
+    for rank, dump in details.items():
+        entries[rank] = dump["entries"]
+        version_by_ranks[rank] = dump["version"]
+        pg_config[rank] = dump["pg_config"]
+
+    # Ensure version is consistent across all ranks.
+    check_version(version_by_ranks, version)
+    entries = align_trace_from_beginning(entries)
+
+    # flattened database
+    groups, _groups, memberships, _memberships, _pg_guids = build_groups_memberships(pg_config)
+    logger.debug("built groups, memberships")
+
+    if not args.allow_incomplete_ranks:
+        check_no_missing_dump_files(entries, memberships)
+
+    if args.just_print_entries:
+        just_print_entries(entries, _groups, _memberships, _pg_guids, args)
+        return None
+
+    tracebacks, collectives, hccl_calls = build_collectives(entries, _groups, _memberships, _pg_guids, version)
+    logger.debug("built collectives, hccl_calls")
+    if args.verbose:
+        logger.debug("Groups")
+        logger.debug(tabulate(groups, headers=Group._fields))
+        logger.debug("Memberships")
+        logger.debug(tabulate(memberships, headers=Membership._fields))
+        logger.debug("Collectives")
+        logger.debug(tabulate(collectives, headers=Collective._fields))
+        logger.debug("HCCLCalls")
+        logger.debug(tabulate(hccl_calls, headers=HCCLCall._fields))
+    db = Database(
+        tracebacks=tracebacks,
+        collectives=collectives,
+        hcclcalls=hccl_calls,
+        groups=groups,
+        memberships=memberships,
+    )
+    return db
diff --git a/tools/flight_recorder/components/config_manager.py b/tools/flight_recorder/components/config_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc8df58fd7bf5d122fa016d3fb2cb2aeb39f96c3
--- /dev/null
+++ b/tools/flight_recorder/components/config_manager.py
@@ -0,0 +1,71 @@
+import argparse
+import logging
+from collections.abc import Sequence
+from typing import Optional
+
+from tools.flight_recorder.components.fr_logger import FlightRecorderLogger
+
+
+logger: FlightRecorderLogger = FlightRecorderLogger()
+
+
+class JobConfig:
+    """
+    A helper class to manage the script configuration.
+    """
+
+    def __init__(self: "JobConfig"):
+        self.parser = argparse.ArgumentParser(description="PyTorch Flight recorder analyzing script.")
+        self.parser.add_argument(
+            "trace_dir",
+            nargs="?",
+            help="Directory containing one trace file per rank, named with <prefix>_<rank>.",
+        )
+        self.parser.add_argument(
+            "--selected-ranks",
+            default=None,
+            nargs="+",
+            type=int,
+            help="List of ranks we want to show traces for.",
+        )
+        self.parser.add_argument(
+            "--allow-incomplete-ranks",
+            action="store_true",
+            help=(
+                "FR trace require all ranks to have dumps for analysis. "
+                "This flag allows best-effort partial analysis of results "
+                "and printing of collected data."
+            ),
+        )
+        self.parser.add_argument(
+            "--pg-filters",
+            default=None,
+            nargs="+",
+            type=str,
+            help=(
+                "List of filter strings, it could be pg name or pg desc. "
+                "If specified, only show traces for the given pg."
+            ),
+        )
+        self.parser.add_argument("-o", "--output", default=None)
+        self.parser.add_argument(
+            "-p",
+            "--prefix",
+            help=(
+                "Common filename prefix to strip such that rank can be extracted. "
+                "If not specified, will attempt to infer a common prefix."
+            ),
+            default=None,
+        )
+        self.parser.add_argument("-j", "--just_print_entries", action="store_true")
+        self.parser.add_argument("-v", "--verbose", action="store_true")
+
+    def parse_args(self: "JobConfig", args: Optional[Sequence[str]]) -> argparse.Namespace:
+        args = self.parser.parse_args(args)
+        if args.selected_ranks is not None and not args.just_print_entries:
+            raise ValueError("Cannot use --selected-ranks without --just-print-entries")
+        if args.pg_filters is not None and not args.just_print_entries:
+            raise ValueError("Cannot use --pg-filters without --just-print-entries")
+        if args.verbose:
+            logger.set_log_level(logging.DEBUG)
+        return args
diff --git a/tools/flight_recorder/components/fr_logger.py b/tools/flight_recorder/components/fr_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..62cc2f7b88ebd032ac66ba5aebd54b13360c4c40
--- /dev/null
+++ b/tools/flight_recorder/components/fr_logger.py
@@ -0,0 +1,42 @@
+import logging
+from typing import Any, Callable, Optional
+
+
+class FlightRecorderLogger:
+    _instance: Optional[Any] = None
+    logger: logging.Logger
+
+    def __init__(self) -> None:
+        self.logger: logging.Logger = logging.getLogger("Flight Recorder")
+
+    def __new__(cls) -> Any:
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance.logger = logging.getLogger("Flight Recorder")
+            cls._instance.logger.setLevel(logging.INFO)
+            ch = logging.StreamHandler()
+            cls._instance.logger.addHandler(ch)
+        return cls._instance
+
+    def set_log_level(self, level: int) -> None:
+        self.logger.setLevel(level)
+
+    @property
+    def debug(self) -> Callable[..., None]:
+        return self.logger.debug
+
+    @property
+    def info(self) -> Callable[..., None]:
+        return self.logger.info
+
+    @property
+    def warning(self) -> Callable[..., None]:
+        return self.logger.warning
+
+    @property
+    def error(self) -> Callable[..., None]:
+        return self.logger.error
+
+    @property
+    def critical(self) -> Callable[..., None]:
+        return self.logger.critical
diff --git a/tools/flight_recorder/components/loader.py b/tools/flight_recorder/components/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3c03c49bdc86e18f598865c3afe0fcad3257f09
--- /dev/null
+++ b/tools/flight_recorder/components/loader.py
@@ -0,0 +1,83 @@
+import os
+import pickle
+import re
+from collections import defaultdict
+
+from tools.flight_recorder.components.fr_logger import FlightRecorderLogger
+from tools.flight_recorder.components.utils import get_valid_read_path
+
+MAX_DEPTH = 3
+
+logger: FlightRecorderLogger = FlightRecorderLogger()
+
+SAFE_CLASSES = {
+    # Built-in security type
+    "builtins": {"str", "int", "float", "list", "dict", "tuple"},
+}
+
+exp = re.compile(r"^([a-zA-Z0-9_]{0,100}?)(\d+)$")
+
+
+class SafeUnpickler(pickle.Unpickler):
+    def find_class(self, module, name):
+        # Check if the module and class are in the whitelist
+        if module in SAFE_CLASSES and name in SAFE_CLASSES[module]:
+            return super().find_class(module, name)
+        raise pickle.UnpicklingError(f"Forbidden class: {module}.{name}")
+
+
+def read_dump(prefix, filename):
+    basename = os.path.basename(filename)
+    try:
+        rank = int(basename[len(prefix):])
+    except ValueError as e:
+        raise ValueError(f"Cannot extract rank from '{basename}' with prefix '{prefix}'.") from e
+    filename = get_valid_read_path(filename)
+    try:
+        with open(filename, "rb") as infile:
+            dump = SafeUnpickler(infile).load()
+    except Exception as e:
+        logger.error(f"Failed to load data from {filename}: {e}")
+    return rank, dump
+
+
+def determine_prefix(files):
+    possible_prefixes: defaultdict[str, set[int]] = defaultdict(set)
+    for f in files:
+        m = exp.search(f)
+        if m:
+            p, r = m.groups()
+            possible_prefixes[p].add(int(r))
+    if len(possible_prefixes) == 1:
+        prefix = next(iter(possible_prefixes))
+        return prefix
+    else:
+        raise ValueError(
+            "Unable to automatically determine the common prefix for the trace file names. "
+            "Please specify --prefix argument manually"
+        )
+
+
+def read_dir(args):
+    """Load recorder data for all ranks"""
+    prefix = args.prefix
+    path = args.trace_dir
+    details = {}
+    version = ""
+    for root, _, files in os.walk(path):
+        current_depth = root.count(os.sep) - path.count(os.sep)
+        if current_depth > MAX_DEPTH:
+            logger.error("The current file depth has exceeded the maximum depth limit, which is set to {MAX_DEPTH}.")
+            break
+        if prefix is None:
+            prefix = determine_prefix(files)
+        for f in files:
+            if "py_traceback" in f:
+                continue
+            if f.find(prefix) != 0:
+                continue
+            rank, dump = read_dump(prefix, os.path.join(root, f))
+            details[rank] = dump
+            if not version:
+                version = str(details[rank]["version"])
+    return details, version
diff --git a/tools/flight_recorder/components/types.py b/tools/flight_recorder/components/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..40cb08a1a40522a917dffc1d6dc11ccc7c44a5f4
--- /dev/null
+++ b/tools/flight_recorder/components/types.py
@@ -0,0 +1,550 @@
+import math
+import os
+from enum import auto, Enum
+from typing import (
+    _eval_type,
+    Any,
+    Generic,
+    NamedTuple,
+    Optional,
+    TypeVar,
+)
+
+from tools.flight_recorder.components.fr_logger import FlightRecorderLogger
+
+
+T = TypeVar("T", bound=NamedTuple)
+
+
+class Ref(Generic[T]):
+    pass
+
+
+class TypeInfo(NamedTuple):
+    name: str
+    fields: list[tuple[str, type]]  # type: ignore[type-arg]
+
+    @classmethod
+    def from_type(cls, c: T) -> "TypeInfo":
+        if hasattr(c, "__name__"):
+            name = c.__name__
+        else:
+            name = str(c)
+        return cls(
+            name,
+            [(f, _eval_type(c.__annotations__[f], globals(), {})) for f in c._fields],
+        )
+
+
+class MatchState(Enum):
+    """
+    Enum representing the possible states of matching for collective operations.
+
+    - FULLY_MATCHED: Indicates that all aspects of the collective operations match.
+    - COLLECTIVE_TYPE_MISMATCH: The types of the collective operations differ.
+    - SIZE_OR_SYNTAX_MISMATCH: There is a mismatch in input/output sizes or violation of collective syntax.
+    - COLLECTIVE_STATE_MISMATCH:
+        The states of the collective not same, such as one finished while another just started or scheduled.
+    - COLLECTIVE_DTYPE_MISMATCH: The data types of the collective input/output differ.
+    - UNDECIDED:
+        The match status is ambiguous or cannot be determined, e.g., we might need to check all ranks for alltoall_base.
+    """
+
+    FULLY_MATCHED = auto()
+    COLLECTIVE_TYPE_MISMATCH = auto()
+    SIZE_OR_SYNTAX_MISMATCH = auto()
+    COLLECTIVE_STATE_MISMATCH = auto()
+    COLLECTIVE_DTYPE_MISMATCH = auto()
+    UNDECIDED = auto()
+
+
+class MatchInfo:
+    """
+    Aside from the match state, we also store some dynamic info for the match such as the culprit rank
+    or collective state that caused the mismatch.
+    """
+
+    def __init__(self, state: MatchState, culprit: Optional[str] = None) -> None:
+        self._state = state
+        self.culprit = culprit
+
+    def __str__(self) -> str:
+        details = f", {self.culprit}" if getattr(self, "culprit", None) else ""
+        return f"Error type: {self._state.name}{details}"
+
+    @property
+    def state(self) -> MatchState:
+        return self._state
+
+
+class Group(NamedTuple):
+    id: str
+    desc: str
+    size: int
+
+
+class Membership(NamedTuple):
+    group_id: str
+    global_rank: int
+
+
+class Traceback(NamedTuple):
+    id: int
+    frames: str
+
+
+class Collective(NamedTuple):
+    id: int
+    group_id: str
+    pass_check: bool
+    collective_seq_id: int
+    p2p_seq_id: int
+    record_id: int
+    pg_desc: str
+    collective_name: str
+    input_sizes: list[list[int]]
+    output_sizes: list[list[int]]
+    expected_ranks: set[int]
+    collective_state: str
+    collective_frames: list[dict[str, str]]
+    input_numel: Optional[int] = None
+    output_numel: Optional[int] = None
+    missing_ranks: Optional[set[int]] = None
+    mismatch_collectives: Optional[dict[int, "Collective"]] = None
+    type_of_mismatch: Optional[MatchInfo] = None
+
+
+class HCCLCall(NamedTuple):
+    id: int
+    collective_id: Ref[Collective]
+    group_id: str
+    global_rank: int  # technically Ref[Process] once we have it
+    traceback_id: Ref[Traceback]
+    collective_type: str
+    sizes: list[list[int]]
+
+
+class Database(NamedTuple):
+    groups: list[Group]
+    memberships: list[Membership]
+    tracebacks: list[Traceback]
+    collectives: list[Collective]
+    hcclcalls: list[HCCLCall]
+
+
+types = [
+    TypeInfo.from_type(t)  # type: ignore[type-var]
+    for t in globals().values()
+    if (isinstance(t, type) and issubclass(t, tuple) and hasattr(t, "_fields") and t is not TypeInfo)
+]
+
+
+COLLECTIVES = {
+    "broadcast",
+    "_broadcast_oop",
+    "reduce",
+    "_reduce_oop",
+    "all_gather",
+    "all_reduce",
+    "_all_gather_base",
+    "all_gather_into_tensor_coalesced",
+    "reduce_scatter",
+    "reduce_scatter_tensor_coalesced",
+    "_reduce_scatter_base",
+    "gather",
+    "scatter",
+    "all_to_all",
+    "all_reduce_barrier",
+    "allreduce_coalesced",
+    "ALLGATHER_coalesced",
+    "REDUCE_SCATTER_coalesced",
+}
+
+P2P = {
+    "send",
+    "recv",
+}
+
+
+class EntryState:
+    """
+    Util class to keep track of the state of an entry and standardize the way we
+    log the error info during analysis.
+    """
+
+    def __init__(self, entry: dict[str, Any], expected_ranks: set[int]) -> None:
+        self.pg_name = entry["process_group"][0]
+        self.desc = entry["process_group"][1]
+        self.pg_desc = f"{self.pg_name}:{self.desc}" if self.desc != "undefined" else self.pg_name
+        self.profiling_name = entry["profiling_name"]
+        self.collective_seq_id = entry["collective_seq_id"]
+        self.p2p_seq_id = entry["p2p_seq_id"]
+        self.record_id = entry["record_id"]
+        self.input_sizes = entry["input_sizes"]
+        self.output_sizes = entry["output_sizes"]
+        self.collective_state = entry["state"]
+        self.collective_frames = entry.get("frames", [])
+        self.expected_ranks = expected_ranks
+        self.missing_ranks: set[int]
+        self.input_numel: int
+        self.output_numel: int
+        self.errors: set[tuple[int, MatchInfo]]
+
+
+    def log(
+        self,
+        logger: FlightRecorderLogger,
+        logger_msg: str,
+        frame_formatter: Any,
+        additional_info: dict = None,
+    ) -> None:
+        logger.info(
+            logger_msg,
+            self.collective_seq_id,
+        )
+        logger.info("internal record id: %s", self.record_id)
+        logger.info("group info: %s", self.pg_desc)
+        logger.info("collective: %s", self.profiling_name)
+        if additional_info and "missing_ranks" in additional_info:
+            missing_ranks = additional_info["missing_ranks"]
+            self.missing_ranks = missing_ranks
+            logger.info("missing ranks: %s", missing_ranks)
+        if additional_info and "total_numel" in additional_info:
+            total_numel = additional_info["total_numel"]
+            self.input_numel = total_numel[0]
+            self.output_numel = total_numel[1]
+            logger.info("total input numel: %d", total_numel[0])
+            logger.info("total output numel: %d", total_numel[1])
+        logger.info("input sizes: %s", self.input_sizes)
+        logger.info("output sizes: %s", self.output_sizes)
+        logger.info("world size: %d", len(self.expected_ranks))
+        logger.info("expected ranks: %s", str(self.expected_ranks))
+        logger.info("collective state: %s", self.collective_state)
+        if additional_info and "errors" in additional_info:
+            errors = additional_info["errors"]
+            self.errors = errors
+            error_msg = ", ".join(f"Culprit rank {error[0]}; {str(error[1])}" for error in errors)
+            logger.info("error msg: %s", error_msg)
+        logger.info("collective stack trace: \n %s", frame_formatter(self.collective_frames))
+
+    def to_collective(
+        self,
+        collective_id: int,
+        errors: Optional[set[tuple[int, MatchInfo]]] = None,
+        idx_map: Optional[dict[int, int]] = None,
+        all_entries: Optional[dict[int, list[dict[str, Any]]]] = None,
+    ) -> Collective:
+        if not errors:
+            return Collective(
+                id=collective_id,
+                group_id=self.pg_name,
+                record_id=self.record_id,
+                pg_desc=self.pg_desc,
+                pass_check=True,
+                collective_seq_id=self.collective_seq_id,
+                p2p_seq_id=self.p2p_seq_id,
+                collective_name=self.profiling_name,
+                input_sizes=self.input_sizes,
+                output_sizes=self.output_sizes,
+                expected_ranks=self.expected_ranks,
+                collective_state=self.collective_state,
+                collective_frames=self.collective_frames,
+                missing_ranks=getattr(self, "missing_ranks", None),
+            )
+        else:
+            if idx_map is None:
+                raise ValueError("idx_map cannot be None")
+            if all_entries is None:
+                raise ValueError("all_entries cannot be None")
+            mismatch_collectives = {}
+            for rank, error in errors:
+                idx = idx_map[rank]
+                entry = all_entries[rank][idx]
+                desc = entry["process_group"][1]
+                pg_name = entry["process_group"][0]
+                mismatch_collectives[rank] = Collective(
+                    id=collective_id,
+                    group_id=entry["process_group"][0],
+                    record_id=entry["record_id"],
+                    pg_desc=f"{pg_name}:{desc}" if desc != "undefined" else pg_name,
+                    pass_check=False,
+                    collective_seq_id=entry["collective_seq_id"],
+                    p2p_seq_id=entry["p2p_seq_id"],
+                    collective_name=entry["profiling_name"],
+                    input_sizes=entry["input_sizes"],
+                    output_sizes=entry["output_sizes"],
+                    expected_ranks=self.expected_ranks,
+                    collective_state=entry["state"],
+                    collective_frames=entry.get("frames", []),
+                    type_of_mismatch=error,
+                )
+            return Collective(
+                id=collective_id,
+                group_id=self.pg_name,
+                record_id=self.record_id,
+                pg_desc=self.pg_desc,
+                pass_check=False,
+                collective_seq_id=self.collective_seq_id,
+                p2p_seq_id=self.p2p_seq_id,
+                collective_name=self.profiling_name,
+                input_sizes=self.input_sizes,
+                output_sizes=self.output_sizes,
+                expected_ranks=self.expected_ranks,
+                collective_state=self.collective_state,
+                collective_frames=self.collective_frames,
+                input_numel=self.input_numel if hasattr(self, "input_numel") else None,
+                output_numel=self.output_numel if hasattr(self, "output_numel") else None,
+                missing_ranks=self.missing_ranks if hasattr(self, "missing_ranks") else None,
+                mismatch_collectives=mismatch_collectives,
+            )
+
+    def to_hccl_call(
+        self,
+        all_entries: dict[int, list[dict[str, Any]]],
+        idx_map: dict[int, int],
+        hccl_call_id: int,
+        collective_id: Any,
+    ) -> list[HCCLCall]:
+        result = []
+        for i, k in idx_map.items():
+            all_entries[i].pop(k)
+            result.append(
+                HCCLCall(
+                    id=hccl_call_id,
+                    collective_id=collective_id,
+                    group_id=self.pg_name,  # type: ignore[arg-type]
+                    global_rank=i,
+                    traceback_id=0,  # type: ignore[arg-type]
+                    collective_type=self.profiling_name,
+                    sizes=self.input_sizes,
+                )
+            )
+            hccl_call_id += 1
+        return result
+
+
+class Op:
+    """Parses relevant info about operation out of 'event' dict
+
+    examples of supported `profiling_name`s:
+        hccl:broadcast
+        hccl:send 1->2
+        hccl:recv 3<-0
+    """
+    MISSING_FRAMES_ERR = "Event missing 'frames' field or empty frames array"
+    INVALID_FRAME_ERR = "Frame[0] missing 'name' field"
+
+
+    def __init__(self, event: dict[Any, Any], memberships: dict[str, set[Any]], pg_name: str):
+
+        frames = event.get("frames")
+        if not frames: 
+            raise ValueError(self.MISSING_FRAMES_ERR)
+        first_frame = frames[0] if len(frames) > 0 else None
+        if not first_frame:
+            raise ValueError(self.MISSING_FRAMES_ERR)
+        self.profiling_name = first_frame.get("name")
+        if self.profiling_name is None:
+            raise ValueError(self.INVALID_FRAME_ERR)
+        parts = self.profiling_name.split(":")
+        self.type = parts[0]
+        meta = parts[1] if len(parts) == 2 else None
+        self.state = event.get("state")
+        self.pg_name, self.pg_desc = event.get("process_group")
+        if type == "send":
+            s, d = meta.split("->")
+            self._src, self._dst = int(s), int(d)
+        elif type == "recv":
+            d, s = meta.split("<-")
+            self._dst, self._src = int(d), int(s)
+        else:
+            self._src, self._dst = -1, -1
+        self._init_global_src_dst(memberships[pg_name])
+        self.pg_size = len(memberships[pg_name])
+        if type in P2P | COLLECTIVES:
+            self.input_sizes = event.get("input_sizes")
+            self.output_sizes = event.get("output_sizes")
+        else:
+            self.input_sizes, self.output_sizes = None, None
+        self.collective_seq_id = event.get("collective_seq_id")
+        self.p2p_seq_id = event.get("p2p_seq_id")
+        self.input_dtypes = event.get("input_dtypes")
+        self.output_dtypes = event.get("output_dtypes")
+        self.time_created_ns = event.get("time_created_ns")
+        self.collective_frames = event.get("frames", [])
+        self.is_verbose = os.getenv("FR_TRACE_VERBOSE_OUTPUT", "0") == "1"
+
+    def _init_global_src_dst(self, pg_ranks: set[Any]) -> None:
+        pg_ranks = sorted(pg_ranks)
+        self._src_g = pg_ranks[self._src] if self._src is not None else None
+        self._dst_g = pg_ranks[self._dst] if self._dst is not None else None
+
+    @property
+    def src(self) -> int:
+        if self.type not in P2P:
+            raise ValueError(f"Can't get src of non-p2p op (type: {self.type})")
+        return self._src
+
+    @property
+    def dst(self) -> int:
+        if self.type not in P2P:
+            raise ValueError(f"Can't get dst of non-p2p op (type: {self.type})")
+        return self._dst
+
+    def __repr__(self) -> str:
+        p2p_info = ""
+        if self.type in P2P:
+            p2p_info = f"s={self._src_g} d={self._dst_g}"
+        if self.is_verbose:
+            verbose_info = (
+                f"timestamp_created={self.time_created_ns}",
+                p2p_info,
+                f"input_sizes={self.input_sizes}",
+                f"output_sizes={self.output_sizes}",
+                f"input_dtypes={self.input_dtypes}",
+                f"output_dtypes={self.output_dtypes}",
+                "collective_seq_id | p2p_seq_id=" f"{self.p2p_seq_id if self.type in P2P else self.collective_seq_id}",
+                f"pg_name={self.pg_name}",
+                f"pg_description={self.pg_desc}",
+                f"pg_size={self.pg_size}",
+                f"state={self.state}",
+            )
+            return f"{self.type}({', '.join(s for s in verbose_info if s)})"
+        return f"{self.type}(%sinput_sizes={self.input_sizes}, state={self.state})" % (
+            f"{p2p_info}, " if p2p_info else ""
+        )
+
+    def has_different_dtypes_and_non_empty_sizes(self, other):
+        """
+        Check if the input/output dtypes are different and the sizes are non-empty.
+        """
+        # Check if input/output dtypes are different and sizes are non-empty
+        condition1 = set(self.input_dtypes) != set(self.output_dtypes) and self.input_sizes[0] and self.output_sizes[0]
+        condition2 = set(self.input_dtypes) != set(other.input_dtypes) and self.input_sizes[0] and other.input_sizes[0]
+        condition3 = (
+            set(self.input_dtypes) != set(other.output_dtypes) and self.input_sizes[0] and other.output_sizes[0]
+        )
+        return condition1 or condition2 or condition3
+
+    def match(self, other: "Op") -> MatchInfo:
+        if self.type == "send":
+            return (
+                MatchInfo(MatchState.FULLY_MATCHED)
+                if (
+                    other.type == "recv"
+                    and self.src == other.src
+                    and self.dst == other.dst
+                    and self.input_sizes == other.output_sizes
+                )
+                else MatchInfo(MatchState.SIZE_OR_SYNTAX_MISMATCH)
+            )
+        elif self.type == "recv":
+            return (
+                MatchInfo(MatchState.FULLY_MATCHED)
+                if (
+                    other.type == "send"
+                    and self.src == other.src
+                    and self.dst == other.dst
+                    and self.output_sizes == other.input_sizes
+                )
+                else MatchInfo(MatchState.SIZE_OR_SYNTAX_MISMATCH)
+            )
+        elif self.type in COLLECTIVES:
+            if self.type != other.type:
+                return MatchInfo(
+                    MatchState.COLLECTIVE_TYPE_MISMATCH,
+                    f"Expected collective type: '{self.type}' does not match found collective type: '{other.type}'",
+                )
+            if self.state != other.state:
+                return MatchInfo(
+                    MatchState.COLLECTIVE_STATE_MISMATCH,
+                    f"Expected state: '{self.state}' does not match found state: '{other.state}'",
+                )
+            if self.has_different_dtypes_and_non_empty_sizes(self):
+                return MatchInfo(
+                    MatchState.COLLECTIVE_DTYPE_MISMATCH,
+                    f"Expected dtypes: '{set(self.input_dtypes)}' does not "
+                    f"match found dtype: '{set(self.output_dtypes)}/"
+                    f"{set(other.input_dtypes)}/{set(other.output_dtypes)}'",
+                )
+            if self.type == "all_to_all":
+                return MatchInfo(MatchState.UNDECIDED)
+            if self.type != "scatter" and self.input_sizes != other.input_sizes:
+                return MatchInfo(
+                    MatchState.SIZE_OR_SYNTAX_MISMATCH,
+                    f"Expected input sizes: '{self.input_sizes}' does not match found input sizes: "
+                    f"'{other.input_sizes}'",
+                )
+            if self.type != "gather" and self.output_sizes != other.output_sizes:
+                return MatchInfo(
+                    MatchState.SIZE_OR_SYNTAX_MISMATCH,
+                    f"Expected output sizes: '{self.output_sizes}' does not match found output sizes: "
+                    f"'{other.output_sizes}'",
+                )
+            if self.type in ["all_reduce", "allreduce_coalesced"] and self.input_sizes != other.output_sizes:
+                return MatchInfo(
+                    MatchState.SIZE_OR_SYNTAX_MISMATCH,
+                    f"Expected input sizes: '{self.input_sizes}' does not match found output sizes: '{other.output_sizes}'",
+                )
+            if self.type in [
+                "all_gather",
+                "all_gather_base",
+                "all_gather_into_tensor_coalesced",
+            ] and not (math.prod(other.output_sizes[0]) == math.prod(self.input_sizes[0]) * self.pg_size):
+                return MatchInfo(
+                    MatchState.SIZE_OR_SYNTAX_MISMATCH,
+                    f"Found input numel '{math.prod(other.input_sizes[0])} * pg size {self.pg_size}' "
+                    f"does not match output numel '{math.prod(other.output_sizes[0])}'",
+                )
+            if self.type in [
+                "reduce_scatter",
+                "_reduce_scatter_base",
+                "reduce_scatter_tensor_coalesced",
+            ] and not (math.prod(other.input_sizes[0]) == math.prod(self.output_sizes[0]) * self.pg_size):
+                return MatchInfo(
+                    MatchState.SIZE_OR_SYNTAX_MISMATCH,
+                    f"Found input numel '{math.prod(other.input_sizes[0])}' does not match output numel "
+                    f"'{math.prod(other.output_sizes[0])} * pg size {self.pg_size}'",
+                )
+        elif self.type in [
+            "coalesced",
+            "ALLGATHER_coalesced",
+            "REDUCE_SCATTER_coalesced",
+        ]:
+            return (
+                MatchInfo(MatchState.FULLY_MATCHED)
+                if (other.type == self.type)
+                else MatchInfo(MatchState.SIZE_OR_SYNTAX_MISMATCH)
+            )
+        return MatchInfo(MatchState.FULLY_MATCHED)
+
+
+class MatchStateRecord:
+    def __init__(
+        self,
+        expected_ranks: set[int],
+        other_ranks: list[int],
+        entry_state: EntryState,
+        candidate_ranks: set[int],
+        candidate_idx: dict[int, int],
+        found_ranks: set[int],
+        found_idx: dict[int, int],
+        errors: set[tuple[int, MatchInfo]],
+    ) -> None:
+        self.expected_ranks = expected_ranks
+        self.other_ranks = other_ranks
+        self.entry_state = entry_state
+        self.candidate_ranks = candidate_ranks
+        self.candidate_idx = candidate_idx
+        self.found_ranks = found_ranks
+        self.found_idx = found_idx
+        self.errors = errors
+        self.has_undecided_case = False
+
+    def reset_for_coalesced(self, entry_state: EntryState, candidate_ranks: set[int]) -> None:
+        self.entry_state = entry_state
+        self.candidate_ranks = candidate_ranks
+        self.candidate_idx = {}
+        self.found_ranks = set()
+        self.found_idx = {}
+        self.errors = set()
diff --git a/tools/flight_recorder/components/utils.py b/tools/flight_recorder/components/utils.py
index 297334ba3851b74c0cf92661220b409b4a92f019..c2f503ccb04a25c991b1eaf5c53fcc3bac6722f2 100644
--- a/tools/flight_recorder/components/utils.py
+++ b/tools/flight_recorder/components/utils.py
@@ -1,9 +1,28 @@
+import argparse
+import math
+from typing import Any
 import os
 import re
 import sys
 import stat
 
+from tools.flight_recorder.components.fr_logger import FlightRecorderLogger
+from tools.flight_recorder.components.types import (
+    Group,
+    MatchInfo,
+    MatchState,
+    MatchStateRecord,
+    Membership,
+    Op,
+)
+
 __all__ = []
+logger: FlightRecorderLogger = FlightRecorderLogger()
+
+try:
+    from tabulate import tabulate
+except ModuleNotFoundError:
+    logger.debug("tabulate is not installed. Proceeding without it.")
 
 PATH_WHITE_LIST_REGEX = re.compile(r"[^_A-Za-z0-9/.-]")
 MAX_READ_FILE_SIZE_4G = 4294967296  # 4G, 4 * 1024 * 1024 * 1024
@@ -67,3 +86,316 @@ def get_valid_read_path(path, size_max=MAX_READ_FILE_SIZE_4G, check_user_stat=Tr
     if not is_dir and size_max > 0 and file_stat.st_size > size_max:
         raise ValueError("The file {} exceeds size limitation of {}.".format(path, size_max))
     return real_path
+
+
+def check_write_directory(dir_name, check_user_stat=True):
+    real_dir_name = get_valid_path(dir_name)
+    if not os.path.isdir(real_dir_name):
+        raise ValueError("The file writen directory {} doesn't exists.".format(dir_name))
+
+    file_stat = os.stat(real_dir_name)
+    if check_user_stat and not sys.platform.startswith("win") and not is_belong_to_user_or_group(file_stat):
+        raise ValueError("The file writen directory {} doesn't belong to the current user or group.".format(dir_name))
+    if not os.access(real_dir_name, os.W_OK):
+        raise ValueError("Current user doesn't have writen permission to file writen directory {}.".format(dir_name))
+
+
+def get_valid_write_path(path, check_user_stat=True, is_dir=False, warn_exists=True):
+    real_path = get_valid_path(path)
+    real_path_dir = real_path if is_dir else os.path.dirname(real_path)
+    check_write_directory(real_path_dir, check_user_stat=check_user_stat)
+
+    if not is_dir and os.path.exists(real_path):
+        if os.path.isdir(real_path):
+            raise ValueError("The file {} exist and is a directory.".format(path))
+        if check_user_stat and os.stat(real_path).st_uid != os.getuid():  # Has to be exactly belonging to current user
+            raise ValueError("The file {} doesn't belong to the current user.".format(path))
+        if check_user_stat and os.stat(real_path).st_mode & WRITE_FILE_NOT_PERMITTED_STAT > 0:
+            raise ValueError("The file {} permission for others is not 0, or is group writable.".format(path))
+        if not os.access(real_path, os.W_OK):
+            raise ValueError("The file {} exist and not writable.".format(path))
+        if warn_exists:
+            logger.warning("%s already exist. The original file will be overwritten.", path)
+    return real_path
+
+
+def format_frame(frame: dict[str, str]) -> str:
+    name = frame["name"]
+    filename = frame["filename"]
+    line = frame["line"]
+    return f"{name} at {filename}:{line}"
+
+
+def format_frames(frames: list[dict[str, str]]) -> str:
+    formatted_frames = []
+    for frame in frames:
+        formatted_frames.append(format_frame(frame))
+    return "\n".join(formatted_frames)
+
+
+def match_one_event(
+    event_a: dict[Any, Any],
+    event_b: dict[Any, Any],
+    memberships: dict[str, set[Any]],
+    pg_name: str,
+) -> MatchInfo:
+    op_a = Op(event_a, memberships, pg_name)
+    op_b = Op(event_b, memberships, pg_name)
+    return op_a.match(op_b)
+
+
+def check_size_alltoall(alltoall_cases: list[dict[str, Any]]) -> tuple[bool, int, int]:
+    input_numel = 0
+    output_numel = 0
+    for e in alltoall_cases:
+        input_numel += math.prod(e["input_sizes"][0])
+        output_numel += math.prod(e["output_sizes"][0])
+    return input_numel != output_numel, input_numel, output_numel
+
+
+class ProcessGroupData:
+    def __init__(self, pg_guids: dict[tuple[str, int], str], pg_name: str, desc: str, mismatch: dict[str, int]):
+        self.pg_guids, self.pg_name, self.desc, self.mismatch = pg_guids, pg_name, desc, mismatch
+
+
+def check_current_entry_match(
+    all_entries: dict[int, list[dict[str, Any]]],
+    current_entry: dict[str, Any],
+    _memberships: dict[str, set[Any]],
+    pg_data: ProcessGroupData,
+    match_record: MatchStateRecord,
+) -> None:
+    pg_guids, pg_name, mismatch, desc = pg_data.pg_guids, pg_data.pg_name, pg_data.mismatch, pg_data.desc
+    for rank in match_record.expected_ranks.intersection(set(match_record.other_ranks)):
+        for entry_idx, entry in enumerate(all_entries[rank]):
+            # step over ops from other PGs
+            # only check match state when seq_id matches
+            if (
+                pg_guids[(entry["process_group"][0], rank)] == pg_name
+                and entry["collective_seq_id"] == match_record.entry_state.collective_seq_id
+            ):
+                match_info = match_one_event(current_entry, entry, _memberships, pg_name)
+                if match_info.state in [MatchState.FULLY_MATCHED, MatchState.UNDECIDED] and mismatch[pg_name] == 0:
+                    match_record.found_ranks.add(rank)
+                    match_record.found_idx[rank] = entry_idx
+                    match_record.has_undecided_case = match_info.state == MatchState.UNDECIDED
+                else:
+                    match_record.candidate_ranks.add(rank)
+                    match_record.candidate_idx[rank] = entry_idx
+                    if match_info.state not in [
+                        MatchState.FULLY_MATCHED,
+                        MatchState.UNDECIDED,
+                    ]:
+                        match_record.errors.add((rank, match_info))
+                break
+
+
+class EntryContext:
+    def __init__(self, all_entries, current_entry, dumps_ranks, first_rank):
+        self.all_entries = all_entries
+        self.current_entry = current_entry
+        self.dumps_ranks = dumps_ranks
+        self.first_rank = first_rank
+
+
+def error_analysis(
+    entry_context: EntryContext,
+    match_record: MatchStateRecord, # all
+    mismatch: dict[str, int],  # all
+    version: tuple[int, int],  # 2
+    pg_name: str,  # all, mismatch
+) -> None:
+    all_entries = entry_context.all_entries
+    current_entry = entry_context.current_entry
+    dumps_ranks = entry_context.dumps_ranks
+    first_rank = entry_context.first_rank
+    major_v, minor_v = version[0], version[1]
+    # case one: not every rank join the collective or in the flight recorder.
+    if (
+        match_record.candidate_ranks | match_record.found_ranks
+    ) != match_record.expected_ranks and match_record.expected_ranks - (
+        match_record.candidate_ranks | match_record.found_ranks
+    ) <= dumps_ranks:
+        mismatch[pg_name] += 1
+        logger_msg = "Not all ranks joining collective, sequence number: %s"
+        missing_ranks = match_record.expected_ranks - (match_record.candidate_ranks | match_record.found_ranks)
+        match_record.entry_state.log(
+            logger, logger_msg, format_frames, additional_info={"missing_ranks": missing_ranks}
+        )
+        match_record.candidate_ranks.update(match_record.found_ranks)
+        match_record.candidate_idx.update(match_record.found_idx)
+        match_record.found_idx.clear()
+        match_record.found_ranks.clear()
+    elif len(match_record.candidate_ranks) == 1 and dumps_ranks == match_record.expected_ranks:
+        # case two: alltoall or alltoall_base case.
+        if match_record.has_undecided_case:
+            alltoall_cases = [current_entry] + [
+                all_entries[rank][match_record.found_idx[rank]] for rank in match_record.found_ranks
+            ]
+            fail_check, total_input_numel, total_output_numel = check_size_alltoall(alltoall_cases)
+            if major_v <= 2 and minor_v <= 3:
+                # We don't log the input/output sizes for alltoall before v2.4,
+                # so we don't consider the size mismatch as an error for now.
+                fail_check = False
+            if fail_check:
+                # When we see errors in all_to_all, it's hard to tell which rank is the source of the error.
+                mismatch[pg_name] += 1
+                logger_msg = "Input/output mismatch in the collective sequence number: %s"
+                match_record.entry_state.log(
+                    logger,
+                    logger_msg,
+                    format_frames,
+                    additional_info={"total_numel": (total_input_numel, total_output_numel)},
+                )
+                match_record.candidate_ranks.update(match_record.found_ranks)
+                match_record.candidate_idx.update(match_record.found_idx)
+                match_record.found_idx.clear()
+                match_record.found_ranks.clear()
+                match_record.errors.add((first_rank, MatchInfo(MatchState.SIZE_OR_SYNTAX_MISMATCH)))
+            else:
+                match_record.found_ranks.update(match_record.candidate_ranks)
+                match_record.found_idx.update(match_record.candidate_idx)
+                match_record.candidate_idx.clear()
+                match_record.candidate_ranks.clear()
+        # case three: all joined and everything matches on all ranks.
+        else:
+            match_record.found_ranks.update(match_record.candidate_ranks)
+            match_record.found_idx.update(match_record.candidate_idx)
+            match_record.candidate_idx.clear()
+            match_record.candidate_ranks.clear()
+    # case four: mismatch cases due to not same type, size mismatch or state mismatch.
+    elif len(match_record.errors) > 0:
+        mismatch[pg_name] += 1
+        logger_msg = "Collective sequence number: %s has errors"
+        match_record.entry_state.log(logger, logger_msg, format_frames, errors=match_record.errors)
+        match_record.candidate_ranks.update(match_record.found_ranks)
+        match_record.candidate_idx.update(match_record.found_idx)
+        match_record.found_idx.clear()
+        match_record.found_ranks.clear()
+    # partial analysis case when we cannot decide what's wrong with this collective entry.
+    else:
+        match_record.candidate_ranks.update(match_record.found_ranks)
+        match_record.candidate_idx.update(match_record.found_idx)
+        match_record.found_idx.clear()
+        match_record.found_ranks.clear()
+        if match_record.expected_ranks - dumps_ranks:
+            mismatch[pg_name] += 1
+            logger.info(
+                "We cannot decide what's wrong with this collective entry "
+                "because we missed FR dumps from ranks (%s) so we don't have enough "
+                "information. If you want to debug further use -j to dump all raw trace",
+                str(match_record.expected_ranks - dumps_ranks),
+            )
+        else:
+            logger.info(
+                "No errors found for this collective entry, There could be some "
+                "other reasons why we see collective timeout."
+            )
+
+
+def just_print_entries(
+    all_entries: dict[int, list[dict[str, Any]]],
+    _groups: dict[str, Group],
+    _memberships: dict[str, set[Any]],
+    _pg_guids: dict[tuple[str, int], str],
+    args: argparse.Namespace,
+) -> None:
+    rows = []
+    ranks = sorted(all_entries.keys())
+    headers = [f"Rank {rank}" for rank in ranks if args.selected_ranks is None or rank in args.selected_ranks]
+    progress = True
+    while progress:
+        progress = False
+        row = []
+        for rank in ranks:
+            if args.selected_ranks is not None and rank not in args.selected_ranks:
+                continue
+            if len(all_entries[rank]) == 0:
+                row.append("")
+            else:
+                entry = all_entries[rank].pop(0)
+                pg_name = _pg_guids[(entry["process_group"][0], rank)]
+                if (
+                    args.pg_filters is None
+                    or entry["process_group"][1] in args.pg_filters
+                    or entry["process_group"][0] in args.pg_filters
+                ):
+                    row.append(str(Op(entry, _memberships, pg_name)))
+                else:
+                    row.append("")
+                progress = True
+        if progress:
+            rows.append(row)
+
+    logger.info(tabulate(rows, headers=headers))
+
+
+def check_no_missing_dump_files(entries: dict[int, Any], memberships: list[Membership]) -> None:
+    all_ranks = {int(m.global_rank) for m in memberships}
+
+    dumps_ranks = {int(key) for key in entries.keys()}
+    missing_ranks = all_ranks - dumps_ranks
+    if missing_ranks:
+        raise ValueError(
+            f"Missing dump files for {len(missing_ranks)} ranks: {sorted(missing_ranks)}\n"
+            f"Expected ranks: {sorted(all_ranks)}\n"
+            f"Found dumps for: {sorted(dumps_ranks)}"
+        )
+
+
+def check_version(version_by_ranks: dict[str, str], expected_version: str) -> None:
+    for rank, actual_version in version_by_ranks.items():
+        if actual_version != expected_version:
+            raise ValueError(f"Version mismatch at rank {rank}: " f"expected {expected_version}, got {actual_version}")
+
+
+def get_version_detail(version_str: str) -> tuple[int, int]:
+    parts = version_str.split(".")
+    if len(parts) != 2:
+        raise ValueError(f"Invalid version format: expected 'X.Y', got '{version_str}'")
+
+    try:
+        major, minor = int(parts[0]), int(parts[1])
+    except ValueError as e:
+        raise ValueError(f"Version components must be integers: '{version_str}'") from e
+
+    return major, minor
+
+
+def align_trace_from_beginning(
+    entries: dict[int, list[dict[str, Any]]],
+) -> dict[int, list[dict[str, Any]]]:
+    """
+    Align the trace entries by record ID for entries.
+    This function takes a dictionary of rank names to lists of trace entries as input.
+    Each trace entry is a dictionary containing information about a collective operation,
+    including its unique identifier (`record_id` is monotonically increasing as we write into the ring buffer).
+    The function finds the largest starting point across all ranks by taking the maximum
+    `record_id` value of the first entry in each rank. Finally, it filters out any
+    entries with `record_id` values less than the maximum starting point.
+    The function returns the updated dictionary of sorted and filtered trace entries.
+
+    Args:
+        entries (Dict[str, List[Dict[str, Any]]]): A dictionary of rank names to lists of trace entries.
+
+    Returns:
+        entries (Dict[str, List[Dict[str, Any]]]): Entries sorted by record ID and filtered by the maximum starting point.
+    """
+
+    maximum_starting_record_id = 0
+    for rank in entries:
+        # Although this is a ring buffer, we already sort the entries by `record_id` when dumping, we just
+        # need to find the largest starting point. For example, if the buffer has the following entries:
+        # Rank 0: [0, 1, 2, 3, 4, 5, 6]
+        # Rank 1: [1, 2, 3, 4, 5, 6, 7]
+        # Rank 2: [2, 3, 4, 5, 6, 7, 8]
+        # Rank 3: [0, 1, 2, 3, 4, 5, None]
+        # Then we should start from collective 2 not 0 because any collective before,
+        # we don't have complete records from all ranks so we need to ignore them.
+        first_record_id = entries[rank][0]["record_id"]
+        maximum_starting_record_id = max(maximum_starting_record_id, first_record_id)
+
+    for rank in entries:
+        entries[rank] = [entry for entry in entries[rank] if entry["record_id"] >= maximum_starting_record_id]
+
+    return entries
diff --git a/tools/flight_recorder/flight_recorder.md b/tools/flight_recorder/flight_recorder.md
index dd78e0e2c6f95877e93bac8ee83bbf8a5b0226ea..d3bd819e24871e307c26d639f70d5dff41ef2bd2 100644
--- a/tools/flight_recorder/flight_recorder.md
+++ b/tools/flight_recorder/flight_recorder.md
@@ -1,18 +1,11 @@
-# 飞行记录器超时类问题分析
+# 飞行记录器分析
 
 训练任务卡住是阻塞AI大规模分布式集群训练任务的主要和关键问题，当前需要等待集合通信超时才能感知，影响集群可用性。框架需要支持检测训练任务卡住问题，做到提前识别并保存必要的诊断信息，提高问题定位效率和集群设备可用性。当HeartbeatMonitor长时间未检测到心跳时，即可认为训练任务已经卡住，需要触发诊断信息保存。
+主要功能包括：解析多个 rank 的跟踪数据文件、重建进程组和成员关系、检测通信不匹配（类型、大小、状态等、提供详细的错误报告和调试信息
 
-本工具提供torch npu上飞行记录器flight recorder记录日志的读取解析能力，并根据解析后的日志提供超时类问题的初步分析能力，主要支持以下三种情况的超时类问题的识别和分析
+# 使用方法
 
-|问题| 具体内容 | 
-| --- | --- |
-|类型一  | 同通信域内的某张卡计算超时，导致其他卡等待触发飞行记录器和hccl time out | 
-|类型二  | 同通信域内的通信算子之后的非通信任务耗时过长|
-|类型三  | 同通信域内的某个通信算子进行通信时执行超时 |
-
-## 使用方法
-
-### 1 飞行记录器开启方法
+## 1 飞行记录器开启方法
 
 按照如下方法设置环境变量开启飞行记录器
 
@@ -24,24 +17,27 @@ export TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC=20 # 用于控制心跳超时时间，
 export TORCH_HCCL_DEBUG_INFO_TEMP_FILE=/tmp/  #保存诊断信息的文件路径
 ```
 
-### 2 工具使用方法
+## 2 工具使用方法
 
 ```
-python fr_trace.py <dump dir containing trace files> <--prefix prefix for files>
+python fr_trace.py <trace_dir> [options]
 ```
 
-脚本从命令行参数传入 `dump dir` 的值，为必选值。
-
-* `dump dir`：从命令行第一个参数获取
+## 3 入参说明
 
-| 参数名| 含义 | 使用限制 |
-| --- | --- | --- | 
-| dump dir | 飞行记录器的分析路径 | 必选。数据类型：string 待分析的飞行记录器的日志路径 | 
-| -p, --prefix | 文件路径前缀 | 可选。数据类型：str 日志文件共同前缀，如果不指定将自动判断生成 | 
+| 参数名 | 含义 | 使用限制 |
+| --- | --- | --- |
+| `trace_dir` | 包含跟踪文件的目录，每个 rank 一个文件，命名格式为 `<prefix>_<rank>`。 | 可选。数据类型：string |
+| `--selected-ranks` | 指定需要展示跟踪的 rank 列表。 | 可选。数据类型：int 列表。必须与 `--just-print-entries` 同时使用。 |
+| `--allow-incomplete-ranks` | 允许对不完整的 rank 数据进行尽力分析并输出结果。 | 可选。无参数值。默认关闭。 |
+| `--pg-filters` | 指定需要展示的进程组（PG）的过滤条件列表，可以是 PG 名称或描述。 | 可选。数据类型：string 列表。必须与 `--just-print-entries` 同时使用。 |
+| `-o`, `--output` | 指定输出文件路径。 | 可选。数据类型：string。默认无输出文件。 |
+| `-p`, `--prefix` | 指定文件名前缀，用于提取 rank。如果不指定，将尝试推断一个公共前缀。 | 可选。数据类型：string。默认无前缀。 |
+| `-j`, `--just-print-entries` | 仅打印跟踪条目，不进行完整分析。 | 可选。无参数值。默认关闭。 |
+| `-v`, `--verbose` | 启用详细日志输出。 | 可选。无参数值。默认关闭。 |
 
+## 使用示例
 
-### 3 输出示例
-
-```
-2025-02-19 08:10:07,162 - INFO - The pg_id 0's rank 0's Computational task took too long, causing the other ranks' HCCL task to time out.
-```
+```bash
+python script.py trace_dir --selected-ranks 0 1 2 --allow-incomplete-ranks --pg-filters pg1 pg2 -o output.pkl -p prefix_ -j -v
+```
\ No newline at end of file
diff --git a/tools/flight_recorder/fr_trace.py b/tools/flight_recorder/fr_trace.py
index 7999d947bceb1f41f0d0789502dc11919805dc8e..aa20e48e88be71418954ac6057f61efb0d0ad4a8 100644
--- a/tools/flight_recorder/fr_trace.py
+++ b/tools/flight_recorder/fr_trace.py
@@ -1,186 +1,25 @@
-import os
-import re
+from collections.abc import Sequence
+from typing import Optional
 import pickle
-import logging
-from collections import defaultdict
-import argparse
 
-from components.utils import get_valid_read_path
+from tools.flight_recorder.components.builder import build_db
+from tools.flight_recorder.components.config_manager import JobConfig
+from tools.flight_recorder.components.loader import read_dir
+from tools.flight_recorder.components.types import types
+from tools.flight_recorder.components.utils import get_valid_read_path, get_valid_write_path
 
-__all__ = []
-exp = re.compile(r"^([a-zA-Z0-9_]{0,100}?)(\d+)$")
-MAX_DEPTH = 3
 
+def main(args: Optional[Sequence[str]] = None) -> None:
+    config = JobConfig()
+    args = config.parse_args(args)
+    get_valid_read_path(args.trace_dir, is_dir=True)
 
-logging.basicConfig(
-    level=logging.INFO,  # Set the log level to INFO
-    format="%(asctime)s - %(levelname)s - %(message)s",  # Set format
-    handlers=[logging.StreamHandler()],  # Output to console
-)
-
-
-SAFE_CLASSES = {
-    # Built-in security type
-    "builtins": {"str", "int", "float", "list", "dict", "tuple"},
-}
-
-
-class SafeUnpickler(pickle.Unpickler):
-    def find_class(self, module, name):
-        # Check if the module and class are in the whitelist
-        if module in SAFE_CLASSES and name in SAFE_CLASSES[module]:
-            return super().find_class(module, name)
-        raise pickle.UnpicklingError(f"Forbidden class: {module}.{name}")
-
-
-def determine_prefix(files):
-    possible_prefixes: defaultdict[str, set[int]] = defaultdict(set)
-    for f in files:
-        m = exp.search(f)
-        if m:
-            p, r = m.groups()
-            possible_prefixes[p].add(int(r))
-    if len(possible_prefixes) == 1:
-        prefix = next(iter(possible_prefixes))
-        return prefix
-    else:
-        raise ValueError(
-            "Unable to automatically determine the common prefix for the trace file names. "
-            "Please specify --prefix argument manually"
-        )
-
-
-def load_recorder_data(path, prefix):
-    """Load recorder data for all ranks"""
-    details = {}
-    for root, _, files in os.walk(path):
-        current_depth = root.count(os.sep) - path.count(os.sep)
-        if current_depth > MAX_DEPTH:
-            logging.error("The current file depth has exceeded the maximum depth limit, which is set to {MAX_DEPTH}.")
-            break
-        if prefix is None:
-            prefix = determine_prefix(files)
-        for f in files:
-            if "py_traceback" in f:
-                continue
-            if f.find(prefix) != 0:
-                continue
-            rank, dump = read_dump(prefix, os.path.join(root, f))
-            details[rank] = dump
-    return details
-
-
-def read_dump(prefix, filename):
-    basename = os.path.basename(filename)
-    try:
-        rank = int(basename[len(prefix):])
-    except ValueError as e:
-        raise ValueError(f"Cannot extract rank from '{basename}' with prefix '{prefix}'.") from e
-    filename = get_valid_read_path(filename)
-    try:
-        with open(filename, "rb") as infile:
-            dump = SafeUnpickler(infile).load()
-    except Exception as e:
-        logging.error(f"Failed to load data from {filename}: {e}")
-    return rank, dump
-
-
-def extract_hccl_info(recorder_dict):
-    """Extract HCCL related information from recorder data"""
-    hccl_dict = {}
-    for rank, recorder in recorder_dict.items():
-        entries = recorder.get("entries", [])
-        if not entries:
-            continue
-        last_entry = entries[-1]
-        hccl_dict[rank] = {
-            "state": last_entry.get("state", None),
-            "record_id": last_entry.get("record_id", None),
-            "pg_id": last_entry.get("pg_id", None),
-            "time_discovered_completed_ns": last_entry.get("time_discovered_completed_ns", None),
-            "name": last_entry.get("frames", [{}])[0].get("name", None),
-        }
-    return hccl_dict
-
-
-def analyze_pg_groups(hccl_dict):
-    """Analyze HCCL data, group by pg_id and check for problems"""
-    pg_groups = defaultdict(list)
-    for _, op in hccl_dict.items():
-        pg_groups[op["pg_id"]].append(op)
-
-    for pg_id, group in pg_groups.items():
-        scheduled_ops = [op for op in group if op["state"] == "scheduled"]
-        completed_ops = [op for op in group if op["state"] == "completed"]
-        # Case 1: All NPUs are scheduled and have the same record_id and name
-        if len(scheduled_ops) == len(group):
-            record_id = scheduled_ops[0]["record_id"]
-            name = scheduled_ops[0]["name"]
-            if all(op["record_id"] == record_id and op["name"] == name for op in scheduled_ops):
-                logging.info(
-                    f"The pg_id {pg_id}'s Communication Operator {name} "
-                    "executed too slowly, causing the HCCL to time out."
-                )
-                continue
-
-        # Case 2: There is a completed operator and its record_id is 1 less than other scheduled operators
-        if completed_ops and scheduled_ops:
-            completed_op = completed_ops[0]
-            scheduled_record_id = scheduled_ops[0]["record_id"]
-            if completed_op["record_id"] == scheduled_record_id - 1:
-                logging.info(
-                    f"The pg_id {pg_id}'s rank {completed_op['pg_id']}'s "
-                    "Computational task took too long, causing the other ranks' "
-                    "HCCL task to time out."
-                )
-                continue
-
-        # Case 3: All operators are completed
-        if not scheduled_ops and completed_ops:
-            latest_op = max(completed_ops, key=lambda x: x["time_discovered_completed_ns"] or 0)
-            logging.info(
-                f"The computational task of the pg_id {pg_id} "
-                f"after the communication operator {latest_op['name']} "
-                "took too long."
-            )
-            continue
-
-        # Unrecognized cases
-        logging.info(f"The situation for pg_id {pg_id} cannot be recognized!")
-
-
-def main():
-    # Parsing command line arguments with argparse
-    parser = argparse.ArgumentParser(description="PyTorch Flight recorder analyzing script.")
-    parser.add_argument(
-        "trace_dir",
-        type=str,
-        help="Directory containing one trace file per rank, named with <prefix>_<rank>.",
-    )
-    parser.add_argument(
-        "-p",
-        "--prefix",
-        type=str,
-        help=(
-            "Common filename prefix to strip such that rank can be extracted. "
-            "If not specified, will attempt to infer a common prefix."
-        ),
-        default=None,
-    )
-    args = parser.parse_args()
-
-    path = get_valid_read_path(args.trace_dir, is_dir=True)
-
-    recorder_dict = load_recorder_data(path, args.prefix)
-    if not recorder_dict:
-        logging.error("No valid recorder data found.")
-        return
-
-    # Extract HCCL information
-    hccl_dict = extract_hccl_info(recorder_dict)
-
-    # Analyzing HCCL data
-    analyze_pg_groups(hccl_dict)
+    details, version = read_dir(args)
+    db = build_db(details, args, version)
+    if args.output:
+        args.output = get_valid_write_path(args.output)
+        with open(args.output, "wb") as f:
+            pickle.dump((types, db), f)
 
 
 if __name__ == "__main__":
diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index 13ebc5e244615e133e6c0970b63d57a602143a48..9b1d869d8584aecacf5129bf74c67d17a2df5c1c 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -1,3 +1,5 @@
+__all__ = ["erase_stream", "matmul_checksum"]
+
 import os
 import sys
 import types
@@ -58,8 +60,10 @@ from torch_npu.utils import _register_ops_under_dtensor_rules
 from torch_npu.utils.exposed_api import public_npu_functions
 from torch_npu.distributed.checkpoint.checkpoint import _apply_dcp_patch
 from torch_npu.npu._stream_check import apply_sanitizer_patch
+from torch_npu.npu.utils import _erase_stream as erase_stream
 from torch_npu.utils._error_code import ErrCode, pta_error, _except_handler
 from torch_npu.asd.asd import _asd_patch
+from torch_npu.asd.checksum import _matmul_checksum as matmul_checksum
 from torch_npu._C._distributed_c10d import ParallelStore
 from torch_npu.op_plugin.meta import _meta_registrations
 from torch_npu.version import __version__ as __version__
@@ -70,9 +74,6 @@ del _op_plugin_docs
 _cann_package_check()
 
 
-__all__ = []
-
-
 def _wrap_torch_error_func(func):
     @wraps(func)
     def wrapper(*args, **kwargs):
@@ -162,6 +163,8 @@ def _apply_distributed_methods_patch():
     torch.distributed.distributed_c10d.gather_object = torch_npu.distributed.distributed_c10d._gather_object
     torch.distributed.is_hccl_available = torch_npu.distributed.is_hccl_available
     torch.distributed.reinit_process_group = torch_npu.distributed.reinit_process_group
+    torch.distributed.distributed_c10d.rendezvous = torch_npu.distributed.distributed_c10d._trigger_rendezvous_decorator(torch.distributed.distributed_c10d.rendezvous)    
+    torch.distributed.launcher.api._get_addr_and_port = torch_npu.distributed.distributed_c10d._trigger__get_addr_and_port_decorator(torch.distributed.launcher.api._get_addr_and_port)
 
 
 torch.utils.rename_privateuse1_backend("npu")
@@ -243,6 +246,7 @@ def _npu_shutdown():
     torch_npu.distributed.distributed_c10d._destructor_process_group()
     torch_npu._C._npu_shutdown(success)
     _except_handler.handle_exception()
+    torch_npu.asd.asd.matmul_check._cleanup()
 
 
 # register npu shutdown hook on exit
diff --git a/torch_npu/_logging/_internal.py b/torch_npu/_logging/_internal.py
index c27a2aa3c68e9aaf87d4ba3396f7f265044ad24b..59d0fd9f14acd11fbdc57d946770e3835cba5e2c 100644
--- a/torch_npu/_logging/_internal.py
+++ b/torch_npu/_logging/_internal.py
@@ -36,3 +36,4 @@ def _add_logging_module():
     torch._logging._internal.register_log("memory", "torch_npu.memory")
     torch._logging._internal.register_log("dispatch", "torch_npu.dispatch")
     torch._logging._internal.register_log("silent", "torch_npu.silent_check")
+    torch._logging._internal.register_log("op_plugin", "torch_npu.op_plugin")
diff --git a/torch_npu/_op_plugin_docs.py b/torch_npu/_op_plugin_docs.py
deleted file mode 100644
index 6eab7dcd1d3d59e181289fd0fa3d2aaf337eadbe..0000000000000000000000000000000000000000
--- a/torch_npu/_op_plugin_docs.py
+++ /dev/null
@@ -1 +0,0 @@
-# The file will be forcibly overwritten. Please do not add any content here
diff --git a/torch_npu/asd/_silent_fault_data.py b/torch_npu/asd/_silent_fault_data.py
index b6428b6f5ed8c228b861ebe528af55fd0585966e..543f58f20fbc354a503ae5ee9770761f0d32ef05 100644
--- a/torch_npu/asd/_silent_fault_data.py
+++ b/torch_npu/asd/_silent_fault_data.py
@@ -34,10 +34,3 @@ class SilentFaultDataV2:
         self.step_tensor = torch.zeros(1, dtype=torch.int64).npu()
         self.check_tensor = torch.zeros(3, dtype=torch.float).npu()
         self.upper_thresh, self.sigma_thresh = get_thresh()
-
-
-class SilentFaultDataV3:
-    def __init__(self):
-        self.step_tensor = torch.zeros(1, dtype=torch.int64, device="npu")
-        self.avg_tensor = None
-        self.upper_thresh, self.sigma_thresh = get_thresh()
diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py
index bae88bf7089dbed2f66dbbff09c4a37751a7c71a..47486142336c06040d66792eb2f399285f85635b 100644
--- a/torch_npu/asd/asd.py
+++ b/torch_npu/asd/asd.py
@@ -1,15 +1,24 @@
 import os
+from functools import wraps, partial
+import logging
+import time
+import warnings
+import threading
+import math
 import torch
 from torch.nn.functional import layer_norm as origin_layernorm
 from torch.nn.functional import embedding as origin_embedding
 
 import torch_npu
 from torch_npu.utils._error_code import ErrCode, pta_error
-from ._silent_fault_data import SilentFaultData, SilentFaultDataV2, SilentFaultDataV3
+from ._silent_fault_data import SilentFaultData, SilentFaultDataV2
 
 __all__ = []
 
 
+loggerSilent = logging.getLogger("torch_npu.silent_check")
+
+
 def _Singleton(cls):
     _instances = {}
 
@@ -113,7 +122,9 @@ class _SilentFaultDetectorV2:
         self.silent_data_dict = dict()
         self.min_step = 100
 
-    def silent_fault_check(self, idx, asd_enable, grad):
+    def silent_fault_check(self, idx, asd_flag, grad):
+        if grad is None:
+            return
         if grad.dtype != torch.bfloat16 and grad.dtype != torch.float32:
             return
 
@@ -125,35 +136,688 @@ class _SilentFaultDetectorV2:
         sfda = self.silent_data_dict[idx]
 
         torch_npu._npu_silent_check_v2(val, grad, sfda.check_tensor, sfda.step_tensor, self.min_step, sfda.upper_thresh[0],
-                                       sfda.sigma_thresh[0], sfda.upper_thresh[1], sfda.sigma_thresh[1], asd_enable)
+                                       sfda.sigma_thresh[0], sfda.upper_thresh[1], sfda.sigma_thresh[1], asd_flag)
 
 
 _silent_fault_detector_v2 = _SilentFaultDetectorV2()
+IS_IN_BACKWARD = False
 
 
-@_Singleton
-class _SilentFaultDetectorV3:
-    def __init__(self):
-        self.silent_data_dict = dict()
-        self.beta1 = 0.99
+def _input_hook(idx, asd_flag):
+    def hook(grad):
+        global IS_IN_BACKWARD
+        loggerSilent.debug(f"input_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to False. idx is {idx}, flag is {asd_flag}")
+        IS_IN_BACKWARD = False
+        torch_npu._C._npu_set_call_state("forward")
+        _silent_fault_detector_v2.silent_fault_check(idx, asd_flag, grad)
+        return
+    return hook
 
-    def silent_fault_check(self, idx, asd_enable, grad):
-        if grad.dtype != torch.bfloat16 and grad.dtype != torch.float32:
-            return
 
-        val = torch.norm(grad, float('inf')).pow(2).view(-1)
+def _output_hook(grad):
+    global IS_IN_BACKWARD
+    loggerSilent.debug(f"output_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to True.")
+    IS_IN_BACKWARD = True
+    torch_npu._C._npu_set_call_state("backward")
+    return grad
 
-        if idx not in self.silent_data_dict:
-            self.silent_data_dict[idx] = SilentFaultDataV3()
-            self.silent_data_dict[idx].avg_tensor = grad.pow(2).max().view(-1)
-            grad_max = self.silent_data_dict[idx].avg_tensor
-        else:
-            grad_max = val
 
-        sfda = self.silent_data_dict[idx]
+def _is_inner_module(module):
+    return len(module._modules) == 0
+
+
+class _SilentCheckState:
+    def __init__(self):
+        self.init_param()
+        self.init_marks = {}
+        self.weight_hook_handles = {}
+        self.last_weight_hook_handles = {}
+        self.dtype_support = True
+        self.check_enable = 0
+
+    def set_check_enable(self, enable):
+        self.check_enable = enable
+
+    def get_check_enable(self):
+        return self.check_enable
+
+    def init_param(self):
+        self.first_forward = True
+        self.input_hook_flag = False
+        self.is_training = False
+        self.first_module_id = ""
+        self.first_weight = None
+        self.first_weight_id = None
+        self.last_weight = None
+        self.last_weight_id = None
+
+    def init_module_info(self, module_id, training):
+        self.first_module_id = module_id
+        self.first_forward = False
+        self.is_training = training
+        if self.is_training:
+            torch_npu._C._npu_set_module_train_state("train")
+        else:
+            torch_npu._C._npu_set_module_train_state("infer")
 
-        torch_npu._npu_silent_check_v3(val, grad, sfda.step_tensor, grad_max, sfda.avg_tensor,
-                                       sfda.upper_thresh[0], sfda.upper_thresh[1], self.beta1, asd_enable)
+    def check_tensor_dtype(self, tensor):
+        if not self.dtype_support:
+            return
+        if isinstance(tensor, torch.Tensor) and tensor.requires_grad and tensor.dtype == torch.float16:
+            self.dtype_support = False
+
+    def check_dtype(self, module, *args):
+        for x in args:
+            self.check_tensor_dtype(x)
+        for _, param in module._parameters.items():
+            self.check_tensor_dtype(param)
+
+    def search_first_weight(self, module):
+        # Search the first weight
+        if not self.init_marks.get(self.first_module_id, False) and self.first_weight is None:
+            for _, param in module._parameters.items():
+                if isinstance(param, torch.Tensor) and param.requires_grad:
+                    self.first_weight = param
+                    self.first_weight_id = id(param)
+                    break
+
+    def search_last_weight(self, module):
+        # Search the last weight (only in inner module)
+        if not self.init_marks.get(self.first_module_id, False) and _is_inner_module(module):
+            for _, param in module._parameters.items():
+                if isinstance(param, torch.Tensor) and param.requires_grad:
+                    self.last_weight = param
+                    self.last_weight_id = id(param)
+
+    def init_all_hook(self):
+        if self.is_training:
+            if self.last_weight is not None and self.first_weight is not None:
+                # Otherwise, there is only one weight in the outer module
+                if self.first_weight_id != self.last_weight_id:
+                    loggerSilent.debug(f"init_all_hook: module init, first_module_id is {self.first_module_id}.")
+                    if self.last_weight_hook_handles.get(self.first_module_id, None) is None:
+                        last_weight_handle = self.last_weight.register_hook(_output_hook)
+                        self.last_weight_hook_handles[self.first_module_id] = last_weight_handle
+                    if self.weight_hook_handles.get(self.first_module_id, None) is None:
+                        first_weight_handle = self.first_weight.register_hook(_input_hook(self.first_module_id, self.check_enable))
+                        self.weight_hook_handles[self.first_module_id] = first_weight_handle
+                else:
+                    loggerSilent.debug(f"init_all_hook: module only have one weight, first_module_id is {self.first_module_id}.")
+            self.init_marks[self.first_module_id] = True
+
+
+silent_check = _SilentCheckState()
+
+
+def _silent_check_decorator(func):
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        global silent_check
+        global IS_IN_BACKWARD
+
+        if not torch.npu.is_initialized():
+            return func(self, *args, **kwargs)
+
+        if silent_check.get_check_enable() and not IS_IN_BACKWARD:
+            if silent_check.first_forward:
+                silent_check.init_module_info(id(self), self.training)
+                self.outer = True
+
+            if silent_check.is_training and not silent_check.init_marks.get(silent_check.first_module_id, False):
+                silent_check.check_dtype(self, *args)
+                if not silent_check.dtype_support:
+                    for value in silent_check.weight_hook_handles.values():
+                        if value is not None:
+                            value.remove()
+                    for value in silent_check.last_weight_hook_handles.values():
+                        if value is not None:
+                            value.remove()
+                    silent_check.set_check_enable(0)
+                    warnings.warn(f"Warning: Module has unsupported dtype tensor, silent check will be closed.")
+
+        tmp = func(self, *args, **kwargs)
+
+        if silent_check.get_check_enable() and silent_check.is_training and not IS_IN_BACKWARD:
+            # Search the first weight
+            silent_check.search_first_weight(self)
+
+            # Search the last weight (only in inner module)
+            silent_check.search_last_weight(self)
+
+        if silent_check.get_check_enable() and not IS_IN_BACKWARD:
+            if hasattr(self, "outer") and self.outer:
+                silent_check.init_all_hook()
+                silent_check.init_param()
+                self.outer = False
+
+        return tmp
+    return wrapper
+
+
+class _MatmulSilentCheck:
+    def __init__(self):
+        self.init_param()
+        self.init_marks = {}
+        self.check_stat = {}
+        self.hook_dict = {}
+        self.registered_modules = []
+        self.visited_modules_id = []
+        self.matmul_hook_enable = 0
+        self.matmul_with_bf16 = False
+        self.statistic_value = None
+        self.is_outer_call = True
+        # link to checksum
+        self.matmul_trigger = False
+        self.checksum_enable = False
+        self.checksum_result = None
+        self.checksum_state = None
+        self.checksum_state_thread_running = False
+        self.checksum_state_thread = None
+        # Use another thread to receive the statistic value and detect SDC
+        self.check_thread_running = False
+        self.check_thread = None
+        self._lock = None
+        self.queue_len = 1024
+        self.statistic_cpu_value = None
+        self.name_list = ["" for _ in range(self.queue_len)]
+        self.head_index = 0
+        self.tail_index = 0
+        self.history_abnormal_list = []
+        # Parameter filtering
+        self.filter_index = -1
+        self.filter_interval = 3
+        self.invalid_grad_sum = 0
+        # Threshold
+        self.with_checksum = False
+        self.cooldown = 5 # default 5 min cooldown
+        self.strikes_num = 3 # default 3 times
+        self.strikes_window = 480 # default 480 min
+        self.checksum_cooldown = 180 # default 180 min
+        self.upper_thresh1 = 1000000 # default 1000000
+        self.upper_thresh2 = 100 # default 100
+        self.store = None
+        self.rank = None
+
+    def init_param(self):
+        self.first_forward = True
+        self.is_training = False
+        self.first_module_id = ""
+
+    def init_module_info(self, module_id, training):
+        self.first_module_id = module_id
+        self.first_forward = False
+        self.is_training = training
+
+    def set_matmul_hook_enable(self, enable):
+        self.matmul_hook_enable = enable
+
+    def get_matmul_hook_enable(self):
+        return self.matmul_hook_enable
+
+    def set_with_checksum(self, enable):
+        self.with_checksum = enable
+
+    def get_with_checksum(self):
+        return self.with_checksum
+
+    def set_cooldown(self, cooldown):
+        self.cooldown = cooldown
+
+    def get_cooldown(self):
+        return self.cooldown
+
+    def set_strikes_num(self, strikes_num):
+        self.strikes_num = strikes_num
+
+    def get_strikes_num(self):
+        return self.strikes_num
+
+    def set_strikes_window(self, strikes_window):
+        self.strikes_window = strikes_window
+
+    def get_strikes_window(self):
+        return self.strikes_window
+
+    def set_checksum_cooldown(self, checksum_cooldown):
+        self.checksum_cooldown = checksum_cooldown
+
+    def get_checksum_cooldown(self):
+        return self.checksum_cooldown
+
+    def set_upper_thresh1(self, upper_thresh1):
+        self.upper_thresh1 = upper_thresh1
+
+    def get_upper_thresh1(self):
+        return self.upper_thresh1
+
+    def set_upper_thresh2(self, upper_thresh2):
+        self.upper_thresh2 = upper_thresh2
+
+    def get_upper_thresh2(self):
+        return self.upper_thresh2
+
+    def set_grad_sample_interval(self, grad_sample_interval):
+        self.filter_interval = grad_sample_interval
+
+    def get_grad_sample_interval(self):
+        return self.filter_interval
+
+    @property
+    def lock(self):
+        if self._lock is None:
+            self._lock = threading.Lock()
+        return self._lock
+
+    def init_stream(self):
+        if self.statistic_cpu_value is None:
+            self.statistic_value = torch.tensor(0., device=f"npu:{torch_npu.npu.current_device()}")
+            self.checksum_state = 0
+            self.statistic_cpu_value = torch.zeros((self.queue_len,), device='cpu', dtype=torch.float32).pin_memory()
+            self.statistic_cpu_value.fill_(-1)
+        if self.store is None:
+            if torch.distributed.is_initialized():
+                self.store = torch.distributed.distributed_c10d._get_default_store()
+                self.rank = torch.distributed.get_rank()
+                if self.rank == 0:
+                    for i in range(1, torch.distributed.get_world_size()):
+                        self.store.set(f"rank_{i}_info_log", "")
+                        self.store.set(f"rank_{i}_warn_log", "")
+
+    def parameter_filtering(self):
+        self.filter_index = (self.filter_index + 1) % self.filter_interval
+        return self.filter_index == 0
+    
+    def register_module_hook(self, module, name):
+        self.check_stat[name + "_backward"] = {'avg': 0, 'pre_val': 0, 'step': 0, 'none_zero_step': 0}
+        hook = partial(self.module_hook, name=name + "_backward")
+        self.hook_dict[name + "_backward"] = module.register_full_backward_hook(hook)
+        self.registered_modules.append(name)
+    
+    def module_hook(self, module, grad_input, grad_output, name):
+        for _, param in module.named_parameters():
+            if param.dim() >= 2:
+                if param.grad is not None:
+                    self._detect_grad(param.grad.detach(), name)
+                    self.invalid_grad_sum = 0
+                elif hasattr(param, 'main_grad') and param.main_grad is not None:
+                    self._detect_grad(param.main_grad.detach(), name)
+                    self.invalid_grad_sum = 0
+                else:
+                    self.invalid_grad_sum += 1
+                    if self.invalid_grad_sum > max(10, len(self.registered_modules)):
+                        warnings.warn(f"There is no available grad for detection, and the silent check feature may not take effect.")
+                        self.invalid_grad_sum = 0
+
+    def _detect_grad(self, grad, name):
+        if grad.dtype != torch.bfloat16 and grad.dtype != torch.float32:
+            return
 
+        if self.matmul_hook_enable >= 1:
+            with torch.no_grad():
+                self.statistic_value.fill_(torch.pow(torch.norm(grad, float('inf')), 2).detach().float())
+
+                #Asynchronously copy the value to host
+                self.lock.acquire()
+                self.statistic_cpu_value[self.tail_index].copy_(self.statistic_value.data, non_blocking=True)
+                self.name_list[self.tail_index] = name
+                self.tail_index = (self.tail_index + 1) % self.queue_len
+                self.lock.release()
+            if self.tail_index == self.head_index:
+                # The queue is full, synchronize to empty the queue
+                torch_npu.npu.synchronize()
+
+    def _async_detect(self):
+        while self.check_thread_running:
+            if hasattr(torch, "npu") and torch.npu.is_initialized() and torch.distributed.is_initialized():
+                break
+            time.sleep(10)
+        if not self.check_thread_running:
+            return
+        local_rank = os.getenv("LOCAL_RANK", "-1")
+        if local_rank.isdigit():
+            torch.npu.set_device(int(local_rank))
+
+        while self.check_thread_running:
+            self.lock.acquire()
+            val = self.statistic_cpu_value[self.head_index].item()
+            name = self.name_list[self.head_index]
+            while val != -1 and name != "":
+                loggerSilent.debug(f"[silent data] name:{name}, val: {val}, pre_val: {self.check_stat[name]['pre_val']}, avg: {self.check_stat[name]['avg']}, bp time: {self.check_stat[name]['step']}, none_zero_step: {self.check_stat[name]['none_zero_step']}")
+                result, self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'] = self._silent_check(
+                    val, self.check_stat[name]['pre_val'], self.check_stat[name]['avg'], self.check_stat[name]['none_zero_step'],
+                    self.upper_thresh1, self.upper_thresh2
+                )
+
+                if result:
+                    current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    new_abnormal = {'time_str': current_time,
+                                    'time': time.time(),
+                                    'name': name,
+                                    'rank': self.rank,
+                                    'val': val,
+                                    'pre_val': self.check_stat[name]['pre_val'],
+                                    'avg': self.check_stat[name]['avg'],
+                                    'step': self.check_stat[name]['step'],
+                                    'none_zero_step': self.check_stat[name]['none_zero_step'],
+                                    'counted': True,
+                                    'striked': False}
+                    self._abnormal_process(new_abnormal)
+                self.check_stat[name]['step'] += 1
+                self.check_stat[name]['pre_val'] = val
+
+                self.statistic_cpu_value[self.head_index].fill_(-1)
+                self.name_list[self.head_index] = ""
+                self.head_index = (self.head_index + 1) % self.queue_len
+                val = self.statistic_cpu_value[self.head_index].item()
+                name = self.name_list[self.head_index]
+
+            self.lock.release()
+            time.sleep(0.1)
+
+    def _silent_check(self, val, pre_val, avg, none_zero_step, alpha1=1e6, alpha2=1e2):
+        if val == 0:
+            return False, avg, none_zero_step
+        elif math.isnan(val) or math.isinf(val):
+            return True, avg, none_zero_step
+        else:
+            if none_zero_step >= 10 and avg != 0:
+                thres = avg * alpha1 / (1 - 0.99 ** none_zero_step)
+                thres2 = avg * alpha2 / (1 - 0.99 ** none_zero_step)
+            else:
+                thres = val
+                thres2 = val
+            if val > thres and abs(val - pre_val) > thres:
+                return True, avg, none_zero_step
+            else:
+                if val <= thres2:
+                    none_zero_step += 1
+                    avg = avg * 0.99 + val * 0.01
+                return False, avg, none_zero_step
+
+    def _abnormal_process(self, new_abnormal):
+        counting_abnormal_pos = []
+        i = len(self.history_abnormal_list) - 1
+        if i < 0:
+            self._generate_event_log(new_abnormal)
+            self.history_abnormal_list.append(new_abnormal)
+            if self.strikes_num == 1:
+                self._generate_warning_log(counting_abnormal_pos, new_abnormal)
+                new_abnormal['striked'] = True
+                if self.with_checksum:
+                    self.checksum_state = 1
+                    if not self.matmul_with_bf16:
+                        warnings.warn(f"Warning: Module has no supported dtype grad, checksum will not to be linked.")
+            return
+        while i >= 0:
+            old_abnormal = self.history_abnormal_list[i]
+            old_time = old_abnormal['time']
+            new_time = new_abnormal['time']
+            if old_abnormal['counted'] and abs(new_time - old_time) >= self.cooldown * 60:
+                # A new counted abnormal
+                self._generate_event_log(new_abnormal)
+                if self.strikes_num == 1:
+                    self._generate_warning_log(counting_abnormal_pos, new_abnormal)
+                    new_abnormal['striked'] = True
+                    if self.with_checksum:
+                        self.checksum_state = 1
+                        if not self.matmul_with_bf16:
+                            warnings.warn(f"Warning: Module has no supported dtype grad, checksum will not to be linked.")
+                    break
+                counting_abnormal_pos.append(i)
+                i -= 1
+                while i >= 0:
+                    old_abnormal = self.history_abnormal_list[i]
+                    if old_abnormal['counted'] and not old_abnormal['striked']:
+                        counting_abnormal_pos.append(i)
+                    if len(counting_abnormal_pos) == self.strikes_num - 1:
+                        break
+                    i -= 1
+                if len(counting_abnormal_pos) == self.strikes_num - 1 and abs(new_abnormal['time'] - old_abnormal['time']) <= self.strikes_window * 60:
+                    # Three strikes
+                    self._generate_warning_log(counting_abnormal_pos, new_abnormal)
+                    for index in counting_abnormal_pos:
+                        self.history_abnormal_list[index]['striked'] = True
+                    new_abnormal['striked'] = True
+
+                    if self.with_checksum:
+                        self.checksum_state = 1
+                        if not self.matmul_with_bf16:
+                            warnings.warn(f"Warning: Module has no supported dtype grad, checksum will not to be linked.")
+                break
+            elif not old_abnormal['counted']:
+                # Keep tracing the last counted abnormal
+                i -= 1
+            else:
+                # A new not-counted abnormal
+                new_abnormal['counted'] = False
+                break
+        self.history_abnormal_list.append(new_abnormal)
+        # remove expired exception
+        current_time = time.time()
+        first_expired_index = 0
+        for abnormal in self.history_abnormal_list:
+            if abs(current_time - abnormal['time']) <= self.strikes_window * 60:
+                break
+            first_expired_index += 1
+        if first_expired_index > 0:
+            del self.history_abnormal_list[:first_expired_index]
+
+    def _generate_event_log(self, new_abnormal):
+        info_str = f"[Event][{new_abnormal['time_str']}] [Rank {new_abnormal['rank']}]: A grad-norm spike may happen, "
+        info_str = info_str + f"param name {new_abnormal['name']}, abnormal value {new_abnormal['val']}, previous value {new_abnormal['pre_val']}, "
+        info_str = info_str + f"history avg {new_abnormal['avg']}, bp time {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}."
+        loggerSilent.info(info_str)
+        if self.store is not None and self.rank is not None and self.rank != 0:
+            current_log = self.store.get(f"rank_{self.rank}_info_log").decode()
+            self.store.set(f"rank_{self.rank}_info_log", current_log + "\n" + info_str if current_log != "" else info_str)
+
+    def _generate_warning_log(self, counting_abnormal_pos, new_abnormal):
+        warning_str = f"[Warning][{new_abnormal['time_str']}] [Rank {new_abnormal['rank']}]: feature detection detects abnormal results!"
+        index = 0
+        for pos in reversed(counting_abnormal_pos):
+            warning_str = warning_str + "\n" + f"Grad-norm spike: index {index}, time {self.history_abnormal_list[pos]['time_str']}, param name {self.history_abnormal_list[pos]['name']}, abnormal value {self.history_abnormal_list[pos]['val']}, previous value {self.history_abnormal_list[pos]['pre_val']}, "
+            warning_str = warning_str + f"history avg {self.history_abnormal_list[pos]['avg']}, bp time {self.history_abnormal_list[pos]['step']}, normal count {self.history_abnormal_list[pos]['none_zero_step']}."
+            index += 1
+        warning_str = warning_str + "\n" + f"Grad-norm spike: index {index}, time {new_abnormal['time_str']}, param name {new_abnormal['name']}, abnormal value {new_abnormal['val']}, previous value {new_abnormal['pre_val']}, "
+        warning_str = warning_str + f"history avg {new_abnormal['avg']}, bp time {new_abnormal['step']}, normal count {new_abnormal['none_zero_step']}."
+        loggerSilent.warning(warning_str)
+        if self.store is not None and self.rank is not None and self.rank != 0:
+            current_log = self.store.get(f"rank_{self.rank}_warn_log").decode()
+            self.store.set(f"rank_{self.rank}_warn_log", current_log + "\n" + warning_str if current_log != "" else warning_str)
+
+    def _generate_silent_log(self):
+        warning_str = f"[Warning][Rank {self.rank}]: The result of Matmul checksum is abnormal!"
+        loggerSilent.warning(warning_str)
+        if self.store is not None and self.rank is not None and self.rank != 0:
+            current_log = self.store.get(f"rank_{self.rank}_warn_log").decode()
+            self.store.set(f"rank_{self.rank}_warn_log", current_log + "\n" + warning_str if current_log != "" else warning_str)
+
+    def _tcp_comm_checksum_state(self):
+        while self.checksum_state_thread_running:
+            if hasattr(torch, "npu") and torch.npu.is_initialized() and torch.distributed.is_initialized() and self.store is not None:
+                break
+            time.sleep(10)
+        if not self.checksum_state_thread_running:
+            return
+        local_rank = os.getenv("LOCAL_RANK", "-1")
+        self.rank = torch.distributed.get_rank()
+        world_size = torch.distributed.get_world_size()
+        if local_rank.isdigit():
+            torch.npu.set_device(int(local_rank))
+
+        last_checksum_time = None
+        if self.rank == 0:
+            self.store.add('counter2', world_size)
+        while self.checksum_state_thread_running:
+            if self.rank == 0:
+                for i in range(1, world_size):
+                    msg = self.store.get(f"rank_{i}_warn_log").decode()
+                    if msg != "":
+                        loggerSilent.warning(msg)
+                        self.store.set(f"rank_{i}_warn_log", "")
+                    msg = self.store.get(f"rank_{i}_info_log").decode()
+                    if msg != "":
+                        loggerSilent.info(msg)
+                        self.store.set(f"rank_{i}_info_log", "")
+
+            if not self.with_checksum or not self.matmul_with_bf16:
+                time.sleep(10)
+                continue
+
+            self.store.add('checksum_state', self.checksum_state)
+            if self.rank == 0:
+                self.store.add('counter2', 0 - world_size)
+            self.store.add('counter', 1)
+
+            while int(self.store.get('counter').decode()) < world_size and self.checksum_state_thread_running:
+                time.sleep(0.1)
+
+            global_state = int(self.store.get('checksum_state').decode())
+            if global_state:
+                now_time = time.time()
+                if last_checksum_time is None or abs(now_time - last_checksum_time) > self.checksum_cooldown * 60:
+                    loggerSilent.info(f'[Info] Rank {self.rank}: feature detection detects abnormal results, checksum is on.')
+                    last_checksum_time = now_time
+                    if self.checksum_result is None:
+                        self.checksum_result = torch.tensor(False, dtype=torch.bool, device='npu')
+                    else:
+                        self.checksum_result.fill_(False)
+                    self.checksum_enable = True
+                    time.sleep(self.cooldown * 60)
+                    if self.checksum_result:
+                        self._generate_silent_log()
+                    self.checksum_enable = False
+                    loggerSilent.info(f'[Info] Rank {self.rank}: checksum is off')
+                self.checksum_state = 0
+            self.store.add('counter2', 1)
+
+            while int(self.store.get('counter2').decode()) < world_size and self.checksum_state_thread_running:
+                time.sleep(0.1)
+            
+            if self.rank == 0:
+                self.store.add('checksum_state', 0 - global_state)
+                self.store.add('counter', 0 - world_size)
+
+            time.sleep(10)
+
+    def __getstate__(self):
+        self._cleanup()
+        state = self.__dict__.copy()
+        state['_lock'] = None
+        state['store'] = None
+        return state
+    
+    def __setstate(self, state):
+        self.__dict__.update(state)
+        self.store = None
+
+    def _startup(self):        
+        if not self.check_thread_running:
+            self.check_thread_running = True
+            self.check_thread = threading.Thread(
+                target=self._async_detect,
+                daemon=True
+            )
+            self.check_thread.start()
+
+        if not self.checksum_state_thread_running:
+            self.checksum_state_thread_running = True
+            self.checksum_state_thread = threading.Thread(
+                target=self._tcp_comm_checksum_state,
+                daemon=True
+            )
+            self.checksum_state_thread.start()
+
+    def _cleanup(self):
+        if self.check_thread_running:
+            self.check_thread_running = False
+            self.check_thread.join()
+            self.check_thread = None
+
+        if self.checksum_state_thread_running:
+            self.checksum_state_thread_running = False
+            self.checksum_state_thread.join()
+            self.checksum_state_thread = None
+
+
+matmul_check = _MatmulSilentCheck()
+
+
+def _trigger_matmul_decorator(func):
+    @wraps(func)
+    def wrapper(a, b, *args, **kwargs):
+        global matmul_check
+        result = func(a, b, *args, **kwargs)
+        if matmul_check.checksum_enable and a.dtype == torch.bfloat16 and b.dtype == torch.bfloat16:
+            checksum = torch_npu.matmul_checksum(a, b, result)
+            matmul_check.checksum_result.logical_or_(checksum)
+        return result
+    return wrapper
+
+
+def _trigger_tensor_matmul_decorator(func):
+    @wraps(func)
+    def wrapper(self, other):
+        global matmul_check
+        result = func(self, other)
+        if matmul_check.checksum_enable and other.dtype == torch.bfloat16 and self.dtype == torch.bfloat16:
+            checksum = torch_npu.matmul_checksum(self, other, result)
+            matmul_check.checksum_result.logical_or_(checksum)
+        return result
+    return wrapper
+
+
+def _matmul_silent_check_decorator(func):
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        global matmul_check
+
+        if not torch.npu.is_initialized():
+            return func(self, *args, **kwargs)
+
+        if matmul_check.get_matmul_hook_enable() and matmul_check.first_forward:
+            matmul_check.init_stream()
+            matmul_check.init_module_info(id(self), self.training)
+            self.matmul_check_outer = True
+
+            matmul_check._startup()
+            if matmul_check.with_checksum and not matmul_check.matmul_trigger:
+                original_matmul = torch.matmul
+                original_tensor_matmul = torch.Tensor.matmul
+                torch_npu.asd.checksum.matmul = original_matmul
+                torch.matmul = _trigger_matmul_decorator(original_matmul)
+                torch.Tensor.matmul = _trigger_tensor_matmul_decorator(original_tensor_matmul)
+                matmul_check.matmul_trigger = True
+
+            if matmul_check.is_training and not matmul_check.init_marks.get(matmul_check.first_module_id, False):
+                for name, module in self.named_modules():
+                    if matmul_check.get_matmul_hook_enable() == 0:
+                        break
+                    if len(module._modules) == 0 and name not in matmul_check.registered_modules and id(module) not in matmul_check.visited_modules_id:
+                        matmul_check.visited_modules_id.append(id(module))
+                        for _, param in module.named_parameters():
+                            if not isinstance(param, torch.Tensor) or param.dim() < 2:
+                                continue
+                            if matmul_check.parameter_filtering():
+                                matmul_check.register_module_hook(module, name)
+                            # check dtype
+                            if param.dtype == torch.float16:
+                                for value in matmul_check.hook_dict.values():
+                                    if value is not None:
+                                        value.remove()
+                                matmul_check.set_matmul_hook_enable(0)
+                                break
+                            if param.dtype == torch.bfloat16:
+                                matmul_check.matmul_with_bf16 = True
+
+                matmul_check.init_marks[matmul_check.first_module_id] = True
+
+        tmp = func(self, *args, **kwargs)
+        
+        if matmul_check.get_matmul_hook_enable():
+            if hasattr(self, "matmul_check_outer") and self.matmul_check_outer:
+                matmul_check.init_param()
+                self.matmul_check_outer = False
 
-_silent_fault_detector_v3 = _SilentFaultDetectorV3()
+        return tmp
+    return wrapper
diff --git a/torch_npu/asd/checksum.py b/torch_npu/asd/checksum.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9576675cb071d5d71e03843773026d62a6a8345
--- /dev/null
+++ b/torch_npu/asd/checksum.py
@@ -0,0 +1,50 @@
+__all__ = []
+
+import math
+import torch
+from torch import matmul
+import torch_npu
+from torch_npu.utils._error_code import ErrCode, pta_error
+
+
+def _matmul_checksum(a, b, c):
+    r"""
+    Compare whether there are any feature anomalies in the calculation results of matmul.
+    Args:
+        a(Tensor): matmul's input parameter a, and the device must be npu.
+        b(Tensor): matmul's input parameter b, and the device must be npu.
+        c(Tensor): matmul's output result c, and the device must be npu.
+
+    Returns: The bool scalar tensor, located on the npu side, indicates whether there are any anomalies in the calculation result.
+
+    """
+    if not isinstance(a, torch.Tensor) or a.device.type != 'npu':
+        raise TypeError(f"tensor should be torch.Tensor, and device type should be npu" + pta_error(ErrCode.PARAM))
+    if not isinstance(b, torch.Tensor) or b.device.type != 'npu':
+        raise TypeError(f"tensor should be torch.Tensor, and device type should be npu" + pta_error(ErrCode.PARAM))
+    if not isinstance(c, torch.Tensor) or c.device.type != 'npu':
+        raise TypeError(f"tensor should be torch.Tensor, and device type should be npu" + pta_error(ErrCode.PARAM))
+
+    c_sum = torch.sum(c, dim=-1, dtype=torch.float32)
+    b1 = torch.sum(b, dim=-1, keepdim=True, dtype=torch.float32)
+    c1 = matmul(a.to(torch.float32), b1)
+    c1_trans = c1.squeeze(-1)
+    n_b = b.shape[-1]
+
+    c_max, _ = torch.max(torch.abs(c), dim=-1)
+    c_mean = torch.mean(torch.abs(c), dim=-1)
+    if torch.min(c_max / c_mean) > 5:
+        c_ele_round_error_accum = c_max * 2 ** (-8) * math.sqrt(n_b)
+    else:
+        c_ele_round_error_accum = c_mean * 2 ** (-8) * n_b
+
+    error_total = (c_ele_round_error_accum).to(torch.float)
+
+    error = torch.abs(c_sum - c1_trans)
+    flag = (error - 5 * error_total) > 5 * 1e-20
+    any_flag = torch.any(flag)
+    if any_flag:
+        matmul(a, b, out=c)
+        c_mean2 = torch.mean(torch.abs(c), dim=-1)
+        return torch.any(c_mean != c_mean2)
+    return any_flag
diff --git a/torch_npu/contrib/__init__.py b/torch_npu/contrib/__init__.py
index 78de1a292bf8713101e1741c837bfde01af08e20..7d736e2a232e397ae6b96e3ad159e9da2687fb6b 100644
--- a/torch_npu/contrib/__init__.py
+++ b/torch_npu/contrib/__init__.py
@@ -9,11 +9,6 @@ from .module import ChannelShuffle, Prefetcher, LabelSmoothingCrossEntropy, ROIA
 
 __all__ = [
     # from function
-    "npu_iou",
-    "npu_ptiou",
-    "npu_giou",
-    "npu_diou",
-    "npu_ciou",
     "npu_multiclass_nms",
     "npu_batched_multiclass_nms",
     "npu_single_level_responsible_flags",
@@ -32,13 +27,8 @@ __all__ = [
     "Prefetcher",
     "LabelSmoothingCrossEntropy",
     "ROIAlign",
-    "DCNv2",
     "ModulatedDeformConv",
-    "Mish",
-    "BiLSTM",
     "PSROIPool",
-    "SiLU",
-    "Swish",
     "NpuFairseqDropout",
     "NpuCachedDropout",
     "MultiheadAttention",
@@ -46,7 +36,6 @@ __all__ = [
     "Focus",
     "LinearA8W8Quant",
     "LinearQuant",
-    "FusedColorJitter",
     "LinearWeightQuant",
     "QuantConv2d",
 ]
diff --git a/torch_npu/contrib/function/__init__.py b/torch_npu/contrib/function/__init__.py
index 9df0bb7fe41fdfbf959519c5b98783e27799bc05..1b0e61beab08102e1b86673880bd9ca8aa4f0d64 100644
--- a/torch_npu/contrib/function/__init__.py
+++ b/torch_npu/contrib/function/__init__.py
@@ -10,11 +10,6 @@ from .fused_attention import npu_fused_attention_with_layernorm, npu_fused_atten
 from .npu_functional import dropout_with_byte_mask
 
 __all__ = [
-    "npu_iou",
-    "npu_ptiou",
-    "npu_giou",
-    "npu_diou",
-    "npu_ciou",
     "npu_multiclass_nms",
     "npu_batched_multiclass_nms",
     "npu_single_level_responsible_flags",
diff --git a/torch_npu/contrib/function/iou.py b/torch_npu/contrib/function/iou.py
index e79513fa1290ffd8fc8c036995403e1af5b07115..a642a02c736cad6405ffae4e784f5c883413975f 100644
--- a/torch_npu/contrib/function/iou.py
+++ b/torch_npu/contrib/function/iou.py
@@ -1,9 +1,12 @@
+__all__ = []
+
+import warnings
+
 import torch
 import torch_npu
 from torch_npu.utils._error_code import ErrCode, ops_error
 
-
-__all__ = ['npu_iou', 'npu_ptiou', 'npu_giou', 'npu_diou', 'npu_ciou']
+warnings.filterwarnings(action='once', category=FutureWarning)
 
 
 def _box_dtype_check(box):
@@ -52,6 +55,8 @@ def npu_iou(boxes1,
     Returns:
         Tensor: IoU, sized [N,M].
     """
+    warnings.warn("torch_npu.contrib.npu_iou is deprecated. "
+                  "Please use torch_npu.npu_iou or torch_npu.npu_ptiou for replacement.", FutureWarning)
 
     if mode not in ["iou", "ptiou"]:
         raise ValueError("Expected mode in [iou, ptiou]" + ops_error(ErrCode.VALUE))
@@ -114,6 +119,8 @@ def npu_giou(boxes1,
     Returns:
         Tensor: IoU, sized [n, 1].
     """
+    warnings.warn("torch_npu.contrib.npu_giou is deprecated. "
+                  "Please use torch_npu.npu_giou for replacement.", FutureWarning)
 
     if boxes1.shape != boxes2.shape:
         raise ValueError("Expected boxes1.shape == boxes2.shape" + ops_error(ErrCode.VALUE))
@@ -173,6 +180,8 @@ def npu_diou(boxes1,
     Returns:
         Tensor: IoU, sized [1, n].
     """
+    warnings.warn("torch_npu.contrib.function.npu_diou is deprecated. "
+                  "Please use torch_npu.npu_diou for replacement.", FutureWarning)
 
     out = torch_npu.npu_diou(boxes1, boxes2, trans, is_cross, mode)
 
@@ -224,6 +233,8 @@ def npu_ciou(boxes1,
         Tensor: IoU, sized [1, n].
 
     """
+    warnings.warn("torch_npu.contrib.function.npu_ciou is deprecated. "
+                  "Please use torch_npu.npu_ciou for replacement.", FutureWarning)
 
     out = torch_npu.npu_ciou(boxes1, boxes2, trans, is_cross, mode, True)
 
diff --git a/torch_npu/contrib/module/__init__.py b/torch_npu/contrib/module/__init__.py
index 5ed62595f3f2c373830b5d4ab6cc627e4189647c..ead627f4f78aca87924958b6296175f2ef73e918 100644
--- a/torch_npu/contrib/module/__init__.py
+++ b/torch_npu/contrib/module/__init__.py
@@ -25,17 +25,11 @@ __all__ = [
     "Prefetcher",
     "LabelSmoothingCrossEntropy",
     "ROIAlign",
-    "DCNv2",
     "ModulatedDeformConv",
-    "Mish",
-    "BiLSTM",
     "PSROIPool",
-    "SiLU",
-    "Swish",
     "NpuFairseqDropout",
     "NpuCachedDropout",
     "MultiheadAttention",
-    "FusedColorJitter",
     "NpuDropPath",
     "Focus",
     "LinearA8W8Quant",
diff --git a/torch_npu/contrib/module/activations.py b/torch_npu/contrib/module/activations.py
index 91faa93944b846dc4fa7fc8a588244f53606f7d6..4167f7d0953d354996840cf388d576dcbc7980fa 100644
--- a/torch_npu/contrib/module/activations.py
+++ b/torch_npu/contrib/module/activations.py
@@ -1,7 +1,11 @@
+import warnings
+
 import torch
 import torch.nn as nn
 import torch_npu
 
+warnings.filterwarnings(action='once', category=FutureWarning)
+
 
 class Mish(nn.Module):
     def __init__(self):
@@ -21,6 +25,9 @@ class Mish(nn.Module):
             >>> output = m(input_tensor)
         """
         super(Mish, self).__init__()
+        
+        warnings.warn("torch_npu.contrib.module.Mish is deprecated. "
+                      "Please use torch.nn.Mish for replacement.", FutureWarning)
 
     def forward(self, x):
         x = torch_npu.npu_mish(x)
@@ -41,6 +48,9 @@ class SiLU(nn.Module):
             >>> output = m(input_tensor)
         """
         super(SiLU, self).__init__()
+        
+        warnings.warn("torch_npu.contrib.module.SiLU is deprecated. "
+                      "Please use torch.nn.SiLU for replacement.", FutureWarning)
 
     def forward(self, x):
         x = torch_npu.npu_silu(x)
diff --git a/torch_npu/contrib/module/bidirectional_lstm.py b/torch_npu/contrib/module/bidirectional_lstm.py
index 9ef9efa774bdc57d2688ea59d47aba59bf20b7be..96af0a43622b0fa34a2ed8449635b621e6bcfab0 100644
--- a/torch_npu/contrib/module/bidirectional_lstm.py
+++ b/torch_npu/contrib/module/bidirectional_lstm.py
@@ -1,6 +1,10 @@
+import warnings
+
 import torch
 import torch_npu
 
+warnings.filterwarnings(action='once', category=FutureWarning)
+
 
 class BiLSTM(torch.nn.Module):
     r"""Applies an NPU compatible bidirectional LSTM operation to an input
@@ -59,6 +63,8 @@ class BiLSTM(torch.nn.Module):
     def __init__(self, input_size, hidden_size):
         super(BiLSTM, self).__init__()
 
+        warnings.warn("torch_npu.contrib.BiLSTM is deprecated. "
+                      "Please check document for replacement.", FutureWarning)
         self.fw_rnn = torch.nn.LSTM(input_size, hidden_size, bidirectional=False)
         self.bw_rnn = torch.nn.LSTM(input_size, hidden_size, bidirectional=False)
 
diff --git a/torch_npu/contrib/module/deform_conv.py b/torch_npu/contrib/module/deform_conv.py
index 32628dd84859b823cff30f249577f42178997fed..3a49bedc7d475ca1c64cf7b76d866bdfb6818d39 100644
--- a/torch_npu/contrib/module/deform_conv.py
+++ b/torch_npu/contrib/module/deform_conv.py
@@ -8,7 +8,6 @@ import torch_npu
 __all__ = [
     "ModulatedDeformConv2dFunction",
     "ModulatedDeformConv",
-    "DCNv2"
 ]
 
 
diff --git a/torch_npu/contrib/module/fusedcolorjitter.py b/torch_npu/contrib/module/fusedcolorjitter.py
index 2e73c5335594386675ab93205866561dce744ca3..3f9a5e2487568f65faf7832cbe9333d8a6c8c228 100644
--- a/torch_npu/contrib/module/fusedcolorjitter.py
+++ b/torch_npu/contrib/module/fusedcolorjitter.py
@@ -1,3 +1,6 @@
+__all__ = []
+
+import warnings
 import random
 from math import sin, cos, pi
 import numbers
@@ -6,9 +9,7 @@ import torch
 
 from torch_npu.utils._error_code import ErrCode, ops_error
 
-__all__ = [
-    "FusedColorJitter"
-]
+warnings.filterwarnings(action='once', category=FutureWarning)
 
 
 class _FusedColorJitterApply(object):
@@ -122,6 +123,8 @@ class FusedColorJitter(torch.nn.Module):
 
     def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
         super().__init__()
+        warnings.warn("torch_npu.contrib.module.FusedColorJitter is deprecated. "
+                      "Please use torchvision.transforms.ColorJitter for replacement.", FutureWarning)
         self.brightness = self._check_input(brightness, 'brightness')
         self.contrast = self._check_input(contrast, 'contrast')
         self.saturation = self._check_input(saturation, 'saturation')
diff --git a/torch_npu/contrib/transfer_to_npu.py b/torch_npu/contrib/transfer_to_npu.py
index 04564771fda60f96ce10877bf08d6c99702b8129..7236ee04abf19e0db35945e1ff69753fae3683c9 100644
--- a/torch_npu/contrib/transfer_to_npu.py
+++ b/torch_npu/contrib/transfer_to_npu.py
@@ -346,6 +346,10 @@ def _init():
     torch.distributed.ProcessGroup._get_backend = _wrapper_cuda(torch.distributed.ProcessGroup._get_backend)
     torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel.__init__ = \
         _wrapper_cuda(torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel.__init__)
+    torch.distributed.new_group = _wrapper_hccl(torch.distributed.new_group)
+    
+    # CUDAGraph
+    torch.cuda.CUDAGraph = torch.npu.NPUGraph
 
     # torch.nn.parallel.DistributedDataParallel
     _device_wrapper(torch.nn.parallel.DistributedDataParallel, torch_distributed_fn_white_list)
diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp
index ea53558bebaf1d33120ab4e39c36551a4862b91e..05ef7980b701e5b54e4a2be9a74d34dc6a4ed7b7 100644
--- a/torch_npu/csrc/InitNpuBindings.cpp
+++ b/torch_npu/csrc/InitNpuBindings.cpp
@@ -6,7 +6,7 @@
 #include "torch_npu/csrc/npu/Event.h"
 #include "torch_npu/csrc/npu/DataParallelComm.h"
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
-#include "torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h"
+#include "torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.h"
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
 #include "torch_npu/csrc/core/npu/npu_log.h"
 #include "torch_npu/csrc/core/npu/CachingHostAllocator.h"
@@ -67,9 +67,15 @@ PyObject* THPModule_npu_shutdown(PyObject* self, PyObject* arg)
     at_npu::native::CachingHostAllocator_emptyCache();
     try {
         ASCEND_LOGI("NPU shutdown NPUCachingAllocator emptyCache.");
-        c10_npu::NPUCachingAllocator::emptyCache(check_error);
-    } catch (std::exception& e) {
-        ASCEND_LOGE("NPUCachingAllocator::emptyCache failed err=:%s", e.what());
+        c10_npu::NPUCachingAllocator::emptyCache(false);
+    } catch (...) {
+        ASCEND_LOGE("NPUCachingAllocator::emptyCache failed");
+    }
+    try {
+        ASCEND_LOGI("NPU shutdown NPUSwappedMemoryAllocator emptyCache.");
+        c10_npu::NPUSwappedMemoryAllocator::emptyCache();
+    } catch (...) {
+        ASCEND_LOGE("NPUSwappedMemoryAllocator::emptyCache failed");
     }
 
     ASCEND_LOGI("NPU shutdown NpuSysCtrl Finalize.");
diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp
index 2af7c3be82f3b6559e68bb71cecccd0aa17d8512..99e1746c88423222ad2f989a9014cd4912e6acd0 100644
--- a/torch_npu/csrc/aten/common/TensorFactories.cpp
+++ b/torch_npu/csrc/aten/common/TensorFactories.cpp
@@ -16,6 +16,7 @@
 #include <ATen/record_function.h>
 
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+#include "torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUGuard.h"
 #include "torch_npu/csrc/aten/common/ResizeNpu.h"
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
@@ -29,6 +30,7 @@
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/NPUBridge.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
+#include "torch_npu/csrc/core/npu/NPUFunctions.h"
 #ifndef BUILD_LIBTORCH
 #include "torch_npu/csrc/profiler/utils.h"
 #endif
@@ -264,7 +266,8 @@ at::Tensor NPUNativeFunctions::empty_with_format(
     c10::optional<c10::Layout> layout_opt,
     c10::optional<c10::Device> device_opt,
     c10::optional<bool> pin_memory_opt,
-    int64_t dst_format)
+    int64_t dst_format,
+    c10::optional<int64_t> base_addr_aligned_kb)
 {
 #ifndef BUILD_LIBTORCH
     torch_npu::profiler::NPURecordFunction profiler_guard;
@@ -286,11 +289,21 @@ at::Tensor NPUNativeFunctions::empty_with_format(
     auto dtype = c10::scalarTypeToTypeMeta(dtype_or_default(dtype_opt));
     int64_t nelements = StorageDescHelper::GetMemorySize(size, format, dtype);
     int64_t size_bytes = nelements * dtype.itemsize();
-    c10::intrusive_ptr<c10::StorageImpl> storage_impl = torch_npu::make_npu_storage_impl(
-        c10::StorageImpl::use_byte_size_t(),
-        c10::SymInt(size_bytes),
-        allocator,
-        true);
+    c10::intrusive_ptr<c10::StorageImpl> storage_impl;
+    if (!base_addr_aligned_kb.has_value()) {
+        storage_impl = torch_npu::make_npu_storage_impl(
+            c10::StorageImpl::use_byte_size_t(),
+            c10::SymInt(size_bytes),
+            allocator,
+            true);
+    } else {
+        storage_impl = c10::make_intrusive<torch_npu::NPUStorageImpl>(
+            c10::StorageImpl::use_byte_size_t(),
+            static_cast<size_t>(size_bytes),
+            c10_npu::NPUCachingAllocator::allocate_with_aligned(size_bytes, base_addr_aligned_kb.value()),
+            allocator,
+            true);
+    }
     auto tensor = at::detail::make_tensor<torch_npu::NPUTensorImpl>(storage_impl, dtype);
 
     // Default NPUTensorImpl has size [0]
@@ -318,9 +331,11 @@ at::Tensor NPUNativeFunctions::unsafe_empty_with_format(
     // the specified internal format is preserved.
     if ((!keep_format) && at_npu::native::env::CheckForbidInternalFormat()) {
         dst_format = static_cast<int64_t>(FormatHelper::GetBaseFormat(static_cast<aclFormat>(dst_format)));
+        TORCH_WARN_ONCE("Cannot create tensor with interal format while allow_internel_format=False, "
+                        "tensor will be created with base format.");
     }
 
-    return NPUNativeFunctions::empty_with_format(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, dst_format);
+    return NPUNativeFunctions::empty_with_format(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, dst_format, c10::nullopt);
 }
 
 at::Tensor NPUNativeFunctions::empty_with_format(
@@ -408,6 +423,45 @@ at::Tensor &empty_out_npu(
     return result;
 }
 
+at::Tensor NPUNativeFunctions::empty_with_swapped_memory(
+    c10::IntArrayRef size,
+    c10::optional<at::ScalarType> dtype_opt,
+    c10::optional<c10::Device> device_opt)
+{
+#ifndef BUILD_LIBTORCH
+    torch_npu::profiler::NPURecordFunction profiler_guard;
+#endif
+    RECORD_FUNCTION("empty_with_swapped_memory", std::vector<c10::IValue>({}));
+    auto device_ = device_opt.value_or(at::Device(c10::DeviceType::PrivateUse1, c10_npu::current_device()));
+    torch_npu::utils::torch_check_npu(device_);
+    torch_npu::utils::maybe_initialize_npu(device_);
+    TORCH_CHECK(!(at::isComplexType(dtype_or_default(dtype_opt)) && !at_npu::native::env::CheckJitDisable()),
+                "Current settings do not support Complex dtype. Please try again with jit_compile=False.",
+                PTA_ERROR(ErrCode::NOT_SUPPORT));
+    check_size_nonnegative(size);
+    c10_npu::NPUGuard guard_(device_);
+    c10::Allocator *allocator = c10_npu::NPUSwappedMemoryAllocator::get();
+    int64_t nelements = c10::multiply_integers(size);
+    auto dtype = c10::scalarTypeToTypeMeta(dtype_or_default(dtype_opt));
+    int64_t size_bytes = nelements * dtype.itemsize();
+    c10::intrusive_ptr<c10::StorageImpl> storage_impl = torch_npu::make_npu_storage_impl(
+        c10::StorageImpl::use_byte_size_t(),
+        c10::SymInt(size_bytes),
+        allocator,
+        true);
+
+    auto tensor = at::detail::make_tensor<torch_npu::NPUTensorImpl>(storage_impl, dtype);
+
+    // Default at::TensorImpl has size [0]
+    if (size.size() != 1 || size[0] != 0) {
+        tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size);
+    }
+    tensor.unsafeGetTensorImpl()->empty_tensor_restride(c10::MemoryFormat::Contiguous);
+    StorageDescHelper::SetDesc(tensor, size, tensor.strides());
+
+    return tensor;
+}
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ blackman_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 at::Tensor NPUNativeFunctions::blackman_window(
     int64_t window_length,
diff --git a/torch_npu/csrc/aten/common/ToKernelNpu.cpp b/torch_npu/csrc/aten/common/ToKernelNpu.cpp
index e807c226403cfd533a3150a6e7441fb82b667906..909d876109948e469ac836d09a5be16e5c9b203e 100644
--- a/torch_npu/csrc/aten/common/ToKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/ToKernelNpu.cpp
@@ -60,50 +60,77 @@ static inline at::Tensor to_impl_npu(
     return r;
 }
 
-at::Tensor NPUNativeFunctions::to(
-    const at::Tensor &self,
+at::Tensor NPUNativeFunctions::_to_copy(
+    const at::Tensor& self,
     c10::optional<at::ScalarType> dtype,
     c10::optional<c10::Layout> layout,
     c10::optional<c10::Device> device,
     c10::optional<bool> pin_memory,
     bool non_blocking,
-    bool copy,
     c10::optional<c10::MemoryFormat> optional_memory_format)
 {
+    c10::TensorOptions options_ = c10::TensorOptions()
+        .dtype(dtype)
+        .layout(layout)
+        .device(device);
+
+    auto options = self.options().merge_in(options_);
+
+    if (layout.has_value()) {
+        TORCH_CHECK(
+            self.layout() == layout.value(),
+            "to(options) doesn't support converting to a different layout, "
+            "but got self.layout being ",
+            self.layout(),
+            " and options.layout set as ",
+            layout.value(), OPS_ERROR(ErrCode::NOT_SUPPORT));
+    }
+
+    if (device.has_value()) {
+        options = options.device(ensure_has_index(device.value()));
+    }
+
     if (optional_memory_format.has_value()) {
         TORCH_CHECK(
             optional_memory_format.value() == c10::MemoryFormat::Preserve ||
             optional_memory_format.value() == c10::MemoryFormat::Contiguous,
             "Only contiguous_format or preserve_format is supported.", OPS_ERROR(ErrCode::NOT_SUPPORT));
+        options = options.memory_format(optional_memory_format.value());
+    } else {
+        if (torch_npu::utils::is_npu(self)) {
+            options = options.memory_format(c10::MemoryFormat::Contiguous);
+        } else {
+            // keep the same as cpu default memory format: Preserve
+            options = options.memory_format(c10::MemoryFormat::Preserve);
+        }
     }
-
-    c10::TensorOptions options_ = c10::TensorOptions().dtype(dtype).layout(layout).device(device);
-    TORCH_CHECK(
-        !(options_.has_memory_format() && optional_memory_format.has_value()),
-        "Cannot set memory_format both in c10::TensorOptions and explicit argument; please delete "
-        "the redundant setter.", OPS_ERROR(ErrCode::PARAM));
-    auto options =
-        options_.merge_in(c10::TensorOptions().memory_format(optional_memory_format));
-
     TORCH_CHECK(
         options.requires_grad_opt() == c10::nullopt,
         "to(options) expects unset requires_grad flag, but got "
         "options.requires_grad set as ",
         options.requires_grad(), OPS_ERROR(ErrCode::PARAM));
 
-    TORCH_CHECK(
-        !options.has_layout() || self.layout() == options.layout(),
-        "to(options) doesn't support converting to a different layout, "
-        "but got self.layout being ",
-        self.layout(),
-        " and options.layout set as ",
-        options.layout(), OPS_ERROR(ErrCode::TYPE));
-
-    if (options.has_device()) {
-        options = options.device(ensure_has_index(options.device()));
+    bool pin_out = non_blocking && torch_npu::utils::is_npu(self) && options.device().is_cpu() &&
+                    (options.layout() == c10::kStrided);
+
+    c10::MemoryFormat memory_format = options.memory_format_opt().value_or(c10::MemoryFormat::Contiguous);
+    if (memory_format == c10::MemoryFormat::Preserve) {
+        if (self.is_non_overlapping_and_dense()) {
+            // Copy all strides
+            auto r = at::empty_strided(
+                self.sizes(), self.strides(), options.memory_format(c10::nullopt).pinned_memory(pin_out));
+            r.copy_(self, non_blocking);
+            return r;
+        } else {
+            memory_format = self.suggest_memory_format();
+        }
     }
-    auto specified_options = self.options().merge_in(options);
-    return to_impl_npu(self, specified_options, non_blocking, copy);
+
+    // See Note [Explicit nullopt c10::MemoryFormat argument]
+    auto r = at::empty(
+        self.sizes(), options.memory_format(memory_format).pinned_memory(pin_out), c10::nullopt);
+    r.copy_(self, non_blocking);
+    return r;
 }
 
 at::Tensor NPUNativeFunctions::to(
diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml
index 913b93b73e2782dee62239965e3bb7f6ed7d6560..95bb740db159bef654fb063934f68344c1bf257e 100644
--- a/torch_npu/csrc/aten/npu_native_functions.yaml
+++ b/torch_npu/csrc/aten/npu_native_functions.yaml
@@ -47,7 +47,7 @@ supported:
   - squeeze.dim
   - to.device
   - to.dtype
-  - to.dtype_layout
+  - _to_copy
   - to.other
   - tril_indices
   - triu_indices
@@ -70,24 +70,25 @@ custom:
   - func: npu_format_cast_(Tensor(a!) self, Tensor src) -> Tensor(a!)
     device_check: NoCheck
     exposed: True
-  - func: empty_with_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2) -> Tensor
+  - func: empty_with_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, int? base_addr_aligned_kb=None) -> Tensor
     dispatch:
       CompositeExplicitAutograd: empty_with_format
-    exposed: True
   - func: unsafe_empty_with_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, bool keep_format=False) -> Tensor
     dispatch:
       CompositeExplicitAutograd: empty_with_format
   - func: empty_with_format.names(int[] size, Dimname[]? names, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2) -> Tensor
     dispatch:
       CompositeExplicitAutograd: empty_with_format
-    exposed: True
   - func: copy_memory_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
     device_check: NoCheck
-    exposed: True
   - func: get_storage_size(Tensor self) -> int
   - func: npu_format_cast(Tensor self, int acl_format) -> Tensor
     exposed: True
   - func: _npu_format_cast(Tensor self, int acl_format) -> Tensor
+  - func: empty_with_swapped_memory(int[] size, *, ScalarType? dtype=None, Device? device=None) -> Tensor
+    dispatch:
+      CompositeExplicitAutograd: empty_with_swapped_memory
+    exposed: True
 
 symint:
   - as_strided_
diff --git a/torch_npu/csrc/core/NPUBridge.cpp b/torch_npu/csrc/core/NPUBridge.cpp
index af03768ab9d6aefa3b3892526b771d06cfdf1f63..5a7a60ebe780d82cea69da949ce9006651e8ebc9 100644
--- a/torch_npu/csrc/core/NPUBridge.cpp
+++ b/torch_npu/csrc/core/NPUBridge.cpp
@@ -2,27 +2,29 @@
 
 
 namespace torch_npu {
-
-NPUStorageImpl* NPUBridge::GetNpuStorageImpl(c10::StorageImpl* storageImpl) {
-  return static_cast<NPUStorageImpl*>(storageImpl);
+NPUStorageImpl *NPUBridge::GetNpuStorageImpl(c10::StorageImpl *storageImpl)
+{
+    return static_cast<NPUStorageImpl *>(storageImpl);
 }
 
-NPUStorageImpl* NPUBridge::GetNpuStorageImpl(c10::Storage&& storage) {
-  return static_cast<NPUStorageImpl*>(storage.unsafeGetStorageImpl());
+NPUStorageImpl *NPUBridge::GetNpuStorageImpl(c10::Storage &&storage)
+{
+    return static_cast<NPUStorageImpl *>(storage.unsafeGetStorageImpl());
 }
 
-NPUStorageImpl* NPUBridge::GetNpuStorageImpl(const at::Tensor &tensor) {
-  return static_cast<NPUStorageImpl*>(tensor.storage().unsafeGetStorageImpl());
+NPUStorageImpl *NPUBridge::GetNpuStorageImpl(const at::Tensor &tensor)
+{
+    return static_cast<NPUStorageImpl *>(tensor.storage().unsafeGetStorageImpl());
 }
 
-NPUStorageDesc& NPUBridge::GetNpuStorageImplDesc(const at::Tensor &tensor) {
-  return static_cast<NPUStorageImpl*>(tensor.storage().unsafeGetStorageImpl())->npu_desc_;
+NPUStorageDesc &NPUBridge::GetNpuStorageImplDesc(const at::Tensor &tensor)
+{
+    return static_cast<NPUStorageImpl *>(tensor.storage().unsafeGetStorageImpl())->npu_desc_;
 }
 
 
 NPUTensorImpl *NPUBridge::GetNpuTensorImpl(const at::Tensor &tensor)
 {
-  return static_cast<NPUTensorImpl*>(tensor.unsafeGetTensorImpl());
+    return static_cast<NPUTensorImpl *>(tensor.unsafeGetTensorImpl());
 }
-
 }
\ No newline at end of file
diff --git a/torch_npu/csrc/core/NPUSerialization.cpp b/torch_npu/csrc/core/NPUSerialization.cpp
index af80b2b6555c6195c8a7e156a4b5ef20557546cf..1ae122f3424db8e8fb79b4932930f61f4783b79a 100644
--- a/torch_npu/csrc/core/NPUSerialization.cpp
+++ b/torch_npu/csrc/core/NPUSerialization.cpp
@@ -6,51 +6,53 @@
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
 
 namespace torch_npu {
-
 std::unordered_map<std::string, aclFormat> FORMAT_INFO = {
-    {"NC1HWC0", ACL_FORMAT_NC1HWC0},
-    {"ND", ACL_FORMAT_ND},
-    {"NCHW", ACL_FORMAT_NCHW},
-    {"NHWC", ACL_FORMAT_NHWC},
-    {"FRACTAL_NZ", ACL_FORMAT_FRACTAL_NZ},
-    {"FRACTAL_Z", ACL_FORMAT_FRACTAL_Z},
-    {"NDHWC", ACL_FORMAT_NDHWC},
-    {"NCDHW", ACL_FORMAT_NCDHW},
-    {"NDC1HWC0", ACL_FORMAT_NDC1HWC0},
-    {"FRACTAL_Z_3D", ACL_FRACTAL_Z_3D},
+    { "NC1HWC0", ACL_FORMAT_NC1HWC0 },
+    { "ND", ACL_FORMAT_ND },
+    { "NCHW", ACL_FORMAT_NCHW },
+    { "NHWC", ACL_FORMAT_NHWC },
+    { "FRACTAL_NZ", ACL_FORMAT_FRACTAL_NZ },
+    { "FRACTAL_Z", ACL_FORMAT_FRACTAL_Z },
+    { "NDHWC", ACL_FORMAT_NDHWC },
+    { "NCDHW", ACL_FORMAT_NCDHW },
+    { "NDC1HWC0", ACL_FORMAT_NDC1HWC0 },
+    { "FRACTAL_Z_3D", ACL_FRACTAL_Z_3D },
 };
 
-void npu_info_serialization(const at::Tensor& t, std::unordered_map<std::string, bool>& map) {
-  at_npu::native::StorageDescHelper::GetDescForSerialization(t, map);
+void npu_info_serialization(const at::Tensor &t, std::unordered_map<std::string, bool> &map)
+{
+    at_npu::native::StorageDescHelper::GetDescForSerialization(t, map);
 }
 
-void npu_info_deserialization(const at::Tensor& t, std::unordered_map<std::string, bool>& map) {
-  // Set the true stroage description
-  at_npu::native::StorageDescHelper::SetDescForSerialization(t, map);
+void npu_info_deserialization(const at::Tensor &t, std::unordered_map<std::string, bool> &map)
+{
+    // Set the true stroage description
+    at_npu::native::StorageDescHelper::SetDescForSerialization(t, map);
 
-  auto str_to_aclFormat = [](std::string str) -> aclFormat {
-    int start = 0;
-    while (str[start++] != '/');
-    return FORMAT_INFO[str.substr(start, str.size() - start)];
-  };
+    auto str_to_aclFormat = [](std::string str) -> aclFormat {
+        int start = 0;
+        while (str[start++] != '/') {
+            ;
+        }
+        return FORMAT_INFO[str.substr(start, str.size() - start)];
+    };
 
-  for (auto &m : map) {
-    if (m.first.find("npu_format_") != std::string::npos) {
-      aclFormat format = str_to_aclFormat(m.first);
-      // The format cast is an operator,
-      // so special handling is required for scenarios
-      // where the leaf node tensor requires grad at the same time
-      bool revert_flag = false;
-      if (t.is_leaf() && t.requires_grad()) {
-        revert_flag = true;
-        t.set_requires_grad(false);
-      }
-      at_npu::native::NPUNativeFunctions::npu_format_cast_(const_cast<at::Tensor&>(t), format);
-      if (revert_flag) {
-        t.set_requires_grad(true);
-      }
+    for (auto &m : map) {
+        if (m.first.find("npu_format_") != std::string::npos) {
+            aclFormat format = str_to_aclFormat(m.first);
+            // The format cast is an operator,
+            // so special handling is required for scenarios
+            // where the leaf node tensor requires grad at the same time
+            bool revert_flag = false;
+            if (t.is_leaf() && t.requires_grad()) {
+                revert_flag = true;
+                t.set_requires_grad(false);
+            }
+            at_npu::native::NPUNativeFunctions::npu_format_cast_(const_cast<at::Tensor &>(t), format);
+            if (revert_flag) {
+                t.set_requires_grad(true);
+            }
+        }
     }
-  }
 }
-
 }
diff --git a/torch_npu/csrc/core/NPUTensorImpl.cpp b/torch_npu/csrc/core/NPUTensorImpl.cpp
index a2e57e7700534e6127fa00801064af30eece8b35..78b15a3978a004e50535cd3438815b2e7d5e9eb7 100644
--- a/torch_npu/csrc/core/NPUTensorImpl.cpp
+++ b/torch_npu/csrc/core/NPUTensorImpl.cpp
@@ -8,56 +8,39 @@
 #include "third_party/acl/inc/acl/acl_rt.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
 
-namespace torch_npu
+namespace torch_npu {
+NPUTensorImpl::NPUTensorImpl(c10::Storage &&storage, const caffe2::TypeMeta &data_type)
+    : c10::TensorImpl(std::move(storage),
+    c10::DispatchKeySet{ c10::DispatchKey::PrivateUse1, c10::DispatchKey::AutogradPrivateUse1 }, data_type)
 {
-  NPUTensorImpl::NPUTensorImpl(c10::Storage &&storage, const caffe2::TypeMeta &data_type)
-      : c10::TensorImpl(std::move(storage),
-                        c10::DispatchKeySet{c10::DispatchKey::PrivateUse1,
-                                            c10::DispatchKey::AutogradPrivateUse1},
-                        data_type)
-  {
     is_non_overlapping_and_dense_ = false;
-  }
+}
 
-  void NPUTensorImpl::shallow_copy_from(const c10::intrusive_ptr<TensorImpl> &impl)
-  {
-    copy_tensor_metadata(
-        impl.get(),
-        this,
-        version_counter(),
-        allow_tensor_metadata_change());
+void NPUTensorImpl::shallow_copy_from(const c10::intrusive_ptr<TensorImpl> &impl)
+{
+    copy_tensor_metadata(impl.get(), this, version_counter(), allow_tensor_metadata_change());
     refresh_numel();
     refresh_contiguous();
-  }
+}
 
-  c10::intrusive_ptr<c10::TensorImpl> NPUTensorImpl::shallow_copy_and_detach(
-      const c10::VariableVersion &version_counter,
-      bool allow_tensor_metadata_change) const
-  {
+c10::intrusive_ptr<c10::TensorImpl> NPUTensorImpl::shallow_copy_and_detach(const c10::VariableVersion &version_counter,
+    bool allow_tensor_metadata_change) const
+{
     auto impl = c10::make_intrusive<NPUTensorImpl>(c10::Storage(this->storage()), this->data_type_);
-    copy_tensor_metadata(
-        this,
-        impl.get(),
-        version_counter,
-        allow_tensor_metadata_change);
+    copy_tensor_metadata(this, impl.get(), version_counter, allow_tensor_metadata_change);
     impl->refresh_numel();
     impl->refresh_contiguous();
     return impl;
-  }
+}
 
-  c10::intrusive_ptr<c10::TensorImpl> NPUTensorImpl::shallow_copy_and_detach(
-      c10::VariableVersion &&version_counter,
-      bool allow_tensor_metadata_change) const
-  {
+c10::intrusive_ptr<c10::TensorImpl> NPUTensorImpl::shallow_copy_and_detach(c10::VariableVersion &&version_counter,
+    bool allow_tensor_metadata_change) const
+{
     auto impl = c10::make_intrusive<NPUTensorImpl>(c10::Storage(this->storage()), this->data_type_);
-    copy_tensor_metadata(
-        this,
-        impl.get(),
-        std::move(version_counter),
-        allow_tensor_metadata_change);
+    copy_tensor_metadata(this, impl.get(), std::move(version_counter), allow_tensor_metadata_change);
     impl->refresh_numel();
     impl->refresh_contiguous();
     return impl;
-  }
-  NPUTensorImpl::~NPUTensorImpl() {}
+}
+NPUTensorImpl::~NPUTensorImpl() {}
 }
diff --git a/torch_npu/csrc/core/OverflowUtils.cpp b/torch_npu/csrc/core/OverflowUtils.cpp
index f77b3db4e73ba7aa83893b0a5d7f112746f8b3d7..42994b83878a771cc4e737b959225996f7b392ac 100644
--- a/torch_npu/csrc/core/OverflowUtils.cpp
+++ b/torch_npu/csrc/core/OverflowUtils.cpp
@@ -6,34 +6,35 @@
 
 namespace torch_npu {
 namespace utils {
-
 OverflowUtil::OverflowUtil() {}
 
 OverflowUtil::~OverflowUtil() {}
 
-void OverflowUtil::EnableOverflowNpu() {
-  auto result = c10_npu::NpuSysCtrl::GetInstance().OverflowSwitchEnable();
-  return;
+void OverflowUtil::EnableOverflowNpu()
+{
+    auto result = c10_npu::NpuSysCtrl::GetInstance().OverflowSwitchEnable();
+    return;
 }
 
-bool OverflowUtil::CheckOverflowNpu() {
-  auto options = at::TensorOptions(c10::DeviceType::PrivateUse1).dtype(at::kFloat);
-  at::Tensor tmp = at::empty({8}, options);
-  auto floatStatus = op_plugin::npu_alloc_float_status(tmp);
-  auto result = op_plugin::npu_get_float_status(floatStatus);
-  if (result.cpu()[0].item().toInt() != 0) {
-    return true;
-  }
-  return false;
+bool OverflowUtil::CheckOverflowNpu()
+{
+    auto options = at::TensorOptions(c10::DeviceType::PrivateUse1).dtype(at::kFloat);
+    at::Tensor tmp = at::empty({ 8 }, options);
+    auto floatStatus = op_plugin::npu_alloc_float_status(tmp);
+    auto result = op_plugin::npu_get_float_status(floatStatus);
+    if (result.cpu()[0].item().toInt() != 0) {
+        return true;
+    }
+    return false;
 }
 
-void OverflowUtil::ClearOverflowNpu() {
-  auto options = at::TensorOptions(c10::DeviceType::PrivateUse1).dtype(at::kFloat);
-  at::Tensor tmp = at::empty({8}, options);
-  auto floatStatus = op_plugin::npu_alloc_float_status(tmp);
-  auto result = op_plugin::npu_clear_float_status(floatStatus);
-  return;
+void OverflowUtil::ClearOverflowNpu()
+{
+    auto options = at::TensorOptions(c10::DeviceType::PrivateUse1).dtype(at::kFloat);
+    at::Tensor tmp = at::empty({ 8 }, options);
+    auto floatStatus = op_plugin::npu_alloc_float_status(tmp);
+    auto result = op_plugin::npu_clear_float_status(floatStatus);
+    return;
 }
-
 }
 }
diff --git a/torch_npu/csrc/core/OverflowUtils.h b/torch_npu/csrc/core/OverflowUtils.h
index 8c4c9607c556d6f6ae104b96d414a68db21b962a..7267ecbf1615613a9bfc784e07a6e6475ba316d7 100644
--- a/torch_npu/csrc/core/OverflowUtils.h
+++ b/torch_npu/csrc/core/OverflowUtils.h
@@ -4,24 +4,23 @@
 
 namespace torch_npu {
 namespace utils {
-
 class OverflowUtil {
 public:
-  ~OverflowUtil();
+    ~OverflowUtil();
 
-  static OverflowUtil *GetInstance() {
-    static OverflowUtil instance;
-    return &instance;
-  }
+    static OverflowUtil *GetInstance()
+    {
+        static OverflowUtil instance;
+        return &instance;
+    }
 
-  void EnableOverflowNpu();
-  bool CheckOverflowNpu();
-  void ClearOverflowNpu();
+    void EnableOverflowNpu();
+    bool CheckOverflowNpu();
+    void ClearOverflowNpu();
 
 private:
-  OverflowUtil();
-  bool hasOverflow = false;
+    OverflowUtil();
+    bool hasOverflow = false;
 };
-
 }
 }
diff --git a/torch_npu/csrc/core/npu/CachingHostAllocator.cpp b/torch_npu/csrc/core/npu/CachingHostAllocator.cpp
index cfc63b6610801fb9f2ed23936ac32d91656a4fe5..999ca976071dc4b503fb0d32d1b5c6ce6551405c 100644
--- a/torch_npu/csrc/core/npu/CachingHostAllocator.cpp
+++ b/torch_npu/csrc/core/npu/CachingHostAllocator.cpp
@@ -306,23 +306,28 @@ private:
 };
 } // namespace
 
-static HostAllocator allocator;
+static HostAllocator& getHostAllocator()
+{
+    // Construct allocator inside a function to prevent initialization when import
+    static HostAllocator allocator;
+    return allocator;
+}
 
 aclError CachingHostAllocator_recordEvent(
     void *ptr,
     c10_npu::NPUStream stream)
 {
-    return allocator.recordEvent(ptr, stream);
+    return getHostAllocator().recordEvent(ptr, stream);
 }
 
 bool CachingHostAllocator_isPinned(void *ptr)
 {
-    return allocator.isPinndPtr(ptr);
+    return getHostAllocator().isPinndPtr(ptr);
 }
 
 void CachingHostAllocator_emptyCache()
 {
-    allocator.emptyCache();
+    getHostAllocator().emptyCache();
 }
 
 static void CachingHostDeleter(void *ptr)
@@ -332,13 +337,13 @@ static void CachingHostDeleter(void *ptr)
     if (PyGILState_Check()) {
         // the current thread should not hold GIL.
         Py_BEGIN_ALLOW_THREADS
-            allocator.free(ptr);
+        getHostAllocator().free(ptr);
         Py_END_ALLOW_THREADS
     } else {
-        allocator.free(ptr);
+        getHostAllocator().free(ptr);
     }
 #else
-    allocator.free(ptr);
+    getHostAllocator().free(ptr);
 #endif
 }
 
@@ -348,7 +353,7 @@ struct CachingHostAllocator final : public at::Allocator {
         AT_ASSERT(size >= 0, PTA_ERROR(ErrCode::VALUE));
         void *ptr = nullptr;
         if (size > 0) {
-            if (allocator.malloc(&ptr, size) != ACL_ERROR_NONE) {
+            if (getHostAllocator().malloc(&ptr, size) != ACL_ERROR_NONE) {
                 ASCEND_LOGE("allocate host pinned memory fail");
             }
         }
diff --git a/torch_npu/csrc/core/npu/DeviceUtils.h b/torch_npu/csrc/core/npu/DeviceUtils.h
index dbe6b0eb91bf6a206dc005f08d29663aabe87d99..9cd0bdc9d21e92ea136b786290cb57ceb43f7ecb 100644
--- a/torch_npu/csrc/core/npu/DeviceUtils.h
+++ b/torch_npu/csrc/core/npu/DeviceUtils.h
@@ -11,70 +11,80 @@
 namespace torch_npu {
 namespace utils {
 
-inline bool is_npu(const at::Tensor& tensor) {
-  if (!tensor.defined()) {
-    return false;
-  }
-  return tensor.device().is_privateuseone();
+inline bool is_npu(const at::Tensor& tensor)
+{
+    if (!tensor.defined()) {
+        return false;
+    }
+    return tensor.device().is_privateuseone();
 }
 
-inline bool is_npu(const at::TensorOptions& options) {
-  return options.device().is_privateuseone();
+inline bool is_npu(const at::TensorOptions& options)
+{
+    return options.device().is_privateuseone();
 }
 
-inline bool is_npu(const at::Device& device) {
-  return device.is_privateuseone();
+inline bool is_npu(const at::Device& device)
+{
+    return device.is_privateuseone();
 }
 
-inline void torch_check_npu(const at::Tensor& tensor) {
-  TORCH_CHECK(is_npu(tensor),
-              "Expected NPU tensor, please check whether the input tensor device is correct.", PTA_ERROR(ErrCode::PARAM));
+inline void torch_check_npu(const at::Tensor& tensor)
+{
+    TORCH_CHECK(is_npu(tensor),
+                "Expected NPU tensor, please check whether the input tensor device is correct.", PTA_ERROR(ErrCode::PARAM));
 }
 
-inline void torch_check_npu(const at::TensorOptions& options) {
-  TORCH_CHECK(is_npu(options),
-              "Expected NPU tensor, please check whether the input tensor device is correct.", PTA_ERROR(ErrCode::PARAM));
+inline void torch_check_npu(const at::TensorOptions& options)
+{
+    TORCH_CHECK(is_npu(options),
+                "Expected NPU tensor, please check whether the input tensor device is correct.", PTA_ERROR(ErrCode::PARAM));
 }
 
-inline void torch_check_npu(const at::Device& device) {
-  TORCH_CHECK(is_npu(device),
-              "Expected NPU tensor, please check whether the input tensor device is correct.", PTA_ERROR(ErrCode::PARAM));
+inline void torch_check_npu(const at::Device& device)
+{
+    TORCH_CHECK(is_npu(device),
+                "Expected NPU tensor, please check whether the input tensor device is correct.", PTA_ERROR(ErrCode::PARAM));
 }
 
-inline c10::DeviceType get_npu_device_type() {
-  return c10::DeviceType::PrivateUse1;
+inline c10::DeviceType get_npu_device_type()
+{
+    return c10::DeviceType::PrivateUse1;
 }
 
-inline void maybe_initialize_npu(const at::TensorOptions& options) {
-  if (torch_npu::utils::is_npu(options)) {
-    c10_npu::NpuSysCtrl::SysStatus status =
-        c10_npu::NpuSysCtrl::GetInstance().Initialize(options.device().index());
-    if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
-      TORCH_CHECK(false, "npu device ", options.device().index(), " init failed.", PTA_ERROR(ErrCode::ACL));
-    }
+inline void maybe_initialize_npu(const at::TensorOptions& options)
+{
+    if (torch_npu::utils::is_npu(options)) {
+        c10_npu::NpuSysCtrl::SysStatus status =
+            c10_npu::NpuSysCtrl::GetInstance().Initialize(options.device().index());
+        if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
+            TORCH_CHECK(false, "npu device ", options.device().index(), " init failed.", PTA_ERROR(ErrCode::ACL));
+        }
 #ifndef BUILD_LIBTORCH
-    torch_npu::utils::npu_lazy_init();
+        torch_npu::utils::npu_lazy_init();
 #endif
-  }
+    }
 }
 
-inline void maybe_initialize_npu(const at::Device& device) {
-  if (torch_npu::utils::is_npu(device)) {
-    c10_npu::NpuSysCtrl::SysStatus status =
-        c10_npu::NpuSysCtrl::GetInstance().Initialize(device.index());
-    if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
-      TORCH_CHECK(false, "npu device ", device.index(), " init failed.", PTA_ERROR(ErrCode::ACL));
-    }
+inline void maybe_initialize_npu(const at::Device& device)
+{
+    if (torch_npu::utils::is_npu(device)) {
+        c10_npu::NpuSysCtrl::SysStatus status =
+            c10_npu::NpuSysCtrl::GetInstance().Initialize(device.index());
+        if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
+            TORCH_CHECK(false, "npu device ", device.index(), " init failed.", PTA_ERROR(ErrCode::ACL));
+        }
 #ifndef BUILD_LIBTORCH
-    torch_npu::utils::npu_lazy_init();
+        torch_npu::utils::npu_lazy_init();
 #endif
-  }
+    }
 }
 
-inline void maybe_initialize_npu(const c10::optional<at::Device>& device) {
-  if (device) {
-    maybe_initialize_npu(*device);
-  }
+inline void maybe_initialize_npu(const c10::optional<at::Device>& device)
+{
+    if (device) {
+        maybe_initialize_npu(*device);
+    }
 }
 
 }
diff --git a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..09b2171af3ff6830761c988dc0a3f3bfbd2d6300
--- /dev/null
+++ b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.cpp
@@ -0,0 +1,113 @@
+#include <unordered_map>
+#include "torch_npu/csrc/core/npu/interface/DcmiInterface.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
+#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
+
+constexpr int NPU_OK = 0;
+
+static int DcmiInit()
+{
+    int ret = c10_npu::dcmi::DcmiInit();
+    if (ret != NPU_OK) {
+        TORCH_CHECK(false, "Failed to init dcmi. ", PTA_ERROR(ErrCode::INTERNAL));
+    }
+    return ret;
+}
+
+std::string GetAffinityCPUBaseInfo(int card_id)
+{
+    int ret = DcmiInit();
+    int device_id = 0;
+    int device_id_max = 0;
+    int mcu_id = 0;
+    int cpu_id = 0;
+    ret = c10_npu::dcmi::DcmiGetDeviceIdInCard(card_id, &device_id_max, &mcu_id, &cpu_id);
+    if (ret != NPU_OK) {
+        TORCH_NPU_WARN_ONCE("dcmi_get_device_id_in_card is not supported. "
+                            "The npu_affine configuration of CPU_AFFINITY_CONF will be disabled.");
+        return "";
+    }
+    device_id = std::max(0, device_id_max - 1);
+    char affinity_cpu[TOPO_INFO_MAX_LENTH] = {0};
+    int length = 0;
+    ret = c10_npu::dcmi::DcmiGetAffinityCpuInfoByDeviceId(card_id, device_id, affinity_cpu, &length);
+    if (ret == NPU_OK) {
+        return affinity_cpu;
+    }
+    TORCH_NPU_WARN_ONCE("dcmi_get_affinity_cpu_info_by_device_id is not supported. "
+                        "The npu_affine configuration of CPU_AFFINITY_CONF will be disabled.");
+    return "";
+}
+
+std::unordered_map<int, c10_npu::CoreIdRange> CardIdAffinityCPU;
+
+c10_npu::CoreIdRange parseAffinityCPU(const std::string cpuString)
+{
+    size_t pos = cpuString.find("-");
+    if (pos != std::string::npos) {
+        std::string start = cpuString.substr(0, pos);
+        std::string end = cpuString.substr(pos + 1);
+        int startNum = stoi(start);
+        int endNum = stoi(end);
+        if (startNum < endNum) {
+            return c10_npu::CoreIdRange{startNum, endNum};
+        }
+    }
+    TORCH_CHECK(false, "affinity cpu " + cpuString + " is error ", PTA_ERROR(ErrCode::VALUE));
+}
+
+void GetExclusiveAffinityCPU()
+{
+    int ret = DcmiInit();
+    int device_count = 0;
+    int card_id_list[16];
+    int list_len = 16;
+    ret = c10_npu::dcmi::DcmiGetCardNumList(&device_count, card_id_list, list_len);
+    std::unordered_map<std::string, int> SameAffinityCpuNum;
+    std::map<int, std::string> CardIdAffinityCpuDefault;
+    for (int i = 0; i < device_count; i++) {
+        std::string affinity_cpu = GetAffinityCPUBaseInfo(i);
+        if (affinity_cpu.empty()) {
+            return;
+        }
+        CardIdAffinityCpuDefault[i] = affinity_cpu;
+        auto it = SameAffinityCpuNum.find(affinity_cpu);
+        if (it != SameAffinityCpuNum.end()) {
+            SameAffinityCpuNum[affinity_cpu] = it->second + 1;
+        } else {
+            SameAffinityCpuNum[affinity_cpu] = 1;
+        }
+    }
+    std::unordered_map<std::string, int> offsetMap;
+    for (const auto& it : CardIdAffinityCpuDefault) {
+        int card_id = it.first;
+        std::string affinity_cpu = it.second;
+        int same_num = 1;
+        auto find_same_affinity_cpu = SameAffinityCpuNum.find(affinity_cpu);
+        if (find_same_affinity_cpu != SameAffinityCpuNum.end()) {
+            same_num = find_same_affinity_cpu->second;
+        }
+        int offset = 0;
+        auto find_offset = offsetMap.find(affinity_cpu);
+        if (find_offset != offsetMap.end()) {
+            offset = find_offset->second;
+        }
+        c10_npu::CoreIdRange cpu_range = parseAffinityCPU(affinity_cpu);
+        int length = (cpu_range.end - cpu_range.start + 1) / same_num;
+        c10_npu::CoreIdRange exclusiveAffinityCpu = {cpu_range.start + offset * length, (cpu_range.start + length - 1) + offset * length};
+        offsetMap[affinity_cpu] = offset + 1;
+        CardIdAffinityCPU[card_id] = exclusiveAffinityCpu;
+    }
+}
+
+c10_npu::CoreIdRange GetAssignAffinityCPU(int card_id)
+{
+    GetExclusiveAffinityCPU();
+    if (CardIdAffinityCPU.empty()) {
+        return {0, 0};
+    }
+    auto it = CardIdAffinityCPU.find(card_id);
+    if (it != CardIdAffinityCPU.end()) {
+        return it->second;
+    }
+}
diff --git a/torch_npu/csrc/core/npu/GetAffinityCPUInfo.h b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0ce50a201f9a9801a0f72e0a3e2ba639ae5c587
--- /dev/null
+++ b/torch_npu/csrc/core/npu/GetAffinityCPUInfo.h
@@ -0,0 +1,10 @@
+#ifndef THNP_GETAFFINITY_INC
+#define THNP_GETAFFINITY_INC
+#include <string>
+
+std::string GetAffinityCPUBaseInfo(int card_id);
+c10_npu::CoreIdRange parseAffinityCPU(const std::string cpuString);
+void GetExclusiveAffinityCPU();
+c10_npu::CoreIdRange GetAssignAffinityCPU(int card_id);
+
+#endif
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/GetCANNInfo.cpp b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
index 84597850ed77bb59ca9003f4022485857fc73c84..f834d9357a61d518367cff4a2624f6df7532f21b 100644
--- a/torch_npu/csrc/core/npu/GetCANNInfo.cpp
+++ b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
@@ -5,6 +5,10 @@
 #include "torch_npu/csrc/core/npu/interface/AclInterface.h"
 #include "third_party/acl/inc/acl/acl.h"
 
+constexpr size_t kVersionIndex1 = 1;
+constexpr size_t kVersionIndex2 = 2;
+constexpr size_t kVersionIndex3 = 3;
+constexpr size_t kVersionIndex4 = 4;
 
 std::unordered_map<std::string, aclCANNPackageName> packageNameMap = {
     {"CANN", ACL_PKG_NAME_CANN},
@@ -17,38 +21,109 @@ std::unordered_map<std::string, aclCANNPackageName> packageNameMap = {
     {"DRIVER", ACL_PKG_NAME_DRIVER}
 };
 
-double VersionToNum(std::string versionStr)
+int64_t VersionToNum(std::string versionStr)
 {
     std::smatch results;
-    int major = -1;
-    int minor = -1;
-    int release = -1;
-    int RCVersion = -51;
-    int TVersion = -1;
-    int alphaVersion = 0;
+    int64_t major = -1;
+    int64_t minor = -1;
+    int64_t release = -1;
+    int64_t RCVersion = -51;
+    int64_t TVersion = -1;
+    int64_t alphaVersion = 0;
     if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).RC([0-9]+)"))) {
-        major = stoi(results[1]);
-        minor = stoi(results[2]);
-        RCVersion = stoi(results[3]);
+        major = stoll(results[kVersionIndex1]);
+        minor = stoll(results[kVersionIndex2]);
+        RCVersion = stoll(results[kVersionIndex3]);
     } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).([0-9]+)"))) {
-        major = stoi(results[1]);
-        minor = stoi(results[2]);
-        release = stoi(results[3]);
+        major = stoll(results[kVersionIndex1]);
+        minor = stoll(results[kVersionIndex2]);
+        release = stoll(results[kVersionIndex3]);
     } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).T([0-9]+)"))) {
-        major = stoi(results[1]);
-        minor = stoi(results[2]);
-        TVersion = stoi(results[3]);
+        major = stoll(results[kVersionIndex1]);
+        minor = stoll(results[kVersionIndex2]);
+        TVersion = stoll(results[kVersionIndex3]);
     } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).RC([0-9]+).alpha([0-9]+)"))) {
-        major = stoi(results[1]);
-        minor = stoi(results[2]);
-        RCVersion = stoi(results[3]);
-        alphaVersion = stoi(results[4]);
+        major = stoll(results[kVersionIndex1]);
+        minor = stoll(results[kVersionIndex2]);
+        RCVersion = stoll(results[kVersionIndex3]);
+        alphaVersion = stoll(results[kVersionIndex4]);
     } else {
         TORCH_NPU_WARN_ONCE("Version: " + versionStr + " is invalid.");
-        return 0.0;
+        return 0;
     }
 
-    double num = ((major + 1) * 100000000) + ((minor + 1) * 1000000) + ((release + 1) * 10000) + ((RCVersion + 1) * 100 + 5000) + ((TVersion + 1) * 100) - (100 - alphaVersion);
+    int64_t num = ((major + 1) * 100000000) +
+                 ((minor + 1) * 1000000) +
+                 ((release + 1) * 10000) +
+                 ((RCVersion + 1) * 100 + 5000) +
+                 ((TVersion + 1) * 100) - (100 - alphaVersion);
+    return num;
+}
+
+int64_t DriverVersionToNum(std::string versionStr)
+{
+    std::smatch results;
+    int64_t major = -1;
+    int64_t minor = -1;
+    int64_t release = -1;
+    int64_t TVersion = -1;
+    int64_t RCVersion = -51;
+    int64_t patch = 0;
+    int64_t bVersion = 1;
+    int64_t alphaVersion = 0;
+    // driver version check only supports pattern listed here:
+    // pattern is major.minor.release.patch. release:num or RC+num or T+num, patch: num or alpha+num or beta+num.
+    std::regex re_rc("([0-9]+).([0-9]+).RC([0-9]+)", std::regex::icase);
+    std::regex re_num("([0-9]+).([0-9]+).([0-9]+)");
+    std::regex re_rc_num("([0-9]+).([0-9]+).RC([0-9]+).([0-9]+)", std::regex::icase);
+    std::regex re_num_num("([0-9]+).([0-9]+).([0-9]+).([0-9]+)");
+    std::regex re_t("([0-9]+).([0-9]+).T([0-9]+)", std::regex::icase);
+    std::regex re_rc_beta("([0-9]+).([0-9]+).RC([0-9]+).beta([0-9]+)", std::regex::icase);
+    std::regex re_rc_alpha("([0-9]+).([0-9]+).RC([0-9]+).alpha([0-9]+)", std::regex::icase);
+    if (std::regex_match(versionStr, results, re_rc)) {
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        RCVersion = stoi(results[kVersionIndex3]);
+    } else if (std::regex_match(versionStr, results, re_rc_num)) {
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        RCVersion = stoi(results[kVersionIndex3]);
+        patch = stoi(results[kVersionIndex4]);
+    } else if (std::regex_match(versionStr, results, re_num)) {
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        release = stoi(results[kVersionIndex3]);
+    } else if (std::regex_match(versionStr, results, re_num_num)) {
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        release = stoi(results[kVersionIndex3]);
+        patch = stoi(results[kVersionIndex4]);
+    } else if (std::regex_match(versionStr, results, re_t)) {
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        TVersion = stoi(results[kVersionIndex3]);
+    } else if (std::regex_match(versionStr, results, re_rc_beta)) {
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        RCVersion = stoi(results[kVersionIndex3]);
+        bVersion = stoi(results[kVersionIndex4]);
+    } else if (std::regex_match(versionStr, results, re_rc_alpha)) {
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        RCVersion = stoi(results[kVersionIndex3]);
+        alphaVersion = stoi(results[kVersionIndex4]);
+    } else {
+        TORCH_NPU_WARN_ONCE("Driver Version: " + versionStr + " is invalid or not supported yet.");
+        return 0;
+    }
+
+    int64_t num = ((major + 1) * 100000000) +
+                  ((minor + 1) * 1000000) +
+                  ((release + 1) * 10000) +
+                  ((RCVersion + 1) * 100 + 5000) +
+                  ((TVersion + 1) * 100) -
+                  (alphaVersion ? 1 : 0) * (100 - alphaVersion) +
+                  (bVersion - 1) + patch;
     return num;
 }
 
@@ -82,15 +157,38 @@ std::string GetCANNVersion(const std::string& module)
 bool IsGteCANNVersion(const std::string version, const std::string module)
 {
     static std::string baseVersion = "8.1.RC1";
+    static std::string unsupportedModule = "DRIVER";
+    if (module.compare(unsupportedModule) == 0) {
+        TORCH_CHECK(false, "When the module is DRIVER, this function is not supported.", PTA_ERROR(ErrCode::VALUE));
+    }
     if (version.compare(baseVersion) < 0) {
-        TORCH_CHECK(false, "When the version is less than \"8.1.RC1\", this function is not supported.", PTA_ERROR(ErrCode::VALUE));
+        TORCH_CHECK(false, "When the version " + version + " is less than \"8.1.RC1\", this function is not supported.", PTA_ERROR(ErrCode::VALUE));
     }
     std::string currentVersion = GetCANNVersion(module);
-    double current_num = VersionToNum(currentVersion);
-    double boundary_num = VersionToNum(version);
+    int64_t current_num = VersionToNum(currentVersion);
+    int64_t boundary_num = VersionToNum(version);
     if (current_num >= boundary_num) {
         return true;
     } else {
         return false;
     }
+}
+
+bool IsGteDriverVersion(const std::string driverVersion)
+{
+    // if cann does not support AclsysGetCANNVersion，GetCANNVersion("DRIVER") will return "".
+    // The result of this function will be false, even if current driver version meets the requirement.
+    const static std::string baseCANNVersion = "8.1.RC1";
+    std::string currentCANNVersion = GetCANNVersion("CANN");
+    int64_t currentCannNum = VersionToNum(currentCANNVersion);
+    int64_t boundaryCannNum = VersionToNum(baseCANNVersion);
+    if (currentCannNum < boundaryCannNum) {
+        TORCH_CHECK(false, "When the cann version is less than \"8.1.RC1\", this function is not supported.",
+                    PTA_ERROR(ErrCode::VALUE));
+    }
+    // check driver version
+    std::string currentDriverVersion = GetCANNVersion("DRIVER");
+    double currentDriverNum = DriverVersionToNum(currentDriverVersion);
+    double boundaryDriverNum = DriverVersionToNum(driverVersion);
+    return currentDriverNum >= boundaryDriverNum;
 }
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/GetCANNInfo.h b/torch_npu/csrc/core/npu/GetCANNInfo.h
index 8c3aa86c6b1badeb3436c673d23b85e0209c8b51..917b7edde45f0416a066c55c21e404ce4ed50c81 100644
--- a/torch_npu/csrc/core/npu/GetCANNInfo.h
+++ b/torch_npu/csrc/core/npu/GetCANNInfo.h
@@ -11,4 +11,6 @@ formula: ((a+1) * 100000000) + ((b+1) * 1000000) + ((c+1) * 10000) + ((d+1) * 10
 */
 bool IsGteCANNVersion(const std::string version, const std::string module = "CANN");
 
+bool IsGteDriverVersion(const std::string driverVersion);
+
 #endif
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
index 2ffe862c52f5fed432bdf583b347f595785d0d36..c628f04490d02f1156444b7024fd07f26163fec1 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -1,293 +1,319 @@
-
 #include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
+#include "torch_npu/csrc/core/npu/GetAffinityCPUInfo.h"
+#include "torch_npu/csrc/core/npu/NpuVariables.h"
 
 #include <pthread.h>
 #include <unistd.h>
 #include <sys/syscall.h>
-#include <cstdio>
 #include <sys/prctl.h>
 #include <string>
 #include <unordered_map>
-#include <cctype>
-#include <algorithm>
 
 namespace c10_npu {
 
-    static pthread_t mainthread_tid;
-    static bool has_set_affinity = false;
-
-    const std::unordered_map<ThreadType, std::string> threadTypeToNameMap = {
-        {releaseThread, "release_thread"},
-        {aclThread, "acl_thread"},
-        {mainThread, "main_thread"},
-        {hcclCommWatchdogThread, "hcclComm_watchd"}, // thread name no more than 15 chars
-        {backwardThread, "backward_thread"}};
-
-    const std::unordered_map<std::string, ThreadType> threadNameToTypeMap = {
-        {"release_thread", releaseThread},
-        {"acl_thread", aclThread},
-        {"main_thread", mainThread},
-        {"hcclComm_watchd", hcclCommWatchdogThread},
-        {"backward_thread", backwardThread}};
-
-    inline bool has_set_pthread_affinity()
-    {
-        unsigned int core_nums = static_cast<unsigned int>(sysconf(_SC_NPROCESSORS_ONLN));
-
-        cpu_set_t mask;
-        pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask);
-        for (unsigned int i = 0; i < core_nums; i++) {
-            if (!CPU_ISSET(i, &mask)) {
-                return true;
-            }
-        }
-        return false;
-    }
+static thread_local ThreadType local_thread = ThreadType::MAIN_THREAD;
 
-    void GetAffinityInfo()
-    {
-        mainthread_tid = pthread_self();
-        has_set_affinity = has_set_pthread_affinity();
-    }
+using ThreadCoreMap = std::unordered_map<ThreadType, CoreIdRange>;
 
-    ThreadType getCurrentThreadType()
-    {
-        char thread_name[16];
+static uint32_t cpu_affinity_mode;
+static std::vector<CoreIdRange> device_ranges;
+static std::unordered_map<c10::DeviceIndex, ThreadCoreMap> device_thread_core_maps;
 
-        if (prctl(PR_GET_NAME, thread_name, 0, 0, 0) == 0) {
-            std::string name(thread_name);
+const std::initializer_list<ThreadType> threadTypeList = {
+    MAIN_THREAD, ACL_THREAD, RELEASE_THREAD, WATCHDOG_THREAD, OTHER_THREAD};
 
-            auto it = threadNameToTypeMap.find(name);
-            if (it != threadNameToTypeMap.end()) {
-                return it->second;
-            }
-        }
-        return ThreadType::unknownThread;
-    }
+const std::unordered_map<ThreadType, std::string> threadTypeToNameMap = {
+    {MAIN_THREAD,       "main_thread"},
+    {ACL_THREAD,        "acl_thread"},
+    {RELEASE_THREAD,    "release_thread"},
+    {WATCHDOG_THREAD,   "hccl_watchdog_t"},
+    {OTHER_THREAD,      "other_thread"}};
 
-    aclError SetThreadAffinity(coreIdRange core_range, pthread_t thread)
-    {
-        cpu_set_t mask;
-        CPU_ZERO(&mask);
+CoreIdRange getCPUDefaultRange(c10::DeviceIndex device_id)
+{
+    static int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
+    int device_nums = device_count_ensure_non_zero();
+    int block_size = (core_nums > 0 && device_nums > 0) ? core_nums / device_nums : 0;
+    return CoreIdRange{static_cast<CoreId>(device_id * block_size),
+                       static_cast<CoreId>((device_id + 1) * block_size - 1)};
+}
 
-        for (auto i = core_range.start; i <= core_range.end; i++) {
-            CPU_SET(i, &mask);
-        }
-        if (!pthread_setaffinity_np(thread, sizeof(mask), &mask)) {
-            ASCEND_LOGD("Set Thread Affinity to %d-%d", core_range.start, core_range.end);
-            return ACL_ERROR_NONE;
-        }
-        return ACL_ERROR_FEATURE_UNSUPPORTED;
+inline bool isAllDigits(const std::string &str)
+{
+    if (str.empty()) {
+        return false;
     }
-
-    coreIdRange GetCPUDefaultRange(c10::DeviceIndex device_id)
-    {
-        int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
-        int device_nums = device_count_ensure_non_zero();
-        int block_size = (core_nums > 0 && device_nums > 0) ? (core_nums + device_nums - 1) / device_nums : 0;
-        return coreIdRange{static_cast<unsigned int>(device_id * block_size),
-                           static_cast<coreId>(std::min((device_id + 1) * block_size, core_nums) - 1)};
+    return std::all_of(str.begin(), str.end(), [](unsigned char c) {
+        return std::isdigit(c);
+    });
+}
+
+void parseCPUAffinityConf(uint32_t &mode, std::vector<CoreIdRange> &ranges)
+{
+    // init
+    int device_nums = device_count_ensure_non_zero();
+    ranges.clear();
+    ranges.resize(device_nums);
+    for (int i = 0; i < device_nums; ++i) {
+        ranges[i] = getCPUDefaultRange(i);
     }
+    mode = 0;
 
+    const char *input = c10_npu::option::OptionsManager::GetCpuAffinityConf();
+    if (input == nullptr || strlen(input) == 0) {
+        return;
+    }
 
-    std::string GetAffinityMapAsString(const std::unordered_map<ThreadType, coreIdRange> &threadToCoreidMap, c10::DeviceIndex device_id)
-    {
-        std::ostringstream oss;
-        oss << "threadToCoreidMap plan to bind device " << static_cast<unsigned int>(device_id) << " to "
-            << " [" << threadToCoreidMap.at(unknownThread).start << "," << threadToCoreidMap.at(unknownThread).end << "]、"
-            << " [" << threadToCoreidMap.at(mainThread).start << "," << threadToCoreidMap.at(mainThread).end << "]、"
-            << " [" << threadToCoreidMap.at(backwardThread).start << "," << threadToCoreidMap.at(backwardThread).end << "]、"
-            << " [" << threadToCoreidMap.at(aclThread).start << "," << threadToCoreidMap.at(aclThread).end << "]、"
-            << " [" << threadToCoreidMap.at(releaseThread).start << "," << threadToCoreidMap.at(releaseThread).end << "]、"
-            << " [" << threadToCoreidMap.at(hcclCommWatchdogThread).start << "," << threadToCoreidMap.at(hcclCommWatchdogThread).end << "]";
+    std::string inputStr(input);
+    std::istringstream stream(inputStr);
+    std::string option;
+
+    std::regex pattern("npu_affine:(\\d)");
+    std::smatch match;
+    if (std::regex_search(inputStr, match, pattern)) {
+        int isAffinity = std::stoi(match[1].str());
+        if (isAffinity != 0) {
+            for (int i = 0; i < device_nums; i++) {
+                CoreIdRange getRange = GetAssignAffinityCPU(i);
+                if (getRange.start == 0 && getRange.end == 0) {
+                    break;
+                }
+                ranges[i] = getRange;
+            }
+        }
+    }
 
-        return oss.str();
+    // Handle cases where only `mode` is provided, or `mode:` without value
+    if (isAllDigits(inputStr)) {
+        mode = static_cast<uint32_t>(std::stoi(inputStr));
+        return; // Return directly, `mode` has already been processed
     }
 
-    std::unordered_map<ThreadType, coreIdRange> GetCpuAffinityMap(c10::DeviceIndex device_id)
-    {
-        std::unordered_map<ThreadType, coreIdRange> threadToCoreidMap;
-        std::initializer_list<ThreadType> thread_types = {unknownThread, mainThread, backwardThread, aclThread,
-                                                          releaseThread, hcclCommWatchdogThread};
-
-        coreIdRange current_core_range = GetCPUDefaultRange(device_id);
-        coreId offset = current_core_range.start;
-
-        // calculate env2 default map
-        coreId core_nums = current_core_range.end - current_core_range.start;
-        if (core_nums < thread_types.size()) {
-            ASCEND_LOGW("Available core numbers (%d) are insufficient for all %zu thread types. Binding available cores to all threads.",
-                        core_nums, thread_types.size());
-            for (auto thread_type : thread_types) {
-                threadToCoreidMap[thread_type] = current_core_range;
-            }
-        } else {
-            int remaining_type_count = thread_types.size() - 1;
-            int i = 0;
-            for (auto thread_type : thread_types) {
-                if (thread_type == ThreadType::unknownThread) {
-                    threadToCoreidMap[ThreadType::unknownThread] = coreIdRange{current_core_range.start + remaining_type_count, current_core_range.end};
+    // Parse each option
+    while (std::getline(stream, option, ',')) {
+        // Split `option` based on colon
+        size_t colonPos = option.find(':');
+        if (colonPos != std::string::npos) {
+            std::string key = option.substr(0, colonPos);
+            std::string value = option.substr(colonPos + 1);
+
+            // Process `mode`
+            if (key == "mode") {
+                if (isAllDigits(value)) {
+                    mode = static_cast<uint32_t>(std::stoi(value));
                 } else {
-                    threadToCoreidMap[thread_type] = coreIdRange{offset + i, offset + (i++)};
+                    ASCEND_LOGW("mode is %s, should be all digits", value.c_str());
+                }
+            } else if (key.rfind("npu", 0) == 0) {
+                // Handle NPU core binding range
+                // The key is like 'npu:0', so skip first 3 chars.
+                if (isAllDigits(key.substr(3))) {
+                    int device_id = std::stoi(key.substr(3)); // Parse NPU device ID
+                    if (device_id < device_nums) {
+                        size_t dashPos = value.find('-');
+                        if (dashPos != std::string::npos) {
+                            std::string startStr = value.substr(0, dashPos);
+                            std::string endStr = value.substr(dashPos + 1);
+                            if (isAllDigits(startStr) && isAllDigits(endStr)) {
+                                CoreId start = static_cast<CoreId>(std::stoi(startStr));
+                                CoreId end = static_cast<CoreId>(std::stoi(endStr));
+                                ranges[device_id] = {start, end};
+                            } else {
+                                ASCEND_LOGW("core range is %s-%s, should be all digits", startStr.c_str(), endStr.c_str());
+                            }
+                        } else {
+                            if (isAllDigits(value)) {
+                                CoreId singleCore = static_cast<CoreId>(std::stoi(value));
+                                ranges[device_id] = {singleCore, singleCore};
+                            } else {
+                                ASCEND_LOGW("core range is string : %s, should be all digits", value.c_str());
+                            }
+                        }
+                    }
                 }
             }
+        } else if (isAllDigits(option)) {
+            // If no colon and the value is a number, use it directly as `mode`
+            mode = static_cast<uint32_t>(std::stoi(option));
         }
-
-        ASCEND_LOGD("Thread affinity map for device %d: %s", device_id, GetAffinityMapAsString(threadToCoreidMap, device_id).c_str());
-
-        return threadToCoreidMap;
     }
+}
+
+void printCoreRanges(const uint32_t mode, const std::vector<CoreIdRange> &ranges)
+{
+    std::ostringstream oss;
+    oss << "Mode: " << mode << ". Core range for each device ID: ";
 
-    aclError SetThreadAffinity(c10::DeviceIndex device_id)
-    {
-        return SetThreadAffinity(device_id, getCurrentThreadType());
+    for (size_t i = 0; i < ranges.size(); ++i) {
+        oss << "Device " << i << ": [" << ranges[i].start << "," << ranges[i].end << "]";
+        if (i != ranges.size() - 1) {
+            oss << "; ";
+        } else {
+            oss << ".";
+        }
     }
 
-    void printCoreRanges(const std::vector<coreIdRange> &ranges, uint32_t mode)
-    {
-        std::ostringstream oss;
-        oss << "Mode: " << mode << " ";
+    ASCEND_LOGD("Read CPU affinity config: %s", oss.str().c_str());
+}
 
-        for (size_t i = 0; i < ranges.size(); ++i) {
-            oss << "Device " << i << " Core Range: " << ranges[i].start << " - " << ranges[i].end << " ";
-        }
+bool getThreadAffinityInfo()
+{
+    parseCPUAffinityConf(cpu_affinity_mode, device_ranges);
+    printCoreRanges(cpu_affinity_mode, device_ranges);
 
-        ASCEND_LOGD("Core ranges: %s", oss.str().c_str());
+    if (cpu_affinity_mode == 0) {
+        return false;
     }
 
-    bool isAllDigits(const std::string &str)
-    {
-        if (str.empty()) {
-            return false;
+    cpu_set_t mask;
+    pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask);
+    for (auto &range : device_ranges) {
+        for (unsigned int i = range.start; i < range.end; i++) {
+            if (!CPU_ISSET(i, &mask)) {
+                ASCEND_LOGW("Thread affinity is already set.");
+                return false;
+            }
         }
-        return std::all_of(str.begin(), str.end(), [](unsigned char c) {
-            return std::isdigit(c);
-        });
     }
-
-    void parseCPUAffinityConf(uint32_t &mode, std::vector<coreIdRange> &ranges)
-    {
-        const char *input = c10_npu::option::OptionsManager::GetCpuAffinityConf();
-
-        if (input == nullptr || strlen(input) == 0) {
-            mode = 0;
-            return;
+    return true;
+}
+
+inline bool needToSetThreadAffinity()
+{
+    static bool need_to_set_affinity = getThreadAffinityInfo();
+    return need_to_set_affinity;
+}
+
+void SetThreadType(ThreadType type)
+{
+    // Called at the start of the thread's execution to avoid frequent triggering of this function.
+    local_thread = type;
+    if (type == ThreadType::OTHER_THREAD || type == ThreadType::MAIN_THREAD) {
+        return;
+    }
+    if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) {
+        ASCEND_LOGW("Set thread name to %s failed!", threadTypeToNameMap.at(type).c_str());
+    }
+}
+
+std::string getAffinityMapAsString(c10::DeviceIndex device_id, const ThreadCoreMap &threadCoreMap)
+{
+    std::ostringstream oss;
+    for (auto thread_type : threadTypeList) {
+        oss << threadTypeToNameMap.at(thread_type) << " : ["
+            << threadCoreMap.at(thread_type).start << ","
+            << threadCoreMap.at(thread_type).end << "]";
+        if (thread_type != OTHER_THREAD) {
+            oss << "; ";
+        } else {
+            oss << ".";
         }
+    }
+    return oss.str();
+}
+
+ThreadCoreMap getCpuAffinityMap(c10::DeviceIndex device_id, const std::vector<CoreIdRange> &device_ranges)
+{
+    ThreadCoreMap threadCoreMap;
+    CoreIdRange range = device_ranges[device_id];
+    unsigned int core_nums = range.end - range.start + 1;
+    if (core_nums < threadTypeList.size()) {
+        ASCEND_LOGW("Device %d available core numbers (%d) are insufficient for all %zu thread types and will bind available cores to all threads.",
+                    device_id, core_nums, threadTypeList.size());
+        for (auto thread_type : threadTypeList) {
+            threadCoreMap[thread_type] = range;
+        }
+        return threadCoreMap;
+    }
 
-        mode = 0;
-        int device_nums = device_count_ensure_non_zero();
-        ranges.clear();
-        ranges.resize(device_nums);
-
-        // init
-        for (int i = 0; i < device_nums; ++i) {
-            ranges[i] = GetCPUDefaultRange(i);
+    CoreId now = range.start;
+    for (auto thread_type : threadTypeList) {
+        if (thread_type != ThreadType::OTHER_THREAD) {
+            threadCoreMap[thread_type] = CoreIdRange{now, now};
+        } else {
+            threadCoreMap[ThreadType::OTHER_THREAD] = CoreIdRange{now, range.end};
         }
+        now++;
+    }
 
-        std::string inputStr(input);
-        std::istringstream stream(inputStr);
-        std::string option;
+    ASCEND_LOGD("Device %d thread affinity map: %s", device_id, getAffinityMapAsString(device_id, threadCoreMap).c_str());
+    return threadCoreMap;
+}
 
-        // Handle cases where only `mode` is provided, or `mode:` without value
-        if (isAllDigits(inputStr)) {
-            mode = static_cast<uint32_t>(std::stoi(inputStr));
-            return; // Return directly, `mode` has already been processed
-        }
+bool setThreadAffinityImpl(CoreIdRange core_range)
+{
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    for (auto i = core_range.start; i <= core_range.end; i++) {
+        CPU_SET(i, &mask);
+    }
+    if (!pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask)) {
+        return true;
+    } else {
+        return false;
+    }
+}
 
-        // Parse each option
-        while (std::getline(stream, option, ',')) {
-            // Split `option` based on colon
-            size_t colonPos = option.find(':');
-            if (colonPos != std::string::npos) {
-                std::string key = option.substr(0, colonPos);
-                std::string value = option.substr(colonPos + 1);
-
-                // Process `mode`
-                if (key == "mode") {
-                    if (isAllDigits(value)) {
-                        mode = static_cast<uint32_t>(std::stoi(value));
-                    } else {
-                        ASCEND_LOGW("mode is %s, should be all digits", value.c_str());
-                    }
-                } else if (key.rfind("npu", 0) == 0) {
-                    // Handle NPU core binding range
-                    if (isAllDigits(key.substr(3))) {
-                        int device_id = std::stoi(key.substr(3)); // Parse NPU device ID
-                        if (device_id < device_nums) {
-                            size_t dashPos = value.find('-');
-                            if (dashPos != std::string::npos) {
-                                std::string startStr = value.substr(0, dashPos);
-                                std::string endStr = value.substr(dashPos + 1);
-                                if (isAllDigits(startStr) && isAllDigits(endStr)) {
-                                    coreId start = static_cast<coreId>(std::stoi(startStr));
-                                    coreId end = static_cast<coreId>(std::stoi(endStr));
-                                    ranges[device_id] = {start, end};
-                                } else {
-                                    ASCEND_LOGW("core range is %s-%s, should be all digits", startStr.c_str(), endStr.c_str());
-                                }
-                            } else {
-                                if (isAllDigits(value)) {
-                                    coreId singleCore = static_cast<coreId>(std::stoi(value));
-                                    ranges[device_id] = {singleCore, singleCore};
-                                } else {
-                                    ASCEND_LOGW("core range is string : %s, should be all digits", value.c_str());
-                                }
-                            }
-                        }
-                    }
-                }
-            } else if (isAllDigits(option)) {
-                // If no colon and the value is a number, use it directly as `mode`
-                mode = static_cast<uint32_t>(std::stoi(option));
-            }
-        }
+void SetThreadAffinity(c10::DeviceIndex device_id)
+{
+    if (!needToSetThreadAffinity() || local_thread == ThreadType::USER_THREAD) {
+        return;
     }
 
-    aclError SetThreadAffinity(c10::DeviceIndex device_id, ThreadType current_thread_type)
-    {
-        if (has_set_affinity) {
-            ASCEND_LOGW("Thread affinity is already set.");
-            return ACL_ERROR_NONE;
-        }
-        uint32_t bind_conf;
-        std::vector<coreIdRange> ranges;
-        parseCPUAffinityConf(bind_conf, ranges);
-        printCoreRanges(ranges, bind_conf);
-
-        // bind_conf=1, bind cores averagely based on device_id
-        if (bind_conf == 1) {
-            return SetThreadAffinity(ranges[device_id], pthread_self());
-        } else if (bind_conf == 2) {
-            auto thread_core_map = GetCpuAffinityMap(device_id);
-            // Bind the main thread only when the dispatch phase begins (i.e., when ThreadType::backwardThread is set)
-            if (current_thread_type == ThreadType::backwardThread)
-                SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid);
-            return SetThreadAffinity(thread_core_map.at(current_thread_type), pthread_self());
-        } else {
-            ASCEND_LOGD("Thread affinity setting is disabled.");
+    CoreIdRange core_range;
+    if (cpu_affinity_mode == 1) {
+        core_range = device_ranges[device_id];
+    } else {
+        if (device_thread_core_maps.find(device_id) == device_thread_core_maps.end()) {
+            device_thread_core_maps.emplace(device_id, getCpuAffinityMap(device_id, device_ranges));
         }
-        return ACL_ERROR_NONE;
+        core_range = device_thread_core_maps.at(device_id).at(local_thread);
     }
 
-    void SetBackwardThreadName(c10::DeviceIndex device_id)
-    {
-        static thread_local bool seted = false;
-        if (!seted) {
-            seted = true;
-            if (syscall(SYS_gettid) != getpid()) {
-                SetThreadName(ThreadType::backwardThread);
-                SetThreadAffinity(device_id);
-            }
-        }
+    if (setThreadAffinityImpl(core_range)) {
+        ASCEND_LOGD("Device %d set %s affinity to %d-%d success.",
+                    device_id, threadTypeToNameMap.at(local_thread).c_str(), core_range.start, core_range.end);
+    } else {
+        ASCEND_LOGE("Device %d set %s affinity to %d-%d failed.",
+                    device_id, threadTypeToNameMap.at(local_thread).c_str(), core_range.start, core_range.end);
     }
+}
 
-    void SetThreadName(ThreadType type)
-    {
-        // Ensure this is called at the start of the thread's execution to avoid frequent triggering of this function.
-        if (prctl(PR_SET_NAME, threadTypeToNameMap.at(type).c_str()) != 0) {
-            ASCEND_LOGW("set thread name failed!");
-        }
+void SetThreadAffinity(ThreadType type)
+{
+    if (!needToSetThreadAffinity()) {
+        return;
+    }
+    int device_index;
+    NPU_CHECK_ERROR_WITHOUT_UCE(GetDevice(&device_index));
+    c10::DeviceIndex device = static_cast<c10::DeviceIndex>(device_index);
+    local_thread = type;
+    SetThreadAffinity(device);
+}
+
+void SetThreadAffinity(int core_start, int core_end)
+{
+    if (!needToSetThreadAffinity()) {
+        return;
     }
 
-}
\ No newline at end of file
+    static int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
+    CoreIdRange core_range;
+    core_range.start = static_cast<CoreId>(std::min(core_start, core_nums));
+    core_range.end = static_cast<CoreId>(std::min(core_end, core_nums));
+    local_thread = ThreadType::USER_THREAD;
+
+    if (setThreadAffinityImpl(core_range)) {
+        ASCEND_LOGD("Set thread affinity to user-defined range %d-%d success.", core_range.start, core_range.end);
+    } else {
+        ASCEND_LOGE("Set thread affinity to user-defined range %d-%d failed.", core_range.start, core_range.end);
+    }
+}
+
+bool IsOpDispatch()
+{
+    return is_op_dispatch && (local_thread == ThreadType::MAIN_THREAD);
+}
+
+} // namespace c10_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.h b/torch_npu/csrc/core/npu/NPUAffinityController.h
index f2e78b69b68c689835f55733396f18445c2a11ed..669e3bff68cbc139694d079c9714d6ee44516f45 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.h
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.h
@@ -1,35 +1,38 @@
 #pragma once
-#include "torch_npu/csrc/core/npu/npu_log.h"
+#include <c10/core/Device.h>
 
 namespace c10_npu {
 
-    typedef unsigned int coreId;
-
-    struct coreIdRange {
-        coreId start;
-        coreId end;
-    };
-
-    enum ThreadType {
-        unknownThread = 0, // Mostly refers to threads in PyTorch's motorized sleep thread pool, which are not considered in PTA.
-        mainThread = 1,    // 1st performance hotspot, responsible for operator dispatching during the forward phase.
-        backwardThread = 2,  // 2nd performance hotspot, responsible for operator dispatching during the backward phase.
-        aclThread = 3,     // 3rd performance hotspot in PTA, responsible for handling the task queue.
-        releaseThread = 4, // Thread responsible for resource release.
-        hcclCommWatchdogThread = 5 // Thread responsible for HCCL communication monitoring.
-    };
-
-    aclError SetThreadAffinity(c10::DeviceIndex device);
-    aclError SetThreadAffinity(c10::DeviceIndex device, ThreadType current_thread_type);
-    void SetThreadName(ThreadType type);
-
-    // The main thread of PTA, which is also the main thread of PyTorch, handles multiple phases of tasks
-    // (e.g., first parallel checkpoint data loading, then transitioning to forward training).
-    // Each phase may require different thread affinity settings. Therefore, we record the thread's TID
-    // to adjust its affinity later as needed.
-    void GetAffinityInfo();
-
-    // Set backwardThread Name Once
-    void SetBackwardThreadName(c10::DeviceIndex device_id);
-
-}
\ No newline at end of file
+using CoreId = unsigned int;
+struct CoreIdRange {
+    CoreId start;
+    CoreId end;
+};
+
+enum ThreadType {
+    MAIN_THREAD = 0,        // 1st performance hotspot, responsible for operator dispatching.
+    ACL_THREAD = 1,         // 2rd performance hotspot in PTA, responsible for handling the task queue.
+    RELEASE_THREAD = 2,     // Thread responsible for resource release.
+    WATCHDOG_THREAD = 3,    // Thread responsible for HCCL communication monitoring.
+    OTHER_THREAD = 4,       // Mostly refers to threads in PyTorch's motorized sleep thread pool, which
+                            // are not considered in PTA.
+    USER_THREAD = 5,        // Thread responsible for user.
+};
+
+static thread_local bool is_op_dispatch = false;
+
+inline void SetOpDispatch()
+{
+    if (!is_op_dispatch) {
+        is_op_dispatch = true;
+    }
+}
+
+bool IsOpDispatch();
+
+void SetThreadType(ThreadType type);
+void SetThreadAffinity(c10::DeviceIndex device);
+void SetThreadAffinity(ThreadType type);
+void SetThreadAffinity(int core_start, int core_end);
+
+} // namespace c10_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 03788c3a5e6c9b175c681947212c1e67ac40f771..5cd100ca40a8705b18feb2d7cd616fb71bb37519 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -51,7 +51,6 @@ std::string format_size(uint64_t size)
 
 namespace c10_npu {
 namespace NPUCachingAllocator {
-
 C10_DEFINE_REGISTRY(FreeNPUMemoryCallbacksRegistry, FreeMemoryCallback);
 
 //
@@ -86,61 +85,59 @@ C10_DEFINE_REGISTRY(FreeNPUMemoryCallbacksRegistry, FreeMemoryCallback);
 namespace {
 using stream_set = ska::flat_hash_set<c10_npu::NPUStream>;
 
-constexpr size_t kMinBlockSize = 512; // all sizes are rounded to at least 512 bytes
-constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB
-constexpr size_t kSmallBuffer = 2097152; // "small" allocations are packed in 2 MiB blocks
-constexpr size_t kLargeBuffer = 20971520; // "large" allocations may be packed in 20 MiB blocks
-constexpr size_t kLargeBufferForHccl = 134217728; // "large for hccl" allocations may be packed in 128 MiB blocks
-constexpr size_t kExtraLargeBuffer = 1073741824; // "extra large" allocations may be packed in 1 GB blocks
-constexpr size_t kMinLargeAlloc = 10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
-constexpr size_t kRoundLarge = 2097152; // round up large allocs to 2 MiB
-constexpr size_t kAlignRoundLarge = 16384; // round up large allocs to 16 KB
-constexpr size_t kSmallPoolVirAddrSize = 2147483648; // 2 GB
+constexpr size_t kMinBlockSize = 512;                 // all sizes are rounded to at least 512 bytes
+constexpr size_t kSmallSize = 1048576;                // largest "small" allocation is 1 MiB
+constexpr size_t kSmallBuffer = 2097152;              // "small" allocations are packed in 2 MiB blocks
+constexpr size_t kLargeBuffer = 20971520;             // "large" allocations may be packed in 20 MiB blocks
+constexpr size_t kLargeBufferForHccl = 134217728;     // "large for hccl" allocations may be packed in 128 MiB blocks
+constexpr size_t kExtraLargeBuffer = 1073741824;      // "extra large" allocations may be packed in 1 GB blocks
+constexpr size_t kMinLargeAlloc = 10485760;           // allocations between 1 and 10 MiB may use kLargeBuffer
+constexpr size_t kRoundLarge = 2097152;               // round up large allocs to 2 MiB
+constexpr size_t kAlignRoundLarge = 16384;            // round up large allocs to 16 KB
+constexpr size_t kSmallPoolVirAddrSize = 2147483648;  // 2 GB
 constexpr size_t kLargePoolVirAddrSize = 10737418240; // 10 GB
-const std::string kMinCannVersion = "8.1.RC1"; // minimum cann version which supports 1g mem 8.1.RC1
-const std::string kMinDriverVersion = "25.0.RC1"; // minimum driver version which supports 1g mem 25.0.RC1
-const std::string kCannModule = "CANN"; // cann module name
-const std::string kDriverModule = "DRIVER"; // driver module name
+const std::string kMinCannVersion = "8.1.RC1";        // minimum cann version which supports 1g mem 8.1.RC1
+const std::string kMinDriverVersion = "25.0.RC1";     // minimum driver version which supports 1g mem 25.0.RC1
+const std::string kCannModule = "CANN";               // cann module name
 
 using StatTypes = std::array<bool, static_cast<size_t>(StatType::NUM_TYPES)>;
 
-void update_stat(Stat& stat, int64_t amount) {
-  stat.current += amount;
-  stat.peak = std::max(stat.current, stat.peak);
-  if (amount > 0) {
-    stat.allocated += amount;
-  }
-  if (amount < 0) {
-    stat.freed += -amount;
-  }
+void update_stat(Stat &stat, int64_t amount)
+{
+    stat.current += amount;
+    stat.peak = std::max(stat.current, stat.peak);
+    if (amount > 0) {
+        stat.allocated += amount;
+    }
+    if (amount < 0) {
+        stat.freed += -amount;
+    }
 }
 
-void reset_accumulated_stat(Stat& stat) {
-  stat.allocated = 0;
-  stat.freed = 0;
+void reset_accumulated_stat(Stat &stat)
+{
+    stat.allocated = 0;
+    stat.freed = 0;
 }
 
-void reset_peak_stat(Stat& stat) {
-  stat.peak = stat.current;
+void reset_peak_stat(Stat &stat)
+{
+    stat.peak = stat.current;
 }
 
-template <typename Func>
-void for_each_selected_stat_type(const StatTypes& stat_types, Func f) {
-  for (const auto stat_type : c10::irange(stat_types.size())) {
-    if (stat_types[stat_type]) {
-      f(stat_type);
+template <typename Func> void for_each_selected_stat_type(const StatTypes &stat_types, Func f)
+{
+    for (const auto stat_type : c10::irange(stat_types.size())) {
+        if (stat_types[stat_type]) {
+            f(stat_type);
+        }
     }
-  }
 }
 
-void update_stat_array(
-    StatArray& stat_array,
-    int64_t amount,
-    const StatTypes& stat_types) {
-  for_each_selected_stat_type(
-      stat_types, [&stat_array, amount](size_t stat_type) {
-        update_stat(stat_array[stat_type], amount);
-      });
+void update_stat_array(StatArray &stat_array, int64_t amount, const StatTypes &stat_types)
+{
+    for_each_selected_stat_type(stat_types,
+        [&stat_array, amount](size_t stat_type) { update_stat(stat_array[stat_type], amount); });
 }
 
 bool IsMallocPage1GMem(bool is_small_pool)
@@ -152,19 +149,19 @@ bool IsMallocPage1GMem(bool is_small_pool)
 
         if (!IsGteCANNVersion(kMinCannVersion, kCannModule)) {
             TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. "
-            "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
-            "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
-            "but the current driver version does not support this feature. "
-            "Please upgrade the CANN package version.");
+                "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
+                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
+                "but the current driver version does not support this feature. "
+                "Please upgrade the CANN package version.");
             return false;
         }
 
-        if (!IsGteCANNVersion(kMinDriverVersion, kDriverModule)) {
+        if (!IsGteDriverVersion(kMinDriverVersion)) {
             TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. "
-            "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
-            "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
-            "but the current driver version does not support this feature. "
-            "Please upgrade the CANN package version 1-2.");
+                "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
+                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
+                "but the current driver version does not support this feature. "
+                "Please upgrade the HDK(driver) package version.");
             return false;
         }
         return true;
@@ -175,46 +172,47 @@ bool IsMallocPage1GMem(bool is_small_pool)
 
 struct Block;
 struct PrivatePool;
-using Comparison = bool (*)(const Block*, const Block*);
-static bool BlockComparatorSize(const Block* a, const Block* b);
-static bool BlockComparatorAddress(const Block* a, const Block* b);
+using Comparison = bool (*)(const Block *, const Block *);
+static bool BlockComparatorSize(const Block *a, const Block *b);
+static bool BlockComparatorAddress(const Block *a, const Block *b);
 
-struct BlockPool{
-    std::set<Block*, Comparison> blocks;
-    std::set<Block*, Comparison> unmapped;
+struct BlockPool {
+    std::set<Block *, Comparison> blocks;
+    std::set<Block *, Comparison> unmapped;
     const bool is_small;
-    PrivatePool* owner_PrivatePool;
+    PrivatePool *owner_PrivatePool;
 
-    BlockPool(bool small, PrivatePool* private_pool = nullptr)
+    BlockPool(bool small, PrivatePool *private_pool = nullptr)
         : blocks(BlockComparatorSize),
           unmapped(BlockComparatorAddress),
           is_small(small),
-          owner_PrivatePool(private_pool) {}
+          owner_PrivatePool(private_pool)
+    {}
 };
 
 struct ExpandableSegment;
 
 struct Block {
-    int device; // npu
-    aclrtStream stream; // allocation stream
+    int device;             // npu
+    aclrtStream stream;     // allocation stream
     stream_set stream_uses; // streams on which the block was used
-    size_t size; // block size in bytes
-    size_t requested_size; // memory originally requested
-    BlockPool* pool; // owning memory pool
-    void* ptr; // memory address
-    bool allocated; // in-use flag
-    bool mapped{true}; // is the virtual address range this Block references
-                       // backed by physical pages. Always true when
-                      // expandable_segment_ is null. When false
-                      // This Block will be aligned to the segment size
-                      // of its expandable_segment_.
-    Block* prev; // prev block if split from a larger allocation
-    Block* next; // next block if split from a larger allocation
-    int event_count; // number of outstanding NPU events
-    int gc_count{0}; // counter for prioritizing older / less useful blocks for
-                     // garbage collection
-    ExpandableSegment* expandable_segment_{nullptr};
-    bool is_safe{true};
+    size_t size;            // block size in bytes
+    size_t requested_size;  // memory originally requested
+    BlockPool *pool;        // owning memory pool
+    void *ptr;              // memory address
+    bool allocated;         // in-use flag
+    bool mapped{ true };    // is the virtual address range this Block references
+                            // backed by physical pages. Always true when
+                            // expandable_segment_ is null. When false
+                            // This Block will be aligned to the segment size
+                            // of its expandable_segment_.
+    Block *prev;            // prev block if split from a larger allocation
+    Block *next;            // next block if split from a larger allocation
+    int event_count;        // number of outstanding NPU events
+    int gc_count{ 0 };      // counter for prioritizing older / less useful blocks for
+                            // garbage collection
+    ExpandableSegment *expandable_segment_{ nullptr };
+    bool is_safe{ true };
     std::shared_ptr<c10::GatheredContext> context_when_allocated;
     // only set for the first block in the segment (when prev == null)
     // this records the frame information when aclMalloc was called
@@ -222,7 +220,7 @@ struct Block {
     // memory out from our cache.
     std::shared_ptr<c10::GatheredContext> context_when_segment_allocated;
 
-    Block(int device, aclrtStream stream, size_t size, BlockPool* pool, void* ptr)
+    Block(int device, aclrtStream stream, size_t size, BlockPool *pool, void *ptr)
         : device(device),
           stream(stream),
           stream_uses(),
@@ -234,7 +232,8 @@ struct Block {
           prev(nullptr),
           next(nullptr),
           event_count(0),
-          gc_count(0) {}
+          gc_count(0)
+    {}
 
     // constructor for search key
     Block(int device, aclrtStream stream, size_t size)
@@ -249,14 +248,15 @@ struct Block {
           prev(nullptr),
           next(nullptr),
           event_count(0),
-          gc_count(0) {}
+          gc_count(0)
+    {}
 
     bool is_split() const
     {
         return (prev != nullptr) || (next != nullptr);
     }
 
-    void splice(Block* before, Block* after)
+    void splice(Block *before, Block *after)
     {
         if (before) {
             TORCH_INTERNAL_ASSERT(before->next == after, PTA_ERROR(ErrCode::PTR));
@@ -272,9 +272,9 @@ struct Block {
 };
 
 struct SegmentRange {
-  char* ptr;
-  size_t size;
-  SegmentRange(void* p, size_t s) : ptr(static_cast<char*>(p)), size(s) {}
+    char *ptr;
+    size_t size;
+    SegmentRange(void *p, size_t s) : ptr(static_cast<char *>(p)), size(s) {}
 };
 
 
@@ -355,199 +355,194 @@ bevhavior for allocator tensors that need to be used cross-process.
 */
 
 struct ExpandableSegment {
-  ExpandableSegment(
-      int device,
-      aclrtStream stream,
-      size_t size)
-      : device_(device),
-        stream_(stream),
-        max_handles_(0),
-        // 2MB for small pool, 20MB for large pool
-        segment_size_(size) {
-    size_t device_free;
-    size_t device_total;
-    NPU_CHECK_ERROR(aclrtGetMemInfo(ACL_HBM_MEM, &device_free, &device_total));
-    // we allocate enough address space for 1 1/8 the total memory on the NPU.
-    // This allows for some cases where we have to unmap pages earlier in the
-    // segment to put them at the end.
-    max_handles_ = numSegments(device_total + device_total / 8);
-    if (c10_npu::option::OptionsManager::IsHcclZeroCopyEnable()) {
-        // prevent HCCL reserve virtual address out of memory
-        // small pool reserve 2G
-        // non-default stream large pool 10G
-        auto default_stream = c10_npu::getDefaultNPUStream().stream(false);
-        if (kSmallBuffer == segment_size_) {
-            max_handles_ = numSegments(kSmallPoolVirAddrSize);
-        } else if (default_stream != stream) {
-            max_handles_ = numSegments(kLargePoolVirAddrSize);
-        }
-    }
-
-    NPU_CHECK_ERROR(c10_npu::acl::AclrtReserveMemAddress(
-        &ptr_, segment_size_ * max_handles_, 0, NULL, 1, getHcclComm()));
-    ASCEND_LOGD(
-        "NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu, segment_size=%zu",
-        segment_size_ * max_handles_, segment_size_);
-  }
-  // begin must be aligned to segment_size_.
-  // returns the actual range mapped, which may be
-  // greater than requested if size is not aligned to segment_size_.
-  // return size of 0 indicates OOM
-  SegmentRange map(SegmentRange range) {
-    auto begin = segmentLeft(range.ptr);
-    auto end = segmentRight(range.ptr + range.size);
-    TORCH_INTERNAL_ASSERT(ptr() + begin * segment_size_ == range.ptr, PTA_ERROR(ErrCode::PTR));
-    if (begin == end) {
-      return rangeFromHandles(begin, end);
-    }
-    while (end > handles_.size()) {
-      handles_.emplace_back(c10::nullopt);
-    }
-    for (auto i : c10::irange(begin, end)) {
-        TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL));
-        aclrtDrvMemHandle handle = nullptr;
-        aclrtPhysicalMemProp prop = {};
-        prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
-        prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
-        prop.memAttr = (segment_size_ == kExtraLargeBuffer) ? ACL_HBM_MEM_HUGE1G : ACL_HBM_MEM_HUGE;
-        prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
-        prop.location.id = device_;
-        prop.reserve = 0;
-        auto status =
-            c10_npu::acl::AclrtMallocPhysical(&handle, segment_size_, &prop, 0);
-        if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) {
-            for (auto j : c10::irange(begin, i)) {
-                auto h = handles_.at(j).value();
-                handles_.at(j) = c10::nullopt;
-                NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
-            }
-            trimHandles();
-            return rangeFromHandles(begin, begin);
-        }
-        NPU_CHECK_ERROR(status, "aclrtMallocPhysical");
-        handles_.at(i) = handle;
-    }
-    for (auto i : c10::irange(begin, end)) {
-      NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem(
-          (char*)ptr_ + i * segment_size_,
-          segment_size_,
-          0,
-          handles_.at(i).value(),
-          0,
-          getHcclComm()));
-    }
-    ASCEND_LOGD(
-        "NPUCachingAllocator map: segment_size=%zu", segment_size_);
-    return rangeFromHandles(begin, end);
-  }
-
-  // unmaps all the completely empty segment_size_ segments between
-  // [begin, begin + size), returns the offset where the range begin,
-  // and the actual size unmapped (multiple of segment_size_)
-  SegmentRange unmap(SegmentRange range) {
-    auto begin = segmentRight(range.ptr);
-    auto end = segmentLeft(range.ptr + range.size);
-    if (begin >= end) {
-      return SegmentRange{range.ptr, 0};
-    }
-    unmapHandles(begin, end);
-    return rangeFromHandles(begin, end);
-  }
-
-  char* ptr() const {
-    return (char*)ptr_;
-  }
-
-  size_t size() const {
-    return max_handles_ * segment_size_;
-  }
+    ExpandableSegment(int device, aclrtStream stream, size_t size)
+        : device_(device),
+          stream_(stream),
+          max_handles_(0),
+          // 2MB for small pool, 20MB for large pool
+          segment_size_(size)
+    {
+        size_t device_free;
+        size_t device_total;
+        NPU_CHECK_ERROR(aclrtGetMemInfo(ACL_HBM_MEM, &device_free, &device_total));
+        // we allocate enough address space for 1 1/8 the total memory on the NPU.
+        // This allows for some cases where we have to unmap pages earlier in the
+        // segment to put them at the end.
+        max_handles_ = numSegments(device_total + device_total / 8);
+        if (c10_npu::option::OptionsManager::IsHcclZeroCopyEnable()) {
+            // prevent HCCL reserve virtual address out of memory
+            // small pool reserve 2G
+            // non-default stream large pool 10G
+            auto default_stream = c10_npu::getDefaultNPUStream().stream(false);
+            if (kSmallBuffer == segment_size_) {
+                max_handles_ = numSegments(kSmallPoolVirAddrSize);
+            } else if (default_stream != stream) {
+                max_handles_ = numSegments(kLargePoolVirAddrSize);
+            }
+        }
+
+        NPU_CHECK_ERROR(
+            c10_npu::acl::AclrtReserveMemAddress(&ptr_, segment_size_ * max_handles_, 0, nullptr, 1, getHcclComm()));
+        ASCEND_LOGD("NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu, segment_size=%zu",
+            segment_size_ * max_handles_, segment_size_);
+    }
+    // begin must be aligned to segment_size_.
+    // returns the actual range mapped, which may be
+    // greater than requested if size is not aligned to segment_size_.
+    // return size of 0 indicates OOM
+    SegmentRange map(SegmentRange range)
+    {
+        auto begin = segmentLeft(range.ptr);
+        auto end = segmentRight(range.ptr + range.size);
+        TORCH_INTERNAL_ASSERT(ptr() + begin * segment_size_ == range.ptr, PTA_ERROR(ErrCode::PTR));
+        if (begin == end) {
+            return rangeFromHandles(begin, end);
+        }
+        while (end > handles_.size()) {
+            handles_.emplace_back(c10::nullopt);
+        }
+        for (auto i : c10::irange(begin, end)) {
+            TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::INTERNAL));
+            aclrtDrvMemHandle handle = nullptr;
+            aclrtPhysicalMemProp prop = {};
+            prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
+            prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
+            prop.memAttr = (segment_size_ == kExtraLargeBuffer) ? ACL_HBM_MEM_HUGE1G : ACL_HBM_MEM_HUGE;
+            prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+            prop.location.id = static_cast<unsigned>(device_);
+            prop.reserve = 0;
+            auto status = c10_npu::acl::AclrtMallocPhysical(&handle, segment_size_, &prop, 0);
+            if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) {
+                for (auto j : c10::irange(begin, i)) {
+                    auto h = handles_.at(j).value();
+                    handles_.at(j) = c10::nullopt;
+                    NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+                }
+                trimHandles();
+                return rangeFromHandles(begin, begin);
+            }
+            NPU_CHECK_ERROR(status, "aclrtMallocPhysical");
+            handles_.at(i) = handle;
+        }
+        for (auto i : c10::irange(begin, end)) {
+            NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem((char *)ptr_ + i * segment_size_, segment_size_, 0,
+                handles_.at(i).value(), 0, getHcclComm()));
+        }
+        ASCEND_LOGD("NPUCachingAllocator map: segment_size=%zu", segment_size_);
+        return rangeFromHandles(begin, end);
+    }
+
+    // unmaps all the completely empty segment_size_ segments between
+    // [begin, begin + size), returns the offset where the range begin,
+    // and the actual size unmapped (multiple of segment_size_)
+    SegmentRange unmap(SegmentRange range)
+    {
+        auto begin = segmentRight(range.ptr);
+        auto end = segmentLeft(range.ptr + range.size);
+        if (begin >= end) {
+            return SegmentRange{ range.ptr, 0 };
+        }
+        unmapHandles(begin, end);
+        return rangeFromHandles(begin, end);
+    }
+
+    char *ptr() const
+    {
+        return (char *)ptr_;
+    }
+
+    size_t size() const
+    {
+        return max_handles_ * segment_size_;
+    }
 
     void setHcclComm(std::shared_ptr<c10d_npu::HCCLComm> hcclComm)
     {
         TORCH_INTERNAL_ASSERT(hcclComm, "hcclComm is null.", PTA_ERROR(ErrCode::INTERNAL));
         hcclComm_ = hcclComm;
         HCCL_CHECK_ERROR(at_npu::hccl::HcclCommSetMemoryRangeFace(hcclComm_->getHcclComm(), ptr_,
-                                                                  segment_size_ * max_handles_, 0, 1));
+            segment_size_ * max_handles_, 0, 1));
         for (auto i : c10::irange(handles_.size())) {
             HCCL_CHECK_ERROR(at_npu::hccl::HcclCommActivateCommMemoryFace(hcclComm_->getHcclComm(),
-                                                                          (char*)ptr_ + i * segment_size_,
-                                                                          segment_size_,
-                                                                          0,
-                                                                          handles_.at(i).value(),
-                                                                          0));
-        }
-    }
-
-  ~ExpandableSegment() {
-    forEachAllocatedRange(
-        [&](size_t begin, size_t end) { unmapHandles(begin, end); });
-    NPU_CHECK_ERROR(c10_npu::acl::AclrtReleaseMemAddress(ptr_, getHcclComm()));
-    ASCEND_LOGD("NPUCachingAllocator free by AclrtReleaseMemAddress");
-  }
-
- private:
-  void unmapHandles(size_t begin, size_t end) {
-    // note: unlike aclrtFree, MemUnmap and MemRelease do
-    // not appear to synchronize in all cases, so we have to wait for the
-    // stream to finish before this memory is truly free.
-
-    // cannot call c10::npu::stream_synchronize because
-    // it might grab the GIL which can lead to a deadlock
-    // Locking order must be GIL -> Allocator Lock
-    NPU_CHECK_ERROR(aclrtSynchronizeStream(stream_));
-#ifndef BUILD_LIBTORCH
-    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
-    if (C10_UNLIKELY(trigger)) {
-        trigger->traceNpuStreamSynchronization(
-            reinterpret_cast<uintptr_t>(stream_));
+                (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value(), 0));
+        }
+    }
+
+    ~ExpandableSegment()
+    {
+        forEachAllocatedRange([&](size_t begin, size_t end) { unmapHandles(begin, end); });
+        NPU_CHECK_ERROR(c10_npu::acl::AclrtReleaseMemAddress(ptr_, getHcclComm()));
+        ASCEND_LOGD("NPUCachingAllocator free by AclrtReleaseMemAddress");
     }
+
+private:
+    void unmapHandles(size_t begin, size_t end)
+    {
+        // note: unlike aclrtFree, MemUnmap and MemRelease do
+        // not appear to synchronize in all cases, so we have to wait for the
+        // stream to finish before this memory is truly free.
+
+        // cannot call c10::npu::stream_synchronize because
+        // it might grab the GIL which can lead to a deadlock
+        // Locking order must be GIL -> Allocator Lock
+        NPU_CHECK_ERROR(aclrtSynchronizeStream(stream_));
+#ifndef BUILD_LIBTORCH
+        const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
+        if (C10_UNLIKELY(trigger)) {
+            trigger->traceNpuStreamSynchronization(reinterpret_cast<uintptr_t>(stream_));
+        }
 #endif
-    for (auto i : c10::irange(begin, end)) {
-      aclrtDrvMemHandle h = handles_.at(i).value();
-      handles_.at(i) = c10::nullopt;
-      NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char*)ptr_ + segment_size_ * i, getHcclComm()));
-      NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
-    }
-      ASCEND_LOGD("NPUCachingAllocator unmap: segment_size=%zu", segment_size_);
-    trimHandles();
-  }
-
-  void trimHandles() {
-    while (!handles_.empty() && !handles_.back()) {
-      handles_.pop_back();
-    }
-  }
-
-  void forEachAllocatedRange(std::function<void(size_t, size_t)> fn) {
-    auto start = 0;
-    for (auto i : c10::irange(handles_.size())) {
-      if (handles_.at(i) && (i == 0 || !handles_.at(i - 1))) {
-        start = i;
-      }
-      if (handles_.at(i) && (i + 1 == handles_.size() || !handles_.at(i + 1))) {
-        fn(start, i + 1);
-      }
-    }
-  }
-
-  size_t numSegments(size_t size) {
-    return (size + segment_size_ - 1) / segment_size_;
-  }
-
-  size_t segmentLeft(char* p) {
-    auto size = p - ptr();
-    return size / segment_size_;
-  }
-
-  size_t segmentRight(char* p) {
-    auto size = p - ptr();
-    return numSegments(size);
-  }
-
-  SegmentRange rangeFromHandles(size_t begin, size_t end) {
-    return SegmentRange(
-        ptr() + segment_size_ * begin, segment_size_ * (end - begin));
-  }
+        for (auto i : c10::irange(begin, end)) {
+            aclrtDrvMemHandle h = handles_.at(i).value();
+            handles_.at(i) = c10::nullopt;
+            NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *)ptr_ + segment_size_ * i, getHcclComm()));
+            NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+        }
+        ASCEND_LOGD("NPUCachingAllocator unmap: segment_size=%zu", segment_size_);
+        trimHandles();
+    }
+
+    void trimHandles()
+    {
+        while (!handles_.empty() && !handles_.back()) {
+            handles_.pop_back();
+        }
+    }
+
+    void forEachAllocatedRange(std::function<void(size_t, size_t)> fn)
+    {
+        size_t start = 0;
+        for (auto i : c10::irange(handles_.size())) {
+            if (handles_.at(i) && (i == 0 || !handles_.at(i - 1))) {
+                start = i;
+            }
+            if (handles_.at(i) && (i + 1 == handles_.size() || !handles_.at(i + 1))) {
+                fn(start, i + 1);
+            }
+        }
+    }
+
+    size_t numSegments(size_t size)
+    {
+        return (size + segment_size_ - 1) / segment_size_;
+    }
+
+    size_t segmentLeft(char *p)
+    {
+        auto size = p - ptr();
+        return static_cast<size_t>(size) / segment_size_;
+    }
+
+    size_t segmentRight(char *p)
+    {
+        auto size = p - ptr();
+        return numSegments(size);
+    }
+
+    SegmentRange rangeFromHandles(size_t begin, size_t end)
+    {
+        return SegmentRange(ptr() + segment_size_ * begin, segment_size_ * (end - begin));
+    }
 
     HcclComm getHcclComm()
     {
@@ -557,162 +552,156 @@ struct ExpandableSegment {
         return nullptr;
     }
 
-  int device_;
-  aclrtStream stream_;
-  void* ptr_{};
-  size_t max_handles_;
-  size_t segment_size_;
-  std::vector<c10::optional<aclrtDrvMemHandle>> handles_;
-  std::shared_ptr<c10d_npu::HCCLComm> hcclComm_;
+    int device_;
+    aclrtStream stream_;
+    void *ptr_{};
+    size_t max_handles_;
+    size_t segment_size_;
+    std::vector<c10::optional<aclrtDrvMemHandle>> handles_;
+    std::shared_ptr<c10d_npu::HCCLComm> hcclComm_;
 };
 
 // BlockState, BlockPoolState, and PrivatePoolState contain the information
 // needed to reconstruct a private pool to a previous state. See note
 // [Checkpointing PrivatePoolState]
 struct BlockState {
-  c10::DeviceIndex device = 0;
-  aclrtStream stream = nullptr;
-  stream_set stream_uses = {};
-  size_t size = 0;
-  void* ptr = nullptr;
-  bool allocated = false;
-  int64_t gc_count_base = 0;
-  // maintain invariant that event_count == 0 ;
-  // history will be left alone in checkpoint
-
-  explicit BlockState(Block* block);
+    c10::DeviceIndex device = 0;
+    aclrtStream stream = nullptr;
+    stream_set stream_uses = {};
+    size_t size = 0;
+    void *ptr = nullptr;
+    bool allocated = false;
+    int64_t gc_count_base = 0;
+    // maintain invariant that event_count == 0 ;
+    // history will be left alone in checkpoint
+
+    explicit BlockState(Block *block);
 };
 
 struct SegmentState {
-  std::vector<BlockState> blocks;
-  bool is_small = false;
+    std::vector<BlockState> blocks;
+    bool is_small = false;
 
-  explicit SegmentState(Block* head);
+    explicit SegmentState(Block *head);
 };
 
 struct PrivatePoolState : AllocatorState {
-  // omitting use_count, and aclMalloc_count as they remain the same
-  MempoolId_t owner_id = {0, 0};
+    // omitting use_count, and aclMalloc_count as they remain the same
+    MempoolId_t owner_id = { 0, 0 };
 
-  std::vector<SegmentState> segments;
+    std::vector<SegmentState> segments;
 
-  PrivatePoolState(
-      MempoolId_t pool_id,
-      const std::vector<Block*>& private_pool_head_blocks);
+    PrivatePoolState(MempoolId_t pool_id, const std::vector<Block *> &private_pool_head_blocks);
 };
 
 struct RestoreResult {
-  std::vector<void*> allocations_freed;
-  std::vector<Block*> allocations_created;
+    std::vector<void *> allocations_freed;
+    std::vector<Block *> allocations_created;
 };
 
-static bool BlockComparatorSize(const Block* a, const Block* b) {
-  if (a->stream != b->stream) {
-    return reinterpret_cast<uintptr_t>(a->stream) <
-        reinterpret_cast<uintptr_t>(b->stream);
-  }
-  if (a->size != b->size) {
-    return a->size < b->size;
-  }
-  return reinterpret_cast<uintptr_t>(a->ptr) <
-      reinterpret_cast<uintptr_t>(b->ptr);
+static bool BlockComparatorSize(const Block *a, const Block *b)
+{
+    if (a->stream != b->stream) {
+        return reinterpret_cast<uintptr_t>(a->stream) < reinterpret_cast<uintptr_t>(b->stream);
+    }
+    if (a->size != b->size) {
+        return a->size < b->size;
+    }
+    return reinterpret_cast<uintptr_t>(a->ptr) < reinterpret_cast<uintptr_t>(b->ptr);
 }
 
-static bool BlockComparatorAddress(const Block* a, const Block* b) {
-  if (a->stream != b->stream) {
-    return reinterpret_cast<uintptr_t>(a->stream) <
-        reinterpret_cast<uintptr_t>(b->stream);
-  }
-  return reinterpret_cast<uintptr_t>(a->ptr) <
-      reinterpret_cast<uintptr_t>(b->ptr);
+static bool BlockComparatorAddress(const Block *a, const Block *b)
+{
+    if (a->stream != b->stream) {
+        return reinterpret_cast<uintptr_t>(a->stream) < reinterpret_cast<uintptr_t>(b->stream);
+    }
+    return reinterpret_cast<uintptr_t>(a->ptr) < reinterpret_cast<uintptr_t>(b->ptr);
 }
 
 struct AllocParams {
-  AllocParams(
-      int device,
-      size_t size,
-      aclrtStream stream,
-      BlockPool* pool,
-      size_t alloc_size,
-      DeviceStats& stats)
-      : search_key(device, stream, size),
-        pool(pool),
-        alloc_size(alloc_size),
-        block(nullptr),
-        err(ACL_ERROR_NONE) {}
-
-  int device() const { return search_key.device; }
-  aclrtStream stream() const { return search_key.stream; }
-  size_t size() const { return search_key.size; }
-
-  Block search_key;
-  BlockPool* pool;
-  size_t alloc_size;
-  Block* block;
-  StatTypes stat_types = {false};
-  aclError err;
+    AllocParams(int device, size_t size, aclrtStream stream, BlockPool *pool, size_t alloc_size, DeviceStats &stats)
+        : search_key(device, stream, size), pool(pool), alloc_size(alloc_size), block(nullptr), err(ACL_ERROR_NONE)
+    {}
+
+    int device() const
+    {
+        return search_key.device;
+    }
+    aclrtStream stream() const
+    {
+        return search_key.stream;
+    }
+    size_t size() const
+    {
+        return search_key.size;
+    }
+
+    Block search_key;
+    BlockPool *pool;
+    size_t alloc_size;
+    Block *block;
+    StatTypes stat_types = { false };
+    aclError err;
 };
 
 class EventPool {
 public:
-  using Event = std::unique_ptr<c10_npu::NPUEvent, std::function<void(c10_npu::NPUEvent*)>>;
-  // Explicit device count
-  EventPool() : pools_(c10_npu::device_count()) {}
-
-  Event get(int device) {
-    TORCH_INTERNAL_ASSERT(0 <= device, PTA_ERROR(ErrCode::INTERNAL));
-    TORCH_INTERNAL_ASSERT(device < static_cast<int>(pools_.size()), PTA_ERROR(ErrCode::INTERNAL));
-    auto& pool = pools_[device];
-    auto destructor = [&pool](c10_npu::NPUEvent* event) {
-      std::lock_guard<std::mutex> g(pool.mutex_);
-      pool.event_pool_.push_back(std::unique_ptr<c10_npu::NPUEvent>(event));
-    };
+    using Event = std::unique_ptr<c10_npu::NPUEvent, std::function<void(c10_npu::NPUEvent *)>>;
+    // Explicit device count
+    EventPool() : pools_(c10_npu::device_count()) {}
 
-    // Try to acquire an event from the per-device pool.
+    Event get(int device)
     {
-      std::lock_guard<std::mutex> g(pool.mutex_);
-      if (!pool.event_pool_.empty()) {
-        auto* event = pool.event_pool_.back().release();
-        pool.event_pool_.pop_back();
-        return Event(event, destructor);
-      }
+        TORCH_INTERNAL_ASSERT(0 <= device, PTA_ERROR(ErrCode::INTERNAL));
+        TORCH_INTERNAL_ASSERT(device < static_cast<int>(pools_.size()), PTA_ERROR(ErrCode::INTERNAL));
+        auto &pool = pools_[device];
+        auto destructor = [&pool](c10_npu::NPUEvent *event) {
+            std::lock_guard<std::mutex> g(pool.mutex_);
+            pool.event_pool_.push_back(std::unique_ptr<c10_npu::NPUEvent>(event));
+        };
+
+        // Try to acquire an event from the per-device pool.
+        {
+            std::lock_guard<std::mutex> g(pool.mutex_);
+            if (!pool.event_pool_.empty()) {
+                auto *event = pool.event_pool_.back().release();
+                pool.event_pool_.pop_back();
+                return Event(event, destructor);
+            }
+        }
+        // otherwise, allocate a new event that will be returned to the pool on
+        // destruction.
+        return Event(std::make_unique<c10_npu::NPUEvent>(ACL_EVENT_CAPTURE_STREAM_PROGRESS).release(), destructor);
     }
-    // otherwise, allocate a new event that will be returned to the pool on
-    // destruction.
-    return Event(
-        std::make_unique<c10_npu::NPUEvent>(ACL_EVENT_CAPTURE_STREAM_PROGRESS).release(),
-        destructor);
-  }
 
-  void empty_cache() {
-    for (auto& pool : pools_) {
-      std::lock_guard<std::mutex> g(pool.mutex_);
-      pool.event_pool_.clear();
+    void empty_cache()
+    {
+        for (auto &pool : pools_) {
+            std::lock_guard<std::mutex> g(pool.mutex_);
+            pool.event_pool_.clear();
+        }
     }
-  }
 
 private:
-  struct PerDevicePool {
-    alignas(64) std::mutex mutex_;
-    std::vector<std::unique_ptr<c10_npu::NPUEvent>> event_pool_;
-  };
-  std::vector<PerDevicePool> pools_;
+    struct PerDevicePool {
+        alignas(64) std::mutex mutex_;
+        std::vector<std::unique_ptr<c10_npu::NPUEvent>> event_pool_;
+    };
+    std::vector<PerDevicePool> pools_;
 };
 
 // NPU graphs helper
 struct PrivatePool {
-    PrivatePool()
-        : large_blocks(false, this),
-          small_blocks(true, this) {}
-    PrivatePool(const PrivatePool&) = delete;
-    PrivatePool(PrivatePool&&) = delete;
-    PrivatePool& operator=(const PrivatePool&) = delete;
+    PrivatePool() : large_blocks(false, this), small_blocks(true, this) {}
+    PrivatePool(const PrivatePool &) = delete;
+    PrivatePool(PrivatePool &&) = delete;
+    PrivatePool &operator = (const PrivatePool &) = delete;
     // Number of live graphs using this pool
-    int use_count{1};
+    int use_count{ 1 };
     // Number of unfreed npuMallocs made for this pool. When use_count and
     // npuMalloc_count drop to zero, we can delete this PrivatePool from
     // graph_pools.
-    int npuMalloc_count{0};
+    int npuMalloc_count{ 0 };
     // Instead of maintaining private BlockPools here, I could stuff all blocks
     // (private or no) into the top-level large_blocks and small_blocks, and
     // distinguish private blocks by adding a "pool id" check above the stream
@@ -722,232 +711,204 @@ struct PrivatePool {
     BlockPool small_blocks;
 };
 
-BlockState::BlockState(Block* block)
+BlockState::BlockState(Block *block)
     : device(block->device),
       stream(block->stream),
       stream_uses(block->stream_uses),
       size(block->size),
       ptr(block->ptr),
       allocated(block->allocated),
-      gc_count_base(block->gc_count) {
-  TORCH_CHECK(
-      block->event_count == 0,
-      "Events should have synchronized when checkpointing block", PTA_ERROR(ErrCode::INTERNAL));
+      gc_count_base(block->gc_count)
+{
+    TORCH_CHECK(block->event_count == 0, "Events should have synchronized when checkpointing block",
+        PTA_ERROR(ErrCode::INTERNAL));
 };
 
-SegmentState::SegmentState(Block* head)
+SegmentState::SegmentState(Block *head)
 {
     TORCH_INTERNAL_ASSERT(head->prev == nullptr && head->pool != nullptr);
     is_small = head->pool->is_small;
 
-    for (Block* curr = head; curr != nullptr; curr = curr->next) {
+    for (Block *curr = head; curr != nullptr; curr = curr->next) {
         blocks.emplace_back(curr);
     }
 }
 
-PrivatePoolState::PrivatePoolState(
-    MempoolId_t pool_id,
-    const std::vector<Block*>& private_pool_head_blocks)
+PrivatePoolState::PrivatePoolState(MempoolId_t pool_id, const std::vector<Block *> &private_pool_head_blocks)
     : owner_id(std::move(pool_id))
 {
-    for (Block* head : private_pool_head_blocks) {
+    for (Block *head : private_pool_head_blocks) {
         segments.emplace_back(head);
     }
 }
 
 struct MempoolIdHash {
-    std::size_t operator()(const MempoolId_t& mempool_id) const noexcept
+    std::size_t operator () (const MempoolId_t &mempool_id) const noexcept
     {
         return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
     }
 };
-
 } // namespace
 
 class CachingAllocatorConfig {
- public:
-
-  static size_t max_split_size() {
-    return instance().m_max_split_size;
-  }
-
-  static double garbage_collection_threshold() {
-    return instance().m_garbage_collection_threshold;
-  }
-
-  static bool expandable_segments() {
-    return instance().m_expandable_segments;
-  }
-
-  static size_t base_addr_aligned_size()
-  {
-      return instance().m_base_addr_aligned_size;
-  }
-
-  static bool page_size_1g_enable()
-  {
-      return instance().m_page_size_1g;
-  }
-
-  static CachingAllocatorConfig &instance() {
-    static CachingAllocatorConfig *s_instance = ([]() {
-      auto inst = new CachingAllocatorConfig();
-      const char* env = getenv("PYTORCH_NPU_ALLOC_CONF");
-      inst->parseArgs(env);
-      return inst;
-    })();
-    return *s_instance;
-  }
-
-  void parseArgs(const char* env);
-
- private:
-
-  size_t m_max_split_size;
-  double m_garbage_collection_threshold;
-  bool m_expandable_segments;
-  bool set_expandable_segments_flag = false;
-  size_t m_base_addr_aligned_size = kAlignRoundLarge;
-  bool m_page_size_1g = false; // 新增1G页配置标志
-
-  CachingAllocatorConfig()
-      : m_max_split_size(std::numeric_limits<size_t>::max()),
-        m_garbage_collection_threshold(0),
-        m_expandable_segments(false),
-        m_base_addr_aligned_size(kAlignRoundLarge)
-        {
-        }
+public:
+    static size_t max_split_size()
+    {
+        return instance().m_max_split_size;
+    }
 
-  void lexArgs(const char* env, std::vector<std::string>& config);
-  void consumeToken(
-      const std::vector<std::string>& config,
-      size_t i,
-      const char c);
-  size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
-  size_t parseGarbageCollectionThreshold(
-      const std::vector<std::string>& config,
-      size_t i);
-  size_t parseExpandableSegments(
-      const std::vector<std::string>& config,
-      size_t i);
-  size_t parseAddrAlignSize(
-      const std::vector<std::string>& config,
-      size_t i);
-  size_t parsePageSize(
-      const std::vector<std::string>& config,
-      size_t i);
-};
+    static double garbage_collection_threshold()
+    {
+        return instance().m_garbage_collection_threshold;
+    }
+
+    static bool expandable_segments()
+    {
+        return instance().m_expandable_segments;
+    }
+
+    static size_t base_addr_aligned_size()
+    {
+        return instance().m_base_addr_aligned_size;
+    }
+
+    static bool page_size_1g_enable()
+    {
+        return instance().m_page_size_1g;
+    }
+
+    static CachingAllocatorConfig &instance()
+    {
+        static CachingAllocatorConfig *s_instance = ([]() {
+            auto inst = new CachingAllocatorConfig();
+            const char *env = getenv("PYTORCH_NPU_ALLOC_CONF");
+            inst->parseArgs(env);
+            return inst;
+        })();
+        return *s_instance;
+    }
 
-void CachingAllocatorConfig::lexArgs(
-    const char* env,
-    std::vector<std::string>& config) {
-  std::vector<char> buf;
+    void parseArgs(const char *env);
 
-  size_t env_length = strlen(env);
-  for (size_t i = 0; i < env_length; i++) {
-    if (env[i] == ',' || env[i] == ':' || env[i] == '[' || env[i] == ']') {
-      if (!buf.empty()) {
+private:
+    size_t m_max_split_size;
+    double m_garbage_collection_threshold;
+    bool m_expandable_segments;
+    bool set_expandable_segments_flag = false;
+    size_t m_base_addr_aligned_size = kAlignRoundLarge;
+    bool m_page_size_1g = false; // 新增1G页配置标志
+
+    CachingAllocatorConfig()
+        : m_max_split_size(std::numeric_limits<size_t>::max()),
+          m_garbage_collection_threshold(0),
+          m_expandable_segments(false),
+          m_base_addr_aligned_size(kAlignRoundLarge)
+    {}
+
+    void lexArgs(const char *env, std::vector<std::string> &config);
+    void consumeToken(const std::vector<std::string> &config, size_t i, const char c);
+    size_t parseMaxSplitSize(const std::vector<std::string> &config, size_t i);
+    size_t parseGarbageCollectionThreshold(const std::vector<std::string> &config, size_t i);
+    size_t parseExpandableSegments(const std::vector<std::string> &config, size_t i);
+    size_t parseAddrAlignSize(const std::vector<std::string> &config, size_t i);
+    size_t parsePageSize(const std::vector<std::string> &config, size_t i);
+};
+
+void CachingAllocatorConfig::lexArgs(const char *env, std::vector<std::string> &config)
+{
+    std::vector<char> buf;
+
+    size_t env_length = strlen(env);
+    for (size_t i = 0; i < env_length; i++) {
+        if (env[i] == ',' || env[i] == ':' || env[i] == '[' || env[i] == ']') {
+            if (!buf.empty()) {
+                config.emplace_back(buf.begin(), buf.end());
+                buf.clear();
+            }
+            config.emplace_back(1, env[i]);
+        } else if (env[i] != ' ') {
+            buf.emplace_back(static_cast<char>(env[i]));
+        }
+    }
+    if (!buf.empty()) {
         config.emplace_back(buf.begin(), buf.end());
-        buf.clear();
-      }
-      config.emplace_back(1, env[i]);
-    } else if (env[i] != ' ') {
-      buf.emplace_back(static_cast<char>(env[i]));
-    }
-  }
-  if (!buf.empty()) {
-    config.emplace_back(buf.begin(), buf.end());
-  }
+    }
 }
 
-void CachingAllocatorConfig::consumeToken(
-    const std::vector<std::string>& config,
-    size_t i,
-    const char c) {
-  TORCH_CHECK(
-      i < config.size() && config[i].compare(std::string(1, c)) == 0,
-      "Error parsing CachingAllocator settings, expected ", c, PTA_ERROR(ErrCode::PARAM));
+void CachingAllocatorConfig::consumeToken(const std::vector<std::string> &config, size_t i, const char c)
+{
+    TORCH_CHECK(i < config.size() && config[i].compare(std::string(1, c)) == 0,
+        "Error parsing CachingAllocator settings, expected ", c, PTA_ERROR(ErrCode::PARAM));
 }
 
-size_t CachingAllocatorConfig::parseMaxSplitSize(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    size_t val1 = static_cast<size_t>(stoi(config[i]));
-    TORCH_CHECK(
-        val1 > kLargeBuffer / (1024 * 1024),
-        "CachingAllocator option max_split_size_mb too small, must be > ",
-        kLargeBuffer / (1024 * 1024), OPS_ERROR(ErrCode::VALUE));
-    val1 = std::max(val1, kLargeBuffer / (1024 * 1024));
-    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
-    m_max_split_size = val1 * 1024 * 1024;
-  } else {
-    TORCH_CHECK(false, "Error, expecting max_split_size_mb value", OPS_ERROR(ErrCode::VALUE));
-  }
-  return i;
+size_t CachingAllocatorConfig::parseMaxSplitSize(const std::vector<std::string> &config, size_t i)
+{
+    consumeToken(config, ++i, ':');
+    if (++i < config.size()) {
+        size_t val1 = static_cast<size_t>(stoi(config[i]));
+        TORCH_CHECK(val1 > kLargeBuffer / (1024 * 1024),
+            "CachingAllocator option max_split_size_mb too small, must be > ", kLargeBuffer / (1024 * 1024),
+            OPS_ERROR(ErrCode::VALUE));
+        val1 = std::max(val1, kLargeBuffer / (1024 * 1024));
+        val1 = std::min(val1, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
+        m_max_split_size = val1 * 1024 * 1024;
+    } else {
+        TORCH_CHECK(false, "Error, expecting max_split_size_mb value", OPS_ERROR(ErrCode::VALUE));
+    }
+    return i;
 }
 
-size_t CachingAllocatorConfig::parseGarbageCollectionThreshold(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    double val1 = stod(config[i]);
-    TORCH_CHECK(
-        val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", OPS_ERROR(ErrCode::VALUE));
-    TORCH_CHECK(
-        val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", OPS_ERROR(ErrCode::VALUE));
-    m_garbage_collection_threshold = val1;
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting garbage_collection_threshold value", OPS_ERROR(ErrCode::VALUE));
-  }
-  return i;
+size_t CachingAllocatorConfig::parseGarbageCollectionThreshold(const std::vector<std::string> &config, size_t i)
+{
+    consumeToken(config, ++i, ':');
+    if (++i < config.size()) {
+        double val1 = stod(config[i]);
+        TORCH_CHECK(val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", OPS_ERROR(ErrCode::VALUE));
+        TORCH_CHECK(val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", OPS_ERROR(ErrCode::VALUE));
+        m_garbage_collection_threshold = val1;
+    } else {
+        TORCH_CHECK(false, "Error, expecting garbage_collection_threshold value", OPS_ERROR(ErrCode::VALUE));
+    }
+    return i;
 }
 
-size_t CachingAllocatorConfig::parseExpandableSegments(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    TORCH_CHECK(
-        i < config.size() && (config[i] == "True" || config[i] == "False"),
-        "Expected a single True/False argument for expandable_segments", OPS_ERROR(ErrCode::PARAM));
-    m_expandable_segments = (config[i] == "True");
-    if (m_expandable_segments) {
-        void* ptr = nullptr;
-        auto status = c10_npu::acl::AclrtReserveMemAddress(&ptr, 512, 0, NULL, 1);
-        if (status == ACL_ERROR_NONE) {
-            NPU_CHECK_ERROR(c10_npu::acl::AclrtReleaseMemAddress(ptr));
-        } else {
-            NPU_CHECK_SUPPORTED_OR_ERROR(status, "aclrtReserveMemAddress");
-            TORCH_NPU_WARN_ONCE("expandable_segments setting failure, now change to `False`.");
-            m_expandable_segments = false;
+size_t CachingAllocatorConfig::parseExpandableSegments(const std::vector<std::string> &config, size_t i)
+{
+    consumeToken(config, ++i, ':');
+    if (++i < config.size()) {
+        TORCH_CHECK(i < config.size() && (config[i] == "True" || config[i] == "False"),
+            "Expected a single True/False argument for expandable_segments", OPS_ERROR(ErrCode::PARAM));
+        m_expandable_segments = (config[i] == "True");
+        if (m_expandable_segments) {
+            void *ptr = nullptr;
+            auto status = c10_npu::acl::AclrtReserveMemAddress(&ptr, 512, 0, nullptr, 1);
+            if (status == ACL_ERROR_NONE) {
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtReleaseMemAddress(ptr));
+            } else {
+                NPU_CHECK_ERROR(status, "aclrtReserveMemAddress");
+                TORCH_NPU_WARN_ONCE("expandable_segments setting failure, now change to `False`.");
+                m_expandable_segments = false;
+            }
         }
+    } else {
+        TORCH_CHECK(false, "Error, expecting expandable_segments value", OPS_ERROR(ErrCode::VALUE));
     }
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting expandable_segments value", OPS_ERROR(ErrCode::VALUE));
-  }
-  return i;
+    return i;
 }
 
-size_t CachingAllocatorConfig::parseAddrAlignSize(
-    const std::vector<std::string>& config,
-    size_t i)
+size_t CachingAllocatorConfig::parseAddrAlignSize(const std::vector<std::string> &config, size_t i)
 {
     consumeToken(config, ++i, ':');
     if (++i < config.size()) {
         size_t val = static_cast<size_t>(stoi(config[i]));
         TORCH_CHECK(config[i].length() == std::to_string(val).length(),
-                    "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int",
-                    OPS_ERROR(ErrCode::VALUE));
+            "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int",
+            OPS_ERROR(ErrCode::VALUE));
         TORCH_CHECK(val >= 0, "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int",
-                    OPS_ERROR(ErrCode::VALUE));
+            OPS_ERROR(ErrCode::VALUE));
         TORCH_CHECK(val <= kAlignRoundLarge / 1024,
-                    "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int",
-                    OPS_ERROR(ErrCode::VALUE));
+            "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int",
+            OPS_ERROR(ErrCode::VALUE));
         m_base_addr_aligned_size = val * 1024;
     } else {
         TORCH_CHECK(false, "Error, expecting base_addr_aligned_kb value", OPS_ERROR(ErrCode::VALUE));
@@ -955,64 +916,65 @@ size_t CachingAllocatorConfig::parseAddrAlignSize(
     return i;
 }
 
-size_t CachingAllocatorConfig::parsePageSize(const std::vector<std::string>& config, size_t i)
+size_t CachingAllocatorConfig::parsePageSize(const std::vector<std::string> &config, size_t i)
 {
     TORCH_CHECK(i + 2 < config.size(), "page_size requires format 'page_size:1g'", OPS_ERROR(ErrCode::VALUE));
-    TORCH_CHECK(config[i+1] == ":", "Expected ':' after page_size", OPS_ERROR(ErrCode::VALUE));
+    TORCH_CHECK(config[i + 1] == ":", "Expected ':' after page_size", OPS_ERROR(ErrCode::VALUE));
 
-    if (config[i+2] == "1g") {
+    if (config[i + 2] == "1g") {
         m_page_size_1g = true;
     } else {
-        TORCH_CHECK(false, "Unsupported page_size value: ", config[i+2], OPS_ERROR(ErrCode::VALUE));
+        TORCH_CHECK(false, "Unsupported page_size value: ", config[i + 2], OPS_ERROR(ErrCode::VALUE));
     }
     return i + 2; // 返回最后处理的索引位置
 }
 
-void CachingAllocatorConfig::parseArgs(const char* env) {
-  // If empty, set the default values
-  m_max_split_size = std::numeric_limits<size_t>::max();
-  m_garbage_collection_threshold = 0;
-
-  if (env == nullptr) {
-    return;
-  }
-
-  std::vector<std::string> config;
-  lexArgs(env, config);
-
-  for (size_t i = 0; i < config.size(); i++) {
-    if (config[i].compare("max_split_size_mb") == 0) {
-      i = parseMaxSplitSize(config, i);
-    } else if (config[i].compare("garbage_collection_threshold") == 0) {
-      i = parseGarbageCollectionThreshold(config, i);
-    } else if (config[i] == "expandable_segments") {
-      set_expandable_segments_flag = true;
-      i = parseExpandableSegments(config, i);
-    } else if (config[i] == "base_addr_aligned_kb") {
-      i = parseAddrAlignSize(config, i);
-    } else if (config[i] == "page_size") {
-      i = parsePageSize(config, i);
-    } else {
-      TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", config[i], OPS_ERROR(ErrCode::PARAM));
-    }
-
-    if (i + 1 < config.size()) {
-      consumeToken(config, ++i, ',');
-    }
-  }
-
-  if (m_expandable_segments) {
-      if (set_expandable_segments_flag) {
-          TORCH_CHECK(m_max_split_size == std::numeric_limits<size_t>::max() && m_garbage_collection_threshold == 0,
-                      "`max_split_size_mb` or `garbage_collection_threshold`, cannot be enabled with "
-                      "`expandable_segments`, please set `expandable_segments` to `False`.",
-                      OPS_ERROR(ErrCode::PARAM));
-      } else if (m_max_split_size != std::numeric_limits<size_t>::max() || m_garbage_collection_threshold != 0) {
-          m_expandable_segments = false;
-          TORCH_NPU_WARN_ONCE("`max_split_size_mb` or `garbage_collection_threshold` is enabled, and the "
-                              "`expandable_segments` is changed to `False` by default.");
-      }
-  }
+void CachingAllocatorConfig::parseArgs(const char *env)
+{
+    // If empty, set the default values
+    m_max_split_size = std::numeric_limits<size_t>::max();
+    m_garbage_collection_threshold = 0;
+
+    if (env == nullptr) {
+        return;
+    }
+
+    std::vector<std::string> config;
+    lexArgs(env, config);
+
+    for (size_t i = 0; i < config.size(); i++) {
+        if (config[i].compare("max_split_size_mb") == 0) {
+            i = parseMaxSplitSize(config, i);
+        } else if (config[i].compare("garbage_collection_threshold") == 0) {
+            i = parseGarbageCollectionThreshold(config, i);
+        } else if (config[i] == "expandable_segments") {
+            set_expandable_segments_flag = true;
+            i = parseExpandableSegments(config, i);
+        } else if (config[i] == "base_addr_aligned_kb") {
+            i = parseAddrAlignSize(config, i);
+        } else if (config[i] == "page_size") {
+            i = parsePageSize(config, i);
+        } else {
+            TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", config[i], OPS_ERROR(ErrCode::PARAM));
+        }
+
+        if (i + 1 < config.size()) {
+            consumeToken(config, ++i, ',');
+        }
+    }
+
+    if (m_expandable_segments) {
+        if (set_expandable_segments_flag) {
+            TORCH_CHECK(m_max_split_size == std::numeric_limits<size_t>::max() && m_garbage_collection_threshold == 0,
+                "`max_split_size_mb` or `garbage_collection_threshold`, cannot be enabled with "
+                "`expandable_segments`, please set `expandable_segments` to `False`.",
+                OPS_ERROR(ErrCode::PARAM));
+        } else if (m_max_split_size != std::numeric_limits<size_t>::max() || m_garbage_collection_threshold != 0) {
+            m_expandable_segments = false;
+            TORCH_NPU_WARN_ONCE("`max_split_size_mb` or `garbage_collection_threshold` is enabled, and the "
+                "`expandable_segments` is changed to `False` by default.");
+        }
+    }
 }
 
 bool checkConfigExpandableSegments()
@@ -1025,756 +987,740 @@ bool isConfig1GPageSizeEnable()
     return CachingAllocatorConfig::page_size_1g_enable();
 }
 
-class DeviceCachingAllocator {
- private:
+// To prevent the deadlock situation, temporarily release the lock.
+//
+// Deadlock Scenario Description:
+//
+// 1. Main Thread:
+//    - Acquires the lock and performs sync to clear the taskqueue.
+//    - taskqueue wait a empty signal from the sub-thread.
+//
+// 2. Sub-thread:
+//    - Python function (tbe op compile) called in CANN may trigger GC that introduces a resource release operation.
+//    - The release operation (`free`) cannot acquire the same lock holded in main thread.
+//    - Unable to send a signal to the main thread.
+class UnlockGuard {
+public:
+    explicit UnlockGuard(std::unique_lock<std::recursive_mutex>& lock) : lock_(lock) { lock_.unlock(); }
+
+    ~UnlockGuard() { lock_.lock(); }
+
+private:
+    std::unique_lock<std::recursive_mutex>& lock_;
+};
 
-  // lock around all operations
-  mutable std::recursive_mutex mutex;
+class DeviceCachingAllocator {
+private:
+    // lock around all operations
+    mutable std::recursive_mutex mutex;
 
-  // device statistics
-  DeviceStats stats;
+    // device statistics
+    DeviceStats stats;
 
-  // unallocated cached blocks larger than 1 MB
-  BlockPool large_blocks;
+    // unallocated cached blocks larger than 1 MB
+    BlockPool large_blocks;
 
-  // unallocated cached blocks 1 MB or smaller
-  BlockPool small_blocks;
+    // unallocated cached blocks 1 MB or smaller
+    BlockPool small_blocks;
 
-  // allocated or in use by a stream
-  ska::flat_hash_set<Block*> active_blocks;
+    // allocated or in use by a stream
+    ska::flat_hash_set<Block *> active_blocks;
 
-  // captures_underway tracks if we are diverting some
-  // allocations to a specific pool.
-  // Most of the time it's empty, in which case malloc can avoid calling
-  // aclrtStreamGetCaptureInfo in the hot path.
-  std::vector<std::pair<MempoolId_t, std::function<bool(aclrtStream)>>>
-      captures_underway;
+    // captures_underway tracks if we are diverting some
+    // allocations to a specific pool.
+    // Most of the time it's empty, in which case malloc can avoid calling
+    // aclrtStreamGetCaptureInfo in the hot path.
+    std::vector<std::pair<MempoolId_t, std::function<bool(aclrtStream)>>> captures_underway;
 
-  // See free() for this thing's purpose
-  std::vector<Block*> needs_events_deferred_until_no_capture;
+    // See free() for this thing's purpose
+    std::vector<Block *> needs_events_deferred_until_no_capture;
 
-  // outstanding acl events
-  ska::flat_hash_map<
-      c10_npu::NPUStream,
-      std::deque<std::pair<EventPool::Event, Block*>>>
-      npu_events;
+    // outstanding acl events
+    ska::flat_hash_map<c10_npu::NPUStream, std::deque<std::pair<EventPool::Event, Block *>>> npu_events;
 
-  // record used memory.
-  size_t total_allocated_memory = 0;
+    // record used memory.
+    size_t total_allocated_memory = 0;
 
-  // record maximum allowed memory.
-  size_t allowed_memory_maximum = 0;
+    // record maximum allowed memory.
+    size_t allowed_memory_maximum = 0;
 
-  // all live expandable segments
-  std::vector<ExpandableSegment*> expandable_segments_;
+    // all live expandable segments
+    std::vector<ExpandableSegment *> expandable_segments_;
 
-  bool set_fraction = false;
+    bool set_fraction = false;
 
-  bool record_history = false;
+    bool record_history = false;
 
-  std::atomic<CreateContextFn> context_recorder_;
-  size_t alloc_trace_next = 0;
-  RecordContext record_context_ = RecordContext::NEVER;
-  size_t alloc_trace_max_entries_ = 1;
-  std::vector<TraceEntry>*
-        alloc_trace; // pointer because we need to intentionally leak this on
-                   // deallocation it can hold references to Python state which
-                   // will already be destroyed when we are in exit handlers
+    std::atomic<CreateContextFn> context_recorder_;
+    size_t alloc_trace_next = 0;
+    RecordContext record_context_ = RecordContext::NEVER;
+    size_t alloc_trace_max_entries_ = 1;
+    std::vector<TraceEntry> *alloc_trace; // pointer because we need to intentionally leak this on
+                                          // deallocation it can hold references to Python state which
+                                          // will already be destroyed when we are in exit handlers
 
-  // XXX - maybe we should generalize and have multiple events
-  std::vector<OutOfMemoryObserver> oom_observers_;
+    // XXX - maybe we should generalize and have multiple events
+    std::vector<OutOfMemoryObserver> oom_observers_;
     std::shared_ptr<c10d_npu::HCCLComm> hcclComm_;
 
-  // Private pools for NPU graphs
-  ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash>
-      graph_pools;
-
-  // Pools no longer referenced by any graph. Their BlockPools are eligible for
-  // free_blocks. Can't be a vector or deque because we might erase entries in
-  // any order. Could be an std::list, but we don't care much, access and
-  // insert/erase are rare.
-  ska::flat_hash_map<MempoolId_t, PrivatePool*, MempoolIdHash>
-      graph_pools_freeable;
-
-  // mapping from block to a stream_set, containing streams on which the block
-  // was used while npugraph capturing
-  std::unordered_map<Block*, stream_set> block_to_npugraph_stream_uses;
- public:
-
-  DeviceCachingAllocator() :
-    large_blocks(false),
-    small_blocks(true),
-    alloc_trace(new std::vector<TraceEntry>()) {
-    stats.max_split_size = static_cast<int64_t>(CachingAllocatorConfig::max_split_size());
-    context_recorder_.store(nullptr);
-  }
-
-  void recordHistory(bool enabled, CreateContextFn context_recorder,
-                     size_t alloc_trace_max_entries, RecordContext when)
-  {
-      std::unique_lock<std::recursive_mutex> lock(mutex);
-      TORCH_CHECK(when == RecordContext::NEVER || context_recorder, PTA_ERROR(ErrCode::INTERNAL));
-      record_history = enabled;
-      context_recorder_.store(record_history ? context_recorder : nullptr);
-      alloc_trace_max_entries_ = std::max(size_t(1), alloc_trace_max_entries);
-      record_context_ = enabled ? when : RecordContext::NEVER;
-      alloc_trace_next = 0;
-      alloc_trace->clear();
-  }
-
-  bool isHistoryEnabled() { return record_history; }
-
-  bool checkPoolLiveAllocations(
-    MempoolId_t mempool_id,
-    const std::unordered_set<void*>& expected_live_allocations)
-  {
-      std::unique_lock<std::recursive_mutex> lock(mutex);
-
-      PrivatePool* pool = nullptr;
-      auto pool_it = graph_pools.find(mempool_id);
-      TORCH_CHECK(pool_it != graph_pools.end(), "Could not find pool of id", PTA_ERROR(ErrCode::INTERNAL));
-      pool = pool_it->second.get();
-
-      TORCH_INTERNAL_ASSERT(pool != nullptr, PTA_ERROR(ErrCode::PTR));
-      size_t allocated_pool_blocks = 0;
-
-      for (Block* b : active_blocks) {
-          TORCH_INTERNAL_ASSERT(b != nullptr);
-          TORCH_INTERNAL_ASSERT(b->pool != nullptr);
-          if (b->allocated && b->pool->owner_PrivatePool == pool) {
-              if (!expected_live_allocations.count(b->ptr)) {
-                  return false;
-              }
-              allocated_pool_blocks += 1;
-            }
-      }
-
-      return allocated_pool_blocks == expected_live_allocations.size();
-  }
-
-  void attachOutOfMemoryObserver(OutOfMemoryObserver observer)
-  {
-      oom_observers_.emplace_back(observer);
-  }
-
-  bool checkUceInMemPool()
-  {
-      auto memUceInfo_ = c10_npu::get_mem_uce_info();
-      auto info = memUceInfo_.info;
-      const auto all_blocks = get_all_blocks();
-      bool any_found = false;
-      aclrtMemUceInfo temp_info[memUceInfo_.retSize];
-      size_t temp_retsize = 0;
-
-      for (int i = 0; i < memUceInfo_.retSize; ++i) {
-          void* addr = info[i].addr;
-          size_t length = info[i].len;
-          bool found = false;
-
-          // Calculate the start and end address for info[i]
-          void* addr_end = static_cast<char*>(addr) + length - 1;
-
-          // Iterate through all blocks and check if there's an overlap with addr
-          for (const Block* const head_block : all_blocks) {
-              void* block_start = head_block->ptr;
-              void* block_end = static_cast<char*>(head_block->ptr) + head_block->size - 1;
-
-              // If there is an overlap, mark the block as unsafe
-              if (addr <= block_end && addr_end >= block_start) {
-                  const_cast<Block*>(head_block)->is_safe = false;
-                  ASCEND_LOGI("Memory block with UCE fault error found in the NPUCachingAllocator and was marked as unsafe");
-                  found = true;
-                  any_found = true;
-                  // Set the unsafe flag only once
-                  if (c10_npu::get_npu_data_unsafe_flag() == false) {
-                      c10_npu::set_npu_data_unsafe_flag(true);
-                  }
-              }
-          }
-
-          if (found) {
-            // update memuceinfo
-            temp_info[temp_retsize++] = info[i];
-          }
-      }
-
-      std::memcpy(memUceInfo_.info, temp_info, temp_retsize * sizeof(aclrtMemUceInfo));
-      memUceInfo_.retSize = temp_retsize;
-
-      c10_npu::set_mem_uce_info(memUceInfo_);
-      if (!any_found) {
-          return false;
-      }
-      return true;
-  }
-
-  void markAllBlockUnsafe()
-  {
-      for (auto& active_block : active_blocks) {
-          active_block->is_safe = false;
-      }
-      return;
-  }
-
-  // Must be called outside of `mutex` or deadlocks are possible with Python
-  std::shared_ptr<c10::GatheredContext> maybeGatherContext(RecordContext level)
-  {
-      if (record_context_ < level) {
-          return nullptr;
-      }
-      return context_recorder_.load()();
-  }
-
-  // All public methods (except the above) acquire the allocator mutex.
-  // Thus, do not call a public method from another public method.
-
-  Block* malloc(int device, size_t orig_size, aclrtStream stream, uint8_t allocator_type = 0)
-  {
-    // done outside the lock because we don't know what locks the recorder needs
-    // to have...
-    auto context = maybeGatherContext(RecordContext::STATE);
-
-    std::unique_lock<std::recursive_mutex> lock(mutex);
+    // Private pools for NPU graphs
+    ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash> graph_pools;
 
-    if (device == -1) {
-        NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
-    }
+    // Pools no longer referenced by any graph. Their BlockPools are eligible for
+    // free_blocks. Can't be a vector or deque because we might erase entries in
+    // any order. Could be an std::list, but we don't care much, access and
+    // insert/erase are rare.
+    ska::flat_hash_map<MempoolId_t, PrivatePool *, MempoolIdHash> graph_pools_freeable;
 
-    if (C10_LIKELY(captures_underway.empty())) {
-      // Processes end-of-life events for outstanding allocations used on
-      // multiple streams (checks if their NPU-side uses are complete and
-      // recycles their memory if so)
-      //
-      // Q. Why skip process_events if a capture might be underway?
-      // A. process_events involves npuEventQueries, illegal during NPU graph
-      //    capture.
-      //    Dumb simple solution: defer reclaiming these allocations until after
-      //    capture. Cross-stream memory use is uncommon, so the deferral's
-      //    effect on memory use during capture should be small.
-      process_events(context);
-    }
-    auto size = round_size(orig_size);
-    auto& pool = get_pool(size, stream);
-
-    // 开环境变量 大池子放1G内存块
-    const size_t alloc_size = IsMallocPage1GMem(pool.is_small)
-                              ? kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer)
-                              : get_allocation_size(size);
-    AllocParams params(device, size, stream, &pool, alloc_size, stats);
-    params.stat_types = get_stat_types_for_pool(pool);
-
-    // First, try to get a block from the existing pool.
-    bool block_found =
-      // Search pool
-      get_free_block(params) ||
-      // Trigger callbacks and retry search
-      (trigger_free_memory_callbacks(params) && get_free_block(params));
-    // Can't reuse an existing block; try to get a new one.
-    if (!block_found) {
-      // Do garbage collection if the flag is set.
-      if (C10_UNLIKELY(set_fraction &&
-              CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
-        garbage_collect_cached_blocks(context);
-      }
-      // Attempt allocate
-      block_found = alloc_block(params, false, context, lock) ||
-          // Free enough available cached blocks to satisfy alloc and retry
-          // alloc.
-          (release_available_cached_blocks(params, context) &&
-              alloc_block(params, false, context, lock));
-    }
-
-    if (!block_found && C10_LIKELY(captures_underway.empty())) {
-        ASCEND_LOGE(
-            "Get a block from the existing pool failed. Try to free cached blocks and reallocate. This error log "
-            "can be ignored.");
-        // Free all non-split cached blocks and retry alloc.
-        c10_npu::NPUWorkspaceAllocator::emptyCache(device, true, true);
-        block_found = (release_cached_blocks(true, context) && alloc_block(params, true, context, lock));
-    }
-
-    if (!block_found) {
-      if (params.err == ACL_ERROR_RT_MEMORY_ALLOCATION) {
-        size_t device_free;
-        size_t device_total;
-        NPU_CHECK_ERROR(aclrtGetMemInfo(ACL_HBM_MEM, &device_free, &device_total));
+    // mapping from block to a stream_set, containing streams on which the block
+    // was used while npugraph capturing
+    std::unordered_map<Block *, stream_set> block_to_npugraph_stream_uses;
 
-        std::string allowed_info;
-        if (set_fraction) {
-          allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
-        }
-        stats.num_ooms += 1;
-
-        record_trace(
-            TraceEntry::OOM,
-            device_free,
-            params.size(),
-            params.stream(),
-            params.device(),
-            std::move(context));
-        auto observers_local = oom_observers_;
-
-        // Make sure we do not have the device lock before calling our
-        // observers which might need hold the GIL
-        // It is safe to release at this point because will no longer
-        // be reading any allocator state.
-
-        lock.unlock();
-
-        for (const auto& obs : observers_local) {
-            obs(device,
-                alloc_size,
-                set_fraction ? allowed_memory_maximum : device_total,
-                device_free);
-        }
-        // "total capacity": total global memory on NPU
-        // "allowed": memory is allowed to use, which set by fraction.
-        // "already allocated": memory allocated by the program using the
-        //                      caching allocator
-        // "free": free memory as reported by the NPU API
-        // "cached": memory held by the allocator but not used by the program
-        //
-        // The "allocated" amount  does not include memory allocated outside
-        // of the caching allocator, such as memory allocated by other programs
-        // or memory held by the driver.
-        //
-        // The sum of "allocated" + "free" + "cached" may be less than the
-        // total capacity due to memory held by the driver and usage by other
-        // programs.
-        //
-        // Note that at this point free_cached_blocks has already returned all
-        // possible "cached" memory to the driver. The only remaining "cached"
-        // memory is split from a larger block that is partially in-use.
-        AT_ERROR(
-            "NPU out of memory. Tried to allocate ",
-            format_size(alloc_size),
-            " (NPU ", device, "; ",
-            format_size(device_total),
-            " total capacity; ",
-            format_size(stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
-            " already allocated; ",
-            format_size(stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
-            " current active; ",
-            format_size(device_free),
-            " free; ",
-            allowed_info,
-            format_size(stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
-            " reserved in total by PyTorch)",
-            " If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.");
-      } else {
-        NPU_CHECK_ERROR(params.err);
-      }
-    }
-
-    int64_t ori_block_ptr = int64_t(params.block->ptr);
-    size_t align_round = CachingAllocatorConfig::base_addr_aligned_size();
-    if (params.size() >= kRoundLarge && CachingAllocatorConfig::expandable_segments() && align_round != 0 &&
-        ori_block_ptr % align_round != 0) {
-        char* align_ptr = reinterpret_cast<char*>((ori_block_ptr + align_round) - (ori_block_ptr % align_round));
-        size_t offset_size = align_ptr - (char*)params.block->ptr;
-        if (offset_size + params.size() <= params.block->size) {
-            auto size = params.block->size;
-            Block* remaining = params.block;
-
-            Block* block = new Block(params.device(), params.stream(), size - offset_size, params.pool, align_ptr);
-            block->expandable_segment_ = remaining->expandable_segment_;
-            block->next = remaining->next;
-            if (block->next) {
-                block->next->prev = block;
-            }
-            block->prev = remaining;
-
-            remaining->next = block;
-            remaining->size = offset_size;
-            params.pool->blocks.insert(remaining);
-
-            params.block = block;
-        }
-    }
-
-    bool split_remainder = should_split(params.block, params.size());
-    return alloc_found_block(
-        std::move(params), orig_size, std::move(context), split_remainder, allocator_type);
-  }
-
-  Block* alloc_found_block(
-    AllocParams params,
-    size_t orig_size,
-    std::shared_ptr<c10::GatheredContext> context,
-    bool split_remainder,
-    uint8_t allocator_type)
-  {
-  auto size = params.size();
-  auto device = params.device();
-  auto pool = params.pool;
-  auto stream = params.stream();
-
-  TORCH_INTERNAL_ASSERT(
-      params.err == ACL_ERROR_NONE && params.block != nullptr &&
-      params.block->ptr != nullptr, PTA_ERROR(ErrCode::PTR));
-  Block* block = params.block;
-  Block* remaining = nullptr;
-
-  const bool already_split = block->is_split();
-  if (split_remainder) {
-    remaining = block;
-
-    block = new Block(device, stream, size, pool, block->ptr);
-    block->expandable_segment_ = remaining->expandable_segment_;
-    block->prev = remaining->prev;
-    if (block->prev) {
-      block->prev->next = block;
-    }
-    block->next = remaining;
-
-    remaining->prev = block;
-    remaining->ptr = static_cast<char*>(remaining->ptr) + size;
-    remaining->size -= size;
-    pool->blocks.insert(remaining);
-
-    if (already_split && !block->expandable_segment_) {
-      // An already-split inactive block is being shrunk by size bytes.
-      update_stat_array(
-          stats.inactive_split_bytes,
-          -static_cast<std::int64_t>(block->size),
-          params.stat_types);
-    } else if (!block->expandable_segment_) {
-      // A new split inactive block is being created from a previously unsplit
-      // block, size remaining->size bytes.
-      for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
-        update_stat(
-            stats.inactive_split_bytes[stat_type],
-            static_cast<std::int64_t>(remaining->size));
-        update_stat(stats.inactive_split[stat_type], 1);
-      });
-    }
-  } else if (already_split && !block->expandable_segment_) {
-    // An already-split block is becoming active
-    for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
-      update_stat(
-          stats.inactive_split_bytes[stat_type],
-          -static_cast<std::int64_t>(block->size));
-      update_stat(stats.inactive_split[stat_type], -1);
-    });
-  }
-
-  block->allocated = true;
-  block->requested_size = orig_size;
-  if (block->is_safe == false) {
-      ASCEND_LOGI("Unsafe memory block is passively refreshed by releasing and mallocing memory again");
-  }
-  block->is_safe = true;
-
-  block->context_when_allocated = std::move(context);
-  record_trace(
-      TraceEntry::ALLOC,
-      int64_t(block->ptr),
-      orig_size,
-      block->stream,
-      block->device,
-      block->context_when_allocated);
-
-  active_blocks.insert(block);
-
-  for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
-    update_stat(stats.allocation[stat_type], 1);
-    update_stat(
-        stats.allocated_bytes[stat_type],
-        static_cast<std::int64_t>(block->size));
-    update_stat(stats.active[stat_type], 1);
-    update_stat(
-        stats.active_bytes[stat_type],
-        static_cast<std::int64_t>(block->size));
-    update_stat(
-        stats.requested_bytes[stat_type],
-        static_cast<std::int64_t>(block->requested_size));
-  });
-
-  if (block->size >= CachingAllocatorConfig::max_split_size())
-    update_stat(stats.oversize_allocations, 1);
-
-  ASCEND_LOGD("PTA CachingAllocator malloc: malloc = %zu, cached = %lu, allocated = %lu",
-      block->size,
-      stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-      stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
+public:
+    DeviceCachingAllocator() : large_blocks(false), small_blocks(true), alloc_trace(new std::vector<TraceEntry>())
+    {
+        stats.max_split_size = static_cast<int64_t>(CachingAllocatorConfig::max_split_size());
+        context_recorder_.store(nullptr);
+    }
 
-#ifndef BUILD_LIBTORCH
-    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-    mstxMemVirtualRangeDesc_t desc{block->device, block->ptr, block->size};
-    torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc);
-    torch_npu::profiler::reportMemoryDataToNpuProfiler({
-      static_cast<int8_t>(c10::DeviceType::PrivateUse1),
-      block->device,
-      static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC),
-      allocator_type,
-      reinterpret_cast<int64_t>(block->ptr),
-      block->size,
-      stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-      stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-      stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-      reinterpret_cast<int64_t>(block->stream)}
-    );
-#endif
+    void recordHistory(bool enabled, CreateContextFn context_recorder, size_t alloc_trace_max_entries,
+        RecordContext when)
+    {
+        std::unique_lock<std::recursive_mutex> lock(mutex);
+        TORCH_CHECK(when == RecordContext::NEVER || context_recorder, PTA_ERROR(ErrCode::INTERNAL));
+        record_history = enabled;
+        context_recorder_.store(record_history ? context_recorder : nullptr);
+        alloc_trace_max_entries_ = std::max(size_t(1), alloc_trace_max_entries);
+        record_context_ = enabled ? when : RecordContext::NEVER;
+        alloc_trace_next = 0;
+        alloc_trace->clear();
+    }
 
-  return block;
-}
+    bool isHistoryEnabled()
+    {
+        return record_history;
+    }
+
+    bool checkPoolLiveAllocations(MempoolId_t mempool_id, const std::unordered_set<void *> &expected_live_allocations)
+    {
+        std::unique_lock<std::recursive_mutex> lock(mutex);
 
+        PrivatePool *pool = nullptr;
+        auto pool_it = graph_pools.find(mempool_id);
+        TORCH_CHECK(pool_it != graph_pools.end(), "Could not find pool of id", PTA_ERROR(ErrCode::INTERNAL));
+        pool = pool_it->second.get();
 
-  void free(Block* block, uint8_t allocator_type = 0)
-  {
-    std::shared_ptr<c10::GatheredContext> context =
-        maybeGatherContext(RecordContext::ALL);
-    std::lock_guard<std::recursive_mutex> lock(mutex);
+        TORCH_INTERNAL_ASSERT(pool != nullptr, PTA_ERROR(ErrCode::PTR));
+        size_t allocated_pool_blocks = 0;
 
-    block->allocated = false;
+        for (Block *b : active_blocks) {
+            TORCH_INTERNAL_ASSERT(b != nullptr);
+            TORCH_INTERNAL_ASSERT(b->pool != nullptr);
+            if (b->allocated && b->pool->owner_PrivatePool == pool) {
+                if (!expected_live_allocations.count(b->ptr)) {
+                    return false;
+                }
+                allocated_pool_blocks += 1;
+            }
+        }
 
-    // following logic might modifying underlaying Block, causing the size
-    // changed. We store ahead for reporting
-    auto orig_block_ptr = block->ptr;
-    auto orig_block_size = block->size;
+        return allocated_pool_blocks == expected_live_allocations.size();
+    }
 
-    StatTypes stat_types = get_stat_types_for_pool(*(block->pool));
-    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
-      update_stat(stats.allocation[stat_type], -1);
-      update_stat(stats.allocated_bytes[stat_type], -block->size);
-    });
-
-    record_trace(
-        TraceEntry::FREE_REQUESTED,
-        int64_t(block->ptr),
-        block->requested_size,
-        block->stream,
-        block->device,
-        context ? context : block->context_when_allocated);
-
-    if (block->size >= CachingAllocatorConfig::max_split_size())
-      update_stat(stats.oversize_allocations, -1);
-
-    if (!block->stream_uses.empty() && c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
-      if (C10_UNLIKELY(!captures_underway.empty())) {
-        // It's forbidden to npuEventQuery an event recorded during NPU graph
-        // capture. We conservatively defer recording end-of-life events until
-        // the next call to process_events() (which won't happen until no
-        // captures are underway)
-        needs_events_deferred_until_no_capture.push_back(block);
-      } else {
-        insert_events(block);
-      }
-    } else {
-      free_block(block, context, allocator_type);
+    void attachOutOfMemoryObserver(OutOfMemoryObserver observer)
+    {
+        oom_observers_.emplace_back(observer);
     }
 
-    ASCEND_LOGD("PTA CachingAllocator free: free = %zu, cached = %lu, allocated = %lu",
-        orig_block_size,
-        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-        stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
-#ifndef BUILD_LIBTORCH
-    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-    torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, orig_block_ptr);
-    torch_npu::profiler::reportMemoryDataToNpuProfiler({
-        static_cast<int8_t>(c10::DeviceType::PrivateUse1),
-        block->device,
-        static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_FREE),
-        allocator_type,
-        reinterpret_cast<int64_t>(orig_block_ptr),
-        -orig_block_size,
-        stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-        stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-        reinterpret_cast<int64_t>(block->stream)}
-    );
-#endif
-  }
-
-  void* getBaseAllocation(Block* block, size_t* outSize) {
-    std::lock_guard<std::recursive_mutex> lock(mutex);
-    while (block->prev) {
-      block = block->prev;
-    }
-    void* basePtr = block->ptr;
-    if (outSize) {
-      size_t size = 0;
-      while (block) {
-        size += block->size;
-        block = block->next;
-      }
-      *outSize = size;
-    }
-    return basePtr;
-  }
-
-    void recordStream(Block* block, c10_npu::NPUStream stream) {
-        std::lock_guard<std::recursive_mutex> lock(mutex);
-        block->stream_uses.insert(stream);
-        if (C10_UNLIKELY(!captures_underway.empty())) {
-            block_to_npugraph_stream_uses[block].insert(stream);
+    bool checkUceInMemPool()
+    {
+        auto memUceInfo_ = c10_npu::get_mem_uce_info();
+        auto info = memUceInfo_.info;
+        const auto all_blocks = get_all_blocks();
+        bool any_found = false;
+        aclrtMemUceInfo temp_info[memUceInfo_.retSize];
+        size_t temp_retsize = 0;
+
+        for (size_t i = 0; i < memUceInfo_.retSize; ++i) {
+            void *addr = info[i].addr;
+            size_t length = info[i].len;
+            bool found = false;
+
+            // Calculate the start and end address for info[i]
+            void *addr_end = static_cast<char *>(addr) + length - 1;
+
+            // Iterate through all blocks and check if there's an overlap with addr
+            for (const Block * const head_block : all_blocks) {
+                void *block_start = head_block->ptr;
+                void *block_end = static_cast<char *>(head_block->ptr) + head_block->size - 1;
+
+                // If there is an overlap, mark the block as unsafe
+                if (addr <= block_end && addr_end >= block_start) {
+                    const_cast<Block *>(head_block)->is_safe = false;
+                    ASCEND_LOGI(
+                        "Memory block with UCE fault error found in the NPUCachingAllocator and was marked as unsafe");
+                    found = true;
+                    any_found = true;
+                    // Set the unsafe flag only once
+                    if (c10_npu::get_npu_data_unsafe_flag() == false) {
+                        c10_npu::set_npu_data_unsafe_flag(true);
+                    }
+                }
+            }
+
+            if (found) {
+                // update memuceinfo
+                temp_info[temp_retsize++] = info[i];
+            }
+        }
+
+        std::memcpy(memUceInfo_.info, temp_info, temp_retsize * sizeof(aclrtMemUceInfo));
+        memUceInfo_.retSize = temp_retsize;
+
+        c10_npu::set_mem_uce_info(memUceInfo_);
+        if (!any_found) {
+            return false;
         }
+        return true;
     }
 
-  void eraseStream(Block* block, c10_npu::NPUStream stream) {
-    std::shared_ptr<c10::GatheredContext> context =
-        maybeGatherContext(RecordContext::ALL);
-    std::lock_guard<std::recursive_mutex> lock(mutex);
-    block->stream_uses.erase(stream);
-
-    // free block, lazy destory block related events
-    for (auto it = npu_events[stream].begin(); it != npu_events[stream].end();) {
-      if (block != it->second) {
-        it++;
-        continue;
-      }
-      it = npu_events[stream].erase(it);
-      block->event_count--;
-      if (block->event_count == 0) {
-        free_block(block, context);
-        break;
-      }
-    }
-  }
-
-  /** set memory fraction to limit maximum allocated memory **/
-  void setMemoryFraction(double fraction) {
-    size_t device_free;
-    size_t device_total;
-    NPU_CHECK_ERROR(aclrtGetMemInfo(ACL_HBM_MEM, &device_free, &device_total));
-    allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
-    set_fraction = true;
-  }
-
-    /** returns cached blocks to the system allocator **/
-    void emptyCache(int device, bool check_error)
+    void markAllBlockUnsafe()
     {
-        std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
-        std::lock_guard<std::recursive_mutex> lock(mutex);
-        c10_npu::NPUWorkspaceAllocator::emptyCache(device, true, check_error);
-        release_cached_blocks(check_error, context);
+        for (auto &active_block : active_blocks) {
+            active_block->is_safe = false;
+        }
+        return;
     }
 
-    void buildServerMemMapForHccl(std::shared_ptr<c10d_npu::HCCLComm> hcclComm)
+    // Must be called outside of `mutex` or deadlocks are possible with Python
+    std::shared_ptr<c10::GatheredContext> maybeGatherContext(RecordContext level)
     {
-        std::unique_lock<std::recursive_mutex> lock(mutex);
-        TORCH_INTERNAL_ASSERT(!hcclComm_, "Build HCCL server group redundancy.", PTA_ERROR(ErrCode::INTERNAL));
-        hcclComm_ = hcclComm;
-        for (auto &expandable_segments: expandable_segments_) {
-            expandable_segments->setHcclComm(hcclComm);
+        if (record_context_ < level) {
+            return nullptr;
         }
+        return context_recorder_.load()();
     }
 
-  void release_and_free_events()
-  {
-      std::unique_lock<std::recursive_mutex> lock(mutex);
-      std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
-      for (auto& st : npu_events) {
-          for (auto& e : st.second) {
-              EventPool::Event event = std::move(e.first);
-              Block* block = e.second;
-              block->event_count--;
-              if (block->event_count == 0) {
-                  free_block(block, context);
-              }
-          }
-      }
-      npu_events.clear();
-  }
-
-  /** Retrieves info (total size + largest block) of the memory cache **/
-  void cacheInfo(size_t* total, size_t* largest) {
-    std::lock_guard<std::recursive_mutex> lock(mutex);
-    cache_info_aux(large_blocks, total, largest);
-    cache_info_aux(small_blocks, total, largest);
-    for (const auto& gp : graph_pools) {
-      cache_info_aux(gp.second->large_blocks, total, largest);
-      cache_info_aux(gp.second->small_blocks, total, largest);
-    }
-  }
-
-  /** Returns a copy of the memory allocator stats **/
-  DeviceStats getStats() {
-    std::lock_guard<std::recursive_mutex> lock(mutex);
-    return stats;
-  }
-
-  /** Resets the historical accumulation stats for the device **/
-  void resetAccumulatedStats() {
-    std::lock_guard<std::recursive_mutex> lock(mutex);
-
-    for (size_t statType = 0; statType < static_cast<size_t>(StatType::NUM_TYPES); ++statType) {
-      reset_accumulated_stat(stats.allocation[statType]);
-      reset_accumulated_stat(stats.segment[statType]);
-      reset_accumulated_stat(stats.active[statType]);
-      reset_accumulated_stat(stats.inactive_split[statType]);
-      reset_accumulated_stat(stats.allocated_bytes[statType]);
-      reset_accumulated_stat(stats.reserved_bytes[statType]);
-      reset_accumulated_stat(stats.active_bytes[statType]);
-      reset_accumulated_stat(stats.inactive_split_bytes[statType]);
-      reset_accumulated_stat(stats.requested_bytes[statType]);
-    }
-
-    stats.num_alloc_retries = 0;
-    stats.num_ooms = 0;
-    reset_accumulated_stat(stats.oversize_allocations);
-    reset_accumulated_stat(stats.oversize_segments);
-  }
-
-  /** Resets the historical peak stats for the device **/
-  void resetPeakStats() {
-    std::lock_guard<std::recursive_mutex> lock(mutex);
-
-    for (size_t statType = 0; statType < static_cast<size_t>(StatType::NUM_TYPES); ++statType) {
-      reset_peak_stat(stats.allocation[statType]);
-      reset_peak_stat(stats.segment[statType]);
-      reset_peak_stat(stats.active[statType]);
-      reset_peak_stat(stats.inactive_split[statType]);
-      reset_peak_stat(stats.allocated_bytes[statType]);
-      reset_peak_stat(stats.reserved_bytes[statType]);
-      reset_peak_stat(stats.active_bytes[statType]);
-      reset_peak_stat(stats.inactive_split_bytes[statType]);
-      reset_peak_stat(stats.requested_bytes[statType]);
-    }
-
-    reset_peak_stat(stats.oversize_allocations);
-    reset_peak_stat(stats.oversize_segments);
-  }
+    // All public methods (except the above) acquire the allocator mutex.
+    // Thus, do not call a public method from another public method.
 
-    /* Checkpoint the state of a private pool necessary to return it to its
-      * current state */
-    std::unique_ptr<PrivatePoolState> getCheckpointState(MempoolId_t id)
+    Block *malloc(int device, size_t orig_size, aclrtStream stream, uint8_t allocator_type = 0)
     {
-        auto context = maybeGatherContext(RecordContext::ALL);
-        std::lock_guard<std::recursive_mutex> lock(mutex);
-        insert_events_deferred_until_no_capture(context);
+        // done outside the lock because we don't know what locks the recorder needs
+        // to have...
+        auto context = maybeGatherContext(RecordContext::STATE);
 
-        auto pool = graph_pools.find(id);
-        if (pool != graph_pools.end()) {
-            auto private_pool_head_blocks = get_private_pool_head_blocks(pool->second.get());
-            return std::make_unique<PrivatePoolState>(id, private_pool_head_blocks);
-        } else if (graph_pools_freeable.count(id)) {
-            TORCH_CHECK(false, "Not expected to checkpoint freeable graph", PTA_ERROR(ErrCode::VALUE));
-        } else {
-            TORCH_CHECK(false, "Could not find pool of id", PTA_ERROR(ErrCode::NOT_FOUND));
+        std::unique_lock<std::recursive_mutex> lock(mutex);
+
+        if (device == -1) {
+            NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
         }
-    }
 
-    void freeBlocksAllocatedToPool(PrivatePool* private_pool, RestoreResult& rr)
-    {
-        auto pool_blocks = get_private_pool_head_blocks(private_pool);
+        if (C10_LIKELY(captures_underway.empty())) {
+            // Processes end-of-life events for outstanding allocations used on
+            // multiple streams (checks if their NPU-side uses are complete and
+            // recycles their memory if so)
+            //
+            // Q. Why skip process_events if a capture might be underway?
+            // A. process_events involves npuEventQueries, illegal during NPU graph
+            //    capture.
+            //    Dumb simple solution: defer reclaiming these allocations until after
+            //    capture. Cross-stream memory use is uncommon, so the deferral's
+            //    effect on memory use during capture should be small.
+            process_events(context);
+        }
+        auto size = round_size(orig_size);
+        auto &pool = get_pool(size, stream);
+
+        // 开环境变量 大池子放1G内存块
+        const size_t alloc_size = IsMallocPage1GMem(pool.is_small) ?
+            kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer) :
+            get_allocation_size(size);
+        AllocParams params(device, size, stream, &pool, alloc_size, stats);
+        params.stat_types = get_stat_types_for_pool(pool);
+
+        // First, try to get a block from the existing pool.
+        bool block_found =
+            // Search pool
+            get_free_block(params) ||
+            // Trigger callbacks and retry search
+            (trigger_free_memory_callbacks(params) && get_free_block(params));
+        // Can't reuse an existing block; try to get a new one.
+        if (!block_found) {
+            // Do garbage collection if the flag is set.
+            if (C10_UNLIKELY(set_fraction && CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+                garbage_collect_cached_blocks(context, lock);
+            }
+            // Attempt allocate
+            block_found = alloc_block(params, false, context, lock) ||
+                // Free enough available cached blocks to satisfy alloc and retry
+                // alloc.
+                (release_available_cached_blocks(params, context, lock) && alloc_block(params, false, context, lock));
+        }
 
-        std::vector<Block*> head_blocks;
-        for (Block* block : pool_blocks) {
-            if (block->prev == nullptr) {
-                head_blocks.push_back(block);
+        if (!block_found && C10_LIKELY(captures_underway.empty())) {
+            ASCEND_LOGE(
+                "Get a block from the existing pool failed. Try to free cached blocks and reallocate. This error log can be ignored.");
+            // Free all non-split cached blocks and retry alloc.
+            {
+                UnlockGuard guard(lock);
+                // Make sure taskqueue is empty, then execute release_cached_blocks
+                c10_npu::npuSynchronizeDevice(true);
             }
+            c10_npu::NPUWorkspaceAllocator::emptyCache(device, true, true);
+            block_found = (release_cached_blocks(true, context) && alloc_block(params, true, context, lock));
         }
-        for (Block* block : head_blocks) {
-            Block* curr = block;
 
-            while (curr) {
-                // When we free a block, its pointer should never change
-                // only its adjacent blocks, so free, then look at pointer
-                if (curr->allocated) {
-                    TORCH_CHECK(
-                        curr->event_count == 0,
-                        "Events should have synchronized when setting checkpointed block", PTA_ERROR(ErrCode::INTERNAL));
+        if (!block_found) {
+            if (params.err == ACL_ERROR_RT_MEMORY_ALLOCATION) {
+                size_t device_free;
+                size_t device_total;
+                NPU_CHECK_ERROR(aclrtGetMemInfo(ACL_HBM_MEM, &device_free, &device_total));
+
+                std::string allowed_info;
+                if (set_fraction) {
+                    allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
+                }
+                stats.num_ooms += 1;
+
+                record_trace(TraceEntry::OOM, device_free, params.size(), params.stream(), params.device(),
+                    std::move(context));
+                auto observers_local = oom_observers_;
+
+                // Make sure we do not have the device lock before calling our
+                // observers which might need hold the GIL
+                // It is safe to release at this point because will no longer
+                // be reading any allocator state.
+
+                lock.unlock();
+
+                for (const auto &obs : observers_local) {
+                    obs(device, alloc_size, set_fraction ? allowed_memory_maximum : device_total, device_free);
+                }
+                // "total capacity": total global memory on NPU
+                // "allowed": memory is allowed to use, which set by fraction.
+                // "already allocated": memory allocated by the program using the
+                //                      caching allocator
+                // "free": free memory as reported by the NPU API
+                // "cached": memory held by the allocator but not used by the program
+                //
+                // The "allocated" amount  does not include memory allocated outside
+                // of the caching allocator, such as memory allocated by other programs
+                // or memory held by the driver.
+                //
+                // The sum of "allocated" + "free" + "cached" may be less than the
+                // total capacity due to memory held by the driver and usage by other
+                // programs.
+                //
+                // Note that at this point free_cached_blocks has already returned all
+                // possible "cached" memory to the driver. The only remaining "cached"
+                // memory is split from a larger block that is partially in-use.
+                AT_ERROR("NPU out of memory. Tried to allocate ", format_size(alloc_size), " (NPU ", device, "; ",
+                    format_size(device_total), " total capacity; ",
+                    format_size(stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
+                    " already allocated; ",
+                    format_size(stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
+                    " current active; ", format_size(device_free), " free; ", allowed_info,
+                    format_size(stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
+                    " reserved in total by PyTorch)",
+                    " If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.");
+            } else {
+                NPU_CHECK_ERROR(params.err);
+            }
+        }
+
+        int64_t ori_block_ptr = int64_t(params.block->ptr);
+        size_t align_round = CachingAllocatorConfig::base_addr_aligned_size();
+        if (params.size() >= kRoundLarge && CachingAllocatorConfig::expandable_segments() && align_round != 0 &&
+            ori_block_ptr % align_round != 0) {
+            char *align_ptr = reinterpret_cast<char *>((ori_block_ptr + align_round) - (ori_block_ptr % align_round));
+            size_t offset_size = align_ptr - (char *)params.block->ptr;
+            if (offset_size + params.size() <= params.block->size) {
+                auto size = params.block->size;
+                Block *remaining = params.block;
+
+                Block *block = new Block(params.device(), params.stream(), size - offset_size, params.pool, align_ptr);
+                block->expandable_segment_ = remaining->expandable_segment_;
+                block->next = remaining->next;
+                if (block->next) {
+                    block->next->prev = block;
+                }
+                block->prev = remaining;
+
+                remaining->next = block;
+                remaining->size = offset_size;
+                params.pool->blocks.insert(remaining);
+
+                params.block = block;
+            }
+        }
+
+        bool split_remainder = should_split(params.block, params.size());
+        return alloc_found_block(std::move(params), orig_size, std::move(context), split_remainder, allocator_type);
+    }
+
+    Block *alloc_found_block(AllocParams params, size_t orig_size, std::shared_ptr<c10::GatheredContext> context,
+        bool split_remainder, uint8_t allocator_type)
+    {
+        auto size = params.size();
+        auto device = params.device();
+        auto pool = params.pool;
+        auto stream = params.stream();
+
+        TORCH_INTERNAL_ASSERT(params.err == ACL_ERROR_NONE && params.block != nullptr && params.block->ptr != nullptr,
+            PTA_ERROR(ErrCode::PTR));
+        Block *block = params.block;
+        Block *remaining = nullptr;
+
+        const bool already_split = block->is_split();
+        if (split_remainder) {
+            remaining = block;
+
+            block = new Block(device, stream, size, pool, block->ptr);
+            block->expandable_segment_ = remaining->expandable_segment_;
+            block->prev = remaining->prev;
+            if (block->prev) {
+                block->prev->next = block;
+            }
+            block->next = remaining;
+
+            remaining->prev = block;
+            remaining->ptr = static_cast<char *>(remaining->ptr) + size;
+            remaining->size -= size;
+            pool->blocks.insert(remaining);
+
+            if (already_split && !block->expandable_segment_) {
+                // An already-split inactive block is being shrunk by size bytes.
+                update_stat_array(stats.inactive_split_bytes, -static_cast<std::int64_t>(block->size),
+                    params.stat_types);
+            } else if (!block->expandable_segment_) {
+                // A new split inactive block is being created from a previously unsplit
+                // block, size remaining->size bytes.
+                for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
+                    update_stat(stats.inactive_split_bytes[stat_type], static_cast<std::int64_t>(remaining->size));
+                    update_stat(stats.inactive_split[stat_type], 1);
+                });
+            }
+        } else if (already_split && !block->expandable_segment_) {
+            // An already-split block is becoming active
+            for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
+                update_stat(stats.inactive_split_bytes[stat_type], -static_cast<std::int64_t>(block->size));
+                update_stat(stats.inactive_split[stat_type], -1);
+            });
+        }
+
+        block->allocated = true;
+        block->requested_size = orig_size;
+        if (block->is_safe == false) {
+            ASCEND_LOGI("Unsafe memory block is passively refreshed by releasing and mallocing memory again");
+        }
+        block->is_safe = true;
+
+        block->context_when_allocated = std::move(context);
+        record_trace(TraceEntry::ALLOC, int64_t(block->ptr), orig_size, block->stream, block->device,
+            block->context_when_allocated);
+
+        active_blocks.insert(block);
+
+        for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
+            update_stat(stats.allocation[stat_type], 1);
+            update_stat(stats.allocated_bytes[stat_type], static_cast<std::int64_t>(block->size));
+            update_stat(stats.active[stat_type], 1);
+            update_stat(stats.active_bytes[stat_type], static_cast<std::int64_t>(block->size));
+            update_stat(stats.requested_bytes[stat_type], static_cast<std::int64_t>(block->requested_size));
+        });
+
+        if (block->size >= CachingAllocatorConfig::max_split_size()) {
+            update_stat(stats.oversize_allocations, 1);
+        }
+
+        ASCEND_LOGD("PTA CachingAllocator malloc: malloc = %zu, cached = %lu, allocated = %lu", block->size,
+            stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
+
+#ifndef BUILD_LIBTORCH
+        if (torch_npu::profiler::MstxMgr::GetInstance()->isMsleaksEnable()) {
+            mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(
+                torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+            mstxMemVirtualRangeDesc_t heapDesc{ block->device, block->ptr,
+                stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current };
+            torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &heapDesc);
+            mstxMemVirtualRangeDesc_t regionDesc{ block->device, block->ptr, block->size };
+            torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &regionDesc);
+        }
+        torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast<int8_t>(c10::DeviceType::PrivateUse1),
+            block->device, static_cast<uint8_t>(torch_npu::profiler::MemoryComponentType::CACHING_ALLOCATOR),
+            static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC), allocator_type,
+            reinterpret_cast<int64_t>(block->ptr), block->size,
+            stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            reinterpret_cast<int64_t>(block->stream) });
+#endif
+
+        return block;
+    }
+
+
+    void free(Block *block, uint8_t allocator_type = 0)
+    {
+        std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+
+        block->allocated = false;
+
+        // following logic might modifying underlaying Block, causing the size
+        // changed. We store ahead for reporting
+        auto orig_block_ptr = block->ptr;
+        auto orig_block_size = block->size;
+
+        StatTypes stat_types = get_stat_types_for_pool(*(block->pool));
+        for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
+            update_stat(stats.allocation[stat_type], -1);
+            update_stat(stats.allocated_bytes[stat_type], -block->size);
+        });
+
+        record_trace(TraceEntry::FREE_REQUESTED, int64_t(block->ptr), block->requested_size, block->stream,
+            block->device, context ? context : block->context_when_allocated);
+
+        if (block->size >= CachingAllocatorConfig::max_split_size())
+            update_stat(stats.oversize_allocations, -1);
+
+        if (!block->stream_uses.empty() && c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
+            if (C10_UNLIKELY(!captures_underway.empty())) {
+                // It's forbidden to npuEventQuery an event recorded during NPU graph
+                // capture. We conservatively defer recording end-of-life events until
+                // the next call to process_events() (which won't happen until no
+                // captures are underway)
+                needs_events_deferred_until_no_capture.push_back(block);
+            } else {
+                insert_events(block);
+            }
+        } else {
+            free_block(block, context, allocator_type);
+        }
+
+        ASCEND_LOGD("PTA CachingAllocator free: free = %zu, cached = %lu, allocated = %lu", orig_block_size,
+            stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
+#ifndef BUILD_LIBTORCH
+        if (torch_npu::profiler::MstxMgr::GetInstance()->isMsleaksEnable()) {
+            mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(
+                torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+            mstxMemVirtualRangeDesc_t desc{ block->device, orig_block_ptr,
+                stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current };
+            torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc);
+            torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, orig_block_ptr);
+        }
+        torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast<int8_t>(c10::DeviceType::PrivateUse1),
+            block->device, static_cast<uint8_t>(torch_npu::profiler::MemoryComponentType::CACHING_ALLOCATOR),
+            static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_FREE), allocator_type,
+            reinterpret_cast<int64_t>(orig_block_ptr), -orig_block_size,
+            stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            reinterpret_cast<int64_t>(block->stream) });
+#endif
+    }
+
+    void *getBaseAllocation(Block *block, size_t *outSize)
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        while (block->prev) {
+            block = block->prev;
+        }
+        void *basePtr = block->ptr;
+        if (outSize) {
+            size_t size = 0;
+            while (block) {
+                size += block->size;
+                block = block->next;
+            }
+            *outSize = size;
+        }
+        return basePtr;
+    }
+
+    void recordStream(Block *block, c10_npu::NPUStream stream)
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        block->stream_uses.insert(stream);
+        if (C10_UNLIKELY(!captures_underway.empty())) {
+            block_to_npugraph_stream_uses[block].insert(stream);
+        }
+    }
+
+    void eraseStream(Block *block, c10_npu::NPUStream stream)
+    {
+        std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        block->stream_uses.erase(stream);
+
+        // free block, lazy destory block related events
+        for (auto it = npu_events[stream].begin(); it != npu_events[stream].end();) {
+            if (block != it->second) {
+                it++;
+                continue;
+            }
+            it = npu_events[stream].erase(it);
+            block->event_count--;
+            if (block->event_count == 0) {
+                free_block(block, context);
+                break;
+            }
+        }
+    }
+
+    /* * set memory fraction to limit maximum allocated memory * */
+    void setMemoryFraction(double fraction)
+    {
+        size_t device_free;
+        size_t device_total;
+        NPU_CHECK_ERROR(aclrtGetMemInfo(ACL_HBM_MEM, &device_free, &device_total));
+        allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
+        set_fraction = true;
+    }
+
+    /* * returns cached blocks to the system allocator * */
+    void emptyCache(int device, bool check_error)
+    {
+        std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
+        // Make sure event deque from taskqueue, then synchronize Event
+        c10_npu::npuSynchronizeDevice(check_error);
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        c10_npu::NPUWorkspaceAllocator::emptyCache(device, true, check_error);
+        release_cached_blocks(check_error, context);
+    }
+
+    void buildServerMemMapForHccl(std::shared_ptr<c10d_npu::HCCLComm> hcclComm)
+    {
+        std::unique_lock<std::recursive_mutex> lock(mutex);
+        TORCH_INTERNAL_ASSERT(!hcclComm_, "Build HCCL server group redundancy.", PTA_ERROR(ErrCode::INTERNAL));
+        hcclComm_ = hcclComm;
+        for (auto &expandable_segments : expandable_segments_) {
+            expandable_segments->setHcclComm(hcclComm);
+        }
+    }
+
+    void release_and_free_events()
+    {
+        std::unique_lock<std::recursive_mutex> lock(mutex);
+        std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
+        for (auto &st : npu_events) {
+            for (auto &e : st.second) {
+                EventPool::Event event = std::move(e.first);
+                Block *block = e.second;
+                block->event_count--;
+                if (block->event_count == 0) {
+                    free_block(block, context);
+                }
+            }
+        }
+        npu_events.clear();
+    }
+
+    /* * Retrieves info (total size + largest block) of the memory cache * */
+    void cacheInfo(size_t *total, size_t *largest)
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        cache_info_aux(large_blocks, total, largest);
+        cache_info_aux(small_blocks, total, largest);
+        for (const auto &gp : graph_pools) {
+            cache_info_aux(gp.second->large_blocks, total, largest);
+            cache_info_aux(gp.second->small_blocks, total, largest);
+        }
+    }
+
+    /* * Returns a copy of the memory allocator stats * */
+    DeviceStats getStats()
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        return stats;
+    }
+
+    /* * Resets the historical accumulation stats for the device * */
+    void resetAccumulatedStats()
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+
+        for (size_t statType = 0; statType < static_cast<size_t>(StatType::NUM_TYPES); ++statType) {
+            reset_accumulated_stat(stats.allocation[statType]);
+            reset_accumulated_stat(stats.segment[statType]);
+            reset_accumulated_stat(stats.active[statType]);
+            reset_accumulated_stat(stats.inactive_split[statType]);
+            reset_accumulated_stat(stats.allocated_bytes[statType]);
+            reset_accumulated_stat(stats.reserved_bytes[statType]);
+            reset_accumulated_stat(stats.active_bytes[statType]);
+            reset_accumulated_stat(stats.inactive_split_bytes[statType]);
+            reset_accumulated_stat(stats.requested_bytes[statType]);
+        }
+
+        stats.num_alloc_retries = 0;
+        stats.num_ooms = 0;
+        reset_accumulated_stat(stats.oversize_allocations);
+        reset_accumulated_stat(stats.oversize_segments);
+    }
+
+    /* * Resets the historical peak stats for the device * */
+    void resetPeakStats()
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+
+        for (size_t statType = 0; statType < static_cast<size_t>(StatType::NUM_TYPES); ++statType) {
+            reset_peak_stat(stats.allocation[statType]);
+            reset_peak_stat(stats.segment[statType]);
+            reset_peak_stat(stats.active[statType]);
+            reset_peak_stat(stats.inactive_split[statType]);
+            reset_peak_stat(stats.allocated_bytes[statType]);
+            reset_peak_stat(stats.reserved_bytes[statType]);
+            reset_peak_stat(stats.active_bytes[statType]);
+            reset_peak_stat(stats.inactive_split_bytes[statType]);
+            reset_peak_stat(stats.requested_bytes[statType]);
+        }
+
+        reset_peak_stat(stats.oversize_allocations);
+        reset_peak_stat(stats.oversize_segments);
+    }
+
+    /* Checkpoint the state of a private pool necessary to return it to its
+     * current state */
+    std::unique_ptr<PrivatePoolState> getCheckpointState(MempoolId_t id)
+    {
+        auto context = maybeGatherContext(RecordContext::ALL);
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        insert_events_deferred_until_no_capture(context);
+
+        auto pool = graph_pools.find(id);
+        if (pool != graph_pools.end()) {
+            auto private_pool_head_blocks = get_private_pool_head_blocks(pool->second.get());
+            return std::make_unique<PrivatePoolState>(id, private_pool_head_blocks);
+        } else if (graph_pools_freeable.count(id)) {
+            TORCH_CHECK(false, "Not expected to checkpoint freeable graph", PTA_ERROR(ErrCode::VALUE));
+        } else {
+            TORCH_CHECK(false, "Could not find pool of id", PTA_ERROR(ErrCode::NOT_FOUND));
+        }
+    }
+
+    void freeBlocksAllocatedToPool(PrivatePool *private_pool, RestoreResult &rr)
+    {
+        auto pool_blocks = get_private_pool_head_blocks(private_pool);
+
+        std::vector<Block *> head_blocks;
+        for (Block *block : pool_blocks) {
+            if (block->prev == nullptr) {
+                head_blocks.push_back(block);
+            }
+        }
+        for (Block *block : head_blocks) {
+            Block *curr = block;
+
+            while (curr) {
+                // When we free a block, its pointer should never change
+                // only its adjacent blocks, so free, then look at pointer
+                if (curr->allocated) {
+                    TORCH_CHECK(curr->event_count == 0,
+                        "Events should have synchronized when setting checkpointed block",
+                        PTA_ERROR(ErrCode::INTERNAL));
                     rr.allocations_freed.push_back(curr->ptr);
                     free(curr);
                     TORCH_CHECK(!curr->allocated, PTA_ERROR(ErrCode::PTR));
@@ -1782,8 +1728,8 @@ class DeviceCachingAllocator {
                 curr = curr->next;
             }
         }
-        for (Block* b : get_private_pool_head_blocks(private_pool)) {
-            Block* curr = b;
+        for (Block *b : get_private_pool_head_blocks(private_pool)) {
+            Block *curr = b;
             while (curr) {
                 TORCH_CHECK(!curr->allocated, PTA_ERROR(ErrCode::PTR));
                 curr = curr->next;
@@ -1793,17 +1739,14 @@ class DeviceCachingAllocator {
 
     // checkpoint the state of an allocation that may have been
     // split into multiple blocks
-    void setSegmentStateToCheckpoint(
-        Block* block,
-        SegmentState& segment,
-        const std::shared_ptr<c10::GatheredContext>& context,
-        RestoreResult& rr)
+    void setSegmentStateToCheckpoint(Block *block, SegmentState &segment,
+        const std::shared_ptr<c10::GatheredContext> &context, RestoreResult &rr)
     {
-        Block* curr_block = block;
-        Block* last_block = block;
+        Block *curr_block = block;
+        Block *last_block = block;
 
         TORCH_INTERNAL_ASSERT(block->pool);
-        BlockPool& pool = *block->pool;
+        BlockPool &pool = *block->pool;
         const auto segment_len = segment.blocks.size();
 
         // allocate all blocks in the segment
@@ -1818,13 +1761,8 @@ class DeviceCachingAllocator {
                 continue;
             }
 
-            auto& block_state = segment.blocks.at(i);
-            AllocParams params(
-                block_state.device,
-                block_state.size,
-                block_state.stream,
-                &pool,
-                block_state.size,
+            auto &block_state = segment.blocks.at(i);
+            AllocParams params(block_state.device, block_state.size, block_state.stream, &pool, block_state.size,
                 stats);
             pool.blocks.erase(curr_block);
             params.block = curr_block;
@@ -1866,7 +1804,7 @@ class DeviceCachingAllocator {
                 continue;
             }
 
-            auto& block_state = segment.blocks.at(i);
+            auto &block_state = segment.blocks.at(i);
             TORCH_INTERNAL_ASSERT(curr_block != nullptr, PTA_ERROR(ErrCode::PTR));
 
             if (block_state.allocated) {
@@ -1881,7 +1819,7 @@ class DeviceCachingAllocator {
             TORCH_CHECK(curr_block->size == block_state.size, PTA_ERROR(ErrCode::VALUE));
         }
     }
-    /**
+    /* *
      * Note [Checkpointing PrivatePoolState]
      *
      * Refer above to Note [Interaction with acl graph capture]. Allocations made
@@ -1917,17 +1855,17 @@ class DeviceCachingAllocator {
      * (link note [live tensors between iterations] when it exists). For
      * every block that is currently allocated but no allocated in the snapshot,
      * we will return a pointer to their block.
-     *.
-    *
-    *
-    *  ---------------> A ---------------> B ---------------> C
-    *                                      |
-    *                                      |
-    *                                      |
-    *                                      |
-    *                                      ╰ ---------------> D
-    */
-    RestoreResult setCheckpointPoolState(PrivatePoolState& pps)
+     * .
+     *
+     *
+     * ---------------> A ---------------> B ---------------> C
+     *                                     |
+     *                                     |
+     *                                     |
+     *                                     |
+     *                                     ╰ ---------------> D
+     */
+    RestoreResult setCheckpointPoolState(PrivatePoolState &pps)
     {
         // To reset the caching allocator state we will
         // - Free all the blocks currently allocated to the pool (see [live tensors
@@ -1941,34 +1879,32 @@ class DeviceCachingAllocator {
         // following `done outside the lock because we don't know what locks the
         // recorder needs to have...`
 
-        std::shared_ptr<c10::GatheredContext> context =
-            maybeGatherContext(RecordContext::STATE);
+        std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::STATE);
 
         std::lock_guard<std::recursive_mutex> lock(mutex);
 
         RestoreResult rr;
 
-        TORCH_CHECK(
-            !graph_pools_freeable.count(pps.owner_id),
-            "Not expected to checkpoint freeable graph", PTA_ERROR(ErrCode::INTERNAL));
+        TORCH_CHECK(!graph_pools_freeable.count(pps.owner_id), "Not expected to checkpoint freeable graph",
+            PTA_ERROR(ErrCode::INTERNAL));
 
         auto pool = graph_pools.find(pps.owner_id);
         TORCH_CHECK(pool != graph_pools.end(), "Could not find private pool id", PTA_ERROR(ErrCode::INTERNAL));
 
-        PrivatePool* private_pool = pool->second.get();
+        PrivatePool *private_pool = pool->second.get();
 
         freeBlocksAllocatedToPool(private_pool, rr);
 
-        std::unordered_map<void*, Block*> ptrs_to_blocks;
+        std::unordered_map<void *, Block *> ptrs_to_blocks;
         // at this point, all of the blocks should be free, so they will all be in
         // the block set
-        for (Block* block : private_pool->small_blocks.blocks) {
+        for (Block *block : private_pool->small_blocks.blocks) {
             ptrs_to_blocks[block->ptr] = block;
         }
-        for (Block* block : private_pool->large_blocks.blocks) {
+        for (Block *block : private_pool->large_blocks.blocks) {
             ptrs_to_blocks[block->ptr] = block;
         }
-        for (auto& segment : pps.segments) {
+        for (auto &segment : pps.segments) {
             auto ptr = segment.blocks.at(0).ptr;
             TORCH_CHECK(ptrs_to_blocks.count(ptr), " could not find ", ptr, PTA_ERROR(ErrCode::PARAM));
             auto block = ptrs_to_blocks[ptr];
@@ -1978,47 +1914,46 @@ class DeviceCachingAllocator {
         return rr;
     }
 
-    /** Dump a complete snapshot of the memory held by the allocator. Potentially VERY expensive. **/
+    /* * Dump a complete snapshot of the memory held by the allocator. Potentially VERY expensive. * */
     std::vector<SegmentInfo> snapshot()
     {
         std::lock_guard<std::recursive_mutex> lock(mutex);
 
-        std::unordered_map<PrivatePool*, MempoolId_t> pool_to_id;
+        std::unordered_map<PrivatePool *, MempoolId_t> pool_to_id;
         pool_to_id.reserve(graph_pools.size() + graph_pools_freeable.size());
-        for (const auto& pair : graph_pools) {
+        for (const auto &pair : graph_pools) {
             pool_to_id[pair.second.get()] = pair.first;
         }
-        for (const auto& pair : graph_pools_freeable) {
+        for (const auto &pair : graph_pools_freeable) {
             pool_to_id[pair.second] = pair.first;
         }
 
-        size_t total_active = 0;
+        uint64_t total_active = 0;
         std::vector<SegmentInfo> result;
         const auto all_blocks = get_all_blocks();
 
-        for (const Block* const head_block : all_blocks) {
+        for (const Block * const head_block : all_blocks) {
             // For expandable segments, we report one segment for each continguous
             // mapped range of memory
             if (head_block->prev && head_block->prev->mapped) {
                 continue;
             }
             result.emplace_back();
-            SegmentInfo& segment_info = result.back();
+            SegmentInfo &segment_info = result.back();
             segment_info.device = head_block->device;
             segment_info.address = reinterpret_cast<int64_t>(head_block->ptr);
             segment_info.stream = head_block->stream;
             segment_info.is_large = (!head_block->pool->is_small);
             segment_info.is_expandable = head_block->expandable_segment_;
-            segment_info.context_when_allocated =
-                head_block->context_when_segment_allocated;
+            segment_info.context_when_allocated = head_block->context_when_segment_allocated;
             auto mempool_id = pool_to_id.find(head_block->pool->owner_PrivatePool);
             if (mempool_id != pool_to_id.end()) {
                 segment_info.owner_private_pool_id = mempool_id->second;
             }
-            const Block* block = head_block;
+            const Block *block = head_block;
             while (block != nullptr && block->mapped) {
                 segment_info.blocks.emplace_back();
-                BlockInfo& block_info = segment_info.blocks.back();
+                BlockInfo &block_info = segment_info.blocks.back();
 
                 block_info.size = block->size;
                 block_info.requested_size = block->requested_size;
@@ -2040,42 +1975,37 @@ class DeviceCachingAllocator {
         }
 
         std::sort(result.begin(), result.end(),
-                  [](const SegmentInfo& a, const SegmentInfo& b) {
-                      return a.address < b.address;
-                  });
+            [](const SegmentInfo &a, const SegmentInfo &b) { return a.address < b.address; });
 
         record_trace(TraceEntry::SNAPSHOT, 0, total_active, nullptr, 0, nullptr);
         return result;
     }
 
-  std::vector<TraceEntry> trace()
-  {
-      std::lock_guard<std::recursive_mutex> lock(mutex);
-      std::vector<TraceEntry> result;
-      result.reserve(alloc_trace->size());
-      result.insert(result.end(), alloc_trace->begin() + alloc_trace_next,
-                    alloc_trace->end());
-      result.insert(result.end(), alloc_trace->begin(),
-                    alloc_trace->begin() + alloc_trace_next);
+    std::vector<TraceEntry> trace()
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        std::vector<TraceEntry> result;
+        result.reserve(alloc_trace->size());
+        result.insert(result.end(), alloc_trace->begin() + alloc_trace_next, alloc_trace->end());
+        result.insert(result.end(), alloc_trace->begin(), alloc_trace->begin() + alloc_trace_next);
 
-      return result;
-  }
+        return result;
+    }
 
-  static size_t round_size(size_t size) {
-    size = size + 32;
-    if (size < kMinBlockSize) {
-      return kMinBlockSize;
-    } else {
-      return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
+    static size_t round_size(size_t size)
+    {
+        size = size + 32;
+        if (size < kMinBlockSize) {
+            return kMinBlockSize;
+        } else {
+            return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
+        }
     }
-  }
 
     // See Note [Interaction with NPU graph capture]
 
     // Called by NPUGraph::capture_begin
-    void beginAllocateToPool(
-        MempoolId_t mempool_id,
-        std::function<bool(aclrtStream)> filter)
+    void beginAllocateToPool(MempoolId_t mempool_id, std::function<bool(aclrtStream)> filter)
     {
         std::lock_guard<std::recursive_mutex> lock(mutex);
         auto it = graph_pools.find(mempool_id);
@@ -2090,11 +2020,8 @@ class DeviceCachingAllocator {
             TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
             it->second->use_count++;
         }
-        for (auto it2 = captures_underway.begin(); it2 != captures_underway.end();
-            ++it2) {
-            TORCH_CHECK(
-                it2->first != mempool_id,
-                "beginAllocateToPool: already recording to mempool_id");
+        for (auto it2 = captures_underway.begin(); it2 != captures_underway.end(); ++it2) {
+            TORCH_CHECK(it2->first != mempool_id, "beginAllocateToPool: already recording to mempool_id");
         }
         captures_underway.emplace_back(mempool_id, std::move(filter));
     }
@@ -2109,8 +2036,7 @@ class DeviceCachingAllocator {
                 return;
             }
         }
-        TORCH_CHECK(
-            false, "endAllocatePool: not currently recording to mempool_id");
+        TORCH_CHECK(false, "endAllocatePool: not currently recording to mempool_id");
     }
 
     // Called by NPUGraph::reset
@@ -2134,49 +2060,41 @@ class DeviceCachingAllocator {
             // Allows free_cached_blocks to begin npuFreeing this pool's memory,
             // and makes sure this pool wasn't somehow made freeable already.
             // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-            bool inserted =
-                graph_pools_freeable.insert({mempool_id, it->second.get()}).second;
+            bool inserted = graph_pools_freeable.insert({ mempool_id, it->second.get() }).second;
             TORCH_INTERNAL_ASSERT(inserted);
         }
     }
 
- private:
-
-  // All private methods do not acquire the allocator mutex.
+private:
+    // All private methods do not acquire the allocator mutex.
 
-  std::vector<const Block*> get_all_blocks() const {
-    std::vector<const Block*> blocks;
-    blocks.insert(blocks.end(), small_blocks.blocks.begin(), small_blocks.blocks.end());
-    blocks.insert(blocks.end(), large_blocks.blocks.begin(), large_blocks.blocks.end());
-    for (const auto& gp : graph_pools) {
-      blocks.insert(
-          blocks.end(),
-          gp.second->small_blocks.blocks.begin(),
-          gp.second->small_blocks.blocks.end());
-      blocks.insert(
-          blocks.end(),
-          gp.second->large_blocks.blocks.begin(),
-          gp.second->large_blocks.blocks.end());
+    std::vector<const Block *> get_all_blocks() const
+    {
+        std::vector<const Block *> blocks;
+        blocks.insert(blocks.end(), small_blocks.blocks.begin(), small_blocks.blocks.end());
+        blocks.insert(blocks.end(), large_blocks.blocks.begin(), large_blocks.blocks.end());
+        for (const auto &gp : graph_pools) {
+            blocks.insert(blocks.end(), gp.second->small_blocks.blocks.begin(), gp.second->small_blocks.blocks.end());
+            blocks.insert(blocks.end(), gp.second->large_blocks.blocks.begin(), gp.second->large_blocks.blocks.end());
+        }
+        blocks.insert(blocks.end(), active_blocks.begin(), active_blocks.end());
+        return blocks;
     }
-    blocks.insert(blocks.end(), active_blocks.begin(), active_blocks.end());
-    return blocks;
-  }
 
-    std::vector<Block*> get_private_pool_head_blocks(PrivatePool* pool) const
+    std::vector<Block *> get_private_pool_head_blocks(PrivatePool *pool) const
     {
-        std::vector<Block*> blocks;
-        for (Block* b : active_blocks) {
-            if ((b->pool == &pool->small_blocks || b->pool == &pool->large_blocks) &&
-                b->prev == nullptr) {
+        std::vector<Block *> blocks;
+        for (Block *b : active_blocks) {
+            if ((b->pool == &pool->small_blocks || b->pool == &pool->large_blocks) && b->prev == nullptr) {
                 blocks.push_back(b);
             }
         }
-        for (Block* b : pool->small_blocks.blocks) {
+        for (Block *b : pool->small_blocks.blocks) {
             if (b->prev == nullptr) {
                 blocks.push_back(b);
             }
         }
-        for (Block* b : pool->large_blocks.blocks) {
+        for (Block *b : pool->large_blocks.blocks) {
             if (b->prev == nullptr) {
                 blocks.push_back(b);
             }
@@ -2185,291 +2103,240 @@ class DeviceCachingAllocator {
         return blocks;
     }
 
-  // returns the smallest possible address in any segment
-  // where there is enough free address space to fit size
-  // may be composed of free and unmapped segments
-  Block* find_expandable_block(
-      int device,
-      aclrtStream stream,
-      BlockPool* pool,
-      size_t size) {
-    Block key(device, stream, 0);
+    // returns the smallest possible address in any segment
+    // where there is enough free address space to fit size
+    // may be composed of free and unmapped segments
+    Block *find_expandable_block(int device, aclrtStream stream, BlockPool *pool, size_t size)
+    {
+        Block key(device, stream, 0);
+
+        auto allocatable = [](Block *b) { return b && !b->allocated && b->event_count == 0 && b->stream_uses.empty(); };
+        auto has_available_address_space = [&](Block *b) {
+            size_t bytes = 0;
+            while (bytes < size && allocatable(b)) {
+                bytes += b->size;
+                b = b->next;
+            }
+            return bytes >= size;
+        };
+        for (auto it = pool->unmapped.lower_bound(&key); it != pool->unmapped.end() && (*it)->stream == stream; ++it) {
+            Block *c = *it;
+            // we found the lowest address of an unmapped segment
+            // but there might be a free segment we can also use
+            // right before it
+            if (allocatable(c->prev)) {
+                c = c->prev;
+            }
+            if (has_available_address_space(c)) {
+                return c;
+            }
+        }
+        auto segment_size = pool->is_small ?
+            kSmallBuffer :
+            (c10_npu::option::OptionsManager::IsHcclZeroCopyEnable() ? kLargeBufferForHccl : kLargeBuffer);
+        // 此处申请虚拟内存，segment_size是页大小，实际虚拟内存巨大
+        if (IsMallocPage1GMem(pool->is_small)) {
+            segment_size = kExtraLargeBuffer;
+        }
+        auto segment = new ExpandableSegment(device, stream, segment_size);
+        if (hcclComm_) {
+            segment->setHcclComm(hcclComm_);
+        }
+        expandable_segments_.emplace_back(segment);
+
+        ExpandableSegment *es = expandable_segments_.back();
+        Block *candidate = new Block(device, stream, es->size(), pool, es->ptr());
+        candidate->mapped = false;
+        candidate->expandable_segment_ = es;
+        pool->unmapped.insert(candidate);
+        return candidate;
+    }
+
+    bool map_block(Block *to_map, size_t size, const std::shared_ptr<c10::GatheredContext> &ctx)
+    {
+        TORCH_INTERNAL_ASSERT(!to_map->mapped && size <= to_map->size, PTA_ERROR(ErrCode::INTERNAL));
+        TORCH_INTERNAL_ASSERT(!to_map->context_when_allocated); // unmapped blocks should not keep
+                                                                // history
+        auto mapped_range = to_map->expandable_segment_->map(SegmentRange{ to_map->ptr, size });
+        // failed to map the memory
+        if (mapped_range.size == 0) {
+            return false;
+        }
+        TORCH_INTERNAL_ASSERT(mapped_range.ptr == to_map->ptr && mapped_range.size >= size,
+            PTA_ERROR(ErrCode::INTERNAL));
+
+        BlockPool &pool = *to_map->pool;
+        pool.unmapped.erase(to_map);
+        to_map->mapped = true;
+
+        if (mapped_range.size < to_map->size) {
+            // to_map -> remaining -> to_map->next(?)
+            Block *remaining = new Block(to_map->device, to_map->stream, to_map->size - mapped_range.size, &pool,
+                static_cast<char *>(to_map->ptr) + mapped_range.size);
+            remaining->mapped = false;
+            remaining->expandable_segment_ = to_map->expandable_segment_;
+            remaining->splice(to_map, to_map->next);
+            pool.unmapped.insert(remaining);
+            to_map->size = mapped_range.size;
+        }
 
-    auto allocatable = [](Block* b) {
-      return b && !b->allocated && b->event_count == 0 &&
-          b->stream_uses.empty();
-    };
-    auto has_available_address_space = [&](Block* b) {
-      size_t bytes = 0;
-      while (bytes < size && allocatable(b)) {
-        bytes += b->size;
-        b = b->next;
-      }
-      return bytes >= size;
-    };
-    for (auto it = pool->unmapped.lower_bound(&key);
-         it != pool->unmapped.end() && (*it)->stream == stream;
-         ++it) {
-      Block* c = *it;
-      // we found the lowest address of an unmapped segment
-      // but there might be a free segment we can also use
-      // right before it
-      if (allocatable(c->prev)) {
-        c = c->prev;
-      }
-      if (has_available_address_space(c)) {
-        return c;
-      }
-    }
-    auto segment_size = pool->is_small ? kSmallBuffer : (
-      c10_npu::option::OptionsManager::IsHcclZeroCopyEnable() ? kLargeBufferForHccl : kLargeBuffer
-    );
-    // 此处申请虚拟内存，segment_size是页大小，实际虚拟内存巨大
-    if (IsMallocPage1GMem(pool->is_small)) {
-        segment_size = kExtraLargeBuffer;
-    }
-    auto segment = new ExpandableSegment(device, stream, segment_size);
-    if (hcclComm_) {
-        segment->setHcclComm(hcclComm_);
-    }
-    expandable_segments_.emplace_back(segment);
-
-    ExpandableSegment* es = expandable_segments_.back();
-    Block* candidate = new Block(device, stream, es->size(), pool, es->ptr());
-    candidate->mapped = false;
-    candidate->expandable_segment_ = es;
-    pool->unmapped.insert(candidate);
-    return candidate;
-  }
-
-  bool map_block(
-      Block* to_map,
-      size_t size,
-      const std::shared_ptr<c10::GatheredContext>& ctx)
-  {
-    TORCH_INTERNAL_ASSERT(!to_map->mapped && size <= to_map->size, PTA_ERROR(ErrCode::INTERNAL));
-    TORCH_INTERNAL_ASSERT(
-        !to_map->context_when_allocated); // unmapped blocks should not keep
-                                          // history
-    auto mapped_range =
-        to_map->expandable_segment_->map(SegmentRange{to_map->ptr, size});
-    // failed to map the memory
-    if (mapped_range.size == 0) {
-      return false;
-    }
-    TORCH_INTERNAL_ASSERT(
-        mapped_range.ptr == to_map->ptr && mapped_range.size >= size, PTA_ERROR(ErrCode::INTERNAL));
-
-    BlockPool& pool = *to_map->pool;
-    pool.unmapped.erase(to_map);
-    to_map->mapped = true;
-
-    if (mapped_range.size < to_map->size) {
-      // to_map -> remaining -> to_map->next(?)
-      Block* remaining = new Block(
-          to_map->device,
-          to_map->stream,
-          to_map->size - mapped_range.size,
-          &pool,
-          static_cast<char*>(to_map->ptr) + mapped_range.size);
-      remaining->mapped = false;
-      remaining->expandable_segment_ = to_map->expandable_segment_;
-      remaining->splice(to_map, to_map->next);
-      pool.unmapped.insert(remaining);
-      to_map->size = mapped_range.size;
-    }
-
-    try_merge_blocks(to_map, to_map->prev, pool);
-    try_merge_blocks(to_map, to_map->next, pool);
-
-    pool.blocks.insert(to_map);
-
-    // update statistics
-    total_allocated_memory += mapped_range.size;
-    StatTypes stat_types = get_stat_types_for_pool(*to_map->pool);
-    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
-      update_stat(stats.reserved_bytes[stat_type], mapped_range.size);
-    });
-#ifndef BUILD_LIBTORCH
-    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-    mstxMemVirtualRangeDesc_t desc{to_map->device, mapped_range.ptr, mapped_range.size};
-    torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc);
-#endif
-    record_trace(
-        TraceEntry::SEGMENT_MAP,
-        int64_t(mapped_range.ptr),
-        mapped_range.size,
-        to_map->stream,
-        to_map->device,
-        ctx);
-    if (!to_map->prev && !to_map->context_when_segment_allocated) {
-      to_map->context_when_segment_allocated = ctx;
-    }
-
-    return true;
-  }
-
-  Block* try_allocate_expandable_block(
-      int device,
-      aclrtStream stream,
-      BlockPool* pool,
-      size_t size,
-      const std::shared_ptr<c10::GatheredContext>& ctx)
-  {
-    Block* candidate = find_expandable_block(device, stream, pool, size);
-    // Candidate is now a list free/unmapped blocks with at least size room:
-    // unmapped -> null
-    // unmapped -> free -> *
-    // free -> unmapped -> *
-
-    if (!candidate->mapped &&
-        !map_block(candidate, std::min(candidate->size, size), ctx)) {
-      return nullptr;
-    }
-    TORCH_INTERNAL_ASSERT(candidate->mapped, PTA_ERROR(ErrCode::INTERNAL));
-
-    while (candidate->size < size) {
-      // invariant: free -> unmapped -> *
-      // map_block will map some of unmapped and merge with free
-      auto remaining = size - candidate->size;
-      auto new_candidate = candidate->next;
-      if (!map_block(new_candidate, std::min(remaining, candidate->next->size), ctx)) {
-        return nullptr;
-      }
-      candidate = new_candidate;
-    }
-    pool->blocks.erase(candidate);
-    return candidate;
-  }
-
-
-  /** moves a block into a pool of cached free blocks **/
-  void free_block(
-      Block* block,
-      const std::shared_ptr<c10::GatheredContext>& context,
-      uint8_t allocator_type = 0)
-  {
-    AT_ASSERT(!block->allocated && block->event_count == 0, PTA_ERROR(ErrCode::VALUE));
-
-    record_trace(
-        TraceEntry::FREE_COMPLETED,
-        int64_t(block->ptr),
-        block->requested_size,
-        block->stream,
-        block->device,
-        context ? context : block->context_when_allocated);
-
-    block->context_when_allocated = nullptr;
-    size_t original_block_size = block->size;
-    auto orig_block_ptr = block->ptr;
-    size_t requested_size = block->requested_size;
-
-    auto& pool = *block->pool;
-    int64_t net_change_inactive_split_blocks = 0;
-    int64_t net_change_inactive_split_size = 0;
-
-    const std::array<Block*, 2> merge_candidates = {block->prev, block->next};
-    for (Block* merge_candidate : merge_candidates) {
-      const int64_t subsumed_size = static_cast<int64_t>(try_merge_blocks(block, merge_candidate, pool));
-      if (subsumed_size > 0) {
-        net_change_inactive_split_blocks -= 1;
-        net_change_inactive_split_size -= subsumed_size;
-      }
-    }
-
-    active_blocks.erase(block);
-    pool.blocks.insert(block);
-
-    if (block->is_split()) {
-      net_change_inactive_split_blocks += 1;
-      net_change_inactive_split_size += static_cast<int64_t>(block->size);
-    }
-
-    StatTypes stat_types = get_stat_types_for_pool(pool);
-    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
-      // inactive_split tries to capture the idea that blocks
-      // cannot be freed when requested, but fully free pages
-      // of expandable blocks can always be freed.
-      // The logic to track this as statistic is pretty involved,
-      // so we simply just exclude expandable segements from
-      // inactive_split
-      if (!block->expandable_segment_) {
-        update_stat(
-            stats.inactive_split[stat_type], net_change_inactive_split_blocks);
-        update_stat(
-            stats.inactive_split_bytes[stat_type],
-            net_change_inactive_split_size);
-      }
-      update_stat(stats.active[stat_type], -1);
-      update_stat(stats.active_bytes[stat_type], -original_block_size);
-      update_stat(
-          stats.requested_bytes[stat_type],
-          -static_cast<std::int64_t>(requested_size));
-    });
-#ifndef BUILD_LIBTORCH
-    torch_npu::profiler::reportMemoryDataToNpuProfiler({
-        static_cast<int8_t>(c10::DeviceType::PrivateUse1),
-        block->device,
-        static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_BLOCK_FREE),
-        allocator_type,
-        reinterpret_cast<int64_t>(orig_block_ptr),
-        -original_block_size,
-        stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-        stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-        reinterpret_cast<int64_t>(block->stream)}
-    );
-#endif
-  }
+        try_merge_blocks(to_map, to_map->prev, pool);
+        try_merge_blocks(to_map, to_map->next, pool);
+
+        pool.blocks.insert(to_map);
 
-  /** combine previously split blocks. returns the size of the subsumed block, or 0 on failure. **/
-  size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
-    if (!src || src->allocated || src->event_count > 0 ||
-        !src->stream_uses.empty() || dst->mapped != src->mapped) {
-      return 0;
+        // update statistics
+        total_allocated_memory += mapped_range.size;
+        StatTypes stat_types = get_stat_types_for_pool(*to_map->pool);
+        for_each_selected_stat_type(stat_types,
+            [&](size_t stat_type) { update_stat(stats.reserved_bytes[stat_type], mapped_range.size); });
+        record_trace(TraceEntry::SEGMENT_MAP, int64_t(mapped_range.ptr), mapped_range.size, to_map->stream,
+            to_map->device, ctx);
+        if (!to_map->prev && !to_map->context_when_segment_allocated) {
+            to_map->context_when_segment_allocated = ctx;
+        }
+
+        return true;
+    }
+
+    Block *try_allocate_expandable_block(int device, aclrtStream stream, BlockPool *pool, size_t size,
+        const std::shared_ptr<c10::GatheredContext> &ctx)
+    {
+        Block *candidate = find_expandable_block(device, stream, pool, size);
+        // Candidate is now a list free/unmapped blocks with at least size room:
+        // unmapped -> null
+        // unmapped -> free -> *
+        // free -> unmapped -> *
+
+        if (!candidate->mapped && !map_block(candidate, std::min(candidate->size, size), ctx)) {
+            return nullptr;
+        }
+        TORCH_INTERNAL_ASSERT(candidate->mapped, PTA_ERROR(ErrCode::INTERNAL));
+
+        while (candidate->size < size) {
+            // invariant: free -> unmapped -> *
+            // map_block will map some of unmapped and merge with free
+            auto remaining = size - candidate->size;
+            auto new_candidate = candidate->next;
+            if (!map_block(new_candidate, std::min(remaining, candidate->next->size), ctx)) {
+                return nullptr;
+            }
+            candidate = new_candidate;
+        }
+        pool->blocks.erase(candidate);
+        return candidate;
     }
 
-    AT_ASSERT(dst->is_split() && src->is_split(), PTA_ERROR(ErrCode::VALUE));
 
-    if (dst->prev == src) {
-      dst->ptr = src->ptr;
-      dst->prev = src->prev;
-      if (dst->prev) {
-        dst->prev->next = dst;
-      }
-    } else {
-      dst->next = src->next;
-      if (dst->next) {
-        dst->next->prev = dst;
-      }
+    /* * moves a block into a pool of cached free blocks * */
+    void free_block(Block *block, const std::shared_ptr<c10::GatheredContext> &context, uint8_t allocator_type = 0)
+    {
+        AT_ASSERT(!block->allocated && block->event_count == 0, PTA_ERROR(ErrCode::VALUE));
+
+        record_trace(TraceEntry::FREE_COMPLETED, int64_t(block->ptr), block->requested_size, block->stream,
+            block->device, context ? context : block->context_when_allocated);
+
+        block->context_when_allocated = nullptr;
+        size_t original_block_size = block->size;
+        auto orig_block_ptr = block->ptr;
+        size_t requested_size = block->requested_size;
+
+        auto &pool = *block->pool;
+        int64_t net_change_inactive_split_blocks = 0;
+        int64_t net_change_inactive_split_size = 0;
+
+        const std::array<Block *, 2> merge_candidates = { block->prev, block->next };
+        for (Block *merge_candidate : merge_candidates) {
+            const int64_t subsumed_size = static_cast<int64_t>(try_merge_blocks(block, merge_candidate, pool));
+            if (subsumed_size > 0) {
+                net_change_inactive_split_blocks -= 1;
+                net_change_inactive_split_size -= subsumed_size;
+            }
+        }
+
+        active_blocks.erase(block);
+        pool.blocks.insert(block);
+
+        if (block->is_split()) {
+            net_change_inactive_split_blocks += 1;
+            net_change_inactive_split_size += static_cast<int64_t>(block->size);
+        }
+
+        StatTypes stat_types = get_stat_types_for_pool(pool);
+        for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
+            // inactive_split tries to capture the idea that blocks
+            // cannot be freed when requested, but fully free pages
+            // of expandable blocks can always be freed.
+            // The logic to track this as statistic is pretty involved,
+            // so we simply just exclude expandable segements from
+            // inactive_split
+            if (!block->expandable_segment_) {
+                update_stat(stats.inactive_split[stat_type], net_change_inactive_split_blocks);
+                update_stat(stats.inactive_split_bytes[stat_type], net_change_inactive_split_size);
+            }
+            update_stat(stats.active[stat_type], -1);
+            update_stat(stats.active_bytes[stat_type], -original_block_size);
+            update_stat(stats.requested_bytes[stat_type], -static_cast<std::int64_t>(requested_size));
+        });
+#ifndef BUILD_LIBTORCH
+        torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast<int8_t>(c10::DeviceType::PrivateUse1),
+            block->device, static_cast<uint8_t>(torch_npu::profiler::MemoryComponentType::CACHING_ALLOCATOR),
+            static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_BLOCK_FREE), allocator_type,
+            reinterpret_cast<int64_t>(orig_block_ptr), -original_block_size,
+            stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            reinterpret_cast<int64_t>(block->stream) });
+#endif
     }
 
-    const size_t subsumed_size = src->size;
-    dst->size += subsumed_size;
-    auto erased =
-        src->mapped ? pool.blocks.erase(src) : pool.unmapped.erase(src);
-    delete src;
-    src = nullptr;
+    /* * combine previously split blocks. returns the size of the subsumed block, or 0 on failure. * */
+    size_t try_merge_blocks(Block *dst, Block *src, BlockPool &pool)
+    {
+        if (!src || src->allocated || src->event_count > 0 || !src->stream_uses.empty() || dst->mapped != src->mapped) {
+            return 0;
+        }
+
+        AT_ASSERT(dst->is_split() && src->is_split(), PTA_ERROR(ErrCode::VALUE));
+
+        if (dst->prev == src) {
+            dst->ptr = src->ptr;
+            dst->prev = src->prev;
+            if (dst->prev) {
+                dst->prev->next = dst;
+            }
+        } else {
+            dst->next = src->next;
+            if (dst->next) {
+                dst->next->prev = dst;
+            }
+        }
 
-    return subsumed_size;
-  }
+        const size_t subsumed_size = src->size;
+        dst->size += subsumed_size;
+        auto erased = src->mapped ? pool.blocks.erase(src) : pool.unmapped.erase(src);
+        delete src;
+        src = nullptr;
 
-    BlockPool& get_pool(size_t size, aclrtStream stream)
+        return subsumed_size;
+    }
+
+    BlockPool &get_pool(size_t size, aclrtStream stream)
     {
         // captures_underway is a conservative guess that the current stream may be
         // capturing. It's only non-empty if some thread has begun and not yet ended
         // a capture, so it's usually 0, and we can short-circuit
         // npuStreamCaptureStatus (which does a TLS lookup).
         if (C10_UNLIKELY(!captures_underway.empty())) {
-            for (auto& entry : captures_underway) {
+            for (auto &entry : captures_underway) {
                 if (entry.second(stream)) {
                     auto it1 = graph_pools.find(entry.first);
                     TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
-                if (size <= kSmallSize) {
-                    return it1->second->small_blocks;
-                } else {
-                    return it1->second->large_blocks;
-                }
+                    if (size <= kSmallSize) {
+                        return it1->second->small_blocks;
+                    } else {
+                        return it1->second->large_blocks;
+                    }
                 }
             }
         }
@@ -2480,178 +2347,175 @@ class DeviceCachingAllocator {
         }
     }
 
-  StatTypes get_stat_types_for_pool(const BlockPool& pool) {
-    StatTypes stat_types = {false};
-    stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
-    stat_types[static_cast<size_t>(
-        pool.is_small ? StatType::SMALL_POOL : StatType::LARGE_POOL)] = true;
-    return stat_types;
-  }
-
-  bool should_split(const Block* block, size_t size) {
-    size_t remaining = block->size - size;
-    if (block->pool->is_small ||
-        CachingAllocatorConfig::expandable_segments()) {
-      return remaining >= kMinBlockSize;
-    } else {
-      return (size < CachingAllocatorConfig::max_split_size()) && (remaining > kSmallSize);
+    StatTypes get_stat_types_for_pool(const BlockPool &pool)
+    {
+        StatTypes stat_types = { false };
+        stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+        stat_types[static_cast<size_t>(pool.is_small ? StatType::SMALL_POOL : StatType::LARGE_POOL)] = true;
+        return stat_types;
     }
-  }
 
-  static size_t get_allocation_size(size_t size) {
-    if (size <= kSmallSize) {
-      return kSmallBuffer;
-    } else if (size < kMinLargeAlloc) {
-      return kLargeBuffer;
-    } else {
-      return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
-    }
-  }
-
-  bool get_free_block(AllocParams& p) {
-    BlockPool& pool = *p.pool;
-
-    if (C10_UNLIKELY(set_fraction &&
-            CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
-      // Track block reuse interval only when garbage collection is enabled.
-      for (auto& b : pool.blocks) {
-        ++b->gc_count;
-      }
-    }
-    auto it = pool.blocks.lower_bound(&p.search_key);
-    if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
-      return false;
-    }
-
-    if ((*it)->expandable_segment_) {
-      if (CachingAllocatorConfig::expandable_segments()) {
-        // if we are allocated to the part of the block that is expandable
-        // for the purposes of "best fit" we consider its size to be the size it
-        // can expand to, not the size it currently is. This means that we
-        // sometimes have to search for blocks with bigger 'size' before
-        // choosing this segment.
-        auto expandable_size = [](Block* b) {
-          return b->size + (b->next && !b->next->mapped ? b->next->size : 0);
-        };
-        auto next = it;
-        next++;
-        while ((*it)->expandable_segment_ && next != pool.blocks.end() &&
-               (*next)->stream == p.stream() &&
-               expandable_size(*next) < expandable_size(*it)) {
-          it = next++;
-        }
-      } else {
-        // Rarely expandable segments has been turned off after we have
-        // already allocated some blocks as expandable. For instance,
-        // since we cannot share expandable memory via IPC, someone might
-        // temporarily disable it. In this case we need to honor this request
-        // by only finding non-expandable blocks
-        do {
-          it++;
-        } while (it != pool.blocks.end() && (*it)->expandable_segment_ &&
-                 (*it)->stream == p.stream());
-        if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
-          return false;
-        }
-      }
-    }
-
-    // Do not return an oversized block for a large request
-    if ((p.size() < CachingAllocatorConfig::max_split_size()) &&
-        ((*it)->size >= CachingAllocatorConfig::max_split_size())) {
-          return false;
-        }
-    // Allow oversized block size to be rounded up but within a limit
-    if ((p.size() >= CachingAllocatorConfig::max_split_size()) && ((*it)->size >= p.size() + kLargeBuffer)) {
-      return false;
-    }
-    p.block = *it;
-    (*it)->gc_count = 0; // Denote this block has been used
-    pool.blocks.erase(it);
-    return true;
-  }
-
-  bool trigger_free_memory_callbacks(AllocParams& p) {
-    bool freed_memory = false;
-    for (const auto& name : FreeNPUMemoryCallbacksRegistry()->Keys()) {
-      freed_memory |=
-        FreeNPUMemoryCallbacksRegistry()->Create(name)->Execute();
-    }
-    return freed_memory;
-  }
-
-  void garbage_collect_cached_blocks(const std::shared_ptr<c10::GatheredContext>& ctx)
-  {
-    // Free unused cached blocks to reclaim NPU memory.
-    // Unlike release_cached_blocks(), this does not enforce synchronization and
-    // therefore should be of less overheads.
-
-    size_t gc_threshold = static_cast<size_t>(
-        CachingAllocatorConfig::garbage_collection_threshold() *
-        allowed_memory_maximum);
-    // No need to trigger GC yet
-    if (total_allocated_memory <= gc_threshold) {
-      return;
-    }
-    const auto target_size = total_allocated_memory - gc_threshold;
-    size_t gc_reclaimed = 0;
-
-    // Calculate the total age of the free-able blocks. We'll use it later to
-    // get "avg age" threshold.
-    double total_age = 0.0;
-    int freeable_block_count = 0;
-    for (auto& b : large_blocks.blocks) {
-      if (!b->is_split()) {
-        total_age += b->gc_count;
-        ++freeable_block_count;
-      }
-    }
-    // No free-able blocks?
-    if (freeable_block_count == 0) {
-      return;
-    }
-
-    c10_npu::npuSynchronizeDevice(true);
-
-    // Repeat GC until we reach reclaim > target size.
-    bool block_freed = true;
-    while (gc_reclaimed < target_size && block_freed == true &&
-           freeable_block_count > 0) {
-      // Free blocks exceeding this age threshold first.
-      double age_threshold = total_age / freeable_block_count;
-      // Stop iteration if we can no longer free a block.
-      block_freed = false;
-
-      // Free blocks of > avg age. Don't stop upon reaching the target_size,
-      // we don't want this GC to be triggered frequently.
-      auto it = large_blocks.blocks.begin();
-      while (it != large_blocks.blocks.end()) {
-        Block* block = *it;
-        ++it;
-        if (!block->is_split() && block->gc_count >= age_threshold) {
-          block_freed = true;
-          gc_reclaimed += block->size;
-          total_age -= block->gc_count; // Decrement the age
-          freeable_block_count--; // One less block that can be freed
-          release_block(block, ctx);
-
-          ASCEND_LOGD("PTA CachingAllocator gc: free = %zu, cached = %lu, allocated = %lu",
-              block->size,
-              stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-              stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
-        }
-      }
-    }
-  }
-
-    bool alloc_block(
-        AllocParams &p,
-        bool isRetry,
-        const std::shared_ptr <c10::GatheredContext> &ctx,
-        std::unique_lock <std::recursive_mutex> &lock)
+    bool should_split(const Block *block, size_t size)
     {
-        size_t size = p.alloc_size;
-        void *ptr = nullptr;
+        size_t remaining = block->size - size;
+        if (block->pool->is_small || CachingAllocatorConfig::expandable_segments()) {
+            return remaining >= kMinBlockSize;
+        } else {
+            return (size < CachingAllocatorConfig::max_split_size()) && (remaining > kSmallSize);
+        }
+    }
+
+    static size_t get_allocation_size(size_t size)
+    {
+        if (size <= kSmallSize) {
+            return kSmallBuffer;
+        } else if (size < kMinLargeAlloc) {
+            return kLargeBuffer;
+        } else {
+            return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
+        }
+    }
+
+    bool get_free_block(AllocParams &p)
+    {
+        BlockPool &pool = *p.pool;
+
+        if (C10_UNLIKELY(set_fraction && CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+            // Track block reuse interval only when garbage collection is enabled.
+            for (auto &b : pool.blocks) {
+                ++b->gc_count;
+            }
+        }
+        auto it = pool.blocks.lower_bound(&p.search_key);
+        if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
+            return false;
+        }
+
+        if ((*it)->expandable_segment_) {
+            if (CachingAllocatorConfig::expandable_segments()) {
+                // if we are allocated to the part of the block that is expandable
+                // for the purposes of "best fit" we consider its size to be the size it
+                // can expand to, not the size it currently is. This means that we
+                // sometimes have to search for blocks with bigger 'size' before
+                // choosing this segment.
+                auto expandable_size = [](Block *b) {
+                    return b->size + (b->next && !b->next->mapped ? b->next->size : 0);
+                };
+                auto next = it;
+                next++;
+                while ((*it)->expandable_segment_ && next != pool.blocks.end() && (*next)->stream == p.stream() &&
+                    expandable_size(*next) < expandable_size(*it)) {
+                    it = next++;
+                }
+            } else {
+                // Rarely expandable segments has been turned off after we have
+                // already allocated some blocks as expandable. For instance,
+                // since we cannot share expandable memory via IPC, someone might
+                // temporarily disable it. In this case we need to honor this request
+                // by only finding non-expandable blocks
+                do {
+                    it++;
+                } while (it != pool.blocks.end() && (*it)->expandable_segment_ && (*it)->stream == p.stream());
+                if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
+                    return false;
+                }
+            }
+        }
+
+        // Do not return an oversized block for a large request
+        if ((p.size() < CachingAllocatorConfig::max_split_size()) &&
+            ((*it)->size >= CachingAllocatorConfig::max_split_size())) {
+            return false;
+        }
+        // Allow oversized block size to be rounded up but within a limit
+        if ((p.size() >= CachingAllocatorConfig::max_split_size()) && ((*it)->size >= p.size() + kLargeBuffer)) {
+            return false;
+        }
+        p.block = *it;
+        (*it)->gc_count = 0; // Denote this block has been used
+        pool.blocks.erase(it);
+        return true;
+    }
+
+    bool trigger_free_memory_callbacks(AllocParams &p)
+    {
+        bool freed_memory = false;
+        for (const auto &name : FreeNPUMemoryCallbacksRegistry()->Keys()) {
+            freed_memory |= FreeNPUMemoryCallbacksRegistry()->Create(name)->Execute();
+        }
+        return freed_memory;
+    }
+
+    void garbage_collect_cached_blocks(const std::shared_ptr<c10::GatheredContext>& ctx,
+                                       std::unique_lock<std::recursive_mutex>& lock)
+    {
+        // Free unused cached blocks to reclaim NPU memory.
+        // Unlike release_cached_blocks(), this does not enforce synchronization and
+        // therefore should be of less overheads.
+
+        size_t gc_threshold =
+            static_cast<size_t>(CachingAllocatorConfig::garbage_collection_threshold() * allowed_memory_maximum);
+        // No need to trigger GC yet
+        if (total_allocated_memory <= gc_threshold) {
+            return;
+        }
+        const auto target_size = total_allocated_memory - gc_threshold;
+        size_t gc_reclaimed = 0;
+
+        // Calculate the total age of the free-able blocks. We'll use it later to
+        // get "avg age" threshold.
+        double total_age = 0.0;
+        int freeable_block_count = 0;
+        for (auto &b : large_blocks.blocks) {
+            if (!b->is_split()) {
+                total_age += b->gc_count;
+                ++freeable_block_count;
+            }
+        }
+        // No free-able blocks?
+        if (freeable_block_count == 0) {
+            return;
+        }
+
+        {
+            UnlockGuard guard(lock);
+            c10_npu::npuSynchronizeDevice(true);
+        }
+
+        // Repeat GC until we reach reclaim > target size.
+        bool block_freed = true;
+        while (gc_reclaimed < target_size && block_freed == true && freeable_block_count > 0) {
+            // Free blocks exceeding this age threshold first.
+            double age_threshold = total_age / freeable_block_count;
+            // Stop iteration if we can no longer free a block.
+            block_freed = false;
+
+            // Free blocks of > avg age. Don't stop upon reaching the target_size,
+            // we don't want this GC to be triggered frequently.
+            auto it = large_blocks.blocks.begin();
+            while (it != large_blocks.blocks.end()) {
+                Block *block = *it;
+                ++it;
+                if (!block->is_split() && block->gc_count >= age_threshold) {
+                    block_freed = true;
+                    gc_reclaimed += block->size;
+                    total_age -= block->gc_count; // Decrement the age
+                    freeable_block_count--;       // One less block that can be freed
+                    release_block(block, ctx);
+
+                    ASCEND_LOGD("PTA CachingAllocator gc: free = %zu, cached = %lu, allocated = %lu", block->size,
+                        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+                        stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
+                }
+            }
+        }
+    }
+
+    bool alloc_block(AllocParams &p, bool isRetry, const std::shared_ptr<c10::GatheredContext> &ctx,
+        std::unique_lock<std::recursive_mutex> &lock)
+    {
+        size_t size = p.alloc_size;
+        void *ptr = nullptr;
 
         if (isRetry) {
             stats.num_alloc_retries += 1;
@@ -2674,8 +2538,7 @@ class DeviceCachingAllocator {
             return bool(p.block);
         } else {
             auto active_pool = MemPoolContext::getActiveMemPool();
-            if (active_pool && active_pool->allocator() &&
-                p.pool->owner_PrivatePool) {
+            if (active_pool && active_pool->allocator() && p.pool->owner_PrivatePool) {
                 ptr = active_pool->allocator()->raw_alloc(size);
                 p.err = ptr ? ACL_ERROR_NONE : ACL_ERROR_RT_MEMORY_ALLOCATION;
             } else {
@@ -2690,331 +2553,287 @@ class DeviceCachingAllocator {
             }
         }
 
-      ASCEND_LOGD("NPUCachingAllocator malloc by AclrtMallocAlign32: size=%zu", size);
+        ASCEND_LOGD("NPUCachingAllocator malloc by AclrtMallocAlign32: size=%zu", size);
 
-      if (p.pool->owner_PrivatePool) {
-          // The block is for a NPU graph's PrivatePool.
-          p.pool->owner_PrivatePool->npuMalloc_count++;
-      }
+        if (p.pool->owner_PrivatePool) {
+            // The block is for a NPU graph's PrivatePool.
+            p.pool->owner_PrivatePool->npuMalloc_count++;
+        }
 
-      total_allocated_memory += size;
-      p.block = new Block(p.device(), p.stream(), size, p.pool, (char *) ptr);
-      for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
-          update_stat(stats.segment[stat_type], 1);
-          update_stat(stats.reserved_bytes[stat_type], size);
-      });
-      if (size >= CachingAllocatorConfig::max_split_size()) {
-          update_stat(stats.oversize_segments, 1);
-      }
+        total_allocated_memory += size;
+        p.block = new Block(p.device(), p.stream(), size, p.pool, (char *)ptr);
+        for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
+            update_stat(stats.segment[stat_type], 1);
+            update_stat(stats.reserved_bytes[stat_type], size);
+        });
+        if (size >= CachingAllocatorConfig::max_split_size()) {
+            update_stat(stats.oversize_segments, 1);
+        }
 
-      ASCEND_LOGD("pta_memory acl_malloc: malloc = %zu, ret = %d", size, p.err);
+        ASCEND_LOGD("pta_memory acl_malloc: malloc = %zu, ret = %d", size, p.err);
 
-      // p.block came from new, not npuMalloc. It should not be nullptr here.
-      TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
-#ifndef BUILD_LIBTORCH
-      mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(
-          torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-      mstxMemVirtualRangeDesc_t desc{p.block->device, p.block->ptr, p.block->size};
-      torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc);
-#endif
-      record_trace(
-          TraceEntry::SEGMENT_ALLOC,
-          int64_t(p.block->ptr),
-          p.block->size,
-          p.stream(),
-          p.device(),
-          ctx);
-      p.block->context_when_segment_allocated = ctx;
-      return true;
-    }
-
-  /** Free one or more oversize blocks to the system allocator.  But only enough to satisfy the target size **/
-  bool release_available_cached_blocks(const AllocParams& p,
-    const std::shared_ptr<c10::GatheredContext>& ctx)
-  {
-    if (CachingAllocatorConfig::max_split_size() == std::numeric_limits<size_t>::max()) {
-      return false;
-    }
-    BlockPool &pool = *p.pool;
-    Block key = p.search_key;
-    key.size =
-        (key.size < CachingAllocatorConfig::max_split_size()) ? CachingAllocatorConfig::max_split_size() : key.size;
-    auto it = pool.blocks.lower_bound(&key);
-
-    c10_npu::npuSynchronizeDevice(true);
-
-    if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
-      // No single block is large enough; free multiple oversize blocks, starting with the largest
-      if (it == pool.blocks.begin()) {
-        return false;
-      }
-      size_t totalReleased = 0;
-      // Back up one item.  Now on the largest block for the correct stream
-      --it;
-      while ((totalReleased < key.size) && ((*it)->size >= CachingAllocatorConfig::max_split_size()) &&
-            ((*it)->stream == p.stream())) {
-        auto cur = it;
-        totalReleased += (*it)->size;
-        if (it != pool.blocks.begin()) {
-          --it;
-          release_block(*cur, ctx);
-        } else {
-          release_block(*cur, ctx);
-          break;
+        // p.block came from new, not npuMalloc. It should not be nullptr here.
+        TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
+        record_trace(TraceEntry::SEGMENT_ALLOC, int64_t(p.block->ptr), p.block->size, p.stream(), p.device(), ctx);
+        p.block->context_when_segment_allocated = ctx;
+        return true;
+    }
+
+    /* * Free one or more oversize blocks to the system allocator.  But only enough to satisfy the target size * */
+    bool release_available_cached_blocks(const AllocParams& p, const std::shared_ptr<c10::GatheredContext>& ctx,
+                                         std::unique_lock<std::recursive_mutex>& lock)
+    {
+        if (CachingAllocatorConfig::max_split_size() == std::numeric_limits<size_t>::max()) {
+            return false;
         }
-      }
-      if (totalReleased < key.size) {
-        return false;
-      }
-    } else {
-      release_block(*it, ctx);
-    }
-    return true;
-  }
-
-  bool release_cached_blocks(bool check_error, const std::shared_ptr<c10::GatheredContext>& context)
-  {
-      // Make sure event deque from taskqueue, then synchronize Event
-      c10_npu::npuSynchronizeDevice(check_error);
-
-      // First ensure that all blocks that can't currently be allocated due to
-      // outstanding events are returned to the pool.
-      synchronize_and_free_events(check_error, context);
-
-      // Free all non-split cached blocks
-      release_blocks(large_blocks, context);
-      release_blocks(small_blocks, context);
-
-      for (auto it = graph_pools_freeable.begin();
-          it != graph_pools_freeable.end();) {
-        // See notifyCaptureDestroy for the strategy here.
-        TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
-        release_blocks(it->second->small_blocks, context);
-        release_blocks(it->second->large_blocks, context);
-        if (it->second->npuMalloc_count == 0) {
-          auto erase_count = graph_pools.erase(it->first);
-          TORCH_INTERNAL_ASSERT(erase_count == 1);
-          it = graph_pools_freeable.erase(it);
+        BlockPool &pool = *p.pool;
+        Block key = p.search_key;
+        key.size =
+            (key.size < CachingAllocatorConfig::max_split_size()) ? CachingAllocatorConfig::max_split_size() : key.size;
+        auto it = pool.blocks.lower_bound(&key);
+
+        {
+            UnlockGuard guard(lock);
+            c10_npu::npuSynchronizeDevice(true);
+        }
+
+        if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
+            // No single block is large enough; free multiple oversize blocks, starting with the largest
+            if (it == pool.blocks.begin()) {
+                return false;
+            }
+            size_t totalReleased = 0;
+            // Back up one item.  Now on the largest block for the correct stream
+            --it;
+            while ((totalReleased < key.size) && ((*it)->size >= CachingAllocatorConfig::max_split_size()) &&
+                ((*it)->stream == p.stream())) {
+                auto cur = it;
+                totalReleased += (*it)->size;
+                if (it != pool.blocks.begin()) {
+                    --it;
+                    release_block(*cur, ctx);
+                } else {
+                    release_block(*cur, ctx);
+                    break;
+                }
+            }
+            if (totalReleased < key.size) {
+                return false;
+            }
         } else {
-          ++it;
-        }
-      }
-
-      return true;
-  }
-
-  void release_expandable_segment(Block* block) {
-    TORCH_INTERNAL_ASSERT(
-        block->size == block->expandable_segment_->size(),
-        "block disagrees with segment", PTA_ERROR(ErrCode::INTERNAL));
-    TORCH_INTERNAL_ASSERT(!block->mapped, PTA_ERROR(ErrCode::INTERNAL));
-    auto it = std::find(
-        expandable_segments_.begin(),
-        expandable_segments_.end(),
-        block->expandable_segment_);
-    TORCH_INTERNAL_ASSERT(it != expandable_segments_.end(), PTA_ERROR(ErrCode::INTERNAL));
-    expandable_segments_.erase(it);
-    block->pool->unmapped.erase(block);
-    delete block->expandable_segment_;
-    block->expandable_segment_ = nullptr;
-    delete block;
-    block = nullptr;
-  }
-
-  void release_block(
-      Block* block,
-      const std::shared_ptr<c10::GatheredContext>& context)
-  {
-    TORCH_INTERNAL_ASSERT(!block->expandable_segment_, PTA_ERROR(ErrCode::INTERNAL));
-
-    record_trace(
-        TraceEntry::SEGMENT_FREE,
-        int64_t(block->ptr),
-        block->size,
-        block->stream,
-        block->device,
-        context ? context : block->context_when_segment_allocated);
-
-    ASCEND_LOGI("NPUCachingAllocator free by aclrtFree: size=%zu", block->size);
-    aclrtFree((void*)block->ptr);
-    total_allocated_memory -= block->size;
-
-    auto* pool = block->pool;
-    if (pool->owner_PrivatePool) {
-      // The npuFreed block belonged to a NPU graph's PrivatePool.
-      TORCH_INTERNAL_ASSERT(pool->owner_PrivatePool->npuMalloc_count > 0);
-      pool->owner_PrivatePool->npuMalloc_count--;
-    }
-
-    StatTypes stat_types = get_stat_types_for_pool(*pool);
-    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
-      update_stat(stats.segment[stat_type], -1);
-      update_stat(stats.reserved_bytes[stat_type], -block->size);
-    });
-
-    if (block->size >= CachingAllocatorConfig::max_split_size())
-      update_stat(stats.oversize_segments, -1);
-#ifndef BUILD_LIBTORCH
-    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-    torch_npu::profiler::MstxMgr::GetInstance()->memHeapUnregister(msleaksDomain, block->ptr);
-#endif
-    ASCEND_LOGD("pta_memory acl_free: free_size = %zu", block->size);
-
-    pool->blocks.erase(block);
-    delete block;
-    block = nullptr;
-    }
-
-  void unmap_block(
-      Block* block,
-      const std::shared_ptr<c10::GatheredContext>& context)
-  {
-    auto unmapped = block->expandable_segment_->unmap(
-        SegmentRange{block->ptr, block->size});
-    if (unmapped.size == 0) {
-      return;
-    }
-    block->pool->blocks.erase(block);
-
-    ptrdiff_t before_size =
-        static_cast<char*>(unmapped.ptr) - static_cast<char*>(block->ptr);
-    if (before_size > 0) {
-      // prev? -> before_free -> block
-      Block* before_free = new Block(
-          block->device, block->stream, before_size, block->pool, block->ptr);
-      before_free->expandable_segment_ = block->expandable_segment_;
-      before_free->splice(block->prev, block);
-      block->pool->blocks.insert(before_free);
-    }
-
-    auto after_size = block->size - (before_size + unmapped.size);
-    if (after_size > 0) {
-      // block -> after_free -> next?
-      Block* after_free = new Block(
-          block->device,
-          block->stream,
-          after_size,
-          block->pool,
-          static_cast<char*>(unmapped.ptr) + unmapped.size);
-      after_free->expandable_segment_ = block->expandable_segment_;
-      after_free->splice(block, block->next);
-      block->pool->blocks.insert(after_free);
-    }
-
-    block->ptr = unmapped.ptr;
-    block->size = unmapped.size;
-    block->mapped = false;
-
-    try_merge_blocks(block, block->prev, *block->pool);
-    try_merge_blocks(block, block->next, *block->pool);
-    block->pool->unmapped.insert(block);
-
-    // update statistics
-    total_allocated_memory -= unmapped.size;
-    StatTypes stat_types = get_stat_types_for_pool(*block->pool);
-    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
-      update_stat(stats.reserved_bytes[stat_type], -unmapped.size);
-    });
-
-    if (block->pool->owner_PrivatePool) {
-      // The npuFreed block belonged to a NPU graph's PrivatePool.
-      TORCH_INTERNAL_ASSERT(
-          block->pool->owner_PrivatePool->npuMalloc_count > 0);
-      block->pool->owner_PrivatePool->npuMalloc_count--;
+            release_block(*it, ctx);
+        }
+        return true;
     }
 
-#ifndef BUILD_LIBTORCH
-    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-    torch_npu::profiler::MstxMgr::GetInstance()->memHeapUnregister(msleaksDomain, block->ptr);
-#endif
-    record_trace(
-        TraceEntry::SEGMENT_UNMAP,
-        int64_t(unmapped.ptr),
-        unmapped.size,
-        block->stream,
-        block->device,
-        context ? context : block->context_when_segment_allocated);
-  }
-
-  void release_blocks(
-      BlockPool& pool,
-      const std::shared_ptr<c10::GatheredContext>& context)
-  {
-    std::vector<Block*> to_unmap;
-    // Frees all non-split blocks
-    auto it = pool.blocks.begin();
-    while (it != pool.blocks.end()) {
-      Block *block = *it;
-      ++it;
-      if (block->expandable_segment_) {
-        // unmapping will mutate the free pool
-        // so just gather what needs to be freed
-        // to avoid invalidating the iterator
-        to_unmap.push_back(block);
-      } else if (!block->prev && !block->next) {
-        release_block(block, context);
-      }
-    }
-    for (Block* block : to_unmap) {
-      unmap_block(block, context);
-      if (!block->prev && !block->next) {
-        release_expandable_segment(block);
-      }
-    }
-  }
-
-  EventPool::Event create_event_internal(int idx) {
-    // Leak the event pool to avoid shutdown issues.
-    static auto* event_pool = new EventPool();
-    return event_pool->get(idx);
-  }
-
-  void synchronize_and_free_events(bool check_error, const std::shared_ptr<c10::GatheredContext>& context)
-  {
-    // This function syncs, so capture should not be underway. Might as well
-    // make sure capture-deferred end of life events get processed too.
-    TORCH_INTERNAL_ASSERT(captures_underway.empty());
-    insert_events_deferred_until_no_capture(context);
-
-    // Synchronize on outstanding events and then free associated blocks.
-    for (auto& st : npu_events) {
-      for (auto& e : st.second) {
-        EventPool::Event event = std::move(e.first);
-        Block* block = e.second;
-        if (check_error) {
-          NPU_CHECK_ERROR(aclrtSynchronizeEvent(*event));
-        } else {
-          NPU_CHECK_WARN(aclrtSynchronizeEvent(*event));
+    // npuSynchronizeDevice must be executed before this function can be called
+    bool release_cached_blocks(bool check_error, const std::shared_ptr<c10::GatheredContext> &context)
+    {
+        // First ensure that all blocks that can't currently be allocated due to
+        // outstanding events are returned to the pool.
+        synchronize_and_free_events(check_error, context);
+
+        // Free all non-split cached blocks
+        release_blocks(large_blocks, context);
+        release_blocks(small_blocks, context);
+
+        for (auto it = graph_pools_freeable.begin(); it != graph_pools_freeable.end();) {
+            // See notifyCaptureDestroy for the strategy here.
+            TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
+            release_blocks(it->second->small_blocks, context);
+            release_blocks(it->second->large_blocks, context);
+            if (it->second->npuMalloc_count == 0) {
+                auto erase_count = graph_pools.erase(it->first);
+                TORCH_INTERNAL_ASSERT(erase_count == 1);
+                it = graph_pools_freeable.erase(it);
+            } else {
+                ++it;
+            }
         }
-#ifndef BUILD_LIBTORCH
-        const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
-        if (C10_UNLIKELY(trigger)) {
-            trigger->traceNpuEventSynchronization(reinterpret_cast<uintptr_t>(event.get()));
+
+        return true;
+    }
+
+    void release_expandable_segment(Block *block)
+    {
+        TORCH_INTERNAL_ASSERT(block->size == block->expandable_segment_->size(), "block disagrees with segment",
+            PTA_ERROR(ErrCode::INTERNAL));
+        TORCH_INTERNAL_ASSERT(!block->mapped, PTA_ERROR(ErrCode::INTERNAL));
+        auto it = std::find(expandable_segments_.begin(), expandable_segments_.end(), block->expandable_segment_);
+        TORCH_INTERNAL_ASSERT(it != expandable_segments_.end(), PTA_ERROR(ErrCode::INTERNAL));
+        expandable_segments_.erase(it);
+        block->pool->unmapped.erase(block);
+        delete block->expandable_segment_;
+        block->expandable_segment_ = nullptr;
+        delete block;
+        block = nullptr;
+    }
+
+    void release_block(Block *block, const std::shared_ptr<c10::GatheredContext> &context)
+    {
+        TORCH_INTERNAL_ASSERT(!block->expandable_segment_, PTA_ERROR(ErrCode::INTERNAL));
+
+        record_trace(TraceEntry::SEGMENT_FREE, int64_t(block->ptr), block->size, block->stream, block->device,
+            context ? context : block->context_when_segment_allocated);
+
+        ASCEND_LOGI("NPUCachingAllocator free by aclrtFree: size=%zu", block->size);
+        aclrtFree((void *)block->ptr);
+        total_allocated_memory -= block->size;
+
+        auto *pool = block->pool;
+        if (pool->owner_PrivatePool) {
+            // The npuFreed block belonged to a NPU graph's PrivatePool.
+            TORCH_INTERNAL_ASSERT(pool->owner_PrivatePool->npuMalloc_count > 0);
+            pool->owner_PrivatePool->npuMalloc_count--;
         }
-#endif
-        ASCEND_LOGI("Event: aclrtSynchronizeEvent is successfully executed, event=%p", event.get());
 
-        block->event_count--;
-        if (block->event_count == 0) {
-          free_block(block, context);
+        StatTypes stat_types = get_stat_types_for_pool(*pool);
+        for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
+            update_stat(stats.segment[stat_type], -1);
+            update_stat(stats.reserved_bytes[stat_type], -block->size);
+        });
+
+        if (block->size >= CachingAllocatorConfig::max_split_size()) {
+            update_stat(stats.oversize_segments, -1);
+        }
+        ASCEND_LOGD("pta_memory acl_free: free_size = %zu", block->size);
+
+        pool->blocks.erase(block);
+        delete block;
+        block = nullptr;
+    }
+
+    void unmap_block(Block *block, const std::shared_ptr<c10::GatheredContext> &context)
+    {
+        auto unmapped = block->expandable_segment_->unmap(SegmentRange{ block->ptr, block->size });
+        if (unmapped.size == 0) {
+            return;
+        }
+        block->pool->blocks.erase(block);
+
+        ptrdiff_t before_size = static_cast<char *>(unmapped.ptr) - static_cast<char *>(block->ptr);
+        if (before_size > 0) {
+            // prev? -> before_free -> block
+            Block *before_free = new Block(block->device, block->stream, before_size, block->pool, block->ptr);
+            before_free->expandable_segment_ = block->expandable_segment_;
+            before_free->splice(block->prev, block);
+            block->pool->blocks.insert(before_free);
+        }
+
+        auto after_size = block->size - (before_size + unmapped.size);
+        if (after_size > 0) {
+            // block -> after_free -> next?
+            Block *after_free = new Block(block->device, block->stream, after_size, block->pool,
+                static_cast<char *>(unmapped.ptr) + unmapped.size);
+            after_free->expandable_segment_ = block->expandable_segment_;
+            after_free->splice(block, block->next);
+            block->pool->blocks.insert(after_free);
+        }
+
+        block->ptr = unmapped.ptr;
+        block->size = unmapped.size;
+        block->mapped = false;
+
+        try_merge_blocks(block, block->prev, *block->pool);
+        try_merge_blocks(block, block->next, *block->pool);
+        block->pool->unmapped.insert(block);
+
+        // update statistics
+        total_allocated_memory -= unmapped.size;
+        StatTypes stat_types = get_stat_types_for_pool(*block->pool);
+        for_each_selected_stat_type(stat_types,
+            [&](size_t stat_type) { update_stat(stats.reserved_bytes[stat_type], -unmapped.size); });
+
+        if (block->pool->owner_PrivatePool) {
+            // The npuFreed block belonged to a NPU graph's PrivatePool.
+            TORCH_INTERNAL_ASSERT(block->pool->owner_PrivatePool->npuMalloc_count > 0);
+            block->pool->owner_PrivatePool->npuMalloc_count--;
         }
-      }
+
+        record_trace(TraceEntry::SEGMENT_UNMAP, int64_t(unmapped.ptr), unmapped.size, block->stream, block->device,
+            context ? context : block->context_when_segment_allocated);
     }
 
-    npu_events.clear();
-  }
+    void release_blocks(BlockPool &pool, const std::shared_ptr<c10::GatheredContext> &context)
+    {
+        std::vector<Block *> to_unmap;
+        // Frees all non-split blocks
+        auto it = pool.blocks.begin();
+        while (it != pool.blocks.end()) {
+            Block *block = *it;
+            ++it;
+            if (block->expandable_segment_) {
+                // unmapping will mutate the free pool
+                // so just gather what needs to be freed
+                // to avoid invalidating the iterator
+                to_unmap.push_back(block);
+            } else if (!block->prev && !block->next) {
+                release_block(block, context);
+            }
+        }
+        for (Block *block : to_unmap) {
+            unmap_block(block, context);
+            if (!block->prev && !block->next) {
+                release_expandable_segment(block);
+            }
+        }
+    }
+
+    EventPool::Event create_event_internal(int idx)
+    {
+        // Leak the event pool to avoid shutdown issues.
+        static auto *event_pool = new EventPool();
+        return event_pool->get(idx);
+    }
+
+    void synchronize_and_free_events(bool check_error, const std::shared_ptr<c10::GatheredContext> &context)
+    {
+        // This function syncs, so capture should not be underway. Might as well
+        // make sure capture-deferred end of life events get processed too.
+        TORCH_INTERNAL_ASSERT(captures_underway.empty());
+        insert_events_deferred_until_no_capture(context);
+
+        // Synchronize on outstanding events and then free associated blocks.
+        for (auto &st : npu_events) {
+            for (auto &e : st.second) {
+                EventPool::Event event = std::move(e.first);
+                Block *block = e.second;
+                auto err = aclrtSynchronizeEvent(*event);
+                if (err != ACL_ERROR_NONE) {
+                    if (check_error) {
+                        NPU_CHECK_ERROR(err);
+                    } else {
+                        ASCEND_LOGE("Event: aclrtSynchronizeEvent failed, event = %p", event.get());
+                    }
+                } else {
+                    ASCEND_LOGI("Event: aclrtSynchronizeEvent is successfully executed, event=%p", event.get());
+                }
+#ifndef BUILD_LIBTORCH
+                const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
+                if (C10_UNLIKELY(trigger)) {
+                    trigger->traceNpuEventSynchronization(reinterpret_cast<uintptr_t>(event.get()));
+                }
+#endif
+
+                block->event_count--;
+                if (block->event_count == 0) {
+                    free_block(block, context);
+                }
+            }
+        }
 
-    void remove_npugraph_stream_uses(Block* block)
+        npu_events.clear();
+    }
+
+    void remove_npugraph_stream_uses(Block *block)
     {
         // remove stream uses added during npugraph capture
         // (i.e., block->stream_uses - block->npugraph_stream_uses)
-        if (C10_UNLIKELY(
-            block_to_npugraph_stream_uses.find(block) != block_to_npugraph_stream_uses.end())) {
+        if (C10_UNLIKELY(block_to_npugraph_stream_uses.find(block) != block_to_npugraph_stream_uses.end())) {
             stream_set streams(std::move(block->stream_uses));
             AT_ASSERT(block->stream_uses.empty());
-            for (auto& stream : streams) {
-                if (block_to_npugraph_stream_uses[block].find(stream) ==
-                    block_to_npugraph_stream_uses[block].end()) {
+            for (auto &stream : streams) {
+                if (block_to_npugraph_stream_uses[block].find(stream) == block_to_npugraph_stream_uses[block].end()) {
                     block->stream_uses.insert(stream);
                 }
             }
@@ -3022,7 +2841,7 @@ class DeviceCachingAllocator {
         }
     }
 
-    void insert_events(Block* block)
+    void insert_events(Block *block)
     {
         aclrtContext compiler_ctx = aclrtContext();
         aclError ret_ctx = aclrtGetCurrentContext(&compiler_ctx);
@@ -3030,7 +2849,7 @@ class DeviceCachingAllocator {
 
         stream_set streams(std::move(block->stream_uses));
         AT_ASSERT(block->stream_uses.empty(), PTA_ERROR(ErrCode::VALUE));
-        for (auto& stream : streams) {
+        for (auto &stream : streams) {
             NPU_CHECK_ERROR(c10_npu::SetDevice(stream.device_index()));
 
             EventPool::Event event = create_event_internal(stream.device_index());
@@ -3045,11 +2864,10 @@ class DeviceCachingAllocator {
         }
     }
 
-    void insert_events_deferred_until_no_capture(
-        const std::shared_ptr<c10::GatheredContext>& context)
+    void insert_events_deferred_until_no_capture(const std::shared_ptr<c10::GatheredContext> &context)
     {
         if (C10_UNLIKELY(!needs_events_deferred_until_no_capture.empty())) {
-            for (auto* block : needs_events_deferred_until_no_capture) {
+            for (auto *block : needs_events_deferred_until_no_capture) {
                 TORCH_INTERNAL_ASSERT(!block->stream_uses.empty());
                 // only streams recorded before npugraph will be used to insert events
                 // since we know all streams recorded during npugraph must have
@@ -3065,85 +2883,77 @@ class DeviceCachingAllocator {
         }
     }
 
-  void process_events(const std::shared_ptr<c10::GatheredContext>& context)
-  {
-    insert_events_deferred_until_no_capture(context);
-
-    // Process outstanding npuEvents. Events that are completed are removed
-    // from the queue, and the 'event_count' for the corresponding allocation
-    // is decremented. Stops at the first event which has not been completed.
-    // Since events on different devices or streams may occur out of order,
-    // the processing of some events may be delayed.
-    for (auto it = npu_events.begin(); it != npu_events.end();) {
-      while (!it->second.empty()) {
-        auto& e = it->second.front();
-        EventPool::Event event = std::move(e.first);
-        Block* block = e.second;
-
-        if (!event->query()) {
-          e.first = std::move(event);
-          break;
-        }
-
-        block->event_count--;
-        if (block->event_count == 0) {
-          free_block(block, context);
-        }
-        it->second.pop_front();
-      }
-
-      if (it->second.empty()) {
-        it = npu_events.erase(it);
-      } else {
-        it++;
-      }
-    }
-  }
-
-  // Accumulates sizes of all memory blocks for given device in given pool
-  void cache_info_aux(BlockPool& blocks, size_t* total, size_t* largest) {
-    for (auto it = blocks.blocks.begin(); it != blocks.blocks.end(); ++it) {
-      size_t blocksize = (*it)->size;
-      *total += blocksize;
-      if (blocksize > *largest) {
-        *largest = blocksize;
-      }
-    }
-  }
-
-  void record_trace(
-      TraceEntry::Action action,
-      int64_t addr,
-      size_t size,
-      aclrtStream stream,
-      int device,
-      std::shared_ptr<c10::GatheredContext> context)
-  {
-    if (!record_history) {return;}
-
-    auto te = TraceEntry(
-        action,
-        device,
-        addr,
-        size,
-        stream,
-        record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr);
-
-    if (record_history) {
-      if (alloc_trace->size() < alloc_trace_max_entries_) {
-        alloc_trace->emplace_back(te);
-      } else {
-        (*alloc_trace)[alloc_trace_next++] = te;
-        if (alloc_trace_next == alloc_trace_max_entries_) {
-          alloc_trace_next = 0;
-        }
-      }
-    }
-  }
+    void process_events(const std::shared_ptr<c10::GatheredContext> &context)
+    {
+        insert_events_deferred_until_no_capture(context);
+
+        // Process outstanding npuEvents. Events that are completed are removed
+        // from the queue, and the 'event_count' for the corresponding allocation
+        // is decremented. Stops at the first event which has not been completed.
+        // Since events on different devices or streams may occur out of order,
+        // the processing of some events may be delayed.
+        for (auto it = npu_events.begin(); it != npu_events.end();) {
+            while (!it->second.empty()) {
+                auto &e = it->second.front();
+                EventPool::Event event = std::move(e.first);
+                Block *block = e.second;
+
+                if (!event->query()) {
+                    e.first = std::move(event);
+                    break;
+                }
+
+                block->event_count--;
+                if (block->event_count == 0) {
+                    free_block(block, context);
+                }
+                it->second.pop_front();
+            }
+
+            if (it->second.empty()) {
+                it = npu_events.erase(it);
+            } else {
+                it++;
+            }
+        }
+    }
+
+    // Accumulates sizes of all memory blocks for given device in given pool
+    void cache_info_aux(BlockPool &blocks, size_t *total, size_t *largest)
+    {
+        for (auto it = blocks.blocks.begin(); it != blocks.blocks.end(); ++it) {
+            size_t blocksize = (*it)->size;
+            *total += blocksize;
+            if (blocksize > *largest) {
+                *largest = blocksize;
+            }
+        }
+    }
 
+    void record_trace(TraceEntry::Action action, int64_t addr, size_t size, aclrtStream stream, int device,
+        std::shared_ptr<c10::GatheredContext> context)
+    {
+        if (!record_history) {
+            return;
+        }
+
+        auto te = TraceEntry(action, device, addr, size, stream,
+            record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr);
+
+        if (record_history) {
+            if (alloc_trace->size() < alloc_trace_max_entries_) {
+                alloc_trace->emplace_back(te);
+            } else {
+                (*alloc_trace)[alloc_trace_next++] = te;
+                if (alloc_trace_next == alloc_trace_max_entries_) {
+                    alloc_trace_next = 0;
+                }
+            }
+        }
+    }
 };
 
-static void uncached_delete(void* ptr)
+static void uncached_delete(void *ptr)
 {
     if (c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
         c10_npu::npuSynchronizeDevice(false);
@@ -3152,278 +2962,267 @@ static void uncached_delete(void* ptr)
     NPU_CHECK_ERROR(aclrtFree(ptr));
 }
 
-void local_raw_delete(void* ptr);
+void local_raw_delete(void *ptr);
 
 class NpuCachingAllocator : public NPUAllocator {
- private:
-
-  std::mutex mutex;
-
-  // allocated blocks by device pointer
-  ska::flat_hash_map<void*, Block*> allocated_blocks;
+private:
+    std::mutex mutex;
 
-  void add_allocated_block(Block* block) {
-    std::lock_guard<std::mutex> lock(mutex);
-    allocated_blocks[block->ptr] = block;
-  }
+    // allocated blocks by device pointer
+    ska::flat_hash_map<void *, Block *> allocated_blocks;
 
- public:
+    void add_allocated_block(Block *block)
+    {
+        std::lock_guard<std::mutex> lock(mutex);
+        allocated_blocks[block->ptr] = block;
+    }
 
-  std::vector<std::unique_ptr<DeviceCachingAllocator>> device_allocator;
+public:
+    std::vector<std::unique_ptr<DeviceCachingAllocator>> device_allocator;
 
-  Block* get_allocated_block(void* ptr, bool remove = false) {
-    std::lock_guard<std::mutex> lock(mutex);
-    auto it = allocated_blocks.find(ptr);
-    if (it == allocated_blocks.end()) {
-      return nullptr;
-    }
-    Block* block = it->second;
-    if (remove) {
-      allocated_blocks.erase(it);
+    Block *get_allocated_block(void *ptr, bool remove = false)
+    {
+        std::lock_guard<std::mutex> lock(mutex);
+        auto it = allocated_blocks.find(ptr);
+        if (it == allocated_blocks.end()) {
+            return nullptr;
+        }
+        Block *block = it->second;
+        if (remove) {
+            allocated_blocks.erase(it);
+        }
+        return block;
     }
-    return block;
-  }
 
-  void init(int device_count) override
+    void init(int device_count) override
     {
-    int size = static_cast<int>(device_allocator.size());
-    if (size < device_count) {
-      device_allocator.resize(device_count);
-      for (const auto i : c10::irange(size, device_count)) {
-        device_allocator[i] = std::make_unique<DeviceCachingAllocator>();
-      }
+        int size = static_cast<int>(device_allocator.size());
+        if (size < device_count) {
+            device_allocator.resize(device_count);
+            for (const auto i : c10::irange(size, device_count)) {
+                device_allocator[i] = std::make_unique<DeviceCachingAllocator>();
+            }
+        }
     }
-  }
 
-  bool initialized() override
+    bool initialized() override
     {
         return !device_allocator.empty();
     }
-  /** allocates a block which is safe to use from the provided stream */
-  void malloc(void** devPtr, int device, size_t size, aclrtStream stream) {
-      TORCH_INTERNAL_ASSERT(0 <= device && static_cast<size_t>(device) < device_allocator.size(),
-                            "Allocator not initialized for device ", device, ": did you call init?",
-                            PTA_ERROR(ErrCode::PARAM));
-      Block* block = device_allocator[device]->malloc(device, size, stream);
+    /* * allocates a block which is safe to use from the provided stream */
+    void malloc(void **devPtr, int device, size_t size, aclrtStream stream)
+    {
+        TORCH_INTERNAL_ASSERT(0 <= device && static_cast<size_t>(device) < device_allocator.size(),
+            "Allocator not initialized for device ", device, ": did you call init?", PTA_ERROR(ErrCode::PARAM));
+        Block *block = device_allocator[device]->malloc(device, size, stream);
 
-    add_allocated_block(block);
-    *devPtr = static_cast<void*>(block->ptr);
+        add_allocated_block(block);
+        *devPtr = static_cast<void *>(block->ptr);
 #ifndef BUILD_LIBTORCH
-    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
-    if (C10_UNLIKELY(trigger)) {
-        trigger->traceNpuMemoryAllocation(
-            reinterpret_cast<uintptr_t>(*devPtr));
+        const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
+        if (C10_UNLIKELY(trigger)) {
+            trigger->traceNpuMemoryAllocation(reinterpret_cast<uintptr_t>(*devPtr));
+        }
+#endif
     }
+
+    void free(void *ptr)
+    {
+        if (!ptr) {
+            return;
+        }
+        Block *block = get_allocated_block(ptr, true);
+        if (!block) {
+            AT_ERROR("invalid device pointer: ", ptr);
+        }
+#ifndef BUILD_LIBTORCH
+        const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
+        if (C10_UNLIKELY(trigger)) {
+            trigger->traceNpuMemoryDeallocation(reinterpret_cast<uintptr_t>(block->ptr));
+        }
 #endif
-  }
+        auto orig_block_ptr = block->ptr;
+        auto orig_block_size = block->size;
+        device_allocator[block->device]->free(block);
+    }
+
+    void setMemoryFraction(double fraction, int device) override
+    {
+        TORCH_INTERNAL_ASSERT(0 <= device && device < device_allocator.size(), "Allocator not initialized for device ",
+            device, ": did you call init?", PTA_ERROR(ErrCode::PARAM));
+        TORCH_INTERNAL_ASSERT(0 <= fraction && fraction <= 1, "invalid fraction:", fraction,
+            ". Please set within (0, 1).", PTA_ERROR(ErrCode::PARAM));
 
-  void free(void* ptr) {
-    if (!ptr) {
-      return;
+        c10_npu::SetDevice(device);
+
+        device_allocator[device]->setMemoryFraction(fraction);
     }
-    Block* block = get_allocated_block(ptr, true);
-    if (!block) {
-      AT_ERROR("invalid device pointer: ", ptr);
+
+    void recordHistory(bool enabled, CreateContextFn context_recorder, size_t alloc_trace_max_entries,
+        RecordContext when) override
+    {
+        for (auto &allocator : device_allocator) {
+            allocator->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when);
+        }
     }
-#ifndef BUILD_LIBTORCH
-    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
-    if (C10_UNLIKELY(trigger)) {
-        trigger->traceNpuMemoryDeallocation(
-            reinterpret_cast<uintptr_t>(block->ptr));
+
+    bool isHistoryEnabled() override
+    {
+        int device = 0;
+        NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
+        return device_allocator[device]->isHistoryEnabled();
     }
-#endif
-    auto orig_block_ptr = block->ptr;
-    auto orig_block_size = block->size;
-    device_allocator[block->device]->free(block);
-  }
-
-  void setMemoryFraction(double fraction, int device) override
-  {
-    TORCH_INTERNAL_ASSERT(
-        0 <= device && device < device_allocator.size(),
-        "Allocator not initialized for device ",
-        device,
-        ": did you call init?", PTA_ERROR(ErrCode::PARAM));
-    TORCH_INTERNAL_ASSERT(
-        0 <= fraction  && fraction <= 1,
-        "invalid fraction:",
-        fraction,
-        ". Please set within (0, 1).", PTA_ERROR(ErrCode::PARAM));
-
-    c10_npu::SetDevice(device);
-
-    device_allocator[device]->setMemoryFraction(fraction);
-  }
-
-  void recordHistory(bool enabled, CreateContextFn context_recorder,
-                     size_t alloc_trace_max_entries,
-                     RecordContext when) override
-  {
-      for (auto& allocator : device_allocator) {
-          allocator->recordHistory(enabled, context_recorder,
-                                   alloc_trace_max_entries, when);
-      }
-  }
-
-  bool isHistoryEnabled() override
-  {
-      int device = 0;
-      NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
-      return device_allocator[device]->isHistoryEnabled();
-  }
-
-  bool checkPoolLiveAllocations(
-      c10::DeviceIndex device,
-      MempoolId_t mempool_id,
-      const std::unordered_set<void*>& expected_live_allocations) override
-  {
-      return device_allocator[device]->checkPoolLiveAllocations(mempool_id, expected_live_allocations);
-  }
-
-  void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override
-  {
-      for (auto& allocator : device_allocator) {
-          allocator->attachOutOfMemoryObserver(observer);
-      }
-  }
-
-  bool checkUceInMemPool(int device) override
-  {
-      return device_allocator[device]-> checkUceInMemPool();
-  }
-
-  bool checkBlockIsSafe(const c10::DataPtr& ptr) override
-  {
-      if (!ptr.get()) {
-          return true;
-      }
-      if (ptr.get_deleter() != &local_raw_delete) {
-          return true;
-      }
-      Block* block = get_allocated_block(ptr.get());
-      TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found", PTA_ERROR(ErrCode::NOT_FOUND));
-      return block->is_safe;
-  }
-
-  void markAllBlockUnsafe(int device) override
-  {
-      return device_allocator[device]-> markAllBlockUnsafe();
-  }
-
-  void updateBlockToSafe(const c10::DataPtr &ptr) override
-  {
-      if (!ptr.get()) {
-          return;
-      }
-      if (ptr.get_deleter() != &local_raw_delete) {
-          return;
-      }
-      Block* block = get_allocated_block(ptr.get());
-      TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found", PTA_ERROR(ErrCode::NOT_FOUND));
-      if (block->is_safe == false) {
-          ASCEND_LOGI("Triggers to refresh the data of the unsafe memory block and remove the unsafe flag");
-      }
-      block->is_safe = true;
-  }
-
-  void cleanEvent() override
-  {
-      int count = static_cast<int>(device_allocator.size());
-      for (int i = 0; i < count; i++)
-          device_allocator[i]->release_and_free_events();
-  }
-
-  void emptyCache(bool check_error) override
-  {
-    int count = static_cast<int>(device_allocator.size());
-    for (int i = 0; i < count; i++)
-      device_allocator[i]->emptyCache(i, check_error);
-  }
-
-  void* getBaseAllocation(void* ptr, size_t* outSize) override
-  {
-    Block* block = get_allocated_block(ptr);
-    if (!block) {
-      AT_ERROR("invalid device pointer: ", ptr);
-    }
-    return device_allocator[block->device]->getBaseAllocation(block, outSize);
-  }
-
-  void recordStream(const c10::DataPtr& ptr, c10_npu::NPUStream stream) override
-  {
-    // Empty tensor's storage().data() might be a null ptr. As there is no
-    // blocks associated with those tensors, it is fine to do nothing here.
-    if (!ptr.get()) {
-      return;
-    }
-
-    // If a tensor is not allocated by this instance, simply skip
-    // This usually happens when NPU tensors are shared across processes,
-    // we have implemented reference counting based sharing mechanism to
-    // guarantee tensors won't be accidentally freed by one process while
-    // they are still being used in another
-    if (ptr.get_deleter() != &local_raw_delete) {
-      return;
-    }
-
-    Block* block = get_allocated_block(ptr.get());
-    // block must not be null reaching here
-    TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found", PTA_ERROR(ErrCode::NOT_FOUND));
-    device_allocator[block->device]->recordStream(block, stream);
-  }
-
-  void eraseStream(const c10::DataPtr& ptr, c10_npu::NPUStream stream)
-  {
-      if (!ptr.get()) {
-          return;
-      }
-
-      // If a tensor is not allocated by this instance, simply skip
-      // This usually happens when NPU tensors are shared across processes,
-      // we have implemented reference counting based sharing mechanism to
-      // guarantee tensors won't be accidentally freed by one process while
-      // they are still being used in another
-      if (ptr.get_deleter() != &local_raw_delete) {
-          TORCH_NPU_WARN_ONCE("Tensor not is not allocated by NPUCachingAllocator, skip eraseStream.");
-          return;
-      }
-
-      Block* block = get_allocated_block(ptr.get());
-      if (!block) {
-          AT_ERROR("invalid device pointer: ", ptr.get());
-      }
-
-      if (block->stream != c10_npu::getCurrentNPUStream(block->device).stream(false)) {
-          // If the Stream applying for tensor block different from
-          // the stream of submiting event wait task in HCCL synchronize()
-          // method, the recordSteam can not be erased.
-          // New tensor creation may use the block before HCCL op is complete.
-          return;
-      }
-
-      device_allocator[block->device]->eraseStream(block, stream);
-  }
-
-  SnapshotInfo snapshot() override
-  {
-    SnapshotInfo result;
-    int count = static_cast<int>(device_allocator.size());
-    for (int i = 0; i < count; i++) {
-      result.device_traces.emplace_back(device_allocator[i]->trace());
-      auto snap = device_allocator[i]->snapshot();
-      result.segments.insert(result.segments.end(), snap.begin(), snap.end());
-    }
-    return result;
-  }
-
-    std::shared_ptr<AllocatorState> getCheckpointState(
-        c10::DeviceIndex device,
-        MempoolId_t id) override
+
+    bool checkPoolLiveAllocations(c10::DeviceIndex device, MempoolId_t mempool_id,
+        const std::unordered_set<void *> &expected_live_allocations) override
+    {
+        return device_allocator[device]->checkPoolLiveAllocations(mempool_id, expected_live_allocations);
+    }
+
+    void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override
+    {
+        for (auto &allocator : device_allocator) {
+            allocator->attachOutOfMemoryObserver(observer);
+        }
+    }
+
+    bool checkUceInMemPool(int device) override
+    {
+        return device_allocator[device]->checkUceInMemPool();
+    }
+
+    bool checkBlockIsSafe(const c10::DataPtr &ptr) override
+    {
+        if (!ptr.get()) {
+            return true;
+        }
+        if (ptr.get_deleter() != &local_raw_delete) {
+            return true;
+        }
+        Block *block = get_allocated_block(ptr.get());
+        TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found", PTA_ERROR(ErrCode::NOT_FOUND));
+        return block->is_safe;
+    }
+
+    void markAllBlockUnsafe(int device) override
+    {
+        return device_allocator[device]->markAllBlockUnsafe();
+    }
+
+    void updateBlockToSafe(const c10::DataPtr &ptr) override
+    {
+        if (!ptr.get()) {
+            return;
+        }
+        if (ptr.get_deleter() != &local_raw_delete) {
+            return;
+        }
+        Block *block = get_allocated_block(ptr.get());
+        TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found", PTA_ERROR(ErrCode::NOT_FOUND));
+        if (block->is_safe == false) {
+            ASCEND_LOGI("Triggers to refresh the data of the unsafe memory block and remove the unsafe flag");
+        }
+        block->is_safe = true;
+    }
+
+    void cleanEvent() override
+    {
+        int count = static_cast<int>(device_allocator.size());
+        for (int i = 0; i < count; i++) {
+            device_allocator[i]->release_and_free_events();
+        }
+    }
+
+    void emptyCache(bool check_error) override
+    {
+        int count = static_cast<int>(device_allocator.size());
+        for (int i = 0; i < count; i++) {
+            device_allocator[i]->emptyCache(i, check_error);
+        }
+    }
+
+    void *getBaseAllocation(void *ptr, size_t *outSize) override
+    {
+        Block *block = get_allocated_block(ptr);
+        if (!block) {
+            AT_ERROR("invalid device pointer: ", ptr);
+        }
+        return device_allocator[block->device]->getBaseAllocation(block, outSize);
+    }
+
+    void recordStream(const c10::DataPtr &ptr, c10_npu::NPUStream stream) override
+    {
+        // Empty tensor's storage().data() might be a null ptr. As there is no
+        // blocks associated with those tensors, it is fine to do nothing here.
+        if (!ptr.get()) {
+            return;
+        }
+
+        // If a tensor is not allocated by this instance, simply skip
+        // This usually happens when NPU tensors are shared across processes,
+        // we have implemented reference counting based sharing mechanism to
+        // guarantee tensors won't be accidentally freed by one process while
+        // they are still being used in another
+        if (ptr.get_deleter() != &local_raw_delete) {
+            return;
+        }
+
+        Block *block = get_allocated_block(ptr.get());
+        // block must not be null reaching here
+        TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found", PTA_ERROR(ErrCode::NOT_FOUND));
+        device_allocator[block->device]->recordStream(block, stream);
+    }
+
+    void eraseStream(const c10::DataPtr &ptr, c10_npu::NPUStream stream)
+    {
+        if (!ptr.get()) {
+            return;
+        }
+
+        // If a tensor is not allocated by this instance, simply skip
+        // This usually happens when NPU tensors are shared across processes,
+        // we have implemented reference counting based sharing mechanism to
+        // guarantee tensors won't be accidentally freed by one process while
+        // they are still being used in another
+        if (ptr.get_deleter() != &local_raw_delete) {
+            TORCH_NPU_WARN_ONCE("Tensor not is not allocated by NPUCachingAllocator, skip eraseStream.");
+            return;
+        }
+
+        Block *block = get_allocated_block(ptr.get());
+        if (!block) {
+            AT_ERROR("invalid device pointer: ", ptr.get());
+        }
+
+        if (block->stream != c10_npu::getCurrentNPUStream(block->device).stream(false)) {
+            // If the Stream applying for tensor block different from
+            // the stream of submiting event wait task in HCCL synchronize()
+            // method, the recordSteam can not be erased.
+            // New tensor creation may use the block before HCCL op is complete.
+            return;
+        }
+
+        device_allocator[block->device]->eraseStream(block, stream);
+    }
+
+    SnapshotInfo snapshot() override
+    {
+        SnapshotInfo result;
+        int count = static_cast<int>(device_allocator.size());
+        for (int i = 0; i < count; i++) {
+            result.device_traces.emplace_back(device_allocator[i]->trace());
+            auto snap = device_allocator[i]->snapshot();
+            result.segments.insert(result.segments.end(), snap.begin(), snap.end());
+        }
+        return result;
+    }
+
+    std::shared_ptr<AllocatorState> getCheckpointState(c10::DeviceIndex device, MempoolId_t id) override
     {
         return device_allocator[device]->getCheckpointState(id);
     }
 
-    /**
+    /* *
      * @brief Checkpoint the private pool state identified in `as` to its prior
      * state
      *
@@ -3435,9 +3234,7 @@ class NpuCachingAllocator : public NPUAllocator {
      * @return CheckpointDelta - Freed Pointers and DataPtrs that contain deleter
      * functions for all allocated blocks in the new checkpoint state.
      */
-    CheckpointDelta setCheckpointPoolState(
-        c10::DeviceIndex device,
-        std::shared_ptr<AllocatorState> as) override
+    CheckpointDelta setCheckpointPoolState(c10::DeviceIndex device, std::shared_ptr<AllocatorState> as) override
     {
         std::shared_ptr<PrivatePoolState> pps = std::dynamic_pointer_cast<PrivatePoolState>(as);
 
@@ -3446,26 +3243,21 @@ class NpuCachingAllocator : public NPUAllocator {
         auto rr = device_allocator[device]->setCheckpointPoolState(*pps);
 
         CheckpointDelta cpd;
-        for (void* ptr : rr.allocations_freed) {
+        for (void *ptr : rr.allocations_freed) {
             // remove block
             get_allocated_block(ptr, true);
             cpd.ptrs_freed.push_back(ptr);
         }
-        for (Block* block : rr.allocations_created) {
+        for (Block *block : rr.allocations_created) {
             add_allocated_block(block);
-            cpd.dataptrs_allocd.emplace_back(
-                block->ptr,
-                block->ptr,
-                &local_raw_delete,
+            cpd.dataptrs_allocd.emplace_back(block->ptr, block->ptr, &local_raw_delete,
                 c10::Device(c10::DeviceType::PrivateUse1, device));
         }
 
         return cpd;
     }
 
-    void beginAllocateToPool(
-        c10::DeviceIndex device,
-        MempoolId_t mempool_id,
+    void beginAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id,
         std::function<bool(aclrtStream)> filter) override
     {
         assertValidDevice(device);
@@ -3484,111 +3276,140 @@ class NpuCachingAllocator : public NPUAllocator {
         device_allocator[device]->releasePool(std::move(mempool_id));
     }
 
-  c10::DataPtr allocate(size_t size) const override
-  {
-      constexpr size_t one_exa_bytes = 1152921504606846976ULL;
-      if (size >= one_exa_bytes) {
-          AT_ERROR("NPU out of memory. Tried to allocate more than 1EB memory.");
-      }
-      int device = 0;
-      NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
-      void* devPtr = nullptr;
-      void (*deleteFunc)(void*) = &local_raw_delete;
-
-      if (size != 0) {
-          if (c10_npu::option::OptionsManager::CheckForceUncached()) {
-              deleteFunc = &uncached_delete;
-              size_t alloc_size = size + 32;
-              NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&devPtr, alloc_size,
-                                                               aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST));
-              ASCEND_LOGD("Without NPUCachingAllocator, malloc by "
-                          "AclrtMallocAlign32: size=%zu",
-                          alloc_size);
-          } else {
-              const_cast<NpuCachingAllocator*>(this)->malloc(&devPtr, device, size,
-                                                             c10_npu::getCurrentNPUStreamNoWait(device));
-          }
-      }
-      return {devPtr, devPtr, deleteFunc, c10::Device(c10::DeviceType::PrivateUse1, device)};
-  }
-
-  c10::DeleterFnPtr raw_deleter() const override
-  {
-      if (c10_npu::option::OptionsManager::CheckForceUncached()) {
-          return &uncached_delete;
-      } else {
-          return &local_raw_delete;
-      }
-  }
-
-  void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) override
-  {
-      device_allocator[dev_id]->cacheInfo(cachedAndFree, largestBlock);
-  }
-
-  void assertValidDevice(int device)
-  {
-      const auto device_num = device_allocator.size();
-      TORCH_CHECK(0 <= device && device < static_cast<int64_t>(device_num), "Invalid device argument ", device,
-                  ": did you call init?", PTA_ERROR(ErrCode::PARAM));
-  }
-
-  DeviceStats getDeviceStats(int device) override
-  {
-      assertValidDevice(device);
-      return device_allocator[device]->getStats();
-  }
-
-  void resetAccumulatedStats(int device) override
-  {
-      assertValidDevice(device);
-      device_allocator[device]->resetAccumulatedStats();
-  }
-
-  void resetPeakStats(int device) override
-  {
-      assertValidDevice(device);
-      device_allocator[device]->resetPeakStats();
-  }
-
-  void* raw_alloc(size_t nbytes) override
-  {
-    if (nbytes == 0) {
-      return nullptr;
-    }
-    int device = 0;
-    NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
-    void* r = nullptr;
-    malloc(&r, device, nbytes, c10_npu::getCurrentNPUStreamNoWait(device));
-    return r;
-  }
-
-  void* raw_alloc_with_stream(size_t nbytes, aclrtStream stream) override
-  {
-    if (nbytes == 0) {
-      return nullptr;
-    }
-    int device;
-    NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
-    void* r = nullptr;
-    malloc(&r, device, nbytes, stream);
-    return r;
-  }
-
-  void raw_delete(void* ptr) override
-  {
-    this->free(ptr);
-  }
-
-  void FreeDeviceCachedMemory(int device) override
-  {
-    device_allocator[device]->emptyCache(device, true);
-  }
-
-  std::string name() override
-  {
-    return "native";
-  }
+    c10::DataPtr allocate(size_t size) const override
+    {
+        constexpr size_t one_exa_bytes = 1152921504606846976ULL;
+        if (size >= one_exa_bytes) {
+            AT_ERROR("NPU out of memory. Tried to allocate more than 1EB memory.");
+        }
+        int device = 0;
+        NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
+        void *devPtr = nullptr;
+        void (*deleteFunc)(void *) = &local_raw_delete;
+
+        if (size != 0) {
+            if (c10_npu::option::OptionsManager::CheckForceUncached()) {
+                deleteFunc = &uncached_delete;
+                size_t alloc_size = size + 32;
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&devPtr, alloc_size,
+                    aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST));
+                ASCEND_LOGD("Without NPUCachingAllocator, malloc by "
+                    "AclrtMallocAlign32: size=%zu",
+                    alloc_size);
+            } else {
+                const_cast<NpuCachingAllocator *>(this)->malloc(&devPtr, device, size,
+                    c10_npu::getCurrentNPUStreamNoWait(device));
+            }
+        }
+        return { devPtr, devPtr, deleteFunc, c10::Device(c10::DeviceType::PrivateUse1, device) };
+    }
+
+    c10::DataPtr allocate_with_aligned(size_t size, size_t base_addr_aligned_kb) const override
+    {
+        constexpr size_t one_exa_bytes = 1152921504606846976ULL;
+        if (C10_UNLIKELY(size >= one_exa_bytes)) {
+            AT_ERROR("NPU out of memory. Tried to allocate more than 1EB memory.");
+        }
+        int device = 0;
+        NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
+        void *realPtr = nullptr;
+        void (*deleteFunc)(void *) = &local_raw_delete;
+
+        size_t aligned = base_addr_aligned_kb * 1024;
+        if (size != 0) {
+            if (c10_npu::option::OptionsManager::CheckForceUncached()) {
+                deleteFunc = &uncached_delete;
+                size_t alloc_size = size + 32 + aligned;
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&realPtr, alloc_size,
+                                                                 aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST));
+                ASCEND_LOGD("Without NPUCachingAllocator, malloc by "
+                            "AclrtMallocAlign32: size=%zu", alloc_size);
+            } else {
+                const_cast<NpuCachingAllocator *>(this)->malloc(&realPtr, device, size + aligned,
+                                                                c10_npu::getCurrentNPUStreamNoWait(device));
+            }
+        }
+        void *devPtr = reinterpret_cast<void*>(aligned * ((reinterpret_cast<uintptr_t>(realPtr) + aligned - 1) / aligned));
+        return { devPtr, realPtr, deleteFunc, c10::Device(c10::DeviceType::PrivateUse1, device) };
+    }
+
+    c10::DeleterFnPtr raw_deleter() const override
+    {
+        if (c10_npu::option::OptionsManager::CheckForceUncached()) {
+            return &uncached_delete;
+        } else {
+            return &local_raw_delete;
+        }
+    }
+
+    void cacheInfo(int dev_id, size_t *cachedAndFree, size_t *largestBlock) override
+    {
+        device_allocator[dev_id]->cacheInfo(cachedAndFree, largestBlock);
+    }
+
+    void assertValidDevice(int device)
+    {
+        const auto device_num = device_allocator.size();
+        TORCH_CHECK(0 <= device && device < static_cast<int64_t>(device_num), "Invalid device argument ", device,
+            ": did you call init?", PTA_ERROR(ErrCode::PARAM));
+    }
+
+    DeviceStats getDeviceStats(int device) override
+    {
+        assertValidDevice(device);
+        return device_allocator[device]->getStats();
+    }
+
+    void resetAccumulatedStats(int device) override
+    {
+        assertValidDevice(device);
+        device_allocator[device]->resetAccumulatedStats();
+    }
+
+    void resetPeakStats(int device) override
+    {
+        assertValidDevice(device);
+        device_allocator[device]->resetPeakStats();
+    }
+
+    void *raw_alloc(size_t nbytes) override
+    {
+        if (nbytes == 0) {
+            return nullptr;
+        }
+        int device = 0;
+        NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
+        void *r = nullptr;
+        malloc(&r, device, nbytes, c10_npu::getCurrentNPUStreamNoWait(device));
+        return r;
+    }
+
+    void *raw_alloc_with_stream(size_t nbytes, aclrtStream stream) override
+    {
+        if (nbytes == 0) {
+            return nullptr;
+        }
+        int device;
+        NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
+        void *r = nullptr;
+        malloc(&r, device, nbytes, stream);
+        return r;
+    }
+
+    void raw_delete(void *ptr) override
+    {
+        this->free(ptr);
+    }
+
+    void FreeDeviceCachedMemory(int device) override
+    {
+        device_allocator[device]->emptyCache(device, true);
+    }
+
+    std::string name() override
+    {
+        return "native";
+    }
 
     void buildServerMemMapForHccl(int device, std::shared_ptr<c10d_npu::HCCLComm> hcclComm)
     {
@@ -3601,69 +3422,72 @@ NpuCachingAllocator caching_allocator;
 REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &caching_allocator);
 
 
-void local_raw_delete(void* ptr)
+void local_raw_delete(void *ptr)
 {
-  caching_allocator.free(ptr);
+    caching_allocator.free(ptr);
 }
 
-void* MallocBlock(size_t size, void *stream, int device) {
-  if (device == -1) {
-    NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
-  }
-  if ((device < 0) || (device > static_cast<int>(caching_allocator.device_allocator.size()))) {
-    return nullptr;
-  }
-  AT_ASSERT(caching_allocator.device_allocator[device], PTA_ERROR(ErrCode::NOT_FOUND));
-  AT_ASSERT(stream, PTA_ERROR(ErrCode::NOT_FOUND));
-  auto block = caching_allocator.device_allocator[device]->malloc(device, size, stream,
-    static_cast<uint8_t>(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_EXTERNAL));
-  AT_ASSERT(block, PTA_ERROR(ErrCode::NOT_FOUND));
-  return reinterpret_cast<void*>(block);
+void *MallocBlock(size_t size, void *stream, int device)
+{
+    if (device == -1) {
+        NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
+    }
+    if ((device < 0) || (device > static_cast<int>(caching_allocator.device_allocator.size()))) {
+        return nullptr;
+    }
+    AT_ASSERT(caching_allocator.device_allocator[device], PTA_ERROR(ErrCode::NOT_FOUND));
+    AT_ASSERT(stream, PTA_ERROR(ErrCode::NOT_FOUND));
+    auto block = caching_allocator.device_allocator[device]->malloc(device, size, stream,
+        static_cast<uint8_t>(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_EXTERNAL));
+    AT_ASSERT(block, PTA_ERROR(ErrCode::NOT_FOUND));
+    return reinterpret_cast<void *>(block);
 }
 
-void FreeBlock(void *handle) {
-  Block* block = reinterpret_cast<Block*>(handle);
-  AT_ASSERT(block, PTA_ERROR(ErrCode::PTR));
-  caching_allocator.assertValidDevice(block->device);
-  AT_ASSERT(caching_allocator.device_allocator[block->device], PTA_ERROR(ErrCode::NOT_FOUND));
-  auto orig_block_ptr = block->ptr;
-  auto orig_block_size = block->size;
-  caching_allocator.device_allocator[block->device]->free(block,
-    static_cast<uint8_t>(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_EXTERNAL));
+void FreeBlock(void *handle)
+{
+    Block *block = reinterpret_cast<Block *>(handle);
+    AT_ASSERT(block, PTA_ERROR(ErrCode::PTR));
+    caching_allocator.assertValidDevice(block->device);
+    AT_ASSERT(caching_allocator.device_allocator[block->device], PTA_ERROR(ErrCode::NOT_FOUND));
+    auto orig_block_ptr = block->ptr;
+    auto orig_block_size = block->size;
+    caching_allocator.device_allocator[block->device]->free(block,
+        static_cast<uint8_t>(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_EXTERNAL));
 }
 
-void* GetBlockPtr(const void *handle) {
-  const Block* block = reinterpret_cast<const Block*>(handle);
-  AT_ASSERT(block, PTA_ERROR(ErrCode::PTR));
-  return block->ptr;
+void *GetBlockPtr(const void *handle)
+{
+    const Block *block = reinterpret_cast<const Block *>(handle);
+    AT_ASSERT(block, PTA_ERROR(ErrCode::PTR));
+    return block->ptr;
 }
 
-size_t GetBlockSize(const void *handle) {
-  const Block* block = reinterpret_cast<const Block*>(handle);
-  AT_ASSERT(block, PTA_ERROR(ErrCode::PTR));
-  return block->size;
+size_t GetBlockSize(const void *handle)
+{
+    const Block *block = reinterpret_cast<const Block *>(handle);
+    AT_ASSERT(block, PTA_ERROR(ErrCode::PTR));
+    return block->size;
 }
 
 struct BackendStaticInitializer {
     BackendStaticInitializer()
     {
-      allocator.store(&caching_allocator);
+        allocator.store(&caching_allocator);
     }
 };
 
-std::atomic<NPUAllocator*> allocator;
+std::atomic<NPUAllocator *> allocator;
 BackendStaticInitializer backend_static_initializer;
 
-std::mutex* getFreeMutex() {
-  static std::mutex npu_free_mutex;
-  return &npu_free_mutex;
+std::mutex *getFreeMutex()
+{
+    static std::mutex npu_free_mutex;
+    return &npu_free_mutex;
 }
-
 } // namespace NPUCachingAllocator
 } // namespace c10_npu
 
 namespace c10_npu {
-
 // uid_ is incremented when a user creates a MemPool,
 // for example: using graph_pool_handle() or c10_npu::MemPool().
 //
@@ -3674,16 +3498,16 @@ namespace c10_npu {
 // passed to a function, either by user or NPUGraphs. For example,
 // default value of MempoolId_t for capture_begin function is {0, 0}.
 // That's why uid_ and uuid_ start at 1.
-std::atomic<CaptureId_t> MemPool::uid_{1};
-std::atomic<CaptureId_t> MemPool::uuid_{1};
+std::atomic<CaptureId_t> MemPool::uid_{ 1 };
+std::atomic<CaptureId_t> MemPool::uuid_{ 1 };
 
-MemPool::MemPool(NPUCachingAllocator::NPUAllocator* allocator, bool is_user_created)
+MemPool::MemPool(NPUCachingAllocator::NPUAllocator *allocator, bool is_user_created)
     : allocator_(allocator), is_user_created_(is_user_created)
 {
     if (is_user_created_) {
-        id_ = {0, uid_++};
+        id_ = { 0, uid_++ };
     } else {
-        id_ = {uuid_++, 0};
+        id_ = { uuid_++, 0 };
     }
 }
 
@@ -3692,7 +3516,7 @@ MempoolId_t MemPool::id()
     return id_;
 }
 
-NPUCachingAllocator::NPUAllocator* MemPool::allocator()
+NPUCachingAllocator::NPUAllocator *MemPool::allocator()
 {
     return allocator_;
 }
@@ -3700,10 +3524,9 @@ NPUCachingAllocator::NPUAllocator* MemPool::allocator()
 // Note that active_mempool_ is a global variable here
 // and not inside MemPoolContext class, because in windows we
 // can't use __declspec(dllexport) and __declspec(thread)
-static thread_local MemPool* active_mempool_ = nullptr;
+static thread_local MemPool *active_mempool_ = nullptr;
 
-MemPoolContext::MemPoolContext(MemPool* mempool)
-    : prev_mempool_(active_mempool_)
+MemPoolContext::MemPoolContext(MemPool *mempool) : prev_mempool_(active_mempool_)
 {
     active_mempool_ = mempool;
 }
@@ -3713,9 +3536,8 @@ MemPoolContext::~MemPoolContext()
     active_mempool_ = prev_mempool_;
 }
 
-MemPool* MemPoolContext::getActiveMemPool()
+MemPool *MemPoolContext::getActiveMemPool()
 {
     return active_mempool_;
 }
-
 } // namespace c10_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
index 861a140aca76b47b91318b47456991f6267ef1df..a4e14d2232ab30f7a3cd4e991c904f404b18f6a5 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
@@ -23,8 +23,8 @@ C10_NPU_API std::mutex* getFreeMutex();
 // block inside of already allocated area.
 class FreeMemoryCallback {
 public:
-  virtual ~FreeMemoryCallback(){};
-  virtual bool Execute() = 0;
+    virtual ~FreeMemoryCallback(){};
+    virtual bool Execute() = 0;
 };
 
 C10_DECLARE_REGISTRY(FreeNPUMemoryCallbacksRegistry, FreeMemoryCallback);
@@ -45,75 +45,75 @@ C10_DECLARE_REGISTRY(FreeNPUMemoryCallbacksRegistry, FreeMemoryCallback);
 // not counted as a word boundary, so you would otherwise have to list each
 // of these functions.
 struct Stat {
-  int64_t current = 0;
-  int64_t peak = 0;
-  int64_t allocated = 0;
-  int64_t freed = 0;
+    int64_t current = 0;
+    int64_t peak = 0;
+    int64_t allocated = 0;
+    int64_t freed = 0;
 };
 
 enum struct StatType : uint64_t {
-  AGGREGATE = 0,
-  SMALL_POOL = 1,
-  LARGE_POOL = 2,
-  NUM_TYPES = 3  // remember to update this whenever a new stat type is added
+    AGGREGATE = 0,
+    SMALL_POOL = 1,
+    LARGE_POOL = 2,
+    NUM_TYPES = 3  // remember to update this whenever a new stat type is added
 };
 
 typedef std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)> StatArray;
 // Struct containing memory allocator summary statistics for a device.
 struct DeviceStats {
-  // COUNT: allocations requested by client code
-  StatArray allocation;
-  // COUNT: number of allocated segments from npuMalloc().
-  StatArray segment;
-  // COUNT: number of active memory blocks (allocated or used by stream)
-  StatArray active;
-  // COUNT: number of inactive, split memory blocks (unallocated but can't be released via npuFree)
-  StatArray inactive_split;
-
-  // SUM: bytes requested by client code
-  StatArray allocated_bytes;
-  // SUM: bytes reserved by this memory allocator (both free and used)
-  StatArray reserved_bytes;
-  // SUM: bytes within active memory blocks
-  StatArray active_bytes;
-  // SUM: bytes within inactive, split memory blocks
-  StatArray inactive_split_bytes;
-  // SUM: bytes requested by client code
-  StatArray requested_bytes;
-
-  // COUNT: total number of failed calls to NPU malloc necessitating cache flushes.
-  int64_t num_alloc_retries = 0;
-
-  // COUNT: total number of OOMs (i.e. failed calls to NPU after cache flush)
-  int64_t num_ooms = 0;
-
-  // COUNT: total number of oversize blocks allocated from pool
-  Stat oversize_allocations;
-
-  // COUNT: total number of oversize blocks requiring malloc
-  Stat oversize_segments;
-
-  // SIZE: maximum block size that is allowed to be split.
-  int64_t max_split_size = 0;
+    // COUNT: allocations requested by client code
+    StatArray allocation;
+    // COUNT: number of allocated segments from npuMalloc().
+    StatArray segment;
+    // COUNT: number of active memory blocks (allocated or used by stream)
+    StatArray active;
+    // COUNT: number of inactive, split memory blocks (unallocated but can't be released via npuFree)
+    StatArray inactive_split;
+
+    // SUM: bytes requested by client code
+    StatArray allocated_bytes;
+    // SUM: bytes reserved by this memory allocator (both free and used)
+    StatArray reserved_bytes;
+    // SUM: bytes within active memory blocks
+    StatArray active_bytes;
+    // SUM: bytes within inactive, split memory blocks
+    StatArray inactive_split_bytes;
+    // SUM: bytes requested by client code
+    StatArray requested_bytes;
+
+    // COUNT: total number of failed calls to NPU malloc necessitating cache flushes.
+    int64_t num_alloc_retries = 0;
+
+    // COUNT: total number of OOMs (i.e. failed calls to NPU after cache flush)
+    int64_t num_ooms = 0;
+
+    // COUNT: total number of oversize blocks allocated from pool
+    Stat oversize_allocations;
+
+    // COUNT: total number of oversize blocks requiring malloc
+    Stat oversize_segments;
+
+    // SIZE: maximum block size that is allowed to be split.
+    int64_t max_split_size = 0;
 };
 
 typedef std::shared_ptr<c10::GatheredContext> (*CreateContextFn)(void);
 
 // Struct containing info of an allocation block (i.e. a fractional part of a cudaMalloc)..
 struct BlockInfo {
-  int64_t size = 0;
-  int64_t requested_size = 0;
-  int32_t gc_counter = 0;
-  bool allocated = false;
-  bool active = false;
-  std::shared_ptr<c10::GatheredContext> context_when_allocated;
+    int64_t size = 0;
+    int64_t requested_size = 0;
+    int32_t gc_counter = 0;
+    bool allocated = false;
+    bool active = false;
+    std::shared_ptr<c10::GatheredContext> context_when_allocated;
 };
 
 // Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
 struct SegmentInfo {
     int64_t device = 0;
     int64_t  address = 0;
-    aclrtStream stream = 0;
+    aclrtStream stream = nullptr;
     int64_t total_size = 0;
     int64_t requested_size = 0;
     int64_t allocated_size = 0;
@@ -145,8 +145,9 @@ struct TraceEntry {
                        // segments)
         SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to
                   // trace events
-        OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of
+        OOM, // the allocator threw an OutOfMemoryError (addr_ is the amount of
             // free bytes reported by cuda)
+        WORKSPACE_SNAPSHOT
     };
     TraceEntry(Action action, int device, int64_t addr, size_t size,
                aclrtStream stream,
@@ -189,6 +190,7 @@ using OutOfMemoryObserver =
 
 class NPUAllocator : public c10::Allocator {
 public:
+    virtual c10::DataPtr allocate_with_aligned(size_t size, size_t aligned) const = 0;
     virtual void* raw_alloc(size_t nbytes) = 0;
     virtual void* raw_alloc_with_stream(size_t nbytes, aclrtStream stream) = 0;
     virtual void raw_delete(void* ptr) = 0;
@@ -263,6 +265,11 @@ inline NPUAllocator* get()
     return allocator.load();
 }
 
+inline c10::DataPtr allocate_with_aligned(size_t size, size_t base_addr_aligned_kb)
+{
+    return get()->allocate_with_aligned(size, base_addr_aligned_kb);
+}
+
 // Called directly by clients.
 inline void* raw_alloc(size_t nbytes)
 {
diff --git a/torch_npu/csrc/core/npu/NPUEvent.cpp b/torch_npu/csrc/core/npu/NPUEvent.cpp
index c051d50d287989bdcad7cd2dc14bb9311f122f62..21b19320154d62d3e37d48a95ae2df09ef18b605 100644
--- a/torch_npu/csrc/core/npu/NPUEvent.cpp
+++ b/torch_npu/csrc/core/npu/NPUEvent.cpp
@@ -70,7 +70,9 @@ void NPUEvent::record()
 
 void NPUEvent::recordOnce(const NPUStream& stream)
 {
-    if (!was_recorded_) record(stream);
+    if (!was_recorded_) {
+        record(stream);
+    }
 }
 
 void NPUEvent::record(const NPUStream& stream)
@@ -89,6 +91,9 @@ void NPUEvent::record(const NPUStream& stream)
 
 void NPUEvent::block(const NPUStream& stream)
 {
+    if (!is_created_ && (flags_ == ACL_EVENT_EXTERNAL)) {
+        createEvent(stream.device_index());
+    }
     if (is_created_) {
         NPUGuard guard(stream.device_index());
         c10_npu::queue::LaunchWaitEventTask(event_, stream);
@@ -159,6 +164,16 @@ void NPUEvent::synchronize() const
     }
 }
 
+void NPUEvent::reset(const NPUStream& stream) const
+{
+    if (is_created_) {
+        TORCH_CHECK(flags_ == ACL_EVENT_EXTERNAL,
+                    "API reset() only support ACL_EVENT_EXTERNAL flag event.", PTA_ERROR(ErrCode::INTERNAL));
+        NPUGuard guard(stream.device_index());
+        NPU_CHECK_ERROR_WITHOUT_UCE(aclrtResetEvent(event_, stream.stream()));
+    }
+}
+
 void NPUEvent::createEvent(c10::DeviceIndex device_index)
 {
     device_index_ = device_index;
diff --git a/torch_npu/csrc/core/npu/NPUEvent.h b/torch_npu/csrc/core/npu/NPUEvent.h
index 5eba816db69496545410a3bae53a1e3649185772..cf6e34ee9c73b9e544ca12adf625d9d27fa21f23 100644
--- a/torch_npu/csrc/core/npu/NPUEvent.h
+++ b/torch_npu/csrc/core/npu/NPUEvent.h
@@ -49,6 +49,7 @@ struct C10_NPU_API NPUEvent {
     float elapsed_time(const NPUEvent& other) const;
     uint64_t recorded_time() const;
     void synchronize() const;
+    void reset(const NPUStream& stream) const;
 
     // npu do not support IpcEventHandle until now
 
diff --git a/torch_npu/csrc/core/npu/NPUEventManager.cpp b/torch_npu/csrc/core/npu/NPUEventManager.cpp
index cbea3be79c2b5f97d830fce5d67de6f4790cd928..2371b9bc794579ebd63c5eae72b3471269fe7380 100644
--- a/torch_npu/csrc/core/npu/NPUEventManager.cpp
+++ b/torch_npu/csrc/core/npu/NPUEventManager.cpp
@@ -105,10 +105,10 @@ void NPUEventManager::IncreaseUnrecordedCount(aclrtEvent event)
     auto it = event_unrecorded_count_.find(event);
     if (it != event_unrecorded_count_.end()) {
         it->second++;
-        ASCEND_LOGI("Event: unrecorded count increase, now=%d.", it->second);
+        ASCEND_LOGD("Event: unrecorded count increase, now=%d.", it->second);
     } else {
         event_unrecorded_count_.insert(std::pair<aclrtEvent, int>(event, 1));
-        ASCEND_LOGI("Event: unrecorded count increase, now=%d.", 1);
+        ASCEND_LOGD("Event: unrecorded count increase, now=%d.", 1);
     }
 }
 
@@ -123,10 +123,10 @@ void NPUEventManager::DecreaseUnrecordedCount(aclrtEvent event)
         (void *) event, PTA_ERROR(ErrCode::INTERNAL));
     if (it->second == 1) {
         event_unrecorded_count_.erase(event);
-        ASCEND_LOGI("Event: unrecorded count decrease, now=%d.", 0);
+        ASCEND_LOGD("Event: unrecorded count decrease, now=%d.", 0);
     } else {
         it->second--;
-        ASCEND_LOGI("Event: unrecorded count decrease, now=%d.", it->second);
+        ASCEND_LOGD("Event: unrecorded count decrease, now=%d.", it->second);
     }
 }
 
diff --git a/torch_npu/csrc/core/npu/NPUEventManager.h b/torch_npu/csrc/core/npu/NPUEventManager.h
index 431ddc9f0048dc76972c6feb0f21744b04f75f7c..ac7f0176e0f52daf9f88fdd39bdb2f5b0d546f5b 100644
--- a/torch_npu/csrc/core/npu/NPUEventManager.h
+++ b/torch_npu/csrc/core/npu/NPUEventManager.h
@@ -14,27 +14,27 @@ namespace c10_npu {
 
 class NPUEventManager {
 public:
-  static NPUEventManager& GetInstance();
-  aclError QueryAndDestroyEvent();
-  aclError LazyDestroy(aclrtEvent npu_event);
-  void ClearEvent();
-  void IncreaseUnrecordedCount(aclrtEvent event);
-  void DecreaseUnrecordedCount(aclrtEvent event);
-  bool IsEventRecorded(aclrtEvent event);
-  void ClearUnrecordedCount();
-  ~NPUEventManager() {}
+    static NPUEventManager& GetInstance();
+    aclError QueryAndDestroyEvent();
+    aclError LazyDestroy(aclrtEvent npu_event);
+    void ClearEvent();
+    void IncreaseUnrecordedCount(aclrtEvent event);
+    void DecreaseUnrecordedCount(aclrtEvent event);
+    bool IsEventRecorded(aclrtEvent event);
+    void ClearUnrecordedCount();
+    ~NPUEventManager() {}
 
 private:
-  void run(aclrtEvent event);
+    void run(aclrtEvent event);
 
 private:
-  std::mutex event_queue_mutex_;
-  NPUEventManager();
-  std::deque<aclrtEvent> npu_events_;
-  std::shared_ptr<c10::TaskThreadPool> thread_pool_;
+    std::mutex event_queue_mutex_;
+    NPUEventManager();
+    std::deque<aclrtEvent> npu_events_;
+    std::shared_ptr<c10::TaskThreadPool> thread_pool_;
 
-  std::mutex event_unrecorded_count_mutex_;
-  ska::flat_hash_map<aclrtEvent, int> event_unrecorded_count_;
+    std::mutex event_unrecorded_count_mutex_;
+    ska::flat_hash_map<aclrtEvent, int> event_unrecorded_count_;
 };
 
 } // namespace c10_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp
index 97ce750b50ae0834af4370764b3441dbc9aff57f..9c667f1fdb120c30696c49c1a911702332ffb54b 100644
--- a/torch_npu/csrc/core/npu/NPUException.cpp
+++ b/torch_npu/csrc/core/npu/NPUException.cpp
@@ -51,7 +51,9 @@ std::string formatErrorCode(SubModule submodule, ErrCode errorCode)
     int deviceIndex = -1;
     c10_npu::GetDevice(&deviceIndex);
     auto rank_id = c10_npu::option::OptionsManager::GetRankId();
+    if (!(c10_npu::option::OptionsManager::ShouldPrintLessError())) {
     oss << "\n[ERROR] " << getCurrentTimestamp() << " (PID:" << getpid() << ", Device:" << deviceIndex << ", RankID:" << rank_id << ") ";
+    }
     oss << "ERR" << std::setw(2) << std::setfill('0') << static_cast<int>(submodule);
     oss << std::setw(3) << std::setfill('0') << static_cast<int>(errorCode);
     oss << " " << submoduleMap[submodule] << " " << errCodeMap[errorCode];
@@ -77,6 +79,15 @@ static std::string getCurrentTimestamp()
 
 namespace c10_npu {
 
+std::unordered_map<int, std::function<std::string(int)>> errCodeHandlerMap = {
+    {ACL_ERROR_RT_DEVICE_TASK_ABORT, std::bind(&handleDeviceTaskAbort, std::placeholders::_1)},
+    {ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR, std::bind(&handleHbmMultiBitEccError, std::placeholders::_1)},
+    {ACL_ERROR_RT_DEVICE_MEM_ERROR, std::bind(&handleDeviceMemError, std::placeholders::_1)},
+    {ACL_ERROR_RT_SUSPECT_DEVICE_MEM_ERROR, std::bind(&handleSuspectDeviceMemError, std::placeholders::_1)},
+    {ACL_ERROR_RT_LINK_ERROR, std::bind(&handleLinkError, std::placeholders::_1)},
+    {ACL_ERROR_RT_COMM_OP_RETRY_FAIL, std::bind(&handleHcclOpRetryFailed, std::placeholders::_1)}
+};
+
 MemUceInfo memUceInfo;
 
 std::mutex memUceInfoMutex;
@@ -99,11 +110,53 @@ void clear_mem_uce_info()
     memUceInfo.clear();
 }
 
+const std::string c10_npu_check_error_message(std::string& errmsg)
+{
+    static const std::regex errorRegex(R"(^E[1-9A-Z]9999)");
+    if (std::regex_search(errmsg, errorRegex)) {
+        return "CANN Inner Error. Please rectify the fault based on the error information in the ascend log.";
+    }
+
+    std::regex dateRegex(R"(\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2}\.\d{3}\.\d{3})");
+    std::smatch match;
+
+    if (std::regex_search(errmsg, match, dateRegex)) {
+        size_t dateEndPos = match.position(0) + match.length(0);
+        size_t tracePos = errmsg.find("TraceBack (most recent call last):\n", dateEndPos);
+        std::string content;
+        if (tracePos != std::string::npos) {
+            content = errmsg.substr(dateEndPos, tracePos - dateEndPos);
+        } else {
+            content = errmsg.substr(dateEndPos);
+        }
+
+        std::regex ws_regex("[\\s\\t\\n\\r]+");
+        content = std::regex_replace(content, ws_regex, " ");
+        if (!content.empty() && content.front() == ' ')
+            content.erase(0, 1);
+        if (!content.empty() && content.back() == ' ')
+            content.pop_back();
+
+        return content;
+    }
+
+    return "";
+}
+
+
 const char *c10_npu_get_error_message()
 {
     auto errmsg = c10_npu::acl::AclGetErrMsg();
-    c10_npu::setRepoErrMsg(errmsg);
-    return errmsg;
+    if (c10_npu::option::OptionsManager::ShouldPrintLessError()) {
+        std::string log(errmsg);
+        std::string errmsg_ = c10_npu::c10_npu_check_error_message(log);
+        thread_local std::string processedErrMsg = errmsg_;
+        c10_npu::setRepoErrMsg(processedErrMsg.c_str());
+        return processedErrMsg.c_str();
+    } else {
+        c10_npu::setRepoErrMsg(errmsg);
+        return errmsg;
+    }
 }
 
 void record_mem_hbm_ecc_error()
@@ -153,4 +206,66 @@ bool checkUceErrAndRepair(bool check_error, std::string& err_msg)
     return false;
 }
 
+std::string handleDeviceTaskAbort(int errorCode)
+{
+    ASCEND_LOGE("getRepoStopFlag in Run, throw FORCE STOP.");
+    return "FORCE STOP";
+}
+
+std::string handleHbmMultiBitEccError(int errorCode)
+{
+    ASCEND_LOGE("getRepoStopFlag in Run, throw ECC ERROR.");
+    std::string error_msg(c10_npu::c10_npu_get_error_message());
+    std::regex pattern(R"(time us= (\d+)\.)");
+    std::smatch match;
+    std::string time_msg = "";
+    if (std::regex_search(error_msg, match, pattern)) {
+        if (match.size() > 1) {
+            time_msg = match[1].str();
+        }
+    }
+    c10_npu::record_mem_hbm_ecc_error();
+    return "HBM MULTI BIT ECC ERROR." + error_msg + "time is " + time_msg;
+}
+
+std::string handleDeviceMemError(int errorCode)
+{
+    std::string error_msg = "";
+    if (c10_npu::checkUceErrAndRepair(true, error_msg)) {
+        ASCEND_LOGE("getRepoStopFlag in Run, throw UCE ERROR.");
+        return "UCE ERROR";
+    }
+    return "";
+}
+
+std::string handleSuspectDeviceMemError(int errorCode)
+{
+    ASCEND_LOGE("getRepoStopFlag in Run, throw SUSPECT MEM ERROR.");
+    return "SUSPECT MEM ERROR";
+}
+
+std::string handleLinkError(int errorCode)
+{
+    ASCEND_LOGE("getRepoStopFlag in Run, throw HCCS LINK ERROR.");
+    return "HCCS LINK ERROR";
+}
+
+std::string handleHcclOpRetryFailed(int errorCode)
+{
+    ASCEND_LOGE("getRepoStopFlag in Run, throw HCCL OP RETRY FAILED.");
+    return "HCCL OP RETRY FAILED";
+}
+
+std::string handleDeviceError(int errorCode)
+{
+    auto handlerIter = errCodeHandlerMap.find(errorCode);
+    if (handlerIter != errCodeHandlerMap.end()) {
+        std::function<std::string(int)> handler = handlerIter->second;
+        if (handler != nullptr) {
+            return handler(errorCode);
+        }
+    }
+    return "";
+}
+
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h
index 0005b4046d9c259048853bca538a2a3d4a2c3c6a..a82f8f15688c9da828cc977954527869c99708d3 100644
--- a/torch_npu/csrc/core/npu/NPUException.h
+++ b/torch_npu/csrc/core/npu/NPUException.h
@@ -94,6 +94,9 @@ std::string formatErrorCode(SubModule submodule, ErrCode errorCode);
 #define DEVICE_TASK_ABORT "reason=[device task abort]"
 #define DEVICE_MEM_ERROR "reason=[device mem error]"
 #define DEVICE_HBM_ECC_ERROR "reason=[hbm Multi-bit ECC error]"
+#define SUSPECT_DEVICE_MEM_ERROR "reason=[suspect device mem error]"
+#define HCCS_LINK_ERROR "reason=[link error]"
+#define HCCL_OP_RETRY_FAILED "reason=[hccl op retry failed]"
 
 inline const char* getErrorFunction(const char* msg)
 {
@@ -112,68 +115,74 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
     if ((error_code_peek) != ACL_ERROR_NONE) {                               \
         error_code = error_code_peek;                                        \
     }                                                                        \
-    switch (error_code) {                                                    \
-        case ACL_ERROR_RT_DEVICE_TASK_ABORT: {                               \
-            ASCEND_LOGE("getRepoStopFlag in Run, throw FORCE STOP.");        \
-            TORCH_CHECK(false, __func__, ":", __FILE__, ":", __LINE__,       \
-                " NPU function error: FORCE STOP.",                          \
-                ", error code is ", error_code, PTA_ERROR(ErrCode::ACL));    \
-            break;                                                           \
-        }                                                                    \
-        case ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR: {                         \
-            ASCEND_LOGE("getRepoStopFlag in Run, throw ECC ERROR.");         \
-            std::string error_msg(c10_npu::c10_npu_get_error_message());     \
-            std::regex pattern(R"(time us= (\d+)\.)");                       \
-            std::smatch match;                                               \
-            std::string time_msg = "";                                       \
-            if (std::regex_search(error_msg, match, pattern)) {              \
-                if (match.size() > 1) {                                      \
-                    time_msg = match[1].str();                               \
-                }                                                            \
-            }                                                                \
-            c10_npu::record_mem_hbm_ecc_error();                             \
-            TORCH_CHECK(false, __func__, ":", __FILE__, ":", __LINE__,       \
-                " NPU function error: HBM MULTI BIT ECC ERROR.", error_msg,  \
-                "time is ", time_msg, ", error code is ", error_code,        \
-                PTA_ERROR(ErrCode::ACL));                                    \
-            break;                                                           \
-        }                                                                    \
-        case ACL_ERROR_RT_DEVICE_MEM_ERROR: {                                \
-            std::string error_msg = "";                                      \
-            if (c10_npu::checkUceErrAndRepair(true, error_msg)) {            \
-                ASCEND_LOGE("getRepoStopFlag in Run, throw UCE ERROR.");     \
-                TORCH_CHECK(false, __func__, ":", __FILE__, ":", __LINE__,   \
-                " NPU function error: UCE ERROR.",                           \
-                ", error code is ", error_code, PTA_ERROR(ErrCode::ACL));    \
-            }                                                                \
-            break;                                                           \
-        }                                                                    \
-        default:                                                             \
-            break;                                                           \
+    std::string device_error_msg = c10_npu::handleDeviceError(error_code);   \
+    if (!device_error_msg.empty()) {                                         \
+        TORCH_CHECK(                                                         \
+            false,                                                           \
+            __func__,                                                        \
+            ":",                                                             \
+            __FILE__,                                                        \
+            ":",                                                             \
+            __LINE__,                                                        \
+            " NPU function error: ", device_error_msg,                       \
+            ", error code is ", error_code,                                  \
+            PTA_ERROR(ErrCode::ACL));                                        \
     }                                                                        \
 
+
 #define NPU_CHECK_ERROR_CHECK_UCE(err_code, check_uce, ...)                  \
     do {                                                                     \
         int error_code = err_code;                                           \
         static c10_npu::acl::AclErrorCode err_map;                           \
         if ((error_code) != ACL_ERROR_NONE) {                                \
+            std::string device_error_msg = "";                               \
             if (check_uce) {                                                 \
-                CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(error_code);     \
+                auto error_code_peek = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL);    \
+                if ((error_code_peek) != ACL_ERROR_NONE) {                   \
+                    error_code = error_code_peek;                            \
+                }                                                            \
+                device_error_msg = c10_npu::handleDeviceError(error_code);   \
             }                                                                \
-            TORCH_CHECK(                                                     \
+            if (((error_code) == ACL_ERROR_RT_FEATURE_NOT_SUPPORT) && (device_error_msg.empty())) {              \
+                static auto feature_not_support_warn_once = []() {               \
+                    printf("[WARN]%s,%s:%u:%s\n",                                \
+                           __FUNCTION__, __FILE__, __LINE__,                 \
+                           "Feature is not supportted and the possible cause is" \
+                           " that driver and firmware packages do not match.");  \
+                    return true;                                                 \
+                }();                                                             \
+            } else if (c10_npu::option::OptionsManager::ShouldPrintLessError()) { \
+                std::ostringstream oss;                                          \
+                oss << " NPU function error: "                                   \
+                    << (device_error_msg.empty() ? getErrorFunction(#err_code, ##__VA_ARGS__) : device_error_msg) \
+                    << ", error code is " << error_code << " "                     \
+                    << PTA_ERROR(ErrCode::ACL)                                    \
+                    << (err_map.error_code_map.find(error_code) != err_map.error_code_map.end() ? \
+                      err_map.error_code_map[error_code] : ".")            \
+                    << "\n";                                                 \
+                std::string err_msg = oss.str();                          \
+                ASCEND_LOGE("%s", err_msg.c_str());                       \
+                TORCH_CHECK(                                                     \
+                    false,                                                       \
+                    (device_error_msg.empty() ? "" : device_error_msg),        \
+                    c10_npu::c10_npu_get_error_message());                       \
+            } else {                                                         \
+                TORCH_CHECK(                                                     \
                 false,                                                       \
                 __func__,                                                    \
                 ":",                                                         \
                 __FILE__,                                                    \
                 ":",                                                         \
                 __LINE__,                                                    \
-                " NPU function error: ", getErrorFunction(#err_code, ##__VA_ARGS__),    \
+                " NPU function error: ", (device_error_msg.empty() ?         \
+                getErrorFunction(#err_code, ##__VA_ARGS__) : device_error_msg),    \
                 ", error code is ", error_code,                              \
                 PTA_ERROR(ErrCode::ACL),                                     \
                 (err_map.error_code_map.find(error_code) !=                  \
                 err_map.error_code_map.end() ?                               \
                 "\n[Error]: " + err_map.error_code_map[error_code] : "."),   \
                 "\n", c10_npu::c10_npu_get_error_message());                 \
+            }                                                                 \
         }                                                                    \
     } while (0)
 
@@ -185,7 +194,21 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
         auto Error = err_code;                                               \
         static c10_npu::acl::AclErrorCode err_map;                           \
         if ((Error) != ACL_ERROR_NONE) {                                     \
-            CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(Error);              \
+            CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(Error);               \
+            if (c10_npu::option::OptionsManager::ShouldPrintLessError())      \
+            {                                                                \
+                std::ostringstream oss;                                      \
+                oss << " OPS function error: " << getErrorFunction(#err_code, ##__VA_ARGS__)    \
+                   << ", error code is " << Error  << " "                    \
+                   << OPS_ERROR(ErrCode::ACL)                                 \
+                   << (err_map.error_code_map.find(Error) != err_map.error_code_map.end() ?      \
+                      err_map.error_code_map[Error] : ".") + "\n";           \
+                std::string err_msg = oss.str();                          \
+                ASCEND_LOGE("%s", err_msg.c_str());                       \
+                TORCH_CHECK(                                                 \
+                    false,                                                   \
+                    c10_npu::c10_npu_get_error_message());                   \
+            } else {                                                         \
             TORCH_CHECK(                                                     \
                 false,                                                       \
                 __func__,                                                    \
@@ -201,40 +224,9 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
                 "\n[Error]: " + err_map.error_code_map[Error] : "."),        \
                 "\n", c10_npu::c10_npu_get_error_message());                 \
         }                                                                    \
+        }                                                                    \
     } while (0)
 
-#define NPU_CHECK_SUPPORTED_OR_ERROR(err_code, ...)                              \
-    do {                                                                         \
-        auto Error = err_code;                                                   \
-        static c10_npu::acl::AclErrorCode err_map;                               \
-        if ((Error) != ACL_ERROR_NONE) {                                         \
-            CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(Error);                  \
-            if ((Error) == ACL_ERROR_RT_FEATURE_NOT_SUPPORT) {                   \
-                static auto feature_not_support_warn_once = []() {               \
-                    printf("[WARN]%s,%s:%u:%s\n",                                \
-                           __FUNCTION__, __FILENAME__, __LINE__,                 \
-                           "Feature is not supportted and the possible cause is" \
-                           " that driver and firmware packages do not match.");  \
-                    return true;                                                 \
-                }();                                                             \
-            } else {                                                             \
-                TORCH_CHECK(                                                     \
-                    false,                                                       \
-                    __func__,                                                    \
-                    ":",                                                         \
-                    __FILE__,                                                    \
-                    ":",                                                         \
-                    __LINE__,                                                    \
-                    " NPU function error: ", getErrorFunction(#err_code, ##__VA_ARGS__),    \
-                    ", error code is ", Error,                                   \
-                    PTA_ERROR(ErrCode::ACL),                                     \
-                    (err_map.error_code_map.find(Error) !=                       \
-                    err_map.error_code_map.end() ?                               \
-                    "\n[Error]: " + err_map.error_code_map[Error] : "."),        \
-                    "\n", c10_npu::c10_npu_get_error_message());                 \
-            }                                                                    \
-        }                                                                        \
-    } while (0)
 
 namespace c10_npu {
 
@@ -262,6 +254,8 @@ struct MemUceInfo {
 
 C10_NPU_API const char *c10_npu_get_error_message();
 
+C10_NPU_API const std::string c10_npu_check_error_message(std::string& errmsg);
+
 bool checkUceErrAndRepair(bool check_error, std::string& err_msg);
 
 void record_mem_hbm_ecc_error();
@@ -272,4 +266,18 @@ MemUceInfo get_mem_uce_info();
 
 void clear_mem_uce_info();
 
+std::string handleDeviceTaskAbort(int errorCode);
+
+std::string handleHbmMultiBitEccError(int errorCode);
+
+std::string handleDeviceMemError(int errorCode);
+
+std::string handleSuspectDeviceMemError(int errorCode);
+
+std::string handleLinkError(int errorCode);
+
+std::string handleHcclOpRetryFailed(int errorCode);
+
+std::string handleDeviceError(int errorCode);
+
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUFormat.cpp b/torch_npu/csrc/core/npu/NPUFormat.cpp
index ffa4df9b35472413b84f04556edeb320c3901266..b087842cc3cda065b2fefc609d539c5a8bc96cb8 100644
--- a/torch_npu/csrc/core/npu/NPUFormat.cpp
+++ b/torch_npu/csrc/core/npu/NPUFormat.cpp
@@ -46,5 +46,12 @@ at::Tensor empty_with_format(c10::IntArrayRef sizes, const c10::TensorOptions& o
     return OpPreparation::ApplyTensorWithFormat(sizes, options, format, keep_format);
 }
 
+at::Tensor empty_with_swapped_memory(c10::IntArrayRef size,
+                                     c10::optional<at::ScalarType> dtype_opt,
+                                     c10::optional<c10::Device> device_opt)
+{
+    return NPUNativeFunctions::empty_with_swapped_memory(size, dtype_opt, device_opt);
+}
+
 } // namespace native
 } // namespace at_npu
diff --git a/torch_npu/csrc/core/npu/NPUFormat.h b/torch_npu/csrc/core/npu/NPUFormat.h
index 83113beca005456dad3f7fd185848c85a56202e2..588b54e6e6f2b87f73c56abadc3197520eb71bc9 100644
--- a/torch_npu/csrc/core/npu/NPUFormat.h
+++ b/torch_npu/csrc/core/npu/NPUFormat.h
@@ -29,5 +29,9 @@ TORCH_NPU_API at::Tensor npu_format_cast(const at::Tensor& self, int64_t acl_for
 TORCH_NPU_API at::Tensor empty_with_format(c10::IntArrayRef sizes, const c10::TensorOptions& options,
                                            int64_t format, bool keep_format = false);
 
+TORCH_NPU_API at::Tensor empty_with_swapped_memory(c10::IntArrayRef size,
+                                                   c10::optional<at::ScalarType> dtype_opt,
+                                                   c10::optional<c10::Device> device_opt);
+
 } // namespace native
 } // namespace at_npu
diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp
index 5132fc84a479e502b097f5ad97d8f4a902ca8410..e273d2a6d3802a634315cd8cd8bbe79eed8284de 100644
--- a/torch_npu/csrc/core/npu/NPUFunctions.cpp
+++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp
@@ -3,6 +3,7 @@
 #include <unordered_map>
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
 #include "torch_npu/csrc/core/npu/NPUStream.h"
+#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
 #ifndef BUILD_LIBTORCH
 #include "torch_npu/csrc/sanitizer/NPUTrace.h"
@@ -14,6 +15,7 @@ static uint32_t dev_count = 0;
 static thread_local int local_device = -1;
 static std::unordered_map<int8_t, aclrtContext> used_devices;
 std::recursive_mutex mtx;
+thread_local int targetDeviceIndex = -1;
 
 c10::DeviceIndex device_count() noexcept
 {
@@ -42,6 +44,12 @@ c10::DeviceIndex device_count_ensure_non_zero()
 
 aclError GetDevice(int32_t *device)
 {
+    if (targetDeviceIndex >= 0) {
+        *device = targetDeviceIndex;
+        NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(targetDeviceIndex));
+        return ACL_ERROR_NONE;
+    }
+
     if (local_device >= 0) {
         *device = local_device;
         return ACL_ERROR_NONE;
@@ -64,14 +72,42 @@ aclError GetDevice(int32_t *device)
     return err;
 }
 
+aclError GetDeviceWithoutSet(int32_t *device)
+{
+    if (targetDeviceIndex >= 0) {
+        *device = targetDeviceIndex;
+        return ACL_ERROR_NONE;
+    }
+
+    if (local_device >= 0) {
+        *device = local_device;
+        return ACL_ERROR_NONE;
+    }
+    aclError err =  aclrtGetDevice(device);
+    if (err != ACL_ERROR_NONE) {
+        CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err);
+    }
+    if (err == ACL_ERROR_NONE) {
+        local_device = *device;
+    } else if (err == ACL_ERROR_RT_CONTEXT_NULL) {
+        *device = -1;
+        return ACL_ERROR_NONE;
+    }
+    return err;
+}
+
 aclError SetDevice(c10::DeviceIndex device)
 {
     TORCH_CHECK(device >= 0, "device id must be positive!", PTA_ERROR(ErrCode::VALUE));
-
+    targetDeviceIndex = -1;
     if (local_device == device) {
         return ACL_ERROR_NONE;
     }
 
+    if (c10_npu::IsOpDispatch()) {
+        c10_npu::SetThreadAffinity(device);
+    }
+
     aclError err = aclrtSetDevice(device);
     if (err == ACL_ERROR_NONE) {
         local_device = device;
@@ -104,7 +140,7 @@ aclError DestroyUsedStreams()
     for (const auto it : used_devices) {
         NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(it.first));
         NPUStream stream = getCurrentNPUStream(it.first);
-        aclError acl_ret = acl::AclrtDestroyStreamForce(stream);
+        aclError acl_ret = acl::AclrtDestroyStreamForce(stream.stream(false));
         if (acl_ret != ACL_ERROR_NONE) {
             return acl_ret;
         }
@@ -140,7 +176,7 @@ aclrtContext GetDeviceContext(int32_t device)
 {
     std::lock_guard<std::recursive_mutex> lock(mtx);
     if (used_devices.find(device) == used_devices.end()) {
-        ASCEND_LOGE("NPU device %d has been initialized! Can not get context", device);
+        ASCEND_LOGE("NPU device %d has not been initialized! Can not get context", device);
         return nullptr;
     }
     return used_devices[device];
@@ -171,10 +207,35 @@ void device_synchronize()
 
 int ExchangeDevice(int device)
 {
+    targetDeviceIndex = -1;
     NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(device));
     return device;
 }
 
+int MaybeExchangeDevice(int to_device)
+{
+    int cur_device = -1;
+    NPU_CHECK_ERROR_WITHOUT_UCE(GetDeviceWithoutSet(&cur_device));
+    if (to_device == cur_device) {
+        return cur_device;
+    }
+    std::lock_guard<std::recursive_mutex> lock(mtx);
+    if (used_devices.find(to_device) == used_devices.end()) {
+        ASCEND_LOGI("NPU device %d has not been initialized! We will set targetDeviceIndex.", to_device);
+        targetDeviceIndex = to_device;
+    } else {
+        NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(to_device));
+    }
+    return cur_device;
+}
+
+void SetTargetDevice()
+{
+    if (targetDeviceIndex >= 0) {
+        NPU_CHECK_ERROR_WITHOUT_UCE(SetDevice(targetDeviceIndex));
+    }
+}
+
 int GetLocalDevice()
 {
     return local_device;
@@ -182,6 +243,10 @@ int GetLocalDevice()
 
 bool IsContextInitialized()
 {
+    if (local_device >= 0) {
+        return true;
+    }
+
     int32_t device = -1;
     aclError err = aclrtGetDevice(&device);
     if (err == ACL_ERROR_NONE) {
@@ -196,4 +261,27 @@ bool IsContextInitialized()
     }
 }
 
+void warn_or_error_on_sync()
+{
+    if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_ERROR) {
+        TORCH_CHECK(false, "called a synchronizing NPU operation", PTA_ERROR(ErrCode::ACL));
+    } else if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_WARN) {
+        TORCH_NPU_WARN("called a synchronizing NPU operation");
+    }
+}
+
+void stream_synchronize(aclrtStream stream)
+{
+    if (C10_UNLIKELY(warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
+        warn_or_error_on_sync();
+    }
+#ifndef BUILD_LIBTORCH
+    const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
+    if (C10_UNLIKELY(trigger)) {
+        trigger->traceNpuStreamSynchronization(reinterpret_cast<uintptr_t>(stream));
+    }
+#endif
+    NPU_CHECK_ERROR(aclrtSynchronizeStream(stream));
 }
+
+} // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUFunctions.h b/torch_npu/csrc/core/npu/NPUFunctions.h
index e6925636158b32e5d0e5b30a5b020d216d1db358..9bb715bdb85fc5007e04026865794e9f3a5cc1cd 100644
--- a/torch_npu/csrc/core/npu/NPUFunctions.h
+++ b/torch_npu/csrc/core/npu/NPUFunctions.h
@@ -35,6 +35,8 @@ C10_NPU_API c10::DeviceIndex device_count_ensure_non_zero();
  */
 C10_NPU_API aclError GetDevice(int32_t *device);
 
+aclError GetDeviceWithoutSet(int32_t *device);
+
 /**
  * @ingroup torch_npu
  * @brief set device id by ACL interface: aclrtSetDevice,
@@ -69,6 +71,10 @@ C10_NPU_API void device_synchronize();
 
 C10_NPU_API int ExchangeDevice(int device);
 
+int MaybeExchangeDevice(int to_device);
+
+void SetTargetDevice();
+
 int GetLocalDevice();
 
 enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR };
@@ -99,14 +105,7 @@ C10_NPU_API inline WarningState& warning_state()
 
 // this function has to be called from callers performing npu synchronizing
 // operations, to raise proper error or warning
-C10_NPU_API inline void warn_or_error_on_sync()
-{
-    if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_ERROR) {
-        TORCH_CHECK(false, "called a synchronizing NPU operation", PTA_ERROR(ErrCode::ACL));
-    } else if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_WARN) {
-        TORCH_NPU_WARN("called a synchronizing NPU operation");
-    }
-}
+C10_NPU_API void warn_or_error_on_sync();
 
 enum class CallStateMode { L_UNKNOW = -1, L_FORWARD = 0, L_BACKWARD };
 
@@ -148,4 +147,6 @@ C10_NPU_API inline ModelState& model_state()
 
 bool IsContextInitialized();
 
+C10_NPU_API void stream_synchronize(aclrtStream stream);
+
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUGraph.cpp b/torch_npu/csrc/core/npu/NPUGraph.cpp
index be5f367cddb801055da0ef64e8756aa9ee764f5b..fd060dcb086f911f0e2b345a18ba9672ded2c34c 100644
--- a/torch_npu/csrc/core/npu/NPUGraph.cpp
+++ b/torch_npu/csrc/core/npu/NPUGraph.cpp
@@ -20,11 +20,35 @@ constexpr int kSynchronizeBusyWaitMillis = 10;
 MempoolId_t graph_pool_handle()
 {
     // Sets just the second value, to distinguish it from MempoolId_ts created from
-    // aclmdlCaptureGetInfo id_s in capture_begin.
+    // aclmdlRICaptureGetInfo id_s in capture_begin.
     auto new_pool = c10_npu::MemPool();
     return new_pool.id();
 }
 
+void graph_task_group_begin(c10_npu::NPUStream stream)
+{
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureTaskGrpBegin(stream));
+}
+
+NPUTaskGroupHandle graph_task_group_end(c10_npu::NPUStream stream)
+{
+    aclrtTaskGrp group;
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureTaskGrpEnd(stream, &group));
+    NPUTaskGroupHandle handle;
+    handle.task_group = group;
+    return handle;
+}
+
+void graph_task_update_begin(c10_npu::NPUStream stream, NPUTaskGroupHandle handle)
+{
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureTaskUpdateBegin(stream, handle.task_group));
+}
+
+void graph_task_update_end(c10_npu::NPUStream stream)
+{
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureTaskUpdateEnd(stream));
+}
+
 /**
  * Note [CUDA Graph Wrapper Class]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -73,8 +97,14 @@ NPUGraph::NPUGraph()
     : capture_stream_(c10_npu::getCurrentNPUStream()) {
 }
 
-void NPUGraph::capture_begin(MempoolId_t pool, aclmdlCaptureMode capture_mode)
+void NPUGraph::capture_begin(MempoolId_t pool, aclmdlRICaptureMode capture_mode)
 {
+    static const auto _task_queue_enable = c10_npu::option::OptionsManager::GetTaskQueueEnable();
+    TORCH_CHECK(_task_queue_enable != 2,
+        "Do not support TASK_QUEUE_ENABLE = 2 during NPU graph capture, please "
+        "export TASK_QUEUE_ENABLE=1/0.",
+        PTA_ERROR(ErrCode::NOT_SUPPORT));
+
     TORCH_CHECK(!has_graph_exec_,
                 "This NPUGraph instance already owns a captured graph. "
                 "To capture a new graph, create a new instance.");
@@ -108,10 +138,10 @@ void NPUGraph::capture_begin(MempoolId_t pool, aclmdlCaptureMode capture_mode)
     // autograd thread's free() call triggering an invalid cudaEventRecord in the caching allocator
     // due to the capture status being updated _after_ a capture had already started.
     c10_npu::NPUCachingAllocator::beginAllocateToPool(capture_dev_, mempool_id_, [this](aclrtStream stream) {
-        aclmdlCaptureStatus status;
-        uint32_t model_id;
-        NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureGetInfo(stream, &status, &model_id));
-        return status == aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE && model_id == model_id_;
+        aclmdlRICaptureStatus status;
+        aclmdlRI model_ri;
+        NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureGetInfo(stream, &status, &model_ri));
+        return status == aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_ACTIVE && model_ri == model_ri_;
     });
 
     // At this point, any NCCL watchdogs should be aware that we are in capture mode
@@ -125,13 +155,11 @@ void NPUGraph::capture_begin(MempoolId_t pool, aclmdlCaptureMode capture_mode)
 
     // cudaStreamCaptureModeGlobal is the most conservative option to
     // prevent potentially unsafe CUDA API calls during capture.
-    NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureBegin(capture_stream_, capture_mode));
-
-    c10_npu::is_stream_capturing.store(true);
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureBegin(capture_stream_, capture_mode));
 
-    aclmdlCaptureStatus status;
-    NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureGetInfo(stream, &status, &model_id_));
-    TORCH_INTERNAL_ASSERT(status == aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE);
+    aclmdlRICaptureStatus status;
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureGetInfo(stream, &status, &model_ri_));
+    TORCH_INTERNAL_ASSERT(status == aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_ACTIVE);
 }
 
 void NPUGraph::capture_end()
@@ -141,14 +169,12 @@ void NPUGraph::capture_end()
     TORCH_CHECK(stream == capture_stream_,
                 "Capture must end on the same stream it began on.");
 
-    uint32_t model_id;
-    NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureEnd(capture_stream_, &model_id));
-
-    c10_npu::is_stream_capturing.store(false);
+    aclmdlRI model_ri;
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureEnd(capture_stream_, &model_ri));
 
     c10_npu::NPUCachingAllocator::endAllocateToPool(capture_dev_, mempool_id_);
 
-    TORCH_CHECK(model_id == model_id_, "Invalid end capture model id: ", model_id);
+    TORCH_CHECK(model_ri == model_ri_, "Invalid end capture model id: ", model_ri);
 
     // In typical graph usage some tensors (e.g. the tensors used for graph IO) are not freed
     // between replays.
@@ -171,8 +197,8 @@ void NPUGraph::replay()
 
     c10::OptionalDeviceGuard device_guard{capture_stream_.device()};
 
-    // model_id_ may be replayed in any stream.
-    NPU_CHECK_ERROR(c10_npu::acl::AclmdlExecuteAsync(model_id_, c10_npu::getCurrentNPUStream()));
+    // model_ri_ may be replayed in any stream.
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRIExecuteAsync(model_ri_, c10_npu::getCurrentNPUStream()));
 }
 
 void NPUGraph::enable_debug_mode()
@@ -184,8 +210,8 @@ void NPUGraph::debug_dump()
 {
     if (_npu_graphs_debug) {
         if (has_graph_exec_) {
-            TORCH_WARN("DEBUG: calling NPUGraph::debug_dump() for model id ", model_id_);
-            NPU_CHECK_ERROR(c10_npu::acl::AclmdlDebugPrint(model_id_));
+            TORCH_WARN("DEBUG: calling NPUGraph::debug_dump() for model id ", model_ri_);
+            NPU_CHECK_ERROR(c10_npu::acl::AclmdlRIDebugPrint(model_ri_));
         }
     } else {
         TORCH_WARN("NPU Graphs debug not enabled, set with NPUGraph::enable_debug_mode().");
@@ -216,7 +242,7 @@ void NPUGraph::reset()
     if (has_graph_exec_) {
         // notifyCaptureDestroy may throw. How should we handle this?
         c10_npu::NPUCachingAllocator::releasePool(capture_dev_, mempool_id_);
-        NPU_CHECK_ERROR(c10_npu::acl::AclmdlUnload(model_id_));
+        NPU_CHECK_ERROR(c10_npu::acl::AclmdlRIDestroy(model_ri_));
         has_graph_exec_ = false;
     }
 }
diff --git a/torch_npu/csrc/core/npu/NPUGraph.h b/torch_npu/csrc/core/npu/NPUGraph.h
index b2833744c16f784271696118b6bf3c73f11fe3be..442ae335ccae15506f4352ef5c204e185917672b 100644
--- a/torch_npu/csrc/core/npu/NPUGraph.h
+++ b/torch_npu/csrc/core/npu/NPUGraph.h
@@ -14,6 +14,15 @@ namespace c10_npu {
 // to CUDAGraph::capture_begin
 TORCH_NPU_API MempoolId_t graph_pool_handle();
 
+struct TORCH_NPU_API NPUTaskGroupHandle {
+    aclrtTaskGrp task_group;
+};
+
+TORCH_NPU_API void graph_task_group_begin(c10_npu::NPUStream stream);
+TORCH_NPU_API NPUTaskGroupHandle graph_task_group_end(c10_npu::NPUStream stream);
+TORCH_NPU_API void graph_task_update_begin(c10_npu::NPUStream stream, NPUTaskGroupHandle handle);
+TORCH_NPU_API void graph_task_update_end(c10_npu::NPUStream stream);
+
 struct TORCH_NPU_API NPUGraph {
     NPUGraph();
     ~NPUGraph();
@@ -24,7 +33,7 @@ struct TORCH_NPU_API NPUGraph {
 
     void capture_begin(
         MempoolId_t pool = {0, 0},
-        aclmdlCaptureMode capture_mode = aclmdlCaptureMode::ACL_MODEL_CAPTURE_MODE_GLOBAL);
+        aclmdlRICaptureMode capture_mode = aclmdlRICaptureMode::ACL_MODEL_RI_CAPTURE_MODE_GLOBAL);
     void capture_end();
     void replay();
     void reset();
@@ -33,7 +42,7 @@ struct TORCH_NPU_API NPUGraph {
     void debug_dump();
 
 protected:
-    uint32_t model_id_ = -1;
+    aclmdlRI model_ri_ = nullptr;
 
     static std::atomic<int> pending_event_queries;
 
diff --git a/torch_npu/csrc/core/npu/NPUGraphsUtils.cpp b/torch_npu/csrc/core/npu/NPUGraphsUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dbe77324bf5fb2ed1ff15a60184fb73707ed1fcb
--- /dev/null
+++ b/torch_npu/csrc/core/npu/NPUGraphsUtils.cpp
@@ -0,0 +1,18 @@
+#include "NPUGraphsUtils.h"
+
+namespace c10_npu {
+CaptureStatus currentStreamCaptureStatusMayInitCtx()
+{
+    if (!c10_npu::acl::IsCaptureSupported()) {
+        return CaptureStatus::None;
+    }
+
+    aclmdlRICaptureStatus is_capturing{ACL_MODEL_RI_CAPTURE_STATUS_NONE};
+    aclmdlRI model_ri;
+    auto s = c10_npu::getCurrentNPUStream();
+    NPU_CHECK_ERROR(
+        c10_npu::acl::AclmdlRICaptureGetInfo(s.stream(false), &is_capturing, &model_ri));
+    return CaptureStatus(is_capturing);
+}
+
+} // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUGraphsUtils.h b/torch_npu/csrc/core/npu/NPUGraphsUtils.h
index 395f27a049869394d7e0c401ff5644c4b08c1b9f..e1d89eacd4fc6af2e8bf035c6d793a3b4b290949 100644
--- a/torch_npu/csrc/core/npu/NPUGraphsUtils.h
+++ b/torch_npu/csrc/core/npu/NPUGraphsUtils.h
@@ -10,41 +10,39 @@
 
 namespace c10_npu {
 
-static std::atomic<bool> is_stream_capturing(false);
-
 using CaptureId_t = unsigned long long;
 
 // first is set if the instance is created by NPUGraph::capture_begin.
 // second is set if the instance is created by at::cuda::graph_pool_handle.
 using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
 
-// RAII guard for "aclmdlCaptureMode", a thread-local value
+// RAII guard for "aclmdlRICaptureMode", a thread-local value
 // that controls the error-checking strictness of a capture.
 struct C10_NPU_API NPUStreamCaptureModeGuard{
-    NPUStreamCaptureModeGuard(aclmdlCaptureMode desired)
+    NPUStreamCaptureModeGuard(aclmdlRICaptureMode desired)
     : strictness_(desired) {}
     ~NPUStreamCaptureModeGuard() {}
 
     private:
-    aclmdlCaptureMode strictness_;
+    aclmdlRICaptureMode strictness_;
 };
 
-// Protects against enum aclmdlCaptureStatus implementation changes.
+// Protects against enum aclmdlRICaptureStatus implementation changes.
 // Some compilers seem not to like static_assert without the messages.
 static_assert(
-    int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_NONE) == 0,
-    "unexpected int(ACL_MODEL_CAPTURE_STATUS_NONE) value");
+    int(aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_NONE) == 0,
+    "unexpected int(ACL_MODEL_RI_CAPTURE_STATUS_NONE) value");
 static_assert(
-    int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE) == 1,
-    "unexpected int(ACL_MODEL_CAPTURE_STATUS_ACTIVE) value");
+    int(aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_ACTIVE) == 1,
+    "unexpected int(ACL_MODEL_RI_CAPTURE_STATUS_ACTIVE) value");
 static_assert(
-    int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_INVALIDATED) == 2,
-    "unexpected int(ACL_MODEL_CAPTURE_STATUS_INVALIDATED) value");
+    int(aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_INVALIDATED) == 2,
+    "unexpected int(ACL_MODEL_RI_CAPTURE_STATUS_INVALIDATED) value");
 
 enum class CaptureStatus : int {
-    None = int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_NONE),
-    Active = int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE),
-    Invalidated = int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_INVALIDATED)
+    None = int(aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_NONE),
+    Active = int(aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_ACTIVE),
+    Invalidated = int(aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_INVALIDATED)
 };
 
 inline std::ostream &operator<<(std::ostream &os, CaptureStatus status)
@@ -67,18 +65,7 @@ inline std::ostream &operator<<(std::ostream &os, CaptureStatus status)
 }
 
 // Use this version where you're sure a CUDA context exists already.
-inline CaptureStatus currentStreamCaptureStatusMayInitCtx()
-{
-    if (!c10_npu::acl::IsCaptureSupported()) {
-        return CaptureStatus::None;
-    }
-
-    aclmdlCaptureStatus is_capturing{ACL_MODEL_CAPTURE_STATUS_NONE};
-    uint32_t modelId;
-    NPU_CHECK_ERROR(
-        c10_npu::acl::AclmdlCaptureGetInfo(c10_npu::getCurrentNPUStream(), &is_capturing, &modelId));
-    return CaptureStatus(is_capturing);
-}
+C10_NPU_API CaptureStatus currentStreamCaptureStatusMayInitCtx();
 
 // Use this version where you don't want to create a CUDA context if none exists.
 inline CaptureStatus currentStreamCaptureStatus()
@@ -99,7 +86,22 @@ inline void assertNotCapturing(const std::string &attempt)
                 " during NPU graph capture. If you need this call to be captured, "
                 "please file an issue. "
                 "Current npuStreamCaptureStatus: ",
-                status);
+                status,
+                PTA_ERROR(ErrCode::NOT_SUPPORT));
+}
+
+inline void assertNotCapturingAclop(const std::string &opName)
+{
+    auto status = currentStreamCaptureStatus();
+    TORCH_CHECK(status == CaptureStatus::None,
+                "Cannot run aclop operators during NPU graph capture. Current working aclop is ",
+                opName,
+                ". If you need this call to be captured, "
+                "please try to set torch.npu.config.allow_internal_format = False. "
+                "If still fail, the operator needs aclnn implementation and please file an issue. "
+                "Current npuStreamCaptureStatus: ",
+                status,
+                PTA_ERROR(ErrCode::NOT_SUPPORT));
 }
 
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUGuard.h b/torch_npu/csrc/core/npu/NPUGuard.h
index 9486bf789ee0b921ef2dfd1bf9b1e05945ce307c..995beb9dfc3300734f384f98c5ed492604b540a1 100644
--- a/torch_npu/csrc/core/npu/NPUGuard.h
+++ b/torch_npu/csrc/core/npu/NPUGuard.h
@@ -9,128 +9,137 @@
 #include <cstddef>
 
 namespace c10_npu {
-
 // This code is kind of boilerplatey.  See Note [Whither the DeviceGuard
 // boilerplate]
 
-/// A variant of DeviceGuard that is specialized for NPU.  It accepts
-/// integer indices (interpreting them as NPU devices) and is a little
-/// more efficient than DeviceGuard (it compiles to straight line
-/// NPUSetDevice/NPUGetDevice calls); however, it can only be used
-/// from code that links against NPU directly.
+// / A variant of DeviceGuard that is specialized for NPU.  It accepts
+// / integer indices (interpreting them as NPU devices) and is a little
+// / more efficient than DeviceGuard (it compiles to straight line
+// / NPUSetDevice/NPUGetDevice calls); however, it can only be used
+// / from code that links against NPU directly.
 struct NPUGuard {
-    /// No default constructor; see Note [Omitted default constructor from RAII]
+    // / No default constructor; see Note [Omitted default constructor from RAII]
     explicit NPUGuard() = delete;
 
-    /// Set the current NPU device to the passed device index.
+    // / Set the current NPU device to the passed device index.
     explicit NPUGuard(c10::DeviceIndex device_index) : guard_(device_index) {}
 
-    /// Sets the current NPU device to the passed device.  Errors if the passed
-    /// device is not a NPU device.
+    // / Sets the current NPU device to the passed device.  Errors if the passed
+    // / device is not a NPU device.
     explicit NPUGuard(c10::Device device) : guard_(device) {}
 
     // Copy is not allowed
-    NPUGuard(const NPUGuard&) = delete;
-    NPUGuard& operator=(const NPUGuard&) = delete;
+    NPUGuard(const NPUGuard &) = delete;
+    NPUGuard &operator = (const NPUGuard &) = delete;
 
     // Move is not allowed (there is no uninitialized state)
-    NPUGuard(NPUGuard&& other) = delete;
-    NPUGuard& operator=(NPUGuard&& other) = delete;
+    NPUGuard(NPUGuard &&other) = delete;
+    NPUGuard &operator = (NPUGuard &&other) = delete;
 
-    /// Sets the NPU device to the given device.  Errors if the given device
-    /// is not a NPU device.
-    void set_device(c10::Device device) {
+    // / Sets the NPU device to the given device.  Errors if the given device
+    // / is not a NPU device.
+    void set_device(c10::Device device)
+    {
         guard_.set_device(device);
     }
 
-    /// Sets the NPU device to the given device.  Errors if the given device
-    /// is not a NPU device.  (This method is provided for uniformity with
-    /// DeviceGuard).
-    void reset_device(c10::Device device) {
+    // / Sets the NPU device to the given device.  Errors if the given device
+    // / is not a NPU device.  (This method is provided for uniformity with
+    // / DeviceGuard).
+    void reset_device(c10::Device device)
+    {
         guard_.reset_device(device);
     }
 
-    /// Sets the NPU device to the given device index.
-    void set_index(c10::DeviceIndex device_index) {
+    // / Sets the NPU device to the given device index.
+    void set_index(c10::DeviceIndex device_index)
+    {
         guard_.set_index(device_index);
     }
 
-    /// Returns the device that was set upon construction of the guard
-    c10::Device original_device() const {
+    // / Returns the device that was set upon construction of the guard
+    c10::Device original_device() const
+    {
         return guard_.original_device();
     }
 
-    /// Returns the last device that was set via `set_device`, if any, otherwise
-    /// the device passed during construction.
-    c10::Device current_device() const {
+    // / Returns the last device that was set via `set_device`, if any, otherwise
+    // / the device passed during construction.
+    c10::Device current_device() const
+    {
         return guard_.current_device();
     }
 
 private:
-    /// The guard for the current device.
+    // / The guard for the current device.
     c10::impl::InlineDeviceGuard<c10_npu::impl::NPUGuardImpl> guard_;
 };
 
-/// A variant of OptionalDeviceGuard that is specialized for NPU.  See
-/// NPUGuard for when you can use this.
+// / A variant of OptionalDeviceGuard that is specialized for NPU.  See
+// / NPUGuard for when you can use this.
 struct OptionalNPUGuard {
-    /// Create an uninitialized OptionalNPUGuard.
+    // / Create an uninitialized OptionalNPUGuard.
     explicit OptionalNPUGuard() : guard_() {}
 
-    /// Set the current NPU device to the passed Device, if it is not nullopt.
+    // / Set the current NPU device to the passed Device, if it is not nullopt.
     explicit OptionalNPUGuard(c10::optional<c10::Device> device_opt) : guard_(device_opt) {}
 
-    /// Set the current NPU device to the passed device index, if it is not
-    /// nullopt
-    explicit OptionalNPUGuard(c10::optional<c10::DeviceIndex> device_index_opt)
-        : guard_(device_index_opt) {}
+    // / Set the current NPU device to the passed device index, if it is not
+    // / nullopt
+    explicit OptionalNPUGuard(c10::optional<c10::DeviceIndex> device_index_opt) : guard_(device_index_opt) {}
 
     // Copy is not allowed
-    OptionalNPUGuard(const OptionalNPUGuard&) = delete;
-    OptionalNPUGuard& operator=(const OptionalNPUGuard&) = delete;
+    OptionalNPUGuard(const OptionalNPUGuard &) = delete;
+    OptionalNPUGuard &operator = (const OptionalNPUGuard &) = delete;
 
     // See Note [Move construction for RAII guards is tricky]
-    OptionalNPUGuard(OptionalNPUGuard&& other) = delete;
+    OptionalNPUGuard(OptionalNPUGuard &&other) = delete;
 
     // See Note [Move assignment for RAII guards is tricky]
-    OptionalNPUGuard& operator=(OptionalNPUGuard&& other) = delete;
+    OptionalNPUGuard &operator = (OptionalNPUGuard &&other) = delete;
 
-    /// Sets the NPU device to the given device, initializing the guard if it
-    /// is not already initialized.  Errors if the given device is not a NPU
-    /// device.
-    void set_device(c10::Device device) {
+    // / Sets the NPU device to the given device, initializing the guard if it
+    // / is not already initialized.  Errors if the given device is not a NPU
+    // / device.
+    void set_device(c10::Device device)
+    {
         guard_.set_device(device);
     }
 
-    /// Sets the NPU device to the given device, initializing the guard if it is
-    /// not already initialized.  Errors if the given device is not a NPU device.
-    /// (This method is provided for uniformity with OptionalDeviceGuard).
-    void reset_device(c10::Device device) {
+    // / Sets the NPU device to the given device, initializing the guard if it is
+    // / not already initialized.  Errors if the given device is not a NPU device.
+    // / (This method is provided for uniformity with OptionalDeviceGuard).
+    void reset_device(c10::Device device)
+    {
         guard_.reset_device(device);
     }
 
-    /// Sets the NPU device to the given device index, initializing the guard if
-    /// it is not already initialized.
-    void set_index(c10::DeviceIndex device_index) {
+    // / Sets the NPU device to the given device index, initializing the guard if
+    // / it is not already initialized.
+    void set_index(c10::DeviceIndex device_index)
+    {
         guard_.set_index(device_index);
     }
 
-    /// Returns the device that was set immediately prior to initialization of the
-    /// guard, or nullopt if the guard is uninitialized.
-    c10::optional<c10::Device> original_device() const {
+    // / Returns the device that was set immediately prior to initialization of the
+    // / guard, or nullopt if the guard is uninitialized.
+    c10::optional<c10::Device> original_device() const
+    {
         return guard_.original_device();
     }
 
-    /// Returns the most recent device that was set using this device guard,
-    /// either from construction, or via set_device, if the guard is initialized,
-    /// or nullopt if the guard is uninitialized.
-    c10::optional<c10::Device> current_device() const {
+    // / Returns the most recent device that was set using this device guard,
+    // / either from construction, or via set_device, if the guard is initialized,
+    // / or nullopt if the guard is uninitialized.
+    c10::optional<c10::Device> current_device() const
+    {
         return guard_.current_device();
     }
 
-    /// Restore the original NPU device, resetting this guard to uninitialized
-    /// state.
-    void reset() {
+    // / Restore the original NPU device, resetting this guard to uninitialized
+    // / state.
+    void reset()
+    {
         guard_.reset();
     }
 
@@ -138,62 +147,67 @@ private:
     c10::impl::InlineOptionalDeviceGuard<impl::NPUGuardImpl> guard_;
 };
 
-/// A variant of StreamGuard that is specialized for NPU.  See NPUGuard
-/// for when you can use this.
+// / A variant of StreamGuard that is specialized for NPU.  See NPUGuard
+// / for when you can use this.
 struct NPUStreamGuard {
-    /// No default constructor, see Note [Omitted default constructor from RAII]
+    // / No default constructor, see Note [Omitted default constructor from RAII]
     explicit NPUStreamGuard() = delete;
 
-    /// Set the current NPU device to the device associated with the passed
-    /// stream, and set the current NPU stream on that device to the passed
-    /// stream. Errors if the Stream is not a NPU stream.
+    // / Set the current NPU device to the device associated with the passed
+    // / stream, and set the current NPU stream on that device to the passed
+    // / stream. Errors if the Stream is not a NPU stream.
     explicit NPUStreamGuard(c10::Stream stream) : guard_(stream) {}
 
-    /// Copy is disallowed
-    NPUStreamGuard(const NPUStreamGuard&) = delete;
-    NPUStreamGuard& operator=(const NPUStreamGuard&) = delete;
-
-    /// Move is disallowed, as NPUStreamGuard does not have an uninitialized
-    /// state, which is required for moves on types with nontrivial destructors.
-    NPUStreamGuard(NPUStreamGuard&& other) = delete;
-    NPUStreamGuard& operator=(NPUStreamGuard&& other) = delete;
-
-    /// Resets the currently set stream to the original stream and
-    /// the currently set device to the original device.  Then,
-    /// set the current device to the device associated with the passed stream,
-    /// and set the current stream on that device to the passed stream.
-    /// Errors if the stream passed is not a NPU stream.
-    ///
-    /// NOTE: this implementation may skip some stream/device setting if
-    /// it can prove that it is unnecessary.
-    ///
-    /// WARNING: reset_stream does NOT preserve previously set streams on
-    /// different devices.  If you need to set streams on multiple devices
-    /// on NPU, use NPUMultiStreamGuard instead.
-    void reset_stream(c10::Stream stream) {
+    // / Copy is disallowed
+    NPUStreamGuard(const NPUStreamGuard &) = delete;
+    NPUStreamGuard &operator = (const NPUStreamGuard &) = delete;
+
+    // / Move is disallowed, as NPUStreamGuard does not have an uninitialized
+    // / state, which is required for moves on types with nontrivial destructors.
+    NPUStreamGuard(NPUStreamGuard &&other) = delete;
+    NPUStreamGuard &operator = (NPUStreamGuard &&other) = delete;
+
+    // / Resets the currently set stream to the original stream and
+    // / the currently set device to the original device.  Then,
+    // / set the current device to the device associated with the passed stream,
+    // / and set the current stream on that device to the passed stream.
+    // / Errors if the stream passed is not a NPU stream.
+    // /
+    // / NOTE: this implementation may skip some stream/device setting if
+    // / it can prove that it is unnecessary.
+    // /
+    // / WARNING: reset_stream does NOT preserve previously set streams on
+    // / different devices.  If you need to set streams on multiple devices
+    // / on NPU, use NPUMultiStreamGuard instead.
+    void reset_stream(c10::Stream stream)
+    {
         guard_.reset_stream(stream);
     }
 
-    /// Returns the NPU stream that was set at the time the guard was constructed.
-    NPUStream original_stream() const {
+    // / Returns the NPU stream that was set at the time the guard was constructed.
+    NPUStream original_stream() const
+    {
         return NPUStream(NPUStream::UNCHECKED, guard_.original_stream());
     }
 
-    /// Returns the most recent NPU stream that was set using this device guard,
-    /// either from construction, or via set_stream.
-    NPUStream current_stream() const {
+    // / Returns the most recent NPU stream that was set using this device guard,
+    // / either from construction, or via set_stream.
+    NPUStream current_stream() const
+    {
         return NPUStream(NPUStream::UNCHECKED, guard_.current_stream());
     }
 
-    /// Returns the most recent NPU device that was set using this device guard,
-    /// either from construction, or via set_device/reset_device/set_index.
-    c10::Device current_device() const {
+    // / Returns the most recent NPU device that was set using this device guard,
+    // / either from construction, or via set_device/reset_device/set_index.
+    c10::Device current_device() const
+    {
         return guard_.current_device();
     }
 
-    /// Returns the NPU device that was set at the most recent reset_stream(),
-    /// or otherwise the device at construction time.
-    c10::Device original_device() const {
+    // / Returns the NPU device that was set at the most recent reset_stream(),
+    // / or otherwise the device at construction time.
+    c10::Device original_device() const
+    {
         return guard_.original_device();
     }
 
@@ -201,101 +215,103 @@ private:
     c10::impl::InlineStreamGuard<c10_npu::impl::NPUGuardImpl> guard_;
 };
 
-/// A variant of OptionalStreamGuard that is specialized for NPU.  See NPUGuard
-/// for when you can use this.
+// / A variant of OptionalStreamGuard that is specialized for NPU.  See NPUGuard
+// / for when you can use this.
 struct OptionalNPUStreamGuard {
-  /// Create an uninitialized guard.
-  explicit OptionalNPUStreamGuard() : guard_() {}
-
-  /// Set the current NPU device to the device associated with the passed
-  /// stream, and set the current NPU stream on that device to the passed
-  /// stream. Errors if the Stream is not a NPU stream.
-  explicit OptionalNPUStreamGuard(c10::Stream stream) : guard_(stream) {}
-
-  /// Set the current device to the device associated with the passed stream,
-  /// and set the current stream on that device to the passed stream,
-  /// if the passed stream is not nullopt.
-  explicit OptionalNPUStreamGuard(c10::optional<c10::Stream> stream_opt)
-      : guard_(stream_opt) {}
-
-  /// Copy is disallowed
-  OptionalNPUStreamGuard(const OptionalNPUStreamGuard&) = delete;
-  OptionalNPUStreamGuard& operator=(const OptionalNPUStreamGuard&) = delete;
-
-  // See Note [Move construction for RAII guards is tricky]
-  OptionalNPUStreamGuard(OptionalNPUStreamGuard&& other) = delete;
-
-  // See Note [Move assignment for RAII guards is tricky]
-  OptionalNPUStreamGuard& operator=(OptionalNPUStreamGuard&& other) = delete;
-
-  /// Resets the currently set NPU stream to the original stream and
-  /// the currently set device to the original device.  Then,
-  /// set the current device to the device associated with the passed stream,
-  /// and set the current stream on that device to the passed stream.
-  /// Initializes the guard if it was not previously initialized.
-  void reset_stream(c10::Stream stream) {
-    guard_.reset_stream(stream);
-  }
-
-  /// Returns the NPU stream that was set at the time the guard was most
-  /// recently initialized, or nullopt if the guard is uninitialized.
-  c10::optional<NPUStream> original_stream() const {
-    auto r = guard_.original_stream();
-    if (r.has_value()) {
-      return c10::make_optional(NPUStream(NPUStream::UNCHECKED, r.value()));
-    } else {
-      return c10::nullopt;
+    // / Create an uninitialized guard.
+    explicit OptionalNPUStreamGuard() : guard_() {}
+
+    // / Set the current NPU device to the device associated with the passed
+    // / stream, and set the current NPU stream on that device to the passed
+    // / stream. Errors if the Stream is not a NPU stream.
+    explicit OptionalNPUStreamGuard(c10::Stream stream) : guard_(stream) {}
+
+    // / Set the current device to the device associated with the passed stream,
+    // / and set the current stream on that device to the passed stream,
+    // / if the passed stream is not nullopt.
+    explicit OptionalNPUStreamGuard(c10::optional<c10::Stream> stream_opt) : guard_(stream_opt) {}
+
+    // / Copy is disallowed
+    OptionalNPUStreamGuard(const OptionalNPUStreamGuard &) = delete;
+    OptionalNPUStreamGuard &operator = (const OptionalNPUStreamGuard &) = delete;
+
+    // See Note [Move construction for RAII guards is tricky]
+    OptionalNPUStreamGuard(OptionalNPUStreamGuard &&other) = delete;
+
+    // See Note [Move assignment for RAII guards is tricky]
+    OptionalNPUStreamGuard &operator = (OptionalNPUStreamGuard &&other) = delete;
+
+    // / Resets the currently set NPU stream to the original stream and
+    // / the currently set device to the original device.  Then,
+    // / set the current device to the device associated with the passed stream,
+    // / and set the current stream on that device to the passed stream.
+    // / Initializes the guard if it was not previously initialized.
+    void reset_stream(c10::Stream stream)
+    {
+        guard_.reset_stream(stream);
     }
-  }
-
-  /// Returns the most recent NPU stream that was set using this stream guard,
-  /// either from construction, or via reset_stream, if the guard is
-  /// initialized, or nullopt if the guard is uninitialized.
-  c10::optional<NPUStream> current_stream() const {
-    auto r = guard_.current_stream();
-    if (r.has_value()) {
-      return c10::make_optional(NPUStream(NPUStream::UNCHECKED, r.value()));
-    } else {
-      return c10::nullopt;
+
+    // / Returns the NPU stream that was set at the time the guard was most
+    // / recently initialized, or nullopt if the guard is uninitialized.
+    c10::optional<NPUStream> original_stream() const
+    {
+        auto r = guard_.original_stream();
+        if (r.has_value()) {
+            return c10::make_optional(NPUStream(NPUStream::UNCHECKED, r.value()));
+        } else {
+            return c10::nullopt;
+        }
+    }
+
+    // / Returns the most recent NPU stream that was set using this stream guard,
+    // / either from construction, or via reset_stream, if the guard is
+    // / initialized, or nullopt if the guard is uninitialized.
+    c10::optional<NPUStream> current_stream() const
+    {
+        auto r = guard_.current_stream();
+        if (r.has_value()) {
+            return c10::make_optional(NPUStream(NPUStream::UNCHECKED, r.value()));
+        } else {
+            return c10::nullopt;
+        }
     }
-  }
 
-  /// Restore the original NPU device and stream, resetting this guard to
-  /// uninitialized state.
-  void reset() {
-    guard_.reset();
-  }
+    // / Restore the original NPU device and stream, resetting this guard to
+    // / uninitialized state.
+    void reset()
+    {
+        guard_.reset();
+    }
 
 private:
-  c10::impl::InlineOptionalStreamGuard<c10_npu::impl::NPUGuardImpl> guard_;
+    c10::impl::InlineOptionalStreamGuard<c10_npu::impl::NPUGuardImpl> guard_;
 };
 
-/// A variant of MultiStreamGuard that is specialized for NPU.
+// / A variant of MultiStreamGuard that is specialized for NPU.
 struct NPUMultiStreamGuard {
-  explicit NPUMultiStreamGuard(at::ArrayRef<NPUStream> streams)
-      : guard_(unwrapStreams(streams)) {}
+    explicit NPUMultiStreamGuard(at::ArrayRef<NPUStream> streams) : guard_(unwrapStreams(streams)) {}
 
-  /// Copy is disallowed
-  NPUMultiStreamGuard(const NPUMultiStreamGuard&) = delete;
-  NPUMultiStreamGuard& operator=(const NPUMultiStreamGuard&) = delete;
+    // / Copy is disallowed
+    NPUMultiStreamGuard(const NPUMultiStreamGuard &) = delete;
+    NPUMultiStreamGuard &operator = (const NPUMultiStreamGuard &) = delete;
 
-  // See Note [Move construction for RAII guards is tricky]
-  NPUMultiStreamGuard(NPUMultiStreamGuard&& other) = delete;
+    // See Note [Move construction for RAII guards is tricky]
+    NPUMultiStreamGuard(NPUMultiStreamGuard &&other) = delete;
 
-  // See Note [Move assignment for RAII guards is tricky]
-  NPUMultiStreamGuard& operator=(NPUMultiStreamGuard&& other) = delete;
+    // See Note [Move assignment for RAII guards is tricky]
+    NPUMultiStreamGuard &operator = (NPUMultiStreamGuard &&other) = delete;
 
 private:
-  c10::impl::InlineMultiStreamGuard<c10_npu::impl::NPUGuardImpl> guard_;
-
-  static std::vector<c10::Stream> unwrapStreams(at::ArrayRef<NPUStream> NPUStreams) {
-    std::vector<c10::Stream> streams;
-    streams.reserve(NPUStreams.size());
-    for (const NPUStream& NPUStream : NPUStreams) {
-      streams.push_back(NPUStream);
+    c10::impl::InlineMultiStreamGuard<c10_npu::impl::NPUGuardImpl> guard_;
+
+    static std::vector<c10::Stream> unwrapStreams(at::ArrayRef<NPUStream> NPUStreams)
+    {
+        std::vector<c10::Stream> streams;
+        streams.reserve(NPUStreams.size());
+        for (const NPUStream &NPUStream : NPUStreams) {
+            streams.push_back(NPUStream);
+        }
+        return streams;
     }
-    return streams;
-  }
 };
-
 } // namespace c10_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index 738780c80e0c937812f04ec4c522c464ff865bfd..8ff81a599d6a1e1c92b0899d8e1dcb81be6497f5 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -20,91 +20,105 @@
 #include <third_party/acl/inc/acl/acl_rt.h>
 
 namespace c10_npu {
-
-struct timeval delay = {0, 1};
+struct timeval delay = { 0, 1 };
 
 namespace {
-
 class CallBackManager {
 public:
     CallBackManager() {}
     ~CallBackManager() {}
-    void SetExec(const ACL_EXEC_FUNC& func) {
+    void SetExec(const ACL_EXEC_FUNC &func)
+    {
         this->execFunc = func;
     }
 
-    void SetCopy(const ACL_COPY_FUNC& func) {
+    void SetCopy(const ACL_COPY_FUNC &func)
+    {
         this->copyFunc = func;
     }
 
-    void SetRelease(const ACL_RELEASE_FUNC& func) {
+    void SetRelease(const ACL_RELEASE_FUNC &func)
+    {
         this->releaseFunc = func;
     }
 
-    void SetCopyReleaseParam(const ACL_COPY_RELEASE_PARM_FUNC& func) {
+    void SetCopyReleaseParam(const ACL_COPY_RELEASE_PARM_FUNC &func)
+    {
         this->copyReleaseParamFunc = func;
     }
 
-    void SetReleaseParam(const ACL_RELEASE_PARAM_FUNC& func) {
+    void SetReleaseParam(const ACL_RELEASE_PARAM_FUNC &func)
+    {
         this->releaseParamFunc = func;
     }
 
-    void SetNew(const ACL_NEW_FUNC& func) {
+    void SetNew(const ACL_NEW_FUNC &func)
+    {
         this->newFunc = func;
     }
 
-    void SetDelete(const ACL_DELETE_FUNC& func) {
+    void SetDelete(const ACL_DELETE_FUNC &func)
+    {
         this->deleteFunc = func;
     }
 
-    void *getCurrentParams(void* head, int offset)
+    void *getCurrentParams(void *head, int offset)
     {
-        return (uint8_t*)head + sizePerParams * offset;
+        return (uint8_t *)head + sizePerParams * offset;
     }
 
-    int Call(void* head, int offset) {
+    int Call(void *head, int offset)
+    {
         TORCH_CHECK(this->execFunc, "Failed to find execution function.", PTA_ERROR(ErrCode::NOT_FOUND));
-        auto dstPtr = (uint8_t*)head + sizePerParams * offset;
+        auto dstPtr = (uint8_t *)head + sizePerParams * offset;
         return this->execFunc(dstPtr);
     }
 
-    void Copy(void* dstHead, int offset, void* src) {
+    void Copy(void *dstHead, int offset, void *src)
+    {
         TORCH_CHECK(this->copyFunc, "Failed to find copy function.", PTA_ERROR(ErrCode::NOT_FOUND));
-        auto dstPtr = (uint8_t*)dstHead + sizePerParams * offset;
+        auto dstPtr = (uint8_t *)dstHead + sizePerParams * offset;
         return this->copyFunc(dstPtr, src);
     }
 
-    void Release(void* head, int offset, ReleaseQueue& releaseQueue) {
+    void Release(void *head, int offset, ReleaseQueue &releaseQueue)
+    {
         TORCH_CHECK(this->releaseFunc, "Failed to find release function.", PTA_ERROR(ErrCode::NOT_FOUND));
-        auto ptr = (uint8_t*)head +  sizePerParams * offset;
+        auto ptr = (uint8_t *)head + sizePerParams * offset;
         return this->releaseFunc(ptr, releaseQueue);
     }
 
-    void CopyRealseParam(void* dstHead, int offset, void* src) {
-        TORCH_CHECK(this->copyReleaseParamFunc, "Failed to find copy release params function.", PTA_ERROR(ErrCode::NOT_FOUND));
-        auto dstPtr = (uint8_t*)dstHead + sizePerParams * offset;
+    void CopyRealseParam(void *dstHead, int offset, void *src)
+    {
+        TORCH_CHECK(this->copyReleaseParamFunc, "Failed to find copy release params function.",
+            PTA_ERROR(ErrCode::NOT_FOUND));
+        auto dstPtr = (uint8_t *)dstHead + sizePerParams * offset;
         return this->copyReleaseParamFunc(dstPtr, src);
     }
 
-    void ReleaseParam(void* head, int offset) {
+    void ReleaseParam(void *head, int offset)
+    {
         TORCH_CHECK(this->releaseParamFunc, "Failed to find release params function.", PTA_ERROR(ErrCode::NOT_FOUND));
-        auto ptr = (uint8_t*)head +  sizePerParams * offset;
+        auto ptr = (uint8_t *)head + sizePerParams * offset;
         return this->releaseParamFunc(ptr);
     }
 
-    void* Init(int capacity) {
+    void *Init(int capacity)
+    {
         TORCH_CHECK(this->newFunc, "Failed to find new function.", PTA_ERROR(ErrCode::NOT_FOUND));
-        void* ptr = this->newFunc(capacity, sizePerParams); // not check as CUDA
+        void *ptr = this->newFunc(capacity, sizePerParams); // not check as CUDA
         return ptr;
     }
 
-    void DeInit(void* ptr) {
+    void DeInit(void *ptr)
+    {
         if (ptr != nullptr) {
             TORCH_CHECK(this->deleteFunc, "Failed to find delete function.", PTA_ERROR(ErrCode::NOT_FOUND));
             this->deleteFunc(ptr);
             ptr = nullptr;
         }
     }
+
 private:
     int sizePerParams = 0;
     ACL_EXEC_FUNC execFunc = nullptr;
@@ -116,22 +130,24 @@ private:
     ACL_RELEASE_PARAM_FUNC releaseParamFunc = nullptr;
 }; // class CallBackManager
 
-CallBackManager& manager() {
+CallBackManager &manager()
+{
     static CallBackManager instance;
     return instance;
 }
 
-CallBackManager& releaseManager() {
+CallBackManager &releaseManager()
+{
     static CallBackManager releaseinstance;
     return releaseinstance;
 }
 } // namespace
 
 namespace register_queue_cb {
-NPUCallBackRegisterBuilder::NPUCallBackRegisterBuilder(const ACL_EXEC_FUNC& execFunc,
-    const ACL_COPY_FUNC& copyFunc, const ACL_RELEASE_FUNC& releaseFunc,
-    const ACL_NEW_FUNC& newFunc, const ACL_DELETE_FUNC& deleteFunc,
-    const ACL_COPY_RELEASE_PARM_FUNC& copyReleaseParamF, const ACL_RELEASE_PARAM_FUNC& releaseParamF) {
+NPUCallBackRegisterBuilder::NPUCallBackRegisterBuilder(const ACL_EXEC_FUNC &execFunc, const ACL_COPY_FUNC &copyFunc,
+    const ACL_RELEASE_FUNC &releaseFunc, const ACL_NEW_FUNC &newFunc, const ACL_DELETE_FUNC &deleteFunc,
+    const ACL_COPY_RELEASE_PARM_FUNC &copyReleaseParamF, const ACL_RELEASE_PARAM_FUNC &releaseParamF)
+{
     manager().SetExec(execFunc);
     manager().SetCopy(copyFunc);
     manager().SetRelease(releaseFunc);
@@ -153,7 +169,16 @@ static constexpr size_t kQueueCapacity = 4096;
 static std::string repo_error;
 static std::string acl_error;
 
-std::string get_func_error_msg(void* error_paras)
+std::unordered_map<RepoStatus, std::string> deviceErrorMap = {
+    {RepoStatus::UCE_EXIT, "UCE ERROR"},
+    {RepoStatus::HBM_ECC_EXIT, "HBM MULTI BIT ECC ERROR"},
+    {RepoStatus::STOP_EXIT, "FORCE STOP"},
+    {RepoStatus::SUSPECT_MEM_EXIT, "SUSPECT MEM ERROR"},
+    {RepoStatus::HCCS_LINK_EXIT, "HCCS LINK ERROR"},
+    {RepoStatus::HCCL_OP_RETRY_EXIT, "HCCL OP RETRY FAILED"}
+};
+
+std::string get_func_error_msg(void *error_paras)
 {
     auto queueParam = static_cast<c10_npu::queue::QueueParas *>(error_paras);
     auto type = queueParam->paramType;
@@ -169,9 +194,8 @@ std::string get_func_error_msg(void* error_paras)
         result << "the current working operator name is " << op_name;
     } else if (type == c10_npu::queue::ASYNC_MEMCPY) {
         auto cur_paras = static_cast<c10_npu::queue::CopyParas *>(queueParam->paramVal);
-        result << "the current copy params are srclen=" << cur_paras->srcLen
-               << ", dstlen=" << cur_paras->dstLen
-               << ", kind=" << cur_paras->kind;
+        result << "the current copy params are srclen=" << cur_paras->srcLen << ", dstlen=" << cur_paras->dstLen <<
+            ", kind=" << cur_paras->kind;
     } else {
         auto cur_paras = static_cast<c10_npu::queue::EventParas *>(queueParam->paramVal);
         result << "the current working event is " << cur_paras->event;
@@ -179,7 +203,8 @@ std::string get_func_error_msg(void* error_paras)
     return result.str();
 }
 
-RepoStatus Repository::GetStatus() const {
+RepoStatus Repository::GetStatus() const
+{
     if (initialized == false) {
         ASCEND_LOGE("Task queue is not initialized, shouldn't call GetStatus(). !!");
     }
@@ -187,7 +212,8 @@ RepoStatus Repository::GetStatus() const {
     return repo_status.load();
 }
 
-void Repository::SetStatus(RepoStatus desired) {
+void Repository::SetStatus(RepoStatus desired)
+{
     if (initialized == false) {
         ASCEND_LOGE("Task queue is not initialized, shouldn't call SetStatus(). !!");
         return;
@@ -196,7 +222,8 @@ void Repository::SetStatus(RepoStatus desired) {
     repo_status = desired;
 }
 
-void Repository::ChangeStatus(RepoStatus expected, RepoStatus desired) {
+void Repository::ChangeStatus(RepoStatus expected, RepoStatus desired)
+{
     if (initialized == false) {
         ASCEND_LOGE("Task queue is not initialized, shouldn't call ChangeStatus(). !!");
         return;
@@ -207,6 +234,8 @@ void Repository::ChangeStatus(RepoStatus expected, RepoStatus desired) {
 
 NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
 {
+    std::string error_msg;
+    std::string runtime_error;
     if (initialized == false) {
         ASCEND_LOGE("Task queue is not initialized, shouldn't call MakeSureQueueEmpty(). !!");
         return FAILED;
@@ -252,31 +281,23 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
         }
     }
 
-    if (GetStatus() == RepoStatus::UCE_EXIT) {
-        if (check_error) {
-            throw std::runtime_error("UCE ERROR." + PTA_ERROR(ErrCode::ACL));
-        } else {
-            ASCEND_LOGE("UCE ERROR happend.");
-        }
-    } else if (GetStatus() == RepoStatus::HBM_ECC_EXIT) {
-        if (check_error) {
-            std::string error_msg = c10_npu::c10_npu_get_error_message();
-            throw std::runtime_error("HBM MULTI BIT ECC ERROR." + error_msg + PTA_ERROR(ErrCode::ACL));
-        } else {
-            ASCEND_LOGE("HBM MULTI BIT ECC ERROR happend.");
+    const RepoStatus current_status = GetStatus();
+    auto iter = deviceErrorMap.find(current_status);
+    if (iter != deviceErrorMap.end()) {
+        std::string throwError = iter->second;
+        std::string error_msg;
+        if (current_status != RepoStatus::STOP_EXIT && current_status != RepoStatus::UCE_EXIT) {
+            error_msg = c10_npu::c10_npu_get_error_message();
         }
+        runtime_error = throwError + ", " + error_msg + PTA_ERROR(ErrCode::ACL);
+        error_msg = throwError + " happend.";
     }
 
-    if (GetStatus() == RepoStatus::STOP_EXIT) {
-        if (check_error) {
-            ASCEND_LOGE("getRepoStopFlag in EmptyQueue, throw FORCE STOP.");
-            throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
-        } else {
-            ASCEND_LOGE("FORCE STOP happend.");
-        }
+    if (current_status == RepoStatus::CAN_EXIT) {
+        error_msg = "Inner error happend with CAN_EXIT status, detail: " + repo_error;
     }
 
-    if (GetStatus() == RepoStatus::ERROR_EXIT) {
+    if (current_status == RepoStatus::ERROR_EXIT) {
         // Avoid repeatedly throwing exceptions
         SetStatus(CAN_EXIT);
 
@@ -287,26 +308,18 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
                 c10_npu::option::oom_observer();
             }
         }
-        
-#ifndef BUILD_LIBTORCH
-        if (gilState) {
-            PyEval_RestoreThread(gilState);
-        }
-#endif
 
-        if (check_error) {
-            throw std::runtime_error("The Inner error is reported as above. "
-                                    "The process exits for this inner error, and " + repo_error + ".\n" +
-                                    "Since the operator is called asynchronously, the stacktrace may be inaccurate. "
-                                    "If you want to get the accurate stacktrace, "
-                                    "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" +
-                                    "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, "
-                                    "resulting in performance degradation. "
-                                    "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." +
-                                    PTA_ERROR(ErrCode::ACL) + ".\n" + acl_error);
-        } else {
-            ASCEND_LOGE("Inner error happend, detail: %s", repo_error);
-        }
+        runtime_error = "The Inner error is reported as above. "
+            "The process exits for this inner error, and " +
+            repo_error + ".\n" +
+            "Since the operator is called asynchronously, the stacktrace may be inaccurate. "
+            "If you want to get the accurate stacktrace, "
+            "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" +
+            "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, "
+            "resulting in performance degradation. "
+            "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." +
+            PTA_ERROR(ErrCode::ACL) + ".\n" + acl_error;
+        error_msg = "Inner error happend, detail: " + repo_error;
     }
 
 #ifndef BUILD_LIBTORCH
@@ -316,23 +329,22 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
     }
 #endif
 
+    if (!error_msg.empty()) {
+        ASCEND_LOGE(error_msg);
+    }
+    if (check_error && !runtime_error.empty()) {
+        throw std::runtime_error(runtime_error);
+    }
+
     return SUCCESS;
 }
 
-bool Repository::WriteQueue(void* cur_paras) {
+bool Repository::WriteQueue(void *cur_paras)
+{
     std::lock_guard<std::mutex> lock(mu_enqueue);
 
-    if (GetStatus() == RepoStatus::STOP_EXIT) {
-        auto queueParam = static_cast<c10_npu::queue::QueueParas *>(cur_paras);
-        auto type = queueParam->paramType;
-        // The RECORD_EVENT in the destructor process should not throw an exception.
-        if (type == c10_npu::queue::LAZY_DESTROY_EVENT || type == c10_npu::queue::RECORD_EVENT) {
-            return true;
-        } else {
-            ASCEND_LOGE("getRepoStopFlag in WriteQueue, throw FORCE STOP.");
-            throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
-        }
-    }
+    const RepoStatus current_status = GetStatus();
+    ThrowDeviceError(current_status, cur_paras);
 
     if (IsFullQueue()) {
         return false;
@@ -346,6 +358,33 @@ bool Repository::WriteQueue(void* cur_paras) {
     return true;
 }
 
+void Repository::CheckDeviceError(int ret, std::string& err_msg)
+{
+    if (ret != ACL_ERROR_RT_DEVICE_TASK_ABORT && ret != ACL_ERROR_RT_DEVICE_MEM_ERROR) {
+        acl_error = c10_npu::c10_npu_get_error_message();
+    }
+    if (ret == ACL_ERROR_RT_DEVICE_MEM_ERROR || acl_error.find(DEVICE_HBM_ECC_ERROR) != std::string::npos) {
+        if (checkUceErrAndRepair(false, err_msg)) {
+            ASCEND_LOGE("UCE ERROR happened, set task queue status to UCE_EXIT");
+            SetStatus(UCE_EXIT);
+        }
+    } else if (ret == ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR || acl_error.find(DEVICE_HBM_ECC_ERROR) != std::string::npos) {
+        record_mem_hbm_ecc_error();
+        SetStatus(HBM_ECC_EXIT);
+    } else if (ret == ACL_ERROR_RT_SUSPECT_DEVICE_MEM_ERROR || acl_error.find(SUSPECT_DEVICE_MEM_ERROR) != std::string::npos) {
+        ASCEND_LOGE("SUSPECT MEM ERROR happened, set task queue status to SUSPECT_MEM_EXIT");
+        SetStatus(SUSPECT_MEM_EXIT);
+    } else if (ret == ACL_ERROR_RT_LINK_ERROR || acl_error.find(HCCS_LINK_ERROR) != std::string::npos) {
+        ASCEND_LOGE("HCCS LINK ERROR happened, set task queue status to HCCS_LINK_EXIT");
+        SetStatus(HCCS_LINK_EXIT);
+    } else if (ret == ACL_ERROR_RT_COMM_OP_RETRY_FAIL || acl_error.find(HCCL_OP_RETRY_FAILED) != std::string::npos) {
+        ASCEND_LOGE("HCCL OP RETRY FAILED happened, set task queue status to HCCL_OP_RETRY_EXIT");
+        SetStatus(HCCL_OP_RETRY_EXIT);
+    } else if (GetStatus() != STOP_EXIT) {
+        SetStatus(ERROR_EXIT);
+    }
+}
+
 bool Repository::ReadQueue()
 {
     if (IsEmptyQueue()) {
@@ -374,26 +413,15 @@ bool Repository::ReadQueue()
     auto ret = manager().Call(datas, read_idx.idx);
 #endif
     if (ret != 0) {
-        if (ret != ACL_ERROR_RT_DEVICE_TASK_ABORT && ret != ACL_ERROR_RT_DEVICE_MEM_ERROR) {
-            acl_error = c10_npu::c10_npu_get_error_message();
-        }
         repo_error = get_func_error_msg(manager().getCurrentParams(datas, read_idx.idx));
         ASCEND_LOGE("---Thread---%llu: device = %d, write_idx = %u, read_idx = %u, status = %d, ret = %d",
-                    std::this_thread::get_id(), device_idx, write_idx.idx, read_idx.idx, GetStatus(), ret);
+            std::this_thread::get_id(), device_idx, write_idx.idx, read_idx.idx, GetStatus(), ret);
         while (!IsEmptyQueue()) { // ignore other tasks
             manager().Release(datas, read_idx.idx, releaseQueue);
             read_idx.idx = (read_idx.idx + 1) & (kQueueCapacity - 1);
         }
         std::string err_msg;
-        if (ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair(false, err_msg)) {
-            SetStatus(UCE_EXIT);
-        } else if (ret == ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR ||
-            acl_error.find(DEVICE_HBM_ECC_ERROR) != std::string::npos) {
-            record_mem_hbm_ecc_error();
-            SetStatus(HBM_ECC_EXIT);
-        } else if (GetStatus() != STOP_EXIT) {
-            SetStatus(ERROR_EXIT);
-        }
+        CheckDeviceError(ret, err_msg);
         if (!err_msg.empty()) {
             repo_error = repo_error + ". Other error information exists:" + err_msg;
         }
@@ -410,47 +438,42 @@ bool Repository::ReadQueue()
     return true;
 }
 
-void Repository::Enqueue(void* cur_paras) {
-    if (initialized == false) {
-        ASCEND_LOGE("Task queue is not initialized, shouldn't call Enqueue(). !!");
+void Repository::ThrowDeviceError(RepoStatus current_status, void* cur_paras)
+{
+    auto iter = deviceErrorMap.find(current_status);
+    if (iter == deviceErrorMap.end()) {
         return;
     }
-
-    if (GetStatus() == RepoStatus::UCE_EXIT) {
-        auto queueParam = static_cast<c10_npu::queue::QueueParas *>(cur_paras);
-        auto type = queueParam->paramType;
-        // The RECORD_EVENT in the destructor process should not throw an exception.
-        if (type == c10_npu::queue::LAZY_DESTROY_EVENT || type == c10_npu::queue::RECORD_EVENT) {
-            return;
-        }
-        ASCEND_LOGE("getUceErrorFlag in Enqueue, throw UCE ERROR.");
-        throw std::runtime_error("UCE ERROR" + PTA_ERROR(ErrCode::ACL));
+    std::string throwError = iter->second;
+    auto queueParam = static_cast<c10_npu::queue::QueueParas *>(cur_paras);
+    auto type = queueParam->paramType;
+    // The RECORD_EVENT in the destructor process should not throw an exception.
+    if (type == c10_npu::queue::LAZY_DESTROY_EVENT || type == c10_npu::queue::RECORD_EVENT) {
+        return;
     }
+    ASCEND_LOGE("getUceErrorFlag in Enqueue, throw %s.", throwError.c_str());
+    std::string error_msg;
+    if (current_status != RepoStatus::STOP_EXIT && current_status != RepoStatus::UCE_EXIT) {
+        error_msg = c10_npu::c10_npu_get_error_message();
+    }
+    throw std::runtime_error(throwError + ", " + error_msg + PTA_ERROR(ErrCode::ACL));
+}
 
-    if (GetStatus() == RepoStatus::HBM_ECC_EXIT) {
-        auto queueParam = static_cast<c10_npu::queue::QueueParas *>(cur_paras);
-        auto type = queueParam->paramType;
-        // The RECORD_EVENT in the destructor process should not throw an exception.
-        if (type == c10_npu::queue::LAZY_DESTROY_EVENT || type == c10_npu::queue::RECORD_EVENT) {
-            return;
-        }
-        ASCEND_LOGE("getHBMErrorFlag in Enqueue, throw HBM MULTI BIT ECC ERROR.");
-        std::string error_msg = c10_npu::c10_npu_get_error_message();
-        throw std::runtime_error("HBM MULTI BIT ECC ERROR." + error_msg + PTA_ERROR(ErrCode::ACL));
+void Repository::Enqueue(void *cur_paras)
+{
+    if (initialized == false) {
+        ASCEND_LOGE("Task queue is not initialized, shouldn't call Enqueue(). !!");
+        return;
     }
 
-    if (GetStatus() == RepoStatus::STOP_EXIT) {
-        auto queueParam = static_cast<c10_npu::queue::QueueParas *>(cur_paras);
-        auto type = queueParam->paramType;
-        // The RECORD_EVENT in the destructor process should not throw an exception.
-        if (type == c10_npu::queue::LAZY_DESTROY_EVENT || type == c10_npu::queue::RECORD_EVENT) {
-            return;
-        }
-        ASCEND_LOGE("getRepoStopFlag in Enqueue, throw FORCE STOP.");
-        throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
+    const RepoStatus current_status = GetStatus();
+    ThrowDeviceError(current_status, cur_paras);
+
+    if (current_status == RepoStatus::CAN_EXIT) {
+        ASCEND_LOGE("Inner error happend with CAN_EXIT status, detail: %s", repo_error.c_str());
     }
 
-    if (GetStatus() == RepoStatus::ERROR_EXIT) {
+    if (current_status == RepoStatus::ERROR_EXIT) {
         // Avoid repeatedly throwing exceptions
         SetStatus(CAN_EXIT);
 
@@ -463,17 +486,18 @@ void Repository::Enqueue(void* cur_paras) {
         }
 
         throw std::runtime_error("The Inner error is reported as above. "
-                                "The process exits for this inner error, and " + repo_error + ".\n" +
-                                "Since the operator is called asynchronously, the stacktrace may be inaccurate. "
-                                "If you want to get the accurate stacktrace, "
-                                "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" +
-                                "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, "
-                                "resulting in performance degradation. "
-                                "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." +
-                                PTA_ERROR(ErrCode::ACL) + ".\n" + acl_error);
-    }
-
-    if (GetStatus() != RUN && GetStatus() != INIT) {
+            "The process exits for this inner error, and " +
+            repo_error + ".\n" +
+            "Since the operator is called asynchronously, the stacktrace may be inaccurate. "
+            "If you want to get the accurate stacktrace, "
+            "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" +
+            "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, "
+            "resulting in performance degradation. "
+            "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." +
+            PTA_ERROR(ErrCode::ACL) + ".\n" + acl_error);
+    }
+
+    if (current_status != RUN && current_status != INIT) {
         auto queueParam = static_cast<c10_npu::queue::QueueParas *>(cur_paras);
         auto type = queueParam->paramType;
         if (type == c10_npu::queue::EXECUTE_OPAPI) {
@@ -487,7 +511,7 @@ void Repository::Enqueue(void* cur_paras) {
         } else if (type == c10_npu::queue::ASYNC_MEMCPY) {
             auto cur_paras = static_cast<c10_npu::queue::CopyParas *>(queueParam->paramVal);
             ASCEND_LOGW("Task queue thread is exit, can't call Enqueue() for copy, srclen=%zu, dstlen is %zu, kind=%d",
-                        cur_paras->srcLen, cur_paras->dstLen, cur_paras->kind);
+                cur_paras->srcLen, cur_paras->dstLen, cur_paras->kind);
         } else {
             auto cur_paras = static_cast<c10_npu::queue::EventParas *>(queueParam->paramVal);
             ASCEND_LOGW("Task queue thread is exit, can't call Enqueue() for event, event is=%p", cur_paras->event);
@@ -499,7 +523,7 @@ void Repository::Enqueue(void* cur_paras) {
     uint64_t u = 1;
 
     SetWriteWorking(true);
-    while (ret == false) {
+    while (ret == false && (GetStatus() == RUN || GetStatus() == INIT)) {
         ret = WriteQueue(cur_paras);
         if (ret == false) {
             SetWriteWorking(false);
@@ -543,7 +567,8 @@ void Repository::Enqueue(void* cur_paras) {
     SetWriteWorking(false);
 }
 
-void Repository::Dequeue() {
+void Repository::Dequeue()
+{
     if (initialized == false) {
         ASCEND_LOGE("Task queue is not initialized, shouldn't call Dequeue(). !!");
         return;
@@ -556,8 +581,7 @@ void Repository::Dequeue() {
 
     SetReadWorking(true);
     while (ret == false && GetStatus() != RepoStatus::CAN_EXIT) {
-        if (GetStatus() == RepoStatus::STOP_EXIT || GetStatus() == RepoStatus::UCE_EXIT ||
-            GetStatus() == RepoStatus::HBM_ECC_EXIT) {
+        if (deviceErrorMap.find(GetStatus()) != deviceErrorMap.end()) {
             ClearQueue();
             c10_npu::NPUEventManager::GetInstance().ClearUnrecordedCount();
             std::this_thread::sleep_for(std::chrono::microseconds(1000));
@@ -594,8 +618,7 @@ void Repository::Dequeue() {
             continue;
         }
         __sync_synchronize();
-        notify_empty = need_empty &&
-            IsEmptyQueue(); // need_empty && (ret == false || IsEmptyQueue());
+        notify_empty = need_empty && IsEmptyQueue(); // need_empty && (ret == false || IsEmptyQueue());
         while (notify_empty) {
             s = eventfd_write(efd_empty, u);
             if (s != 0) {
@@ -623,7 +646,8 @@ void Repository::Dequeue() {
     SetReadWorking(false);
 }
 
-void Repository::ReleaseResource() {
+void Repository::ReleaseResource()
+{
     manager().DeInit(datas);
     if (efd_read > 0) {
         close(efd_read);
@@ -652,12 +676,13 @@ void Repository::SetQueueErrMsg(const char *errmsg)
     error_msg = errmsg;
 }
 
-const char* Repository::GetQueueErrMsg()
+const char *Repository::GetQueueErrMsg()
 {
     return error_msg;
 }
 
-Repository::~Repository() {
+Repository::~Repository()
+{
     if (initialized) {
         if (consumer.joinable()) {
             SetStatus(NEED_EXIT);
@@ -669,16 +694,19 @@ Repository::~Repository() {
     }
 }
 
-bool Repository::IsFullQueue() const {
+bool Repository::IsFullQueue() const
+{
     return ((write_idx.idx + 1) & (kQueueCapacity - 1)) == read_idx.idx;
 }
 
-bool Repository::CheckInit() const {
+bool Repository::CheckInit() const
+{
     return initialized;
 }
 
-void StartConsume(Repository* repo, c10::DeviceIndex device_id) {
-    SetThreadName(ThreadType::aclThread);
+void StartConsume(Repository *repo, c10::DeviceIndex device_id)
+{
+    SetThreadType(ThreadType::ACL_THREAD);
     SetThreadAffinity(device_id);
 
     aclError ret = c10_npu::SetDevice(device_id);
@@ -693,7 +721,8 @@ void StartConsume(Repository* repo, c10::DeviceIndex device_id) {
     return;
 }
 
-void Repository::InitRepo(c10::DeviceIndex device_id) {
+void Repository::InitRepo(c10::DeviceIndex device_id)
+{
     if (datas == nullptr) {
         datas = manager().Init(kQueueCapacity);
         ASCEND_LOGI("TaskQueue is enable");
@@ -724,7 +753,7 @@ std::string Repository::GetPara()
 }
 
 static constexpr size_t kReleaseQueueCapacity = 8192;
-bool ReleaseQueue::WriteToReleaseQueue(void* cur_paras)
+bool ReleaseQueue::WriteToReleaseQueue(void *cur_paras)
 {
     if (IsFullQueue()) {
         return false;
@@ -737,7 +766,8 @@ bool ReleaseQueue::WriteToReleaseQueue(void* cur_paras)
     return true;
 }
 
-void ReleaseQueue::PushToReleaseQueue(void* cur_paras) {
+void ReleaseQueue::PushToReleaseQueue(void *cur_paras)
+{
     if (initialized == false) {
         ASCEND_LOGE("Release queue is not initialized, shouldn't call PushToReleaseQueue(). !!");
         return;
@@ -752,7 +782,8 @@ void ReleaseQueue::PushToReleaseQueue(void* cur_paras) {
     }
 }
 
-bool ReleaseQueue::ReadFromReleaseQueue() {
+bool ReleaseQueue::ReadFromReleaseQueue()
+{
     if (IsEmptyQueue()) {
         return false;
     }
@@ -766,7 +797,8 @@ bool ReleaseQueue::ReadFromReleaseQueue() {
     return true;
 }
 
-void ReleaseQueue::PopFromReleaseQueue() {
+void ReleaseQueue::PopFromReleaseQueue()
+{
     if (initialized == false) {
         ASCEND_LOGE("Release queue is not initialized, shouldn't call PopFromReleaseQueue(). !!");
         return;
@@ -786,8 +818,9 @@ void ReleaseQueue::PopFromReleaseQueue() {
     }
 }
 
-void StartRelease(ReleaseQueue* releaseQue) {
-    SetThreadName(ThreadType::releaseThread);
+void StartRelease(ReleaseQueue *releaseQue)
+{
+    SetThreadType(ThreadType::RELEASE_THREAD);
     SetThreadAffinity(releaseQue->GetDeviceID());
 
     while (releaseQue->GetStatus() != RepoStatus::CAN_EXIT) {
@@ -809,7 +842,8 @@ void ReleaseQueue::InitReleaseQueue(c10::DeviceIndex device_id)
     device_idx = device_id;
 }
 
-ReleaseQueue::~ReleaseQueue() {
+ReleaseQueue::~ReleaseQueue()
+{
     if (initialized) {
         if (releaser.joinable()) {
             SetStatus(NEED_EXIT);
@@ -819,11 +853,13 @@ ReleaseQueue::~ReleaseQueue() {
     releaseManager().DeInit(datas);
 }
 
-bool ReleaseQueue::IsFullQueue() const {
+bool ReleaseQueue::IsFullQueue() const
+{
     return ((write_idx.idx + 1) % kReleaseQueueCapacity) == read_idx.idx;
 }
 
-RepoStatus ReleaseQueue::GetStatus() const {
+RepoStatus ReleaseQueue::GetStatus() const
+{
     if (initialized == false) {
         ASCEND_LOGE("Release queue is not initialized, shouldn't call GetStatus(). !!");
     }
@@ -837,7 +873,8 @@ c10::DeviceIndex ReleaseQueue::GetDeviceID() const
 }
 
 
-void ReleaseQueue::SetStatus(RepoStatus desired) {
+void ReleaseQueue::SetStatus(RepoStatus desired)
+{
     if (initialized == false) {
         ASCEND_LOGE("Release queue is not initialized, shouldn't call SetStatus(). !!");
         return;
@@ -846,7 +883,8 @@ void ReleaseQueue::SetStatus(RepoStatus desired) {
     repo_status = desired;
 }
 
-void ReleaseQueue::ChangeStatus(RepoStatus expected, RepoStatus desired) {
+void ReleaseQueue::ChangeStatus(RepoStatus expected, RepoStatus desired)
+{
     if (initialized == false) {
         ASCEND_LOGE("Release queue is not initialized, shouldn't call ChangeStatus(). !!");
         return;
diff --git a/torch_npu/csrc/core/npu/NPUQueue.h b/torch_npu/csrc/core/npu/NPUQueue.h
index f45f2811b611a3c4f6fb2e55d56e76142af4dafa..0ef560904032fbe0d3eb8f2108d26282a85fc591 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.h
+++ b/torch_npu/csrc/core/npu/NPUQueue.h
@@ -25,6 +25,9 @@ enum RepoStatus {
   UCE_EXIT = 5,
   STOP_EXIT = 6,
   HBM_ECC_EXIT = 7,
+  SUSPECT_MEM_EXIT = 8,
+  HCCS_LINK_EXIT = 9,
+  HCCL_OP_RETRY_EXIT = 10,
 };
 
 // c10::SmallVector max size
@@ -113,6 +116,8 @@ private:
   bool IsReadWorking() const {return read_idx.working;};
   bool WriteQueue(void* cur_paras);
   bool ReadQueue();
+  void CheckDeviceError(int ret, std::string& err_msg);
+  void ThrowDeviceError(RepoStatus current_status, void* cur_paras);
 
 private:
   void* datas = nullptr;
diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp
index e51f89b7532ad0eeb0bc9b56ca4c00b56ac88cd4..6874830ad803882dd87ef31264d3d52b8ec0570e 100644
--- a/torch_npu/csrc/core/npu/NPUStream.cpp
+++ b/torch_npu/csrc/core/npu/NPUStream.cpp
@@ -103,19 +103,44 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s)
     return stream;
 }
 
+int GetStreamsPerPoolBits()
+{
+    const static int StreamsPerPoolBits = []() -> int {
+        if (c10_npu::option::OptionsManager::GetStreamsPerDevice() == 8) {
+            return 3;
+        }
+        return kStreamsPerPoolBits;
+    }();
+    return StreamsPerPoolBits;
+}
+
+int GetStreamsPerPool()
+{
+    const static int StreamsPerPool = []() -> int {
+        if (c10_npu::option::OptionsManager::GetStreamsPerDevice() == 8) {
+            return 8;
+        }
+        return kStreamsPerPool;
+    }();
+    return StreamsPerPool;
+}
+
 static inline StreamIdType streamIdType(c10::StreamId s)
 {
-    return static_cast<StreamIdType>((uint32_t)s >> kStreamsPerPoolBits);
+    static int StreamsPerPoolBits = GetStreamsPerPoolBits();
+    return static_cast<StreamIdType>((uint32_t)s >> StreamsPerPoolBits);
 }
 
 static inline size_t streamIdIndex(c10::StreamId s)
 {
-    return static_cast<size_t>((uint32_t)s & ((1 << kStreamsPerPoolBits) - 1));
+    static int StreamsPerPoolBits = GetStreamsPerPoolBits();
+    return static_cast<size_t>((uint32_t)s & ((1 << StreamsPerPoolBits) - 1));
 }
 
 c10::StreamId makeStreamId(StreamIdType st, size_t si)
 {
-    return static_cast<c10::StreamId>((static_cast<size_t>(st) << kStreamsPerPoolBits) | si);
+    static int StreamsPerPoolBits = GetStreamsPerPoolBits();
+    return static_cast<c10::StreamId>((static_cast<size_t>(st) << StreamsPerPoolBits) | si);
 }
 
 template <typename T, typename A>
@@ -172,7 +197,7 @@ static void initGlobalStreamState()
     default_streams[device_id].device_index = device_id;
     npu_counters[device_id] = 0;
     auto& default_streamsi = default_streams[device_id];
-    NPU_CHECK_SUPPORTED_OR_ERROR(
+    NPU_CHECK_ERROR(
         acl::AclrtCreateStreamWithConfig(&default_streamsi.stream, 0, (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC)));
     if (c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
         default_streamsi.repo->InitRepo(device_id);
@@ -180,7 +205,7 @@ static void initGlobalStreamState()
     // Initializes secondary streams
     secondary_streams[device_id].device_index = device_id;
     auto &secondary_streamsi = secondary_streams[device_id];
-    NPU_CHECK_SUPPORTED_OR_ERROR(
+    NPU_CHECK_ERROR(
         acl::AclrtCreateStreamWithConfig(&secondary_streamsi.stream, 0, (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC)));
 }
 
@@ -189,12 +214,13 @@ static void initDeviceStreamState(c10::DeviceIndex device_index)
     // Switches to the requested device so streams are properly associated
     // with it.
     NPUGuard device_guard{device_index};
-    for (auto i = decltype(kStreamsPerPool){0}; i < kStreamsPerPool; ++i) {
+    static int StreamsPerPool = GetStreamsPerPool();
+    for (auto i = decltype(StreamsPerPool){0}; i < StreamsPerPool; ++i) {
         auto& npu_streami = npu_streams[device_index][i];
 
         npu_streami.device_index = device_index;
 
-        NPU_CHECK_SUPPORTED_OR_ERROR(
+        NPU_CHECK_ERROR(
             acl::AclrtCreateStreamWithConfig(&npu_streami.stream, 0, (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC)));
     }
 }
@@ -232,7 +258,8 @@ static inline void check_npu(c10::DeviceIndex device_index)
 static uint32_t get_idx(std::atomic<uint32_t>& counter)
 {
     auto raw_idx = counter++;
-    return raw_idx % kStreamsPerPool;
+    static int StreamsPerPool = GetStreamsPerPool();
+    return raw_idx % StreamsPerPool;
 }
 
 static uint32_t get_sync_launch_stream_idx(std::atomic<uint32_t>& counter)
@@ -523,8 +550,11 @@ void setCurrentNPUStream(NPUStream stream)
     initNPUStreamsOnce();
     auto ptr = NPUStream_internals(stream);
     AT_ASSERT(ptr, PTA_ERROR(ErrCode::PTR));
-    ASCEND_LOGI("Exchange NPU current stream from stream = %p to stream = %p",
-        current_streams[ptr->device_index]->stream, ptr->stream);
+    if (current_streams[ptr->device_index]->stream != ptr->stream) {
+        ASCEND_LOGI("Exchange NPU current stream from stream = %p to stream = %p",
+            current_streams[ptr->device_index]->stream, ptr->stream);
+    }
+
     current_streams[ptr->device_index] = ptr;
 }
 
@@ -533,6 +563,11 @@ std::ostream& operator<<(std::ostream& stream, const NPUStream& s)
     return stream << s.unwrap();
 }
 
+NPUStream::NPUStream(c10::Stream stream) : stream_(stream)
+{
+    TORCH_CHECK(stream_.device_type() == c10::DeviceType::PrivateUse1, PTA_ERROR(ErrCode::TYPE));
+}
+
 void NPUStream::setDataPreprocessStream(bool is_data_preprocess_stream)
 {
     auto ptr = NPUStream_internals(getCurrentNPUStream());
@@ -580,18 +615,19 @@ void recovery_all_npu_streams(c10::DeviceIndex device_index)
     NPUGuard device_guard{device_index};
     auto& default_streamsi = default_streams[device_index];
     default_streamsi.stream = nullptr;
-    NPU_CHECK_SUPPORTED_OR_ERROR(
+    NPU_CHECK_ERROR(
         acl::AclrtCreateStreamWithConfig(&default_streamsi.stream, 0, (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC)));
     auto& secondary_streamsi = secondary_streams[device_index];
     secondary_streamsi.stream = nullptr;
-    NPU_CHECK_SUPPORTED_OR_ERROR(
+    NPU_CHECK_ERROR(
         acl::AclrtCreateStreamWithConfig(&secondary_streamsi.stream, 0, (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC)));
-    for (auto i = decltype(kStreamsPerPool){0}; i < kStreamsPerPool; ++i) {
+    static int StreamsPerPool = GetStreamsPerPool();
+    for (auto i = decltype(StreamsPerPool){0}; i < StreamsPerPool; ++i) {
         auto& npu_streami = npu_streams[device_index][i];
         if (npu_streami.stream == nullptr) {
             continue;
         }
-        NPU_CHECK_SUPPORTED_OR_ERROR(
+        NPU_CHECK_ERROR(
             acl::AclrtCreateStreamWithConfig(&npu_streami.stream, 0, (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC)));
     }
 }
@@ -605,7 +641,7 @@ static void initDeviceSyncLaunchStream(c10::DeviceIndex device_index)
         sync_streami.device_index = device_index;
         sync_streami.is_sync_launch = true;
 
-        NPU_CHECK_SUPPORTED_OR_ERROR(
+        NPU_CHECK_ERROR(
             acl::AclrtCreateStreamWithConfig(&sync_streami.stream, 0, ACL_STREAM_FAST_SYNC));
     }
 }
diff --git a/torch_npu/csrc/core/npu/NPUStream.h b/torch_npu/csrc/core/npu/NPUStream.h
index 561943ea314438d9825b5b4bfe91c8c2981ae84f..10714fdc8d3e3e622bc0dc1bdcb8d5b7e890d893 100644
--- a/torch_npu/csrc/core/npu/NPUStream.h
+++ b/torch_npu/csrc/core/npu/NPUStream.h
@@ -20,13 +20,8 @@ class C10_NPU_API NPUStream {
 public:
     enum Unchecked { UNCHECKED };
 
-    explicit NPUStream(c10::Stream stream) : stream_(stream)
-    {
-        TORCH_CHECK(stream_.device_type() == c10::DeviceType::PrivateUse1, PTA_ERROR(ErrCode::TYPE));
-    }
-
+    explicit NPUStream(c10::Stream stream);
     explicit NPUStream(Unchecked, c10::Stream stream) : stream_(stream) {}
-
     ~NPUStream() {}
 
     bool operator==(const NPUStream& other) const noexcept
@@ -122,7 +117,7 @@ C10_NPU_API NPUStream getNPUStreamFromPool(c10::DeviceIndex device = -1);
 
 C10_NPU_API NPUStream getDefaultNPUStream(c10::DeviceIndex device_index = -1);
 
-NPUStream getStreamFromPool(const bool isHighPriority, c10::DeviceIndex device_index);
+C10_NPU_API NPUStream getStreamFromPool(const bool isHighPriority, c10::DeviceIndex device_index);
 
 C10_NPU_API NPUStream getCurrentNPUStream(c10::DeviceIndex device_index = -1);
 
diff --git a/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp b/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cf6dc33dfe8925bcfd37fd797a6bb9914f12f8b6
--- /dev/null
+++ b/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.cpp
@@ -0,0 +1,101 @@
+#include <unistd.h>
+#include <c10/util/flat_hash_map.h>
+
+#include "third_party/acl/inc/acl/acl_base.h"
+#include "third_party/acl/inc/acl/acl_rt.h"
+#include "torch_npu/csrc/core/npu/NPUFunctions.h"
+#include "torch_npu/csrc/core/npu/NpuVariables.h"
+#include "torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.h"
+
+size_t kAlignSize = 4096; // The first address must be aligned to page_size
+
+struct HostPtr {
+    void* ptr;
+    void* alignedPtr;
+};
+
+ska::flat_hash_map<void*, HostPtr> memBlocks;
+bool initialized = false;
+
+// malloc host memopy
+void* mallocHostMemory(size_t size)
+{
+    void* ptr = nullptr;
+    NPU_CHECK_ERROR(aclrtMallocHost(static_cast<void**>(&ptr), size + kAlignSize));
+    return ptr;
+}
+
+// register host memopy to device
+void* registerSvmMem(void* ptr, size_t size)
+{
+    void *svmPtr = nullptr;
+    aclrtHostRegisterType regType = ACL_HOST_REGISTER_MAPPED;
+    uintptr_t aligned_ptr = (reinterpret_cast<uintptr_t>(ptr) + kAlignSize - 1) / kAlignSize * kAlignSize;
+    void* alignedPtr = reinterpret_cast<void*>(aligned_ptr);
+    if (c10_npu::acl::AclrtHostRegister(alignedPtr, size, regType, &svmPtr) != ACL_ERROR_NONE) {
+        NPU_CHECK_ERROR(aclrtFreeHost(ptr));
+        TORCH_CHECK(false, "AclrtHostRegister failed.", PTA_ERROR(ErrCode::ACL));
+    }
+    HostPtr hostPtr;
+    hostPtr.ptr = ptr;
+    hostPtr.alignedPtr = alignedPtr;
+    memBlocks.emplace(svmPtr, hostPtr);
+    return svmPtr;
+}
+
+// malloc swap memopy
+void* mallocHostSwapMemory(size_t size)
+{
+    if (!initialized) {
+        kAlignSize = sysconf(_SC_PAGESIZE);
+        initialized = true;
+    }
+    size = (size + kAlignSize - 1) & ~(kAlignSize - 1);
+    void *ptr = mallocHostMemory(size);
+    void *svmPtr = registerSvmMem(ptr, size);
+    return svmPtr;
+}
+
+static void svm_deleter(void* ptr)
+{
+}
+
+namespace c10_npu {
+namespace NPUSwappedMemoryAllocator {
+
+class NpuSwappedMemoryAllocator : public c10::Allocator {
+public:
+    c10::DataPtr allocate(size_t size) const override
+    {
+        int device = 0;
+        NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
+
+        void* dev_ptr = mallocHostSwapMemory(size);
+        void (*delete_func)(void*) = &svm_deleter;
+        return {dev_ptr, dev_ptr, delete_func, c10::Device(c10::DeviceType::PrivateUse1, device)};
+    }
+
+    c10::DeleterFnPtr raw_deleter() const override
+    {
+        return &svm_deleter;
+    }
+}; // class NpuSwappedMemoryAllocator
+
+NpuSwappedMemoryAllocator swapmemory_allocator;
+
+c10::Allocator* get()
+{
+    return &swapmemory_allocator;
+}
+
+void emptyCache()
+{
+    for (auto it = memBlocks.begin(); it != memBlocks.end(); it++) {
+        NPU_CHECK_ERROR(c10_npu::acl::AclrtHostUnregister(it->second.alignedPtr));
+        NPU_CHECK_ERROR(aclrtFreeHost(it->second.ptr));
+    }
+    memBlocks.clear();
+}
+
+} // namespace NPUSwappedMemoryAllocator
+} // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.h b/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..664ef46b5f8c7be471cf1788a282e3ec9dd7387c
--- /dev/null
+++ b/torch_npu/csrc/core/npu/NPUSwappedMemoryAllocator.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+
+namespace c10_npu {
+namespace NPUSwappedMemoryAllocator {
+
+c10::Allocator* get();
+
+TORCH_NPU_API void emptyCache();
+
+} // namespace NPUSwappedMemoryAllocator
+} // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
index 24f43c77dedbe6db88ec8ce93a84f76c6c6b889a..9c0a4491ad67728fe5bd4ce96f477d85a11aa79d 100644
--- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
@@ -9,7 +9,6 @@
 #include "torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h"
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
-#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 #include "torch_npu/csrc/framework/utils/OpPreparation.h"
 #include "torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUStream.h"
@@ -41,18 +40,44 @@ constexpr size_t kRoundLarge = 2097152; // Alloceted memory is aligned to 2 MiB.
 struct WorkspaceBlock {
     void* data_ptr;
     size_t size;
+    bool allocated = 0;
+    int64_t requested_size = 0;
+    std::shared_ptr<c10::GatheredContext> context_when_allocated = nullptr;
     WorkspaceBlock() : data_ptr(nullptr), size(0) {}
 };
 
+void update_stat(Stat &stat, int64_t amount)
+{
+    stat.current += amount;
+    stat.peak = std::max(stat.current, stat.peak);
+    if (amount > 0) {
+        stat.allocated += amount;
+    }
+    if (amount < 0) {
+        stat.freed += -amount;
+    }
+}
+
 class DeviceWorkspaceAllocator {
 public:
     DeviceWorkspaceAllocator()
     {
         blocks.clear();
+        context_recorder_.store(nullptr);
     }
 
+    std::shared_ptr<c10::GatheredContext> maybeGatherContext(RecordContext level)
+    {
+        if (record_context_ < level) {
+            return nullptr;
+        }
+        return context_recorder_.load()();
+    }
+    
     void* malloc(size_t size, aclrtStream stream)
     {
+        auto context = maybeGatherContext(RecordContext::STATE);
+
         size_t alloc_size = size + 32;
 
         auto it = blocks.find(stream);
@@ -66,9 +91,12 @@ public:
                 ASCEND_LOGI("NPUWorkspaceAllocator free by aclrtFree: size=%zu", block->size);
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeDeviceWithTimeout());
                 NPU_CHECK_ERROR(aclrtFree(block->data_ptr));
+                update_stat(stats.reserved_bytes, -block->size);
 #ifndef BUILD_LIBTORCH
-                mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-                torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block->data_ptr);
+                if (torch_npu::profiler::MstxMgr::GetInstance()->isMsleaksEnable()) {
+                    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+                    torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block->data_ptr);
+                }
                 record_mem_size_decrement(block->size);
                 const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
                 if (C10_UNLIKELY(trigger)) {
@@ -78,16 +106,18 @@ public:
                 torch_npu::profiler::reportMemoryDataToNpuProfiler({
                     static_cast<int8_t>(c10::DeviceType::PrivateUse1),
                     device,
+                    static_cast<uint8_t>(torch_npu::profiler::MemoryComponentType::WORKSPACE_ALLOCATOR),
                     static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_FREE),
                     static_cast<uint8_t>(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER),
                     reinterpret_cast<int64_t>(block->data_ptr),
                     -block->size,
-                    get_mem_size(),
-                    0, // reserved_bytes not used
-                    0, // active_bytes not used
-                  reinterpret_cast<int64_t>(stream)}
+                    stats.allocated_bytes.current,
+                    stats.reserved_bytes.current,
+                    stats.allocated_bytes.current,
+                    reinterpret_cast<int64_t>(stream)}
                 );
 #endif
+                block->data_ptr = nullptr;
             }
 
             block->size = kRoundLarge * ((alloc_size + kRoundLarge - 1) / kRoundLarge);
@@ -102,25 +132,33 @@ public:
             if (err != ACL_ERROR_NONE) {
                 return nullptr;
             }
+            block->context_when_allocated = std::move(context);
+            block->requested_size = size;
 
             ASCEND_LOGD("NPUWorkspaceAllocator malloc by AclrtMallocAlign32: size=%zu", block->size);
+            update_stat(stats.reserved_bytes, block->size);
 #ifndef BUILD_LIBTORCH
-            mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-            mstxMemVirtualRangeDesc_t desc{device, block->data_ptr, block->size};
-            torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc);
+            if (torch_npu::profiler::MstxMgr::GetInstance()->isMsleaksEnable()) {
+                mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+                mstxMemVirtualRangeDesc_t desc{device, block->data_ptr, block->size};
+                torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc);
+            }
             record_mem_size_increment(block->size);
             torch_npu::profiler::reportMemoryDataToNpuProfiler({
                 static_cast<int8_t>(c10::DeviceType::PrivateUse1),
                 device,
+                static_cast<uint8_t>(torch_npu::profiler::MemoryComponentType::WORKSPACE_ALLOCATOR),
                 static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC),
                 static_cast<uint8_t>(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER),
                 reinterpret_cast<int64_t>(block->data_ptr),
                 block->size,
-                get_mem_size(),
-                0, // reserved_bytes not used
-                0, // active_bytes not used
+                stats.allocated_bytes.current,
+                stats.reserved_bytes.current,
+                stats.allocated_bytes.current,
                 reinterpret_cast<int64_t>(stream)}
             );
+            this->last_block = block;
+            this->last_stream = stream;
             const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
             if (C10_UNLIKELY(trigger)) {
                 trigger->traceNpuMemoryAllocation(
@@ -128,9 +166,51 @@ public:
             }
 #endif
         }
+
+        allocated_size = block->size;
+        update_stat(stats.allocated_bytes, block->size);
+#ifndef BUILD_LIBTORCH
+        torch_npu::profiler::reportMemoryDataToNpuProfiler({
+            static_cast<int8_t>(c10::DeviceType::PrivateUse1),
+            device,
+            static_cast<uint8_t>(torch_npu::profiler::MemoryComponentType::WORKSPACE_ALLOCATOR),
+            static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC),
+            static_cast<uint8_t>(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER),
+            reinterpret_cast<int64_t>(block->data_ptr),
+            block->size,
+            stats.allocated_bytes.current,
+            stats.reserved_bytes.current,
+            stats.allocated_bytes.current,
+            reinterpret_cast<int64_t>(stream)}
+        );
+        this->last_block = block;
+        this->last_stream = stream;
+#endif
         return block->data_ptr;
     }
 
+    void free()
+    {
+        update_stat(stats.allocated_bytes, -allocated_size);
+#ifndef BUILD_LIBTORCH
+        if (this->last_block && this->last_block->data_ptr && this->last_stream) {
+            torch_npu::profiler::reportMemoryDataToNpuProfiler({
+                static_cast<int8_t>(c10::DeviceType::PrivateUse1),
+                device,
+                static_cast<uint8_t>(torch_npu::profiler::MemoryComponentType::WORKSPACE_ALLOCATOR),
+                static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_FREE),
+                static_cast<uint8_t>(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER),
+                reinterpret_cast<int64_t>(this->last_block->data_ptr),
+                -allocated_size,
+                stats.allocated_bytes.current,
+                stats.reserved_bytes.current,
+                stats.allocated_bytes.current,
+                reinterpret_cast<int64_t>(this->last_stream)}
+            );
+        }
+#endif
+    }
+
     // return to the system allocator
     void empty_cache(bool need_empty_queue, bool check_error)
     {
@@ -152,9 +232,12 @@ public:
             if (block_pair.second->data_ptr != nullptr) {
                 ASCEND_LOGI("NPUWorkspaceAllocator free by aclrtFree: size=%zu", block_pair.second->size);
                 NPU_CHECK_ERROR(aclrtFree(block_pair.second->data_ptr));
+                update_stat(stats.reserved_bytes, -block_pair.second->size);
 #ifndef BUILD_LIBTORCH
-                mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-                torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block_pair.second->data_ptr);
+                if (torch_npu::profiler::MstxMgr::GetInstance()->isMsleaksEnable()) {
+                    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+                    torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block_pair.second->data_ptr);
+                }
                 record_mem_size_decrement(block_pair.second->size);
                 const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
                 if (C10_UNLIKELY(trigger)) {
@@ -164,13 +247,14 @@ public:
                 torch_npu::profiler::reportMemoryDataToNpuProfiler({
                     static_cast<int8_t>(c10::DeviceType::PrivateUse1),
                     device,
+                    static_cast<uint8_t>(torch_npu::profiler::MemoryComponentType::WORKSPACE_ALLOCATOR),
                     static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_FREE),
                     static_cast<uint8_t>(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER),
                     reinterpret_cast<int64_t>(block_pair.second->data_ptr),
                     -block_pair.second->size,
-                    get_mem_size(),
-                    0, // reserved_bytes not used
-                    0, // active_bytes not used
+                    stats.allocated_bytes.current,
+                    stats.reserved_bytes.current,
+                    stats.allocated_bytes.current,
                     reinterpret_cast<int64_t>(block_pair.first)}
                 );
 #endif
@@ -180,6 +264,78 @@ public:
 
         blocks.clear();
     }
+
+    void record_history(bool enabled, CreateContextFn context_recorder, RecordContext when)
+    {
+        TORCH_CHECK(when == RecordContext::NEVER || context_recorder, PTA_ERROR(ErrCode::INTERNAL));
+        record_flag = enabled;
+        context_recorder_.store(record_flag ? context_recorder : nullptr);
+        record_context_ = enabled ? when : RecordContext::NEVER;
+    }
+
+    std::vector<TraceEntry> get_trace()
+    {
+        std::vector<TraceEntry> alloc_trace;
+#ifndef BUILD_LIBTORCH
+        if (!record_flag) {
+            return alloc_trace;
+        }
+        for (const auto& block_pair : blocks) {
+            auto te = TraceEntry(TraceEntry::WORKSPACE_SNAPSHOT, device, int64_t(block_pair.second->data_ptr),
+                                 block_pair.second->size, block_pair.first,
+                                 record_context_ >= RecordContext::ALLOC ? block_pair.second->context_when_allocated
+                                                                         : nullptr);
+            alloc_trace.emplace_back(te);
+
+            te = TraceEntry(TraceEntry::SEGMENT_ALLOC, device, int64_t(block_pair.second->data_ptr),
+                            block_pair.second->size, block_pair.first,
+                            record_context_ >= RecordContext::ALLOC ? block_pair.second->context_when_allocated
+                                                                    : nullptr);
+            alloc_trace.emplace_back(te);
+
+            te = TraceEntry(TraceEntry::ALLOC, device, int64_t(block_pair.second->data_ptr), block_pair.second->size,
+                            block_pair.first,
+                            record_context_ >= RecordContext::ALLOC ? block_pair.second->context_when_allocated
+                                                                    : nullptr);
+            alloc_trace.emplace_back(te);
+        }
+#endif
+        return alloc_trace;
+    }
+
+    std::vector<SegmentInfo> get_segm()
+    {
+        std::vector<SegmentInfo> result;
+#ifndef BUILD_LIBTORCH
+        for (const auto& block_pair : blocks) {
+            result.emplace_back();
+            SegmentInfo& segment_info = result.back();
+            segment_info.device = device;
+            segment_info.address = reinterpret_cast<int64_t>(block_pair.second->data_ptr);
+            segment_info.stream = block_pair.first;
+            segment_info.is_large = true;
+            segment_info.is_expandable = false;
+            segment_info.context_when_allocated = block_pair.second->context_when_allocated;
+
+            const WorkspaceBlock* block = block_pair.second;
+            segment_info.blocks.emplace_back();
+            BlockInfo& block_info = segment_info.blocks.back();
+            block_info.size = block->size;
+            block_info.requested_size = block->requested_size;
+            block_info.allocated = block->allocated;
+            block_info.active = block->allocated;
+
+            segment_info.total_size += block_info.size;
+            if (block_info.allocated) {
+                segment_info.allocated_size += block_info.size;
+                segment_info.active_size += block_info.size;
+                segment_info.requested_size += block_info.requested_size;
+            }
+            block_info.context_when_allocated = block->context_when_allocated;
+        }
+#endif
+        return result;
+    }
 #ifndef BUILD_LIBTORCH
     void set_device(int device_id)
     {
@@ -201,12 +357,35 @@ public:
         return this->sum_mem;
     }
 #endif
+
+    DeviceStats getStats()
+    {
+        return stats;
+    }
+
+    void *getStreamPtr(aclrtStream stream)
+    {
+        auto it = blocks.find(stream);
+        if (it == blocks.end()) {
+            return nullptr;
+        }
+        WorkspaceBlock *block = it->second;
+        return block->data_ptr;
+    }
+
 private:
     ska::flat_hash_map<aclrtStream, WorkspaceBlock*> blocks;
+    bool record_flag = false;
+    std::atomic<CreateContextFn> context_recorder_;
+    RecordContext record_context_ = RecordContext::NEVER;
 #ifndef BUILD_LIBTORCH
     uint64_t sum_mem = 0;
     int device = 0;
+    aclrtStream last_stream = nullptr;
+    WorkspaceBlock* last_block = nullptr;
 #endif
+    DeviceStats stats;
+    size_t allocated_size = 0;
 }; // class DeviceworkspaceAllocator
 
 static void uncached_delete(void* ptr)
@@ -215,14 +394,30 @@ static void uncached_delete(void* ptr)
     NPU_CHECK_ERROR(aclrtFree(ptr));
 }
 
-// Now we will reuse the allocated memory and not release immediately until
-// memory is insufficient for NpuCachingAllocator or NpuWorkspaceAllocator.
-// Then both will empty cache and the large memory will be released.
-static void local_raw_delete(void* ptr)
-{
-}
+static void local_raw_delete(void* ptr);
 
 class NpuWorkspaceAllocator : public c10::Allocator {
+private:
+    // allocated blocks by device pointer
+    ska::flat_hash_map<void *, int> allocated_ptrs;
+
+    void replace_allocated_ptr(void *new_ptr, void *src_ptr, int device)
+    {
+        auto it = allocated_ptrs.find(src_ptr);
+        if (it != allocated_ptrs.end()) {
+            allocated_ptrs.erase(it);
+        }
+        allocated_ptrs[new_ptr] = device;
+    }
+
+    int get_allocated_device(void *ptr)
+    {
+        auto it = allocated_ptrs.find(ptr);
+        if (it == allocated_ptrs.end()) {
+            return -1;
+        }
+        return it->second;
+    }
 public:
     std::vector<std::unique_ptr<DeviceWorkspaceAllocator>> device_allocator;
 
@@ -242,6 +437,7 @@ public:
 
     void malloc(void** new_ptr, int device, size_t size, aclrtStream stream)
     {
+        auto src_ptr = static_cast<void*>(device_allocator[device]->getStreamPtr(stream));
         *new_ptr = static_cast<void*>(device_allocator[device]->malloc(size, stream));
 
         // Free all cached blocks and try again.
@@ -266,11 +462,35 @@ public:
                 " free)",
                 PTA_ERROR(ErrCode::MEMORY));
         }
+
+        if ((*new_ptr) != src_ptr) {
+            replace_allocated_ptr(*new_ptr, src_ptr, device);
+        }
     }
 
     void empty_cache(int device, bool need_empty_queue, bool check_error)
     {
         device_allocator[device]->empty_cache(need_empty_queue, check_error);
+        allocated_ptrs.clear();
+    }
+
+    void record_history(bool enabled, CreateContextFn context_recorder, RecordContext when)
+    {
+        for (auto& allocator : device_allocator) {
+            allocator->record_history(enabled, context_recorder, when);
+        }
+    }
+
+    SnapshotInfo snapshot()
+    {
+        SnapshotInfo result;
+        int count = static_cast<int>(device_allocator.size());
+        for (int i = 0; i < count; i++) {
+            result.device_traces.emplace_back(device_allocator[i]->get_trace());
+            auto snap = device_allocator[i]->get_segm();
+            result.segments.insert(result.segments.end(), snap.begin(), snap.end());
+        }
+        return result;
     }
 
     c10::DataPtr allocate(size_t size) const override
@@ -312,10 +532,42 @@ public:
             return &local_raw_delete;
         }
     }
+
+    void assertValidDevice(int device)
+    {
+        const auto device_num = device_allocator.size();
+        TORCH_CHECK(0 <= device && device < static_cast<int64_t>(device_num), "Invalid device argument ", device,
+                    ": did you call init?", PTA_ERROR(ErrCode::PARAM));
+    }
+
+    void free(void* ptr)
+    {
+        if (!ptr) {
+            return;
+        }
+        int device = get_allocated_device(ptr);
+        if (device != -1) {
+            device_allocator[device]->free();
+        }
+    }
+
+    DeviceStats getDeviceStats(int device)
+    {
+        assertValidDevice(device);
+        return device_allocator[device]->getStats();
+    }
 }; // class NpuWorkspaceAllocator
 
 NpuWorkspaceAllocator workspace_allocator;
 
+// Now we will reuse the allocated memory and not release immediately until
+// memory is insufficient for NpuCachingAllocator or NpuWorkspaceAllocator.
+// Then both will empty cache and the large memory will be released.
+static void local_raw_delete(void* ptr)
+{
+    workspace_allocator.free(ptr);
+}
+
 c10::Allocator* get()
 {
     return &workspace_allocator;
@@ -338,5 +590,20 @@ void emptyCache(int device, bool need_empty_queue, bool check_error)
     workspace_allocator.empty_cache(device, need_empty_queue, check_error);
 }
 
+void recordHistory(bool enabled, CreateContextFn context_recorder, RecordContext when)
+{
+    workspace_allocator.record_history(enabled, context_recorder, when);
+}
+SnapshotInfo snapshot()
+{
+    return workspace_allocator.snapshot();
+}
+
+DeviceStats getDeviceStats(int device)
+{
+    return workspace_allocator.getDeviceStats(device);
+}
+
+
 } // namespace NPUWorkspaceAllocator
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h
index 867a25c70324440d618c5610be3014017f366818..75c30236c3c7ea9ef21464345685d0524a424f83 100644
--- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h
@@ -2,14 +2,33 @@
 
 #include <c10/core/Allocator.h>
 #include "torch_npu/csrc/core/npu/NPUMacros.h"
+#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 
 namespace c10_npu {
 namespace NPUWorkspaceAllocator {
 
+using c10_npu::NPUCachingAllocator::CreateContextFn;
+using c10_npu::NPUCachingAllocator::BlockInfo;
+using c10_npu::NPUCachingAllocator::SegmentInfo;
+using c10_npu::NPUCachingAllocator::TraceEntry;
+using c10_npu::NPUCachingAllocator::SnapshotInfo;
+using c10_npu::NPUCachingAllocator::RecordContext;
+using c10_npu::NPUCachingAllocator::Stat;
+
+struct DeviceStats {
+    // SUM: bytes requested by client code
+    Stat allocated_bytes;
+    // SUM: bytes reserved by this memory allocator (both free and used)
+    Stat reserved_bytes;
+};
+
 c10::Allocator* get();
 void init();
 c10::DataPtr malloc_with_stream(size_t size, aclrtStream stream);
 void emptyCache(int device, bool need_empty_queue, bool check_error = true);
+void recordHistory(bool enabled, CreateContextFn context_recorder, RecordContext when);
+SnapshotInfo snapshot();
+DeviceStats getDeviceStats(int device);
 
 } // namespace NPUWorkspaceAllocator
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NpuVariables.cpp b/torch_npu/csrc/core/npu/NpuVariables.cpp
index efda390216ee345edcd1c73c6647f84064761f5c..3fedb9d387ef61702a7414912b5572a8e187e7cd 100644
--- a/torch_npu/csrc/core/npu/NpuVariables.cpp
+++ b/torch_npu/csrc/core/npu/NpuVariables.cpp
@@ -20,6 +20,8 @@ static std::map<std::string, SocVersion> socVersionMap = {
     {"Ascend310P2", SocVersion::Ascend310P2},
     {"Ascend310P3", SocVersion::Ascend310P3},
     {"Ascend310P4", SocVersion::Ascend310P4},
+    {"Ascend310P5", SocVersion::Ascend310P5},
+    {"Ascend310P7", SocVersion::Ascend310P7},
     {"Ascend910B1", SocVersion::Ascend910B1},
     {"Ascend910B2", SocVersion::Ascend910B2},
     {"Ascend910B2C", SocVersion::Ascend910B2C},
diff --git a/torch_npu/csrc/core/npu/NpuVariables.h b/torch_npu/csrc/core/npu/NpuVariables.h
index f2575ee8cfba62a1ecc066650acec2f97574e8b2..3119a645153322225f9d0d9ea19dfa3b1ef9ab9f 100644
--- a/torch_npu/csrc/core/npu/NpuVariables.h
+++ b/torch_npu/csrc/core/npu/NpuVariables.h
@@ -13,6 +13,8 @@ enum class SocVersion {
   Ascend310P2,
   Ascend310P3,
   Ascend310P4,
+  Ascend310P5,
+  Ascend310P7,
   Ascend910B1 = 220,
   Ascend910B2,
   Ascend910B2C,
diff --git a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp
index 910d22fe969ec8360a9afcff6bf82363d99e9225..961022b302139530aa0e436e715b03fb0ad35931 100644
--- a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp
+++ b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.cpp
@@ -1,15 +1,186 @@
 #pragma GCC visibility push(default)
 #include <torch/csrc/jit/serialization/pickler.h>
 #include "torch_npu/csrc/core/npu/impl/NPUGuardImpl.h"
+#include "torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h"
+#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
+#include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
 #include "torch_npu/csrc/core/NPUSerialization.h"
 
+#ifndef BUILD_LIBTORCH
+#include "torch_npu/csrc/sanitizer/NPUTrace.h"
+#endif
+
 namespace c10_npu {
 
 namespace impl {
 
 constexpr c10::DeviceType NPUGuardImpl::static_type;
 
+NPUGuardImpl::NPUGuardImpl(c10::DeviceType t)
+{
+    TORCH_INTERNAL_ASSERT(t == c10::DeviceType::PrivateUse1, "DeviceType must be NPU. Actual DeviceType is: ", t,
+                          PTA_ERROR(ErrCode::PARAM));
+}
+
+c10::Device NPUGuardImpl::exchangeDevice(c10::Device d) const
+{
+    TORCH_INTERNAL_ASSERT(d.type() == c10::DeviceType::PrivateUse1,
+                          "DeviceType must be NPU. Actual DeviceType is: ", d.type(), PTA_ERROR(ErrCode::PARAM));
+    c10::Device old_device = getDevice();
+    if (old_device.index() != d.index()) {
+        NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::SetDevice(d.index()));
+    }
+    return old_device;
+}
+
+c10::Device NPUGuardImpl::getDevice() const
+{
+    int device = 0;
+    NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device));
+    return c10::Device(c10::DeviceType::PrivateUse1, device);
+}
+
+void NPUGuardImpl::setDevice(c10::Device d) const
+{
+    TORCH_INTERNAL_ASSERT(d.type() == c10::DeviceType::PrivateUse1,
+                          "DeviceType must be NPU. Actual DeviceType is: ", d.type(), PTA_ERROR(ErrCode::PARAM));
+    NPU_CHECK_ERROR(c10_npu::SetDevice(d.index()));
+}
+
+void NPUGuardImpl::uncheckedSetDevice(c10::Device d) const noexcept
+{
+    NPU_CHECK_WARN(c10_npu::SetDevice(d.index()));
+}
+
+c10::Stream NPUGuardImpl::getStream(c10::Device d) const noexcept
+{
+    return c10_npu::getCurrentNPUStream(d.index()).unwrap();
+}
+
+c10::Stream NPUGuardImpl::getDefaultStream(c10::Device d) const
+{
+    return c10_npu::getDefaultNPUStream(d.index());
+}
+
+c10::Stream NPUGuardImpl::getStreamFromGlobalPool(c10::Device d, bool isHighPriority) const
+{
+    return c10_npu::getStreamFromPool(isHighPriority, d.index());
+}
+
+c10::Stream NPUGuardImpl::exchangeStream(c10::Stream s) const noexcept
+{
+    NPUStream cs(s);
+    auto old_stream = c10_npu::getCurrentNPUStream(s.device().index());
+    c10_npu::setCurrentNPUStream(cs);
+    return old_stream.unwrap();
+}
+
+c10::DeviceIndex NPUGuardImpl::deviceCount() const noexcept
+{
+    static c10::DeviceIndex count = c10_npu::device_count();
+    return count;
+}
+
+// Event-related functions
+void NPUGuardImpl::createEvent(aclrtEvent *acl_event, const c10::EventFlag flag) const
+{
+    auto flag_ = c10_npu::acl::IsExistCreateEventExWithFlag() ? ACL_EVENT_SYNC : ACL_EVENT_DEFAULT;
+    NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtCreateEventWithFlag(acl_event, flag_));
+    ASCEND_LOGI("Event: aclrtCreateEventWithFlag is successfully executed, event=%p", *acl_event);
+#ifndef BUILD_LIBTORCH
+    const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
+    if (C10_UNLIKELY(trigger)) {
+        trigger->traceNpuEventCreation(reinterpret_cast<uintptr_t>(*acl_event));
+    }
+#endif
+}
+
+void NPUGuardImpl::destroyEvent(void *event, const c10::DeviceIndex device_index) const noexcept
+{
+    if (!event) {
+        return;
+    }
+    auto acl_event = static_cast<aclrtEvent>(event);
+    NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::queue::LaunchLazyDestroyEventTask(acl_event, device_index));
+    ASCEND_LOGI("Event: aclrtDestroyEvent is successfully executed, event=%p", acl_event);
+}
+
+void NPUGuardImpl::record(void **event, const c10::Stream &stream, const c10::DeviceIndex device_index,
+                          const c10::EventFlag flag) const
+{
+    TORCH_CHECK(device_index == -1 || device_index == stream.device_index(), "Event device index ", device_index,
+                " does not match recording stream's device index ", stream.device_index(), ".",
+                PTA_ERROR(ErrCode::PARAM));
+
+    aclrtEvent npu_event = static_cast<aclrtEvent>(*event);
+    NPUStream npu_stream{stream};
+
+    // Moves to stream's device to record
+    const auto orig_device = getDevice();
+    setDevice(stream.device());
+
+    // Creates the event (lazily)
+    if (!npu_event) {
+        auto flag_ = c10_npu::acl::IsExistCreateEventExWithFlag() ? ACL_EVENT_SYNC : ACL_EVENT_DEFAULT;
+        NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtCreateEventWithFlag(&npu_event, flag_));
+        ASCEND_LOGI("Event: aclrtCreateEventWithFlag is successfully executed, event=%p", npu_event);
+#ifndef BUILD_LIBTORCH
+        const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
+        if (C10_UNLIKELY(trigger)) {
+            trigger->traceNpuEventCreation(reinterpret_cast<uintptr_t>(npu_event));
+        }
+#endif
+    }
+    NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::queue::LaunchRecordEventTask(npu_event, npu_stream));
+    ASCEND_LOGI("Event: aclrtRecordEvent is successfully executed, stream=%p, event=%p", npu_stream.stream(false),
+                npu_event);
+    // Makes the void* point to the (possibly just allocated) NPU event
+    *event = npu_event;
+
+    // Resets device
+    setDevice(orig_device);
+}
+
+void NPUGuardImpl::block(void *event, const c10::Stream &stream) const
+{
+    if (!event) {
+        return;
+    }
+    aclrtEvent npu_event = static_cast<aclrtEvent>(event);
+    NPUStream npu_stream{stream};
+    const auto orig_device = getDevice();
+    setDevice(stream.device());
+    NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::queue::LaunchWaitEventTask(npu_event, npu_stream));
+    ASCEND_LOGI("Event: aclrtStreamWaitEvent is successfully executed, stream=%p, event=%p",
+                npu_stream.stream(false), npu_event);
+    setDevice(orig_device);
+}
+
+// May be called from any device
+bool NPUGuardImpl::queryEvent(void *event) const
+{
+    if (!event) {
+        return true;
+    }
+    aclrtEvent npu_event = static_cast<aclrtEvent>(event);
+    if (c10_npu::option::OptionsManager::GetTaskQueueEnable() != 0 &&
+        !c10_npu::NPUEventManager::GetInstance().IsEventRecorded(npu_event)) {
+        return false;
+    }
+    acl::aclrtEventRecordedStatus status = acl::ACL_EVENT_RECORDED_STATUS_NOT_READY;
+    NPU_CHECK_ERROR_WITHOUT_UCE(acl::AclQueryEventRecordedStatus(npu_event, &status));
+    return (status == acl::ACL_EVENT_RECORDED_STATUS_COMPLETE);
+}
+
+void NPUGuardImpl::recordDataPtrOnStream(const c10::DataPtr &data_ptr, const c10::Stream &stream) const
+{
+    NPUStream npu_stream{stream};
+    c10_npu::NPUCachingAllocator::recordStream(data_ptr, npu_stream);
+}
+
 C10_REGISTER_GUARD_IMPL(PrivateUse1, NPUGuardImpl);
 
 #define REGISTER_PRIVATEUSE1_BACKEND(name)                                                                             \
diff --git a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h
index e7d26e272a29a81090d10108566807b8d3200ff1..c511f370ce3af1b484c0f101e3715c13c85e01ad 100644
--- a/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h
+++ b/torch_npu/csrc/core/npu/impl/NPUGuardImpl.h
@@ -4,187 +4,47 @@
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/macros/Macros.h>
 
-#include "torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h"
-#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
-#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/core/npu/NPUStream.h"
-#include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
-#include <third_party/acl/inc/acl/acl.h>
-#include <third_party/acl/inc/acl/acl_base.h>
-#include <third_party/acl/inc/acl/acl_rt.h>
-#ifndef BUILD_LIBTORCH
-#include "torch_npu/csrc/sanitizer/NPUTrace.h"
-#endif
-
+#include "third_party/acl/inc/acl/acl.h"
+#include "third_party/acl/inc/acl/acl_base.h"
+#include "third_party/acl/inc/acl/acl_rt.h"
 
 namespace c10_npu {
 namespace impl {
 
-struct NPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+struct C10_NPU_API NPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     static constexpr c10::DeviceType static_type = c10::DeviceType::PrivateUse1;
 
     NPUGuardImpl() {}
-    explicit NPUGuardImpl(c10::DeviceType t)
-    {
-        TORCH_INTERNAL_ASSERT(t == c10::DeviceType::PrivateUse1, "DeviceType must be NPU. Actual DeviceType is: ", t,
-                              PTA_ERROR(ErrCode::PARAM));
-    }
+    explicit NPUGuardImpl(c10::DeviceType t);
     c10::DeviceType type() const override
     {
         return c10::DeviceType::PrivateUse1;
     }
-    c10::Device exchangeDevice(c10::Device d) const override
-    {
-        TORCH_INTERNAL_ASSERT(d.type() == c10::DeviceType::PrivateUse1,
-                              "DeviceType must be NPU. Actual DeviceType is: ", d.type(), PTA_ERROR(ErrCode::PARAM));
-        c10::Device old_device = getDevice();
-        if (old_device.index() != d.index()) {
-            NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::SetDevice(d.index()));
-        }
-        return old_device;
-    }
-    c10::Device getDevice() const override
-    {
-        int device = 0;
-        NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device));
-        return c10::Device(c10::DeviceType::PrivateUse1, device);
-    }
-    void setDevice(c10::Device d) const override
-    {
-        TORCH_INTERNAL_ASSERT(d.type() == c10::DeviceType::PrivateUse1,
-                              "DeviceType must be NPU. Actual DeviceType is: ", d.type(), PTA_ERROR(ErrCode::PARAM));
-        uncheckedSetDevice(d);
-    }
-    void uncheckedSetDevice(c10::Device d) const noexcept override
-    {
-        SetBackwardThreadName(d.index());
-        NPU_CHECK_WARN(c10_npu::SetDevice(d.index()));
-    }
-    c10::Stream getStream(c10::Device d) const noexcept override
-    {
-        return c10_npu::getCurrentNPUStream(d.index()).unwrap();
-    }
-    c10::Stream getDefaultStream(c10::Device d) const override
-    {
-        return c10_npu::getDefaultNPUStream(d.index());
-    }
-    c10::Stream getStreamFromGlobalPool(c10::Device d, bool isHighPriority = false) const override
-    {
-        return c10_npu::getStreamFromPool(isHighPriority, d.index());
-    }
-    // NB: These do NOT set the current device
-    c10::Stream exchangeStream(c10::Stream s) const noexcept override
-    {
-        NPUStream cs(s);
-        auto old_stream = c10_npu::getCurrentNPUStream(s.device().index());
-        c10_npu::setCurrentNPUStream(cs);
-        return old_stream.unwrap();
-    }
-    c10::DeviceIndex deviceCount() const noexcept override
-    {
-        static c10::DeviceIndex count = c10_npu::device_count();
-        return count;
-    }
 
-    // Event-related functions
-    void createEvent(aclrtEvent *acl_event, [[maybe_unused]] const c10::EventFlag flag) const
-    {
-        auto flag_ = c10_npu::acl::IsExistCreateEventExWithFlag() ? ACL_EVENT_SYNC : ACL_EVENT_DEFAULT;
-        NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtCreateEventWithFlag(acl_event, flag_));
-        ASCEND_LOGI("Event: aclrtCreateEventWithFlag is successfully executed, event=%p", *acl_event);
-#ifndef BUILD_LIBTORCH
-        const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
-        if (C10_UNLIKELY(trigger)) {
-            trigger->traceNpuEventCreation(reinterpret_cast<uintptr_t>(*acl_event));
-        }
-#endif
-    }
+    c10::Device exchangeDevice(c10::Device d) const override;
+    c10::Device getDevice() const override;
+    void setDevice(c10::Device d) const override;
+    void uncheckedSetDevice(c10::Device d) const noexcept override;
 
-    void destroyEvent(void *event, const c10::DeviceIndex device_index) const noexcept override
-    {
-        if (!event) {
-            return;
-        }
-        auto acl_event = static_cast<aclrtEvent>(event);
-        NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::queue::LaunchLazyDestroyEventTask(acl_event, device_index));
-        ASCEND_LOGI("Event: aclrtDestroyEvent is successfully executed, event=%p", acl_event);
-    }
+    c10::Stream getStream(c10::Device d) const noexcept override;
+    c10::Stream getDefaultStream(c10::Device d) const override;
+    c10::Stream getStreamFromGlobalPool(c10::Device d, bool isHighPriority = false) const override;
+    // NB: These do NOT set the current device
+    c10::Stream exchangeStream(c10::Stream s) const noexcept override;
+    c10::DeviceIndex deviceCount() const noexcept override;
 
+    // Event-related functions
+    void createEvent(aclrtEvent *acl_event, [[maybe_unused]] const c10::EventFlag flag) const;
+    void destroyEvent(void *event, const c10::DeviceIndex device_index) const noexcept override;
     void record(void **event, const c10::Stream &stream, const c10::DeviceIndex device_index,
-                const c10::EventFlag flag) const override
-    {
-        TORCH_CHECK(device_index == -1 || device_index == stream.device_index(), "Event device index ", device_index,
-                    " does not match recording stream's device index ", stream.device_index(), ".",
-                    PTA_ERROR(ErrCode::PARAM));
-
-        aclrtEvent npu_event = static_cast<aclrtEvent>(*event);
-        NPUStream npu_stream{stream};
-
-        // Moves to stream's device to record
-        const auto orig_device = getDevice();
-        setDevice(stream.device());
-
-        // Creates the event (lazily)
-        if (!npu_event) {
-            auto flag_ = c10_npu::acl::IsExistCreateEventExWithFlag() ? ACL_EVENT_SYNC : ACL_EVENT_DEFAULT;
-            NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtCreateEventWithFlag(&npu_event, flag_));
-            ASCEND_LOGI("Event: aclrtCreateEventWithFlag is successfully executed, event=%p", npu_event);
-#ifndef BUILD_LIBTORCH
-            const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
-            if (C10_UNLIKELY(trigger)) {
-                trigger->traceNpuEventCreation(reinterpret_cast<uintptr_t>(npu_event));
-            }
-#endif
-        }
-        NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::queue::LaunchRecordEventTask(npu_event, npu_stream));
-        ASCEND_LOGI("Event: aclrtRecordEvent is successfully executed, stream=%p, event=%p", npu_stream.stream(false),
-                    npu_event);
-        // Makes the void* point to the (possibly just allocated) NPU event
-        *event = npu_event;
-
-        // Resets device
-        setDevice(orig_device);
-    }
-
-    void block(void *event, const c10::Stream &stream) const override
-    {
-        if (!event) {
-            return;
-        }
-        aclrtEvent npu_event = static_cast<aclrtEvent>(event);
-        NPUStream npu_stream{stream};
-        const auto orig_device = getDevice();
-        setDevice(stream.device());
-        NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::queue::LaunchWaitEventTask(npu_event, npu_stream));
-        ASCEND_LOGI("Event: aclrtStreamWaitEvent is successfully executed, stream=%p, event=%p",
-                    npu_stream.stream(false), npu_event);
-        setDevice(orig_device);
-    }
-
+                const c10::EventFlag flag) const;
+    void block(void *event, const c10::Stream &stream) const override;
     // May be called from any device
-    bool queryEvent(void *event) const override
-    {
-        if (!event) {
-            return true;
-        }
-        aclrtEvent npu_event = static_cast<aclrtEvent>(event);
-        if (c10_npu::option::OptionsManager::GetTaskQueueEnable() != 0 &&
-            !c10_npu::NPUEventManager::GetInstance().IsEventRecorded(npu_event)) {
-            return false;
-        }
-        acl::aclrtEventRecordedStatus status = acl::ACL_EVENT_RECORDED_STATUS_NOT_READY;
-        NPU_CHECK_ERROR_WITHOUT_UCE(acl::AclQueryEventRecordedStatus(npu_event, &status));
-        return (status == acl::ACL_EVENT_RECORDED_STATUS_COMPLETE);
-    }
-
-    void recordDataPtrOnStream(const c10::DataPtr &data_ptr, const c10::Stream &stream) const override
-    {
-        NPUStream npu_stream{stream};
-        c10_npu::NPUCachingAllocator::recordStream(data_ptr, npu_stream);
-    }
+    bool queryEvent(void *event) const override;
+    void recordDataPtrOnStream(const c10::DataPtr &data_ptr, const c10::Stream &stream) const override;
 };
 
 } // namespace impl
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index 0d40ba05cf757a89c4598d66735edc73c9fafb32..b59e9c85c96e2998273953d3d068a3465bd0efde 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -69,13 +69,19 @@ LOAD_FUNCTION(aclrtPeekAtLastError)
 LOAD_FUNCTION(aclrtSynchronizeDevice)
 LOAD_FUNCTION(aclrtSynchronizeDeviceWithTimeout)
 LOAD_FUNCTION(aclrtEventGetTimestamp)
-LOAD_FUNCTION(aclmdlCaptureBegin)
-LOAD_FUNCTION(aclmdlCaptureGetInfo)
-LOAD_FUNCTION(aclmdlCaptureEnd)
-LOAD_FUNCTION(aclmdlDebugPrint)
-LOAD_FUNCTION(aclmdlExecuteAsync)
-LOAD_FUNCTION(aclmdlUnload)
+LOAD_FUNCTION(aclmdlRICaptureBegin)
+LOAD_FUNCTION(aclmdlRICaptureGetInfo)
+LOAD_FUNCTION(aclmdlRICaptureEnd)
+LOAD_FUNCTION(aclmdlRIDebugPrint)
+LOAD_FUNCTION(aclmdlRIExecuteAsync)
+LOAD_FUNCTION(aclmdlRIDestroy)
 LOAD_FUNCTION(aclsysGetCANNVersion)
+LOAD_FUNCTION(aclrtHostRegister)
+LOAD_FUNCTION(aclrtHostUnregister)
+LOAD_FUNCTION(aclmdlRICaptureTaskGrpBegin)
+LOAD_FUNCTION(aclmdlRICaptureTaskGrpEnd)
+LOAD_FUNCTION(aclmdlRICaptureTaskUpdateBegin)
+LOAD_FUNCTION(aclmdlRICaptureTaskUpdateEnd)
 
 aclprofStepInfoPtr init_stepinfo() {
     typedef aclprofStepInfoPtr(*npdInitFunc)();
@@ -202,13 +208,16 @@ aclError AclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag)
     //   2. There is no limit on the number of events.
     //   3. Only support query event record status, aclrtQueryEvent and aclrtQueryEventWaitStatus are not supported.
     //   4. aclrtDestroyEvent change to asynchronous destroy event.
-    static AclrtCreateEventWithFlagFunc func = (AclrtCreateEventWithFlagFunc)GET_FUNC(aclrtCreateEventExWithFlag);
-    if (func == nullptr) {
-        TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtCreateEventExWithFlag");
-        func = (AclrtCreateEventWithFlagFunc)GET_FUNC(aclrtCreateEventWithFlag);
+    static AclrtCreateEventWithFlagFunc func_ex = (AclrtCreateEventWithFlagFunc)GET_FUNC(aclrtCreateEventExWithFlag);
+    if (func_ex == nullptr) {
+        TORCH_NPU_WARN_ONCE(func_ex, "Failed to find function ", "aclrtCreateEventExWithFlag");
     }
+    static AclrtCreateEventWithFlagFunc func = (AclrtCreateEventWithFlagFunc)GET_FUNC(aclrtCreateEventWithFlag);
     TORCH_CHECK(func, "Failed to find function ", "aclrtCreateEventWithFlag", PTA_ERROR(ErrCode::NOT_FOUND));
-    return func(event, flag);
+    if ((flag == ACL_EVENT_EXTERNAL) || (func_ex == nullptr)) {
+        return func(event, flag);
+    }
+    return func_ex(event, flag);
 }
 
 aclError AclQueryEventWaitStatus(aclrtEvent event, aclrtEventWaitStatus *waitStatus)
@@ -742,67 +751,65 @@ aclError AclrtEventGetTimestamp(aclrtEvent event, uint64_t *timestamp)
     return func(event, timestamp);
 }
 
-aclError AclmdlCaptureBegin(aclrtStream stream, aclmdlCaptureMode mode)
+aclError AclmdlRICaptureBegin(aclrtStream stream, aclmdlRICaptureMode mode)
 {
-    typedef aclError (*AclmdlCaptureBegin)(aclrtStream, aclmdlCaptureMode);
-    static AclmdlCaptureBegin func = nullptr;
+    typedef aclError (*AclmdlRICaptureBegin)(aclrtStream, aclmdlRICaptureMode);
+    static AclmdlRICaptureBegin func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlCaptureBegin) GET_FUNC(aclmdlCaptureBegin);
+        func = (AclmdlRICaptureBegin) GET_FUNC(aclmdlRICaptureBegin);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlCaptureBegin", PTA_ERROR(ErrCode::NOT_FOUND));
+    TORCH_CHECK(func, "Failed to find function aclmdlRICaptureBegin", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(stream, mode);
 }
 
-aclError AclmdlCaptureGetInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId)
+aclError AclmdlRICaptureGetInfo(aclrtStream stream, aclmdlRICaptureStatus *status, aclmdlRI *modelRI)
 {
-    typedef aclError (*AclmdlCaptureGetInfo)(aclrtStream, aclmdlCaptureStatus *, uint32_t *);
-    static AclmdlCaptureGetInfo func = nullptr;
+    typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *);
+    static AclmdlRICaptureGetInfo func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlCaptureGetInfo) GET_FUNC(aclmdlCaptureGetInfo);
+        func = (AclmdlRICaptureGetInfo) GET_FUNC(aclmdlRICaptureGetInfo);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlCaptureGetInfo", PTA_ERROR(ErrCode::NOT_FOUND));
-    return func(stream, status, modelId);
+    TORCH_CHECK(func, "Failed to find function aclmdlRICaptureGetInfo", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream, status, modelRI);
 }
 
-aclError AclmdlCaptureEnd(aclrtStream stream, uint32_t *modelId)
+aclError AclmdlRICaptureEnd(aclrtStream stream, aclmdlRI *modelRI)
 {
-    typedef aclError (*AclmdlCaptureEnd)(aclrtStream, uint32_t *);
-    static AclmdlCaptureEnd func = nullptr;
+    typedef aclError (*AclmdlRICaptureEnd)(aclrtStream, aclmdlRI *);
+    static AclmdlRICaptureEnd func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlCaptureEnd) GET_FUNC(aclmdlCaptureEnd);
+        func = (AclmdlRICaptureEnd) GET_FUNC(aclmdlRICaptureEnd);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlCaptureEnd", PTA_ERROR(ErrCode::NOT_FOUND));
-    return func(stream, modelId);
+    TORCH_CHECK(func, "Failed to find function aclmdlRICaptureEnd", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream, modelRI);
 }
 
-aclError AclmdlDebugPrint(uint32_t modelId)
+aclError AclmdlRIDebugPrint(aclmdlRI modelRI)
 {
-    typedef aclError (*AclmdlDebugPrint)(uint32_t);
-    static AclmdlDebugPrint func = nullptr;
+    typedef aclError (*AclmdlRIDebugPrint)(aclmdlRI);
+    static AclmdlRIDebugPrint func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlDebugPrint) GET_FUNC(aclmdlDebugPrint);
+        func = (AclmdlRIDebugPrint) GET_FUNC(aclmdlRIDebugPrint);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlDebugPrint", PTA_ERROR(ErrCode::NOT_FOUND));
-    return func(modelId);
+    TORCH_CHECK(func, "Failed to find function aclmdlRIDebugPrint", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(modelRI);
 }
 
-aclError AclmdlExecuteAsync(uint32_t modelId, aclrtStream stream)
+aclError AclmdlRIExecuteAsync(aclmdlRI modelRI, aclrtStream stream)
 {
-    typedef aclError (*AclmdlExecuteAsync)(uint32_t, const aclmdlDataset *, aclmdlDataset *, aclrtStream);
-    static AclmdlExecuteAsync func = nullptr;
+    typedef aclError (*AclmdlRIExecuteAsync)(aclmdlRI, aclrtStream);
+    static AclmdlRIExecuteAsync func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlExecuteAsync) GET_FUNC(aclmdlExecuteAsync);
+        func = (AclmdlRIExecuteAsync) GET_FUNC(aclmdlRIExecuteAsync);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlExecuteAsync", PTA_ERROR(ErrCode::NOT_FOUND));
+    TORCH_CHECK(func, "Failed to find function aclmdlRIExecuteAsync", PTA_ERROR(ErrCode::NOT_FOUND));
 
-    static aclmdlDataset *inputs = aclmdlCreateDataset();
-    static aclmdlDataset *outputs = aclmdlCreateDataset();
-    return func(modelId, inputs, outputs, stream);
+    return func(modelRI, stream);
 }
 
 aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *version)
@@ -819,16 +826,16 @@ aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *ve
     return func(name, version);
 }
 
-aclError AclmdlUnload(uint32_t modelId)
+aclError AclmdlRIDestroy(aclmdlRI modelRI)
 {
-    typedef aclError (*AclmdlUnload)(uint32_t);
-    static AclmdlUnload func = nullptr;
+    typedef aclError (*AclmdlRIDestroy)(aclmdlRI);
+    static AclmdlRIDestroy func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlUnload) GET_FUNC(aclmdlUnload);
+        func = (AclmdlRIDestroy) GET_FUNC(aclmdlRIDestroy);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlUnload", PTA_ERROR(ErrCode::NOT_FOUND));
-    return func(modelId);
+    TORCH_CHECK(func, "Failed to find function aclmdlRIDestroy", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(modelRI);
 }
 
 bool IsCaptureSupported()
@@ -840,13 +847,85 @@ bool IsCaptureSupported()
         (GetSocVersion() >= SocVersion::Ascend910_9391);
     if (default_support_capture && !have_load_func) {
         have_load_func = true;
-        typedef aclError (*AclmdlCaptureGetInfo)(aclrtStream, aclmdlCaptureStatus *, uint32_t *);
-        static AclmdlCaptureGetInfo func = (AclmdlCaptureGetInfo) GET_FUNC(aclmdlCaptureGetInfo);
+        typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *);
+        static AclmdlRICaptureGetInfo func = (AclmdlRICaptureGetInfo) GET_FUNC(aclmdlRICaptureGetInfo);
         is_support = (func != nullptr);
     }
 
     return is_support;
 }
 
+aclError AclrtHostRegister(void *ptr, uint64_t size, aclrtHostRegisterType type, void **devPtr)
+{
+    typedef aclError (*AclrtHostRegister)(void *, uint64_t, aclrtHostRegisterType, void **);
+    static AclrtHostRegister func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtHostRegister) GET_FUNC(aclrtHostRegister);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclrtHostRegister", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(ptr, size, type, devPtr);
+}
+
+aclError AclrtHostUnregister(void *ptr)
+{
+    typedef aclError (*AclrtHostUnregister)(void *);
+    static AclrtHostUnregister func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtHostUnregister) GET_FUNC(aclrtHostUnregister);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclrtHostUnregister", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(ptr);
+}
+
+aclError AclmdlRICaptureTaskGrpBegin(aclrtStream stream)
+{
+    typedef aclError (*AclmdlRICaptureTaskGrpBegin)(aclrtStream);
+    static AclmdlRICaptureTaskGrpBegin func = nullptr;
+    if (func == nullptr) {
+        func = (AclmdlRICaptureTaskGrpBegin) GET_FUNC(aclmdlRICaptureTaskGrpBegin);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskGrpBegin", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream);
+}
+
+aclError AclmdlRICaptureTaskGrpEnd(aclrtStream stream, aclrtTaskGrp *handle)
+{
+    typedef aclError (*AclmdlRICaptureTaskGrpEnd)(aclrtStream, aclrtTaskGrp*);
+    static AclmdlRICaptureTaskGrpEnd func = nullptr;
+    if (func == nullptr) {
+        func = (AclmdlRICaptureTaskGrpEnd) GET_FUNC(aclmdlRICaptureTaskGrpEnd);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskGrpEnd", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream, handle);
+}
+
+aclError AclmdlRICaptureTaskUpdateBegin(aclrtStream stream, aclrtTaskGrp handle)
+{
+    typedef aclError (*AclmdlRICaptureTaskUpdateBegin)(aclrtStream, aclrtTaskGrp);
+    static AclmdlRICaptureTaskUpdateBegin func = nullptr;
+    if (func == nullptr) {
+        func = (AclmdlRICaptureTaskUpdateBegin) GET_FUNC(aclmdlRICaptureTaskUpdateBegin);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskUpdateBegin", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream, handle);
+}
+
+aclError AclmdlRICaptureTaskUpdateEnd(aclrtStream stream)
+{
+    typedef aclError (*AclmdlRICaptureTaskUpdateEnd)(aclmdlRI);
+    static AclmdlRICaptureTaskUpdateEnd func = nullptr;
+    if (func == nullptr) {
+        func = (AclmdlRICaptureTaskUpdateEnd) GET_FUNC(aclmdlRICaptureTaskUpdateEnd);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskUpdateEnd", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream);
+}
+
 } // namespace acl
 } // namespace c10
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h
index fcefb7de71e3fbff981820c7a12e3e7c513697a1..9cdad2663bd438107c409c0d0afe542193db6a75 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.h
@@ -183,21 +183,50 @@ aclError AclrtSynchronizeDeviceWithTimeout(void);
 
 aclError AclrtEventGetTimestamp(aclrtEvent event, uint64_t *timestamp);
 
-aclError AclmdlCaptureBegin(aclrtStream stream, aclmdlCaptureMode mode);
+aclError AclmdlRICaptureBegin(aclrtStream stream, aclmdlRICaptureMode mode);
 
-aclError AclmdlCaptureGetInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId);
+aclError AclmdlRICaptureGetInfo(aclrtStream stream, aclmdlRICaptureStatus *status, aclmdlRI *modelRI);
 
-aclError AclmdlCaptureEnd(aclrtStream stream, uint32_t *modelId);
+aclError AclmdlRICaptureEnd(aclrtStream stream, aclmdlRI *modelRI);
 
-aclError AclmdlDebugPrint(uint32_t modelId);
+aclError AclmdlRIDebugPrint(aclmdlRI modelRI);
 
-aclError AclmdlExecuteAsync(uint32_t modelId, aclrtStream stream);
+aclError AclmdlRIExecuteAsync(aclmdlRI modelRI, aclrtStream stream);
 
 aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *version);
 
-aclError AclmdlUnload(uint32_t modelId);
+aclError AclmdlRIDestroy(aclmdlRI modelRI);
 
 bool IsCaptureSupported();
 
+/**
+ * @ingroup AscendCL
+ * @brief register host memory
+ * @param ptr [IN]      memory pointer
+ * @param size [IN]     memory size
+ * @param type [IN]     memory register size
+ * @param devPtr [OUT]  pointer to allocated memory on device
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+aclError AclrtHostRegister(void *ptr, uint64_t size, aclrtHostRegisterType type, void **devPtr);
+
+/**
+ * @ingroup AscendCL
+ * @brief unregister host memory
+ * @param ptr [IN]     memory pointer
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+aclError AclrtHostUnregister(void *ptr);
+
+aclError AclmdlRICaptureTaskGrpBegin(aclrtStream stream);
+
+aclError AclmdlRICaptureTaskGrpEnd(aclrtStream stream, aclrtTaskGrp *handle);
+
+aclError AclmdlRICaptureTaskUpdateBegin(aclrtStream stream, aclrtTaskGrp handle);
+
+aclError AclmdlRICaptureTaskUpdateEnd(aclrtStream stream);
+
 } // namespace acl
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp b/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp
index e4f3051141e0908b6c39baf9d2922f01a5208521..3b4827ed3b5eff1add3a5f24dd08a0626a96a066 100644
--- a/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp
@@ -11,19 +11,20 @@
 #endif
 namespace c10_npu {
 namespace queue {
-std::atomic<uint64_t> QueueParas::g_correlation_id{0};
+std::atomic<uint64_t> QueueParas::g_correlation_id{ 0 };
 std::map<int64_t, std::string> CopyParas::COPY_PARAS_MAP{
-    {ACL_MEMCPY_HOST_TO_HOST, "acl_memcpy_host_to_host"},
-    {ACL_MEMCPY_HOST_TO_DEVICE, "acl_memcpy_host_to_device"},
-    {ACL_MEMCPY_DEVICE_TO_HOST, "acl_memcpy_device_to_host"},
-    {ACL_MEMCPY_DEVICE_TO_DEVICE, "acl_memcpy_device_to_device"},
+    { ACL_MEMCPY_HOST_TO_HOST, "acl_memcpy_host_to_host" },
+    { ACL_MEMCPY_HOST_TO_DEVICE, "acl_memcpy_host_to_device" },
+    { ACL_MEMCPY_DEVICE_TO_HOST, "acl_memcpy_device_to_host" },
+    { ACL_MEMCPY_DEVICE_TO_DEVICE, "acl_memcpy_device_to_device" },
 };
 std::map<int64_t, std::string> EventParas::EVENT_PARAS_MAP{
-    {RECORD_EVENT, "record_event"},
-    {WAIT_EVENT, "wait_event"},
-    {LAZY_DESTROY_EVENT, "destroy_event"},
+    { RECORD_EVENT, "record_event" },
+    { WAIT_EVENT, "wait_event" },
+    { LAZY_DESTROY_EVENT, "destroy_event" },
 };
-void CopyParas::Copy(CopyParas& other) {
+void CopyParas::Copy(CopyParas &other)
+{
     this->dst = other.dst;
     this->dstLen = other.dstLen;
     this->src = other.src;
@@ -31,19 +32,15 @@ void CopyParas::Copy(CopyParas& other) {
     this->kind = other.kind;
 }
 
-void EventParas::Copy(EventParas& other) {
+void EventParas::Copy(EventParas &other)
+{
     this->event = other.event;
     this->eventAllocatorType = other.eventAllocatorType;
 }
 
 class AsyncCopyTask {
 public:
-    AsyncCopyTask(
-        void* dst,
-        size_t dstLen,
-        void* src,
-        size_t srcLen,
-        aclrtMemcpyKind kind);
+    AsyncCopyTask(void *dst, size_t dstLen, void *src, size_t srcLen, aclrtMemcpyKind kind);
     ~AsyncCopyTask() = default;
     void LaunchCopyTask();
 
@@ -53,13 +50,10 @@ private:
 
 class EventTask {
 public:
-    explicit EventTask(
-        aclrtEvent event,
-        EventAllocatorType allocatorType = RESERVED)
+    explicit EventTask(aclrtEvent event, EventAllocatorType allocatorType = RESERVED)
         : eventParam_(event, allocatorType){};
     ~EventTask() = default;
-    void LaunchRecordTask(
-        c10_npu::NPUStream npuStream);
+    void LaunchRecordTask(c10_npu::NPUStream npuStream);
     void LaunchWaitTask(c10_npu::NPUStream npuStream);
     void LaunchLazyDestroyTask(c10::DeviceIndex device_index);
 
@@ -67,12 +61,8 @@ private:
     EventParas eventParam_;
 };
 
-AsyncCopyTask::AsyncCopyTask(
-    void* dst,
-    size_t dstLen,
-    void* src,
-    size_t srcLen,
-    aclrtMemcpyKind kind) {
+AsyncCopyTask::AsyncCopyTask(void *dst, size_t dstLen, void *src, size_t srcLen, aclrtMemcpyKind kind)
+{
     copyParam_.dst = dst;
     copyParam_.dstLen = dstLen;
     copyParam_.src = src;
@@ -80,7 +70,8 @@ AsyncCopyTask::AsyncCopyTask(
     copyParam_.kind = kind;
 }
 
-void AsyncCopyTask::LaunchCopyTask() {
+void AsyncCopyTask::LaunchCopyTask()
+{
     RECORD_FUNCTION(CopyParas::COPY_PARAS_MAP[copyParam_.kind], std::vector<c10::IValue>({}));
     auto cur_stream = c10_npu::getCurrentNPUStream();
     if (!cur_stream.isSyncLaunchStream() && c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
@@ -90,108 +81,106 @@ void AsyncCopyTask::LaunchCopyTask() {
         QueueParas params(ASYNC_MEMCPY, sizeof(CopyParas), &copyParam_);
         c10_npu::enCurrentNPUStream(&params);
 #ifndef BUILD_LIBTORCH
-        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, CopyParas::COPY_PARAS_MAP[copyParam_.kind], params.correlation_id);
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, CopyParas::COPY_PARAS_MAP[copyParam_.kind],
+            params.correlation_id);
 #endif
     } else {
         c10_npu::NPUStream stream = c10_npu::getCurrentNPUStream();
-        NPU_CHECK_ERROR(aclrtMemcpyAsync(
-            copyParam_.dst,
-            copyParam_.dstLen,
-            copyParam_.src,
-            copyParam_.srcLen,
-            copyParam_.kind,
-            stream));
+        NPU_CHECK_ERROR(aclrtMemcpyAsync(copyParam_.dst, copyParam_.dstLen, copyParam_.src, copyParam_.srcLen,
+            copyParam_.kind, stream));
     }
 }
 
-aclError LaunchAsyncCopyTask(
-    void* dst,
-    size_t dstLen,
-    void* src,
-    size_t srcLen,
-    aclrtMemcpyKind kind) {
+aclError LaunchAsyncCopyTask(void *dst, size_t dstLen, void *src, size_t srcLen, aclrtMemcpyKind kind)
+{
     AsyncCopyTask copyTask(dst, dstLen, src, srcLen, kind);
     copyTask.LaunchCopyTask();
     return ACL_ERROR_NONE;
 }
 
-void EventTask::LaunchRecordTask(c10_npu::NPUStream npuStream) {
+void EventTask::LaunchRecordTask(c10_npu::NPUStream npuStream)
+{
     RECORD_FUNCTION(EventParas::EVENT_PARAS_MAP[RECORD_EVENT], std::vector<c10::IValue>({}));
     if (!npuStream.isSyncLaunchStream() && c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
 #ifndef BUILD_LIBTORCH
         at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, EventParas::EVENT_PARAS_MAP[RECORD_EVENT]);
 #endif
-    uint64_t prof_correlation_id = 0;
-    {
-        c10_npu::NPUStreamGuard guard(npuStream);
-        QueueParas params(RECORD_EVENT, sizeof(EventParas), &eventParam_);
-        c10_npu::NPUEventManager::GetInstance().IncreaseUnrecordedCount(eventParam_.event);
-        c10_npu::enCurrentNPUStream(&params);
-        prof_correlation_id = params.correlation_id;
-    }
-    ASCEND_LOGI("Event: LaunchRecordTask is successfully executed, event=%p", eventParam_.event);
+        uint64_t prof_correlation_id = 0;
+        {
+            c10_npu::NPUStreamGuard guard(npuStream);
+            QueueParas params(RECORD_EVENT, sizeof(EventParas), &eventParam_);
+            c10_npu::NPUEventManager::GetInstance().IncreaseUnrecordedCount(eventParam_.event);
+            c10_npu::enCurrentNPUStream(&params);
+            prof_correlation_id = params.correlation_id;
+        }
+        ASCEND_LOGD("Event: LaunchRecordTask is successfully executed, event=%p", eventParam_.event);
 #ifndef BUILD_LIBTORCH
-    at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[RECORD_EVENT], prof_correlation_id);
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[RECORD_EVENT],
+            prof_correlation_id);
 #endif
     } else {
         NPU_CHECK_ERROR(aclrtRecordEvent(eventParam_.event, npuStream));
-        ASCEND_LOGI("Event: aclrtRecordEvent is successfully executed, stream=%p, event=%p", npuStream.stream(false), eventParam_.event);
+        ASCEND_LOGI("Event: aclrtRecordEvent is successfully executed, stream=%p, event=%p", npuStream.stream(false),
+            eventParam_.event);
     }
 }
 
-aclError LaunchRecordEventTask(aclrtEvent event, c10_npu::NPUStream npuStream) {
+aclError LaunchRecordEventTask(aclrtEvent event, c10_npu::NPUStream npuStream)
+{
     EventTask recordTask(event);
     recordTask.LaunchRecordTask(npuStream);
 #ifndef BUILD_LIBTORCH
-    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
+    const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
     if (C10_UNLIKELY(trigger)) {
-        trigger->traceNpuEventRecord(
-            reinterpret_cast<uintptr_t>(event),
-            reinterpret_cast<uintptr_t>(npuStream.stream(false))
-        );
+        trigger->traceNpuEventRecord(reinterpret_cast<uintptr_t>(event),
+            reinterpret_cast<uintptr_t>(npuStream.stream(false)));
     }
 #endif
     return ACL_ERROR_NONE;
 }
 
-void EventTask::LaunchWaitTask(c10_npu::NPUStream npuStream) {
+void EventTask::LaunchWaitTask(c10_npu::NPUStream npuStream)
+{
     RECORD_FUNCTION(EventParas::EVENT_PARAS_MAP[WAIT_EVENT], std::vector<c10::IValue>({}));
     if (!npuStream.isSyncLaunchStream() && c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
 #ifndef BUILD_LIBTORCH
         at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, EventParas::EVENT_PARAS_MAP[WAIT_EVENT]);
 #endif
-    uint64_t prof_correlation_id = 0;
-    {
-        c10_npu::NPUStreamGuard guard(npuStream);
-        QueueParas params(WAIT_EVENT, sizeof(EventParas), &eventParam_);
-        c10_npu::enCurrentNPUStream(&params);
-        prof_correlation_id = params.correlation_id;
-    }
-    ASCEND_LOGI("Event: LaunchWaitTask is successfully executed, event=%p", eventParam_.event);
+        uint64_t prof_correlation_id = 0;
+        {
+            c10_npu::NPUStreamGuard guard(npuStream);
+            QueueParas params(WAIT_EVENT, sizeof(EventParas), &eventParam_);
+            c10_npu::enCurrentNPUStream(&params);
+            prof_correlation_id = params.correlation_id;
+        }
+        ASCEND_LOGI("Event: LaunchWaitTask is successfully executed, event=%p", eventParam_.event);
 #ifndef BUILD_LIBTORCH
-    at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[WAIT_EVENT], prof_correlation_id);
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[WAIT_EVENT],
+            prof_correlation_id);
 #endif
     } else {
         NPU_CHECK_ERROR(aclrtStreamWaitEvent(npuStream, eventParam_.event));
-        ASCEND_LOGI("Event: aclrtStreamWaitEvent is successfully executed, stream=%p, event=%p", npuStream.stream(false), eventParam_.event);
+        ASCEND_LOGI("Event: aclrtStreamWaitEvent is successfully executed, stream=%p, event=%p",
+            npuStream.stream(false), eventParam_.event);
     }
 }
 
-aclError LaunchWaitEventTask(aclrtEvent event, c10_npu::NPUStream npuStream) {
+aclError LaunchWaitEventTask(aclrtEvent event, c10_npu::NPUStream npuStream)
+{
     EventTask waitTask(event);
     waitTask.LaunchWaitTask(npuStream);
 #ifndef BUILD_LIBTORCH
-    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
+    const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
     if (C10_UNLIKELY(trigger)) {
-        trigger->traceNpuEventWait(
-            reinterpret_cast<uintptr_t>(event),
+        trigger->traceNpuEventWait(reinterpret_cast<uintptr_t>(event),
             reinterpret_cast<uintptr_t>(npuStream.stream(false)));
     }
 #endif
     return ACL_ERROR_NONE;
 }
 
-void EventTask::LaunchLazyDestroyTask(c10::DeviceIndex device_index) {
+void EventTask::LaunchLazyDestroyTask(c10::DeviceIndex device_index)
+{
     RECORD_FUNCTION(EventParas::EVENT_PARAS_MAP[LAZY_DESTROY_EVENT], std::vector<c10::IValue>({}));
     auto cur_stream = c10_npu::getCurrentNPUStream();
     if (!cur_stream.isSyncLaunchStream() && c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
@@ -200,19 +189,21 @@ void EventTask::LaunchLazyDestroyTask(c10::DeviceIndex device_index) {
 #endif
         QueueParas params(LAZY_DESTROY_EVENT, sizeof(EventParas), &eventParam_);
         c10_npu::enCurrentNPUStream(&params, device_index);
-        ASCEND_LOGI("Event: LaunchLazyDestroyTask is successfully executed, event=%p", eventParam_.event);
+        ASCEND_LOGD("Event: LaunchLazyDestroyTask is successfully executed, event=%p", eventParam_.event);
 #ifndef BUILD_LIBTORCH
-        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[LAZY_DESTROY_EVENT], params.correlation_id);
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[LAZY_DESTROY_EVENT],
+            params.correlation_id);
 #endif
     } else {
         NPU_CHECK_ERROR(c10_npu::NPUEventManager::GetInstance().LazyDestroy(eventParam_.event), "aclrtDestroyEvent");
     }
 }
 
-aclError LaunchLazyDestroyEventTask(aclrtEvent event, c10::DeviceIndex device_index) {
+aclError LaunchLazyDestroyEventTask(aclrtEvent event, c10::DeviceIndex device_index)
+{
     EventTask lazyDestroyTask(event);
 #ifndef BUILD_LIBTORCH
-    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
+    const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
     if (C10_UNLIKELY(trigger)) {
         trigger->traceNpuEventDeletion(reinterpret_cast<uintptr_t>(event));
     }
diff --git a/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h b/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h
index 860c5e63f5505b6f60e33fee83a3b749196f9420..174a629fad01e7230b1f522971e56547c4925b30 100644
--- a/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h
@@ -40,6 +40,7 @@ enum QueueParamType {
     LAZY_DESTROY_EVENT = 5,
     RESET_EVENT = 6,
     EXECUTE_OPAPI = 7,
+    EXECUTE_OPAPI_V2 = 8,
 };
 
 struct QueueParas {
diff --git a/torch_npu/csrc/core/npu/interface/DcmiInterface.cpp b/torch_npu/csrc/core/npu/interface/DcmiInterface.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab66dae274034ede2a8e325a5953fff9d4fd8193
--- /dev/null
+++ b/torch_npu/csrc/core/npu/interface/DcmiInterface.cpp
@@ -0,0 +1,72 @@
+#include <string>
+#include "torch_npu/csrc/core/npu/register/FunctionLoader.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
+#include "torch_npu/csrc/core/npu/interface/DcmiInterface.h"
+
+namespace c10_npu {
+namespace dcmi {
+
+#undef LOAD_FUNCTION
+#define LOAD_FUNCTION(funcName) \
+    REGISTER_FUNCTION(libdcmi, funcName)
+#undef GET_FUNC
+#define GET_FUNC(funcName) \
+    GET_FUNCTION(libdcmi, funcName)
+
+REGISTER_LIBRARY(libdcmi)
+LOAD_FUNCTION(dcmi_get_affinity_cpu_info_by_device_id)
+LOAD_FUNCTION(dcmi_init)
+LOAD_FUNCTION(dcmi_get_device_id_in_card)
+LOAD_FUNCTION(dcmi_get_card_num_list)
+
+int DcmiInit(void)
+{
+    using dcmiInitFunc = int(*)(void);
+    static dcmiInitFunc func = nullptr;
+    func = (dcmiInitFunc)GET_FUNC(dcmi_init);
+    if (func == nullptr) {
+        TORCH_CHECK(false, "Failed to find function dcmi_init, "
+                    " maybe your hdk version is too low, please upgrade it.", PTA_ERROR(ErrCode::NOT_FOUND))
+    }
+    return func();
+}
+
+int DcmiGetCardNumList(int *card_num, int *card_list, int list_len)
+{
+    using dcmiGetCardNumListFunc = int(*)(int *, int *, int);
+    static dcmiGetCardNumListFunc func = nullptr;
+    func = (dcmiGetCardNumListFunc)GET_FUNC(dcmi_get_card_num_list);
+    if (func == nullptr) {
+        TORCH_CHECK(false, "Failed to find function dcmi_get_card_num_list, "
+                    " maybe your hdk version is too low, please upgrade it.", PTA_ERROR(ErrCode::NOT_FOUND))
+    }
+    return func(card_num, card_list, list_len);
+}
+
+int DcmiGetAffinityCpuInfoByDeviceId(int card_id, int device_id, char *affinity_cpu, int *length)
+{
+    using dcmiGetAffinityCpuInfoByDeviceIdFunc = int(*)(int, int, char *, int *);
+    static dcmiGetAffinityCpuInfoByDeviceIdFunc func = nullptr;
+    func = (dcmiGetAffinityCpuInfoByDeviceIdFunc)GET_FUNC(dcmi_get_affinity_cpu_info_by_device_id);
+    if (func == nullptr) {
+        TORCH_CHECK(false, "Failed to find function dcmi_get_affinity_cpu_info_by_device_id, "
+                    " maybe your hdk version is too low, please upgrade it", PTA_ERROR(ErrCode::NOT_FOUND));
+    }
+    return func(card_id, device_id, affinity_cpu, length);
+}
+
+int DcmiGetDeviceIdInCard(int card_id, int *device_id_max, int *mcu_id, int *cpu_id)
+{
+    using dcmiGetDeviceIdInCardFunc = int(*)(int, int *, int *, int *);
+    static dcmiGetDeviceIdInCardFunc func = nullptr;
+    func = (dcmiGetDeviceIdInCardFunc)GET_FUNC(dcmi_get_device_id_in_card);
+    if (func == nullptr) {
+        TORCH_CHECK(false, "Failed to find function dcmi_get_device_id_in_card, "
+                    " maybe your hdk version is too low, please upgrade it", PTA_ERROR(ErrCode::NOT_FOUND))
+    }
+    return func(card_id, device_id_max, mcu_id, cpu_id);
+}
+
+}
+
+}
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/interface/DcmiInterface.h b/torch_npu/csrc/core/npu/interface/DcmiInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..0388e32d3220decc1173bcc6d0e230cafbd97a73
--- /dev/null
+++ b/torch_npu/csrc/core/npu/interface/DcmiInterface.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "third_party/dcmi/inc/dcmi_interface_api.h"
+
+namespace c10_npu {
+namespace dcmi {
+
+int DcmiInit(void);
+int DcmiGetCardNumList(int *card_num, int *card_list, int list_len);
+int DcmiGetAffinityCpuInfoByDeviceId(int card_id, int device_id, char *affinity_cpu, int *length);
+int DcmiGetDeviceIdInCard(int card_id, int *device_id_max, int *mcu_id, int *cpu_id);
+
+}
+
+}
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/interface/LcclInterface.cpp b/torch_npu/csrc/core/npu/interface/LcclInterface.cpp
index dccfe342cea86675effdcc9090fefe2c9296e666..326bd5b01df5f172fc19501b7cd52a8973d61f21 100644
--- a/torch_npu/csrc/core/npu/interface/LcclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/LcclInterface.cpp
@@ -30,7 +30,7 @@ int LcclCommInitRankLocal(int rankSize, int rank, LcclComm *comms)
     typedef int(*lcalCommInitRankLocal)(int, int, LcclComm *);
     static lcalCommInitRankLocal func = nullptr;
     if (func == nullptr) {
-        func = (lcalCommInitRankLocal)GET_FUNC(LcclCommInitRankLocal);
+        func = (lcalCommInitRankLocal)GET_FUNC(LcalCommInitRankLocal);
         if (func == nullptr) {
             TORCH_CHECK(func, "Failed to find function ", "lcalCommInitRankLocal", PTA_ERROR(ErrCode::NOT_FOUND));
             return -1;
@@ -44,7 +44,7 @@ int LcclCommInit(int rank, int rankSize, LcclComm *comms)
     typedef int(*lcalCommInit)(int, int, LcclComm *);
     static lcalCommInit func = nullptr;
     if (func == nullptr) {
-        func = (lcalCommInit)GET_FUNC(LcclCommInit);
+        func = (lcalCommInit)GET_FUNC(LcalCommInit);
         if (func == nullptr) {
             TORCH_CHECK(func, "Failed to find function ", "lcalCommInit", PTA_ERROR(ErrCode::NOT_FOUND));
             return -1;
diff --git a/torch_npu/csrc/core/npu/interface/OpInterface.cpp b/torch_npu/csrc/core/npu/interface/OpInterface.cpp
index 4026cd99787c5fc0e8519726ab5e0ef79692b9f7..e950ee9f931b3a3fe24c5cd9b0ea4abdaa30be63 100644
--- a/torch_npu/csrc/core/npu/interface/OpInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/OpInterface.cpp
@@ -24,14 +24,5 @@ bool IsExistAclnnSilentCheck()
     return isExist;
 }
 
-bool IsExistAclnnSilentCheckV2()
-{
-    const static bool isExistV2 = []() -> bool {
-        static auto func = GET_FUNC(aclnnSilentCheckV2);
-        return func != nullptr;
-    }();
-    return isExistV2;
-}
-
 } // namespace opapi
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/interface/OpInterface.h b/torch_npu/csrc/core/npu/interface/OpInterface.h
index 1a5f205e8407dd630a0a12dc188e868165fa6109..663f9a6144ed52569d2c92780c42e70c9ddff38d 100644
--- a/torch_npu/csrc/core/npu/interface/OpInterface.h
+++ b/torch_npu/csrc/core/npu/interface/OpInterface.h
@@ -7,10 +7,5 @@ namespace opapi {
 */
 bool IsExistAclnnSilentCheck();
 
-/**
- * This API is used to check whether aclnnSilentCheckV2 exist.
-*/
-bool IsExistAclnnSilentCheckV2();
-
 } // namespace opapi
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/register/FunctionLoader.cpp b/torch_npu/csrc/core/npu/register/FunctionLoader.cpp
index 17930667c8cb00a405fd053e21da6160856e3d74..6ef031ce3ed686f58e00e7c9cbfe0d5302d953ed 100644
--- a/torch_npu/csrc/core/npu/register/FunctionLoader.cpp
+++ b/torch_npu/csrc/core/npu/register/FunctionLoader.cpp
@@ -5,22 +5,25 @@
 
 namespace c10_npu {
 namespace option {
-
-FunctionLoader::FunctionLoader(const std::string& name) {
+FunctionLoader::FunctionLoader(const std::string &name)
+{
     this->fileName = name + ".so";
 }
 
-FunctionLoader::~FunctionLoader() {
+FunctionLoader::~FunctionLoader()
+{
     if (this->handle != nullptr) {
         dlclose(this->handle);
     }
 }
 
-void FunctionLoader::Set(const std::string& name) {
+void FunctionLoader::Set(const std::string &name)
+{
     this->registry[name] = nullptr;
 }
 
-void* FunctionLoader::Get(const std::string& name) {
+void *FunctionLoader::Get(const std::string &name)
+{
     if (this->handle == nullptr) {
         auto handle = dlopen(this->fileName.c_str(), RTLD_LAZY | RTLD_GLOBAL);
         if (handle == nullptr) {
@@ -49,40 +52,44 @@ void* FunctionLoader::Get(const std::string& name) {
 }
 
 namespace register_function {
-    FunctionRegister* FunctionRegister::GetInstance() {
-        static FunctionRegister instance;
-        return &instance;
-    }
-    void FunctionRegister::Register(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr) {
-        std::lock_guard<std::mutex> lock(mu_);
-        registry.emplace(name, std::move(ptr));
-    }
+FunctionRegister *FunctionRegister::GetInstance()
+{
+    static FunctionRegister instance;
+    return &instance;
+}
+void FunctionRegister::Register(const std::string &name, ::std::unique_ptr<FunctionLoader> &ptr)
+{
+    std::lock_guard<std::mutex> lock(mu_);
+    registry.emplace(name, std::move(ptr));
+}
 
-    void FunctionRegister::Register(const std::string& name, const std::string& funcName) {
-        auto itr = registry.find(name);
-        if (itr == registry.end()) {
-            AT_ERROR(name, " library should register first.");
-            return;
-        }
-        itr->second->Set(funcName);
+void FunctionRegister::Register(const std::string &name, const std::string &funcName)
+{
+    auto itr = registry.find(name);
+    if (itr == registry.end()) {
+        AT_ERROR(name, " library should register first.");
+        return;
     }
+    itr->second->Set(funcName);
+}
 
-    void* FunctionRegister::Get(const std::string& soName, const std::string& funcName) {
-        auto itr = registry.find(soName);
-        if (itr != registry.end()) {
-            return itr->second->Get(funcName);
-        }
-        return nullptr;
+void *FunctionRegister::Get(const std::string &soName, const std::string &funcName)
+{
+    auto itr = registry.find(soName);
+    if (itr != registry.end()) {
+        return itr->second->Get(funcName);
     }
+    return nullptr;
+}
 
-    FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr) {
-        FunctionRegister::GetInstance()->Register(name, ptr);
-    }
-    FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string& soName, const std::string& funcName) {
-        FunctionRegister::GetInstance()->Register(soName, funcName);
-    }
+FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string &name, ::std::unique_ptr<FunctionLoader> &ptr) noexcept
+{
+    FunctionRegister::GetInstance()->Register(name, ptr);
+}
+FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string &soName, const std::string &funcName) noexcept
+{
+    FunctionRegister::GetInstance()->Register(soName, funcName);
+}
 } // namespace register_function
-
-
 } // namespace option
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/register/FunctionLoader.h b/torch_npu/csrc/core/npu/register/FunctionLoader.h
index 489243b1b1a830b5b9c6cecae5313dc0df33bab0..722a78a48a15e014309b7d9f666ff1df87b4cd64 100644
--- a/torch_npu/csrc/core/npu/register/FunctionLoader.h
+++ b/torch_npu/csrc/core/npu/register/FunctionLoader.h
@@ -72,11 +72,11 @@ public:
     /**
         ctr
         */
-    FunctionRegisterBuilder(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr);
+    FunctionRegisterBuilder(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr) noexcept;
     /**
         ctr
         */
-    FunctionRegisterBuilder(const std::string& soName, const std::string& funcName);
+    FunctionRegisterBuilder(const std::string& soName, const std::string& funcName) noexcept;
 }; // class FunctionRegisterBuilder
 
 } // namespace register_function
diff --git a/torch_npu/csrc/core/npu/register/OptionRegister.cpp b/torch_npu/csrc/core/npu/register/OptionRegister.cpp
index e37543bf83d93d0ea8a089d09370df9027ac5f21..8f7f17a0114a517ef7f5ef4b201b1bf749274210 100644
--- a/torch_npu/csrc/core/npu/register/OptionRegister.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionRegister.cpp
@@ -7,12 +7,13 @@
 
 namespace c10_npu {
 namespace option {
-
-OptionInterface::OptionInterface(OptionCallBack callback) {
+OptionInterface::OptionInterface(OptionCallBack callback)
+{
     this->callback = callback;
 }
 
-void OptionInterface::Set(const std::string& in) {
+void OptionInterface::Set(const std::string &in)
+{
     this->val = in;
     if (this->callback != nullptr) {
         if (c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
@@ -25,24 +26,27 @@ void OptionInterface::Set(const std::string& in) {
     }
 }
 
-std::string OptionInterface::Get() {
+std::string OptionInterface::Get()
+{
     return val;
 }
 
 
 namespace register_options {
-OptionRegister* OptionRegister::GetInstance() {
+OptionRegister *OptionRegister::GetInstance()
+{
     static OptionRegister instance;
     return &instance;
 }
 
-void OptionRegister::Register(const std::string& name,
-    ::std::unique_ptr<OptionInterface>& ptr) {
+void OptionRegister::Register(const std::string &name, ::std::unique_ptr<OptionInterface> &ptr)
+{
     std::lock_guard<std::mutex> lock(mu_);
     registry.emplace(name, std::move(ptr));
 }
 
-void OptionRegister::Set(const std::string& name, const std::string& val) {
+void OptionRegister::Set(const std::string &name, const std::string &val)
+{
     auto itr = registry.find(name);
     if (itr != registry.end()) {
         itr->second->Set(val);
@@ -51,7 +55,8 @@ void OptionRegister::Set(const std::string& name, const std::string& val) {
     }
 }
 
-c10::optional<std::string> OptionRegister::Get(const std::string& name) {
+c10::optional<std::string> OptionRegister::Get(const std::string &name)
+{
     auto itr = registry.find(name);
     if (itr != registry.end()) {
         return itr->second->Get();
@@ -59,17 +64,16 @@ c10::optional<std::string> OptionRegister::Get(const std::string& name) {
     return c10::nullopt; // default value
 }
 
-OptionInterfaceBuilder::OptionInterfaceBuilder(
-    const std::string& name,
-    ::std::unique_ptr<OptionInterface>& ptr,
-    const std::string& type) {
+OptionInterfaceBuilder::OptionInterfaceBuilder(const std::string &name, ::std::unique_ptr<OptionInterface> &ptr,
+    const std::string &type)
+{
     OptionRegister::GetInstance()->Register(name, ptr);
 
     // init the value if env variable.
     if (type == "env") {
         std::string env_name = name;
         std::transform(env_name.begin(), env_name.end(), env_name.begin(), ::toupper);
-        char* env_val = std::getenv(env_name.c_str());
+        char *env_val = std::getenv(env_name.c_str());
         if (env_val != nullptr) {
             std::string val(env_val);
             OptionRegister::GetInstance()->Set(name, val);
@@ -78,19 +82,21 @@ OptionInterfaceBuilder::OptionInterfaceBuilder(
 }
 } // namespace register_options
 
-void SetOption(const std::string& key, const std::string& val) {
+void SetOption(const std::string &key, const std::string &val)
+{
     register_options::OptionRegister::GetInstance()->Set(key, val);
 }
 
-void SetOption(const std::map<std::string, std::string>& options) {
+void SetOption(const std::map<std::string, std::string> &options)
+{
     for (auto item : options) {
         SetOption(item.first, item.second);
     }
 }
 
-c10::optional<std::string> GetOption(const std::string& key) {
+c10::optional<std::string> GetOption(const std::string &key)
+{
     return register_options::OptionRegister::GetInstance()->Get(key);
 }
-
 } // namespace option
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
index 2ef8685b65b01bcdc1241262b734b6a3fa8302d2..0fd986e7960c9ad36a49832d08dce0012cb96af3 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
@@ -256,7 +256,7 @@ bool OptionsManager::CheckStatusSaveEnable()
     return CheckStatusSaveEnable;
 }
 
-std::string OptionsManager::GetStatusSavePath()
+std::string OptionsManager::GetStatusSavePath() noexcept
 {
     char* status_save_val = std::getenv("TORCH_HCCL_STATUS_SAVE_PATH");
     std::string status_save_path = (status_save_val != nullptr) ? std::string(status_save_val) : "/tmp";
@@ -267,11 +267,11 @@ uint32_t OptionsManager::GetStatusSaveInterval()
 {
     const static uint32_t status_save_interval = []() -> uint32_t {
         char* env_val = std::getenv("TORCH_HCCL_STATUS_SAVE_INTERVAL");
-        int64_t envFlag = 30;
+        int64_t envFlag = 2;
         if (env_val != nullptr) {
             envFlag = strtol(env_val, nullptr, 10);
             if (envFlag <= 0) {
-                envFlag = 30;
+                envFlag = 2;
                 TORCH_NPU_WARN_ONCE("Get env TORCH_HCCL_STATUS_SAVE_INTERVAL less than or equal to 0, so reset it to the default value.");
             }
         }
@@ -483,11 +483,30 @@ uint32_t OptionsManager::GetAclOpInitMode()
         char* buf_val = std::getenv("ACL_OP_INIT_MODE");
         // Default 0
         int64_t acl_op_init_mode = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 0;
+        std::unordered_map<int32_t, std::string> aclOpInitMode = getAclOpInitMode();
+        if (aclOpInitMode.find(acl_op_init_mode) == aclOpInitMode.end()) {
+            TORCH_NPU_WARN_ONCE("Get env ACL_OP_INIT_MODE not in [0, 1, 2], so reset it to the default value 0.");
+        }
         return static_cast<uint32_t>(acl_op_init_mode);
     }();
     return acl_op_init_mode;
 }
 
+uint32_t OptionsManager::GetStreamsPerDevice()
+{
+    const static uint32_t streams_per_device = []() -> uint32_t {
+        char* buf_val = std::getenv("STREAMS_PER_DEVICE");
+        // Default 8
+        int64_t streams_per_device = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 8;
+        if (streams_per_device != 8 && streams_per_device != 32) {
+            streams_per_device = 8;
+            TORCH_NPU_WARN_ONCE("STREAMS_PER_DEVICE only support 8 or 32, but get other value, so reset it to the default value 8");
+        }
+        return static_cast<uint32_t>(streams_per_device);
+    }();
+    return streams_per_device;
+}
+
 char* OptionsManager::GetCpuAffinityConf()
 {
     return std::getenv("CPU_AFFINITY_CONF");
@@ -602,5 +621,14 @@ bool OptionsManager::IsOomSnapshotEnable()
     return (envFlag != 0);
 }
 
+bool OptionsManager::ShouldPrintLessError()
+{
+    static bool should_print = []() -> bool {
+        int32_t disabled_error = OptionsManager::GetBoolTypeOption("TORCH_NPU_COMPACT_ERROR_OUTPUT");
+        return disabled_error != 0;
+    }();
+    return should_print;
+}
+
 } // namespace option
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h
index 9854b48eb24242211e4963dcd787833ae18e8a60..5be33e06daae47716164f4ad7299afabd8c3426c 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.h
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.h
@@ -113,7 +113,7 @@ public:
     static int64_t GetRankId();
     static char *GetNslbPath();
     static bool CheckStatusSaveEnable();
-    static std::string GetStatusSavePath();
+    static std::string GetStatusSavePath() noexcept;
     static uint32_t GetStatusSaveInterval();
     static uint32_t GetNslbCntVal();
     static bool CheckGeInitDisable();
@@ -127,11 +127,13 @@ public:
     static uint32_t GetP2PBufferSize();
     static uint32_t GetTaskQueueEnable();
     static uint32_t GetAclOpInitMode();
+    static uint32_t GetStreamsPerDevice();
     static char* GetCpuAffinityConf();
     static bool CheckForceUncached();
     static std::string GetOomSnapshotDumpPath();
     static bool IsOomSnapshotEnable();
     static bool ShouldPrintWarning();
+    static bool ShouldPrintLessError();
 
 private:
     static int GetBoolTypeOption(const char* env_str, int defaultVal = 0);
diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index 24ff7bd8fa24404cda6df7ce78e2b379b13f4d8b..b3bf8185d4f0c18df99c8b179262fe1fcb79200c 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -178,7 +178,7 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
     }
 
     NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0));
-    NPU_CHECK_SUPPORTED_OR_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut));
+    NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut));
 
     // lazy call for the setoption
     for (const auto &iter: lazy_fn_) {
@@ -190,8 +190,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
 
     lazy_fn_.clear();
 
-    GetAffinityInfo();
-
     init_flag_ = true;
     ASCEND_LOGD("Npu sys ctrl initialize successfully.");
 
diff --git a/torch_npu/csrc/distributed/HCCLUtils.cpp b/torch_npu/csrc/distributed/HCCLUtils.cpp
index ad0d86d3204bbd0f7bf5d5d937bab4af82791708..74c2334adeb31dde0881f175b5992fd0933da2e7 100644
--- a/torch_npu/csrc/distributed/HCCLUtils.cpp
+++ b/torch_npu/csrc/distributed/HCCLUtils.cpp
@@ -1,7 +1,8 @@
 #include <filesystem>
+#include <fstream>
 
-#include "torch_npu/csrc/distributed/HCCLUtils.hpp"
 #include "torch_npu/csrc/core/npu/interface/HcclInterface.h"
+#include "torch_npu/csrc/distributed/HCCLUtils.hpp"
 
 
 namespace c10d_npu {
@@ -103,7 +104,7 @@ HcclDataType getHcclDataType(at::ScalarType type)
 std::string getHcclDataTypeSerialString(HcclDataType type)
 {
     const auto& iter = kHcclDataTypeToStringMap.find(type);
-    if (iter != kHcclDataTypeToStringMap.end()) {
+    if (iter != kHcclDataTypeToStringMap.cend()) {
         return iter->second;
     } else {
         TORCH_NPU_WARN_ONCE("Can not serialize undefined hccl data type.");
@@ -116,4 +117,152 @@ bool isSupportHcclCommName()
     return at_npu::hccl::isHcclFeatureSupported(HcclCommConfigCapability::HCCL_COMM_CONFIG_COMM_NAME);
 }
 
+HCCLComm::HCCLComm(HcclComm hcclComm) : hcclComm_(hcclComm), hcclAsyncErr_(HCCL_SUCCESS) {}
+    
+HCCLComm::~HCCLComm()
+{
+    destroyHcclComm();
+}
+
+std::shared_ptr<HCCLComm> HCCLComm::create(
+    int numRanks,
+    int rank,
+    HcclRootInfo& rootInfo)
+{
+    auto comm = std::make_shared<HCCLComm>();
+    HCCL_CHECK_ERROR(HcclCommInitRootInfo(numRanks, &rootInfo, rank, &(comm->hcclComm_)));
+    c10_npu::NpuSysCtrl::GetInstance().RegisterReleaseFn([=]() ->void {comm->destroyHcclComm();},
+                                                         c10_npu::ReleasePriority::PriorityMiddle);
+    return comm;
+}
+
+std::shared_ptr<HCCLComm> HCCLComm::create_config(
+    int numRanks,
+    int rank,
+    HcclRootInfo& rootInfo,
+    HcclCommConfig* config)
+{
+    auto comm = std::make_shared<HCCLComm>();
+    HCCL_CHECK_ERROR(hcclCommInitRootInfoConfig(numRanks, &rootInfo, rank, config, &(comm->hcclComm_)));
+    c10_npu::NpuSysCtrl::GetInstance().RegisterReleaseFn([=]() ->void {comm->destroyHcclComm();},
+                                                         c10_npu::ReleasePriority::PriorityMiddle);
+    return comm;
+}
+
+std::shared_ptr<HCCLComm> HCCLComm::createGlobalHcclComm(
+    const char *clusterInfo,
+    uint32_t rank,
+    HcclCommConfig* config)
+{
+    auto comm = std::make_shared<HCCLComm>();
+    if (hcclCommInitClusterInfoConfig(clusterInfo, rank, config, &(comm->hcclComm_)) != HCCL_SUCCESS) {
+        return nullptr;
+    }
+    c10_npu::NpuSysCtrl::GetInstance().RegisterReleaseFn([=]() ->void {comm->destroyHcclComm();},
+                                                         c10_npu::ReleasePriority::PriorityMiddle);
+    return comm;
+}
+
+std::shared_ptr<HCCLComm> HCCLComm::createSubHcclComm(
+    std::shared_ptr<HCCLComm> comm,
+    uint32_t rankNum,
+    uint32_t *rankIds,
+    uint64_t subCommId,
+    uint32_t subCommRankId,
+    HcclCommConfig* config)
+{
+    auto subComm = std::make_shared<HCCLComm>();
+    if (hcclCreateSubCommConfig(&(comm->hcclComm_), rankNum, rankIds, subCommId, subCommRankId,
+        config, &(subComm->hcclComm_)) != HCCL_SUCCESS) {
+        return nullptr;
+    }
+    c10_npu::NpuSysCtrl::GetInstance().RegisterReleaseFn([=]() ->void {subComm->destroyHcclComm();},
+                                                         c10_npu::ReleasePriority::PriorityMiddle);
+    return subComm;
+}
+
+// Move constructable
+HCCLComm::HCCLComm(HCCLComm&& other)
+{
+    std::swap(hcclComm_, other.hcclComm_);
+    std::swap(hcclAsyncErr_, other.hcclAsyncErr_);
+}
+
+// Move assignable
+HCCLComm& HCCLComm::operator=(HCCLComm&& other)
+{
+    std::swap(hcclComm_, other.hcclComm_);
+    std::swap(hcclAsyncErr_, other.hcclAsyncErr_);
+    return *this;
+}
+
+void HCCLComm::destroyHcclComm()
+{
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (hcclComm_) {
+        HcclCommDestroy(hcclComm_);
+        hcclComm_ = nullptr;
+    }
+}
+
+HcclResult HCCLComm::checkForHcclError()
+{
+    std::unique_lock<std::mutex> lock(mutex_);
+#ifdef ENABLE_HCCL_ERROR_CHECKING
+    if (hcclAsyncErr_ != HCCL_SUCCESS) {
+        return hcclAsyncErr_;
+    }
+    if (hcclComm_ != nullptr) {
+        C10D_HCCL_CHECK(hcclGetCommAsyncError(hcclComm_, &hcclAsyncErr_));
+    }
+    return hcclAsyncErr_;
+#else
+    // Always return success, if error checks are disabled.
+    return HCCL_SUCCESS;
+#endif
 }
+
+void DebugInfoWriter::write(const std::string &hcclTrace)
+{
+    // Open a file for writing. The ios::binary flag is used to write data as
+    // binary.
+    std::ofstream file(filename_, std::ios::binary);
+
+    // Check if the file was opened successfully.
+    if (!file.is_open()) {
+        LOG(ERROR) << "Error opening file for writing HCCLPG debug info: "
+                   << filename_;
+        return;
+    }
+
+    file.write(hcclTrace.data(), hcclTrace.size());
+    LOG(INFO) << "Finished writing HCCLPG debug info to " << filename_;
+}
+
+DebugInfoWriter &DebugInfoWriter::getWriter(int rank)
+{
+    if (writer_ == nullptr) {
+        std::string fileNamePrefix = getCvarString(
+            {"TORCH_HCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/hccl_trace_rank_");
+        // Using std::unique_ptr here to auto-delete the writer object
+        // when the pointer itself is destroyed.
+        std::unique_ptr<DebugInfoWriter> writerPtr(
+            new DebugInfoWriter(fileNamePrefix, rank));
+        DebugInfoWriter::registerWriter(std::move(writerPtr));
+    }
+    return *writer_;
+}
+
+void DebugInfoWriter::registerWriter(std::unique_ptr<DebugInfoWriter> writer)
+{
+    TORCH_CHECK_WITH(
+        DistBackendError,
+        hasWriterRegistered_.load() == false,
+        "debugInfoWriter already registered");
+    hasWriterRegistered_.store(true);
+    writer_ = std::move(writer);
+}
+
+std::unique_ptr<DebugInfoWriter> DebugInfoWriter::writer_ = nullptr;
+std::atomic<bool> DebugInfoWriter::hasWriterRegistered_(false);
+} // namespace c10d_npu
diff --git a/torch_npu/csrc/distributed/HCCLUtils.hpp b/torch_npu/csrc/distributed/HCCLUtils.hpp
index 93e6c24bfdf9a88f224510aba7eb7aae445f3199..ffd645a7c50575b8dfe9f3d2652e316cf7723a39 100644
--- a/torch_npu/csrc/distributed/HCCLUtils.hpp
+++ b/torch_npu/csrc/distributed/HCCLUtils.hpp
@@ -1,11 +1,11 @@
 #pragma once
+#include <map>
+#include <memory>
+#include <string>
 
 #include "torch_npu/csrc/core/npu/npu_log.h"
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
 #include "torch_npu/csrc/core/npu/NPUException.h"
-#include <map>
-#include <memory>
-#include <string>
 
 #include <ATen/ATen.h>
 #include <c10/util/Optional.h>
@@ -17,6 +17,17 @@
         auto Error = err_code;                                               \
         if ((Error) != HCCL_SUCCESS) {                                       \
             CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(Error);              \
+            if (c10_npu::option::OptionsManager::ShouldPrintLessError()) {   \
+                std::ostringstream oss;                                      \
+                oss << " HCCL function error: " << getErrorFunction(#err_code, ##__VA_ARGS__)    \
+                   << ", error code is " << Error << " "                    \
+                   << DIST_ERROR(ErrCode::HCCL) + ".\n";                     \
+                std::string err_msg = oss.str();                          \
+                ASCEND_LOGE("%s", err_msg.c_str());                       \
+                TORCH_CHECK(                                                 \
+                    false,                                                   \
+                    c10_npu::c10_npu_get_error_message());                   \
+            } else {                                                         \
             TORCH_CHECK(                                                     \
                 false,                                                       \
                 __func__,                                                    \
@@ -27,8 +38,9 @@
                 " HCCL function error: ", getErrorFunction(#err_code, ##__VA_ARGS__),   \
                 ", error code is ", Error,                                   \
                 DIST_ERROR(ErrCode::HCCL) + ".\n" +                          \
-                c10_npu::acl::AclGetErrMsg());                               \
+                c10_npu::c10_npu_get_error_message());                               \
         }                                                                    \
+    }                                                                       \
     } while (0)
 
 #define ENABLE_HCCL_ERROR_CHECKING
@@ -51,6 +63,7 @@ extern HcclResult hcclCommInitRootInfoConfig(uint32_t nRanks, const HcclRootInfo
 extern HcclResult hcclCommInitClusterInfoConfig(const char *clusterInfo, uint32_t rank, HcclCommConfig *config, HcclComm *comm);
 extern HcclResult hcclCreateSubCommConfig(HcclComm *comm, uint32_t rankNum, uint32_t *rankIds, uint64_t subCommId, uint32_t subCommRankId,
     HcclCommConfig* config, HcclComm *subComm);
+extern HcclResult hcclCommWorkingDevNicSet(HcclComm comm, uint32_t *ranks, bool *useBackup, uint32_t nRanks);
 
 // Provides additional detail into HCCL error codes based on when these are
 // thrown in the HCCL codebase.
@@ -91,7 +104,7 @@ inline std::string getCvarString(
         const char *val = std::getenv(env[i].c_str());
         if (val == nullptr) {
             continue;
-        } else if (i) {
+        } else if (i != 0) {
             WARN_ENV_VAR_ONCE(env[i], env[0]);
         }
         ret = val;
@@ -112,10 +125,10 @@ inline int getCvarInt(const std::vector<std::string> &env, int def)
      * versions of a variable get higher priority than the latter
      * versions of the same variable */
     for (ssize_t i = static_cast<ssize_t>(env.size()) - 1; i >= 0; i--) {
-        char *val = std::getenv(env[i].c_str());
+        const char *val = std::getenv(env[i].c_str());
         if (val == nullptr) {
             continue;
-        } else if (i) {
+        } else if (i != 0) {
             WARN_ENV_VAR_ONCE(env[i], env[0]);
         }
         try {
@@ -139,10 +152,10 @@ inline bool getCvarBool(const std::vector<std::string> &env, bool def)
      * versions of a variable get higher priority than the latter
      * versions of the same variable */
     for (ssize_t i = static_cast<ssize_t>(env.size()) - 1; i >= 0; i--) {
-        char *val_ = std::getenv(env[i].c_str());
+        const char *val_ = std::getenv(env[i].c_str());
         if (val_ == nullptr) {
             continue;
-        } else if (i) {
+        } else if (i != 0) {
             WARN_ENV_VAR_ONCE(env[i], env[0]);
         }
 
@@ -168,55 +181,27 @@ inline bool getCvarBool(const std::vector<std::string> &env, bool def)
 }
 
 // RAII wrapper for HCCL communicator
-class HCCLComm {
+class C10_NPU_API HCCLComm {
 public:
-    explicit HCCLComm(HcclComm hcclComm) : hcclComm_(hcclComm), hcclAsyncErr_(HCCL_SUCCESS) {}
-
+    explicit HCCLComm(HcclComm hcclComm);
     HCCLComm() : HCCLComm(nullptr) {}
-
-    ~HCCLComm()
-    {
-        destroyHcclComm();
-    }
+    ~HCCLComm();
 
     static std::shared_ptr<HCCLComm> create(
         int numRanks,
         int rank,
-        HcclRootInfo& rootInfo)
-    {
-        auto comm = std::make_shared<HCCLComm>();
-        HCCL_CHECK_ERROR(HcclCommInitRootInfo(numRanks, &rootInfo, rank, &(comm->hcclComm_)));
-        c10_npu::NpuSysCtrl::GetInstance().RegisterReleaseFn([=]() ->void {comm->destroyHcclComm();},
-                                                             c10_npu::ReleasePriority::PriorityMiddle);
-        return comm;
-    }
+        HcclRootInfo& rootInfo);
 
     static std::shared_ptr<HCCLComm> create_config(
         int numRanks,
         int rank,
         HcclRootInfo& rootInfo,
-        HcclCommConfig* config)
-    {
-        auto comm = std::make_shared<HCCLComm>();
-        HCCL_CHECK_ERROR(hcclCommInitRootInfoConfig(numRanks, &rootInfo, rank, config, &(comm->hcclComm_)));
-        c10_npu::NpuSysCtrl::GetInstance().RegisterReleaseFn([=]() ->void {comm->destroyHcclComm();},
-                                                             c10_npu::ReleasePriority::PriorityMiddle);
-        return comm;
-    }
+        HcclCommConfig* config);
 
     static std::shared_ptr<HCCLComm> createGlobalHcclComm(
         const char *clusterInfo,
         uint32_t rank,
-        HcclCommConfig* config)
-    {
-        auto comm = std::make_shared<HCCLComm>();
-        if (hcclCommInitClusterInfoConfig(clusterInfo, rank, config, &(comm->hcclComm_)) != HCCL_SUCCESS) {
-            return nullptr;
-        }
-        c10_npu::NpuSysCtrl::GetInstance().RegisterReleaseFn([=]() ->void {comm->destroyHcclComm();},
-            c10_npu::ReleasePriority::PriorityMiddle);
-        return comm;
-    }
+        HcclCommConfig* config);
 
     static std::shared_ptr<HCCLComm> createSubHcclComm(
         std::shared_ptr<HCCLComm> comm,
@@ -224,67 +209,29 @@ public:
         uint32_t *rankIds,
         uint64_t subCommId,
         uint32_t subCommRankId,
-        HcclCommConfig* config)
-    {
-        auto subComm = std::make_shared<HCCLComm>();
-        if (hcclCreateSubCommConfig(&(comm->hcclComm_), rankNum, rankIds, subCommId, subCommRankId,
-            config, &(subComm->hcclComm_)) != HCCL_SUCCESS) {
-            return nullptr;
-        }
-        c10_npu::NpuSysCtrl::GetInstance().RegisterReleaseFn([=]() ->void {subComm->destroyHcclComm();},
-                                                             c10_npu::ReleasePriority::PriorityMiddle);
-        return subComm;
-    }
+        HcclCommConfig* config);
+
+    int hcclCommType;
+    int p2pPeer;
 
     // Must not be copyable
     HCCLComm(const HCCLComm&) = delete;
     HCCLComm& operator=(const HCCLComm&) = delete;
 
     // Move constructable
-    HCCLComm(HCCLComm&& other)
-    {
-        std::swap(hcclComm_, other.hcclComm_);
-        std::swap(hcclAsyncErr_, other.hcclAsyncErr_);
-    }
+    HCCLComm(HCCLComm&& other);
 
     // Move assignable
-    HCCLComm& operator=(HCCLComm&& other)
-    {
-        std::swap(hcclComm_, other.hcclComm_);
-        std::swap(hcclAsyncErr_, other.hcclAsyncErr_);
-        return *this;
-    }
+    HCCLComm& operator=(HCCLComm&& other);
 
     HcclComm getHcclComm() const
     {
         return hcclComm_;
     }
 
-    void destroyHcclComm()
-    {
-        std::unique_lock<std::mutex> lock(mutex_);
-        if (hcclComm_) {
-            HcclCommDestroy(hcclComm_);
-            hcclComm_ = nullptr;
-        }
-    }
+    void destroyHcclComm();
 
-    HcclResult checkForHcclError()
-    {
-        std::unique_lock<std::mutex> lock(mutex_);
-#ifdef ENABLE_HCCL_ERROR_CHECKING
-        if (hcclAsyncErr_ != HCCL_SUCCESS) {
-            return hcclAsyncErr_;
-        }
-        if (hcclComm_ != nullptr) {
-            C10D_HCCL_CHECK(hcclGetCommAsyncError(hcclComm_, &hcclAsyncErr_));
-        }
-        return hcclAsyncErr_;
-#else
-        // Always return success, if error checks are disabled.
-        return HCCL_SUCCESS;
-#endif
-    }
+    HcclResult checkForHcclError();
 
 protected:
     HcclComm hcclComm_;
@@ -294,7 +241,7 @@ protected:
 
 class TORCH_API DebugInfoWriter {
 public:
-    virtual ~DebugInfoWriter();
+    virtual ~DebugInfoWriter() = default;
     virtual void write(const std::string &hcclTrace);
     static DebugInfoWriter &getWriter(int rank);
     static void registerWriter(std::unique_ptr<DebugInfoWriter> writer);
diff --git a/torch_npu/csrc/distributed/HcclCompile.h b/torch_npu/csrc/distributed/HcclCompile.h
index d0a8696aa25a352f360b020739e9b335115ca003..a63ad73696038810cb394b9aff3aacde938b2c54 100644
--- a/torch_npu/csrc/distributed/HcclCompile.h
+++ b/torch_npu/csrc/distributed/HcclCompile.h
@@ -26,12 +26,13 @@ LOAD_FUNCTION(HcclCommInitRootInfoConfig)
 LOAD_FUNCTION(HcclGetCommConfigCapability)
 LOAD_FUNCTION(HcclCommInitClusterInfoConfig)
 LOAD_FUNCTION(HcclCreateSubCommConfig)
+LOAD_FUNCTION(HcclCommWorkingDevNicSet)
 
 
 extern HcclResult hcclAlltoAllV(const void *sendBuf, const void *sendCounts, const void *sdispls,
     HcclDataType sendType, const void *recvBuf, const void *recvCounts, const void *rdispls,
     HcclDataType recvType, HcclComm comm, aclrtStream stream) {
-    typedef HcclResult(*HcclAlltoAllVFunc)(
+    using HcclAlltoAllVFunc = HcclResult(*)(
         const void *, const void *, const void *, HcclDataType,
         const void *, const void *, const void *, HcclDataType,
         HcclComm, aclrtStream);
@@ -49,7 +50,7 @@ extern HcclResult hcclAllGatherV(const void *sendBuf, uint64_t sendCount,
     const void *recvBuf, const void *recvCounts, const void *rdispls,
     HcclDataType dataType, HcclComm comm, aclrtStream stream)
 {
-    typedef HcclResult(*HcclAllGatherVFunc)(
+    using HcclAllGatherVFunc = HcclResult(*)(
         const void *, uint64_t,
         const void *, const void *, const void *,
         HcclDataType, HcclComm, aclrtStream);
@@ -66,7 +67,7 @@ extern HcclResult hcclReduceScatterV(const void *sendBuf, const void *sendCounts
     const void *recvBuf, uint64_t recvCount,
     HcclDataType dataType, HcclReduceOp op, HcclComm comm, aclrtStream stream)
 {
-    typedef HcclResult(*HcclReduceScatterVFunc)(
+    using HcclReduceScatterVFunc = HcclResult(*)(
         const void *, const void *, const void *,
         const void *, uint64_t,
         HcclDataType, HcclReduceOp, HcclComm, aclrtStream);
@@ -82,7 +83,7 @@ extern HcclResult hcclReduceScatterV(const void *sendBuf, const void *sendCounts
 extern HcclResult hcclReduce(void *sendBuf, void *recvBuf, uint64_t count, HcclDataType sendType,
     HcclReduceOp op, uint32_t root, HcclComm comm, aclrtStream stream)
 {
-    typedef HcclResult(*HcclReduceVFunc)(
+    using HcclReduceVFunc = HcclResult(*)(
         void *, void *, uint64_t, HcclDataType, HcclReduceOp, uint32_t, HcclComm, aclrtStream);
     static HcclReduceVFunc func = nullptr;
     if (func == nullptr) {
@@ -95,7 +96,7 @@ extern HcclResult hcclReduce(void *sendBuf, void *recvBuf, uint64_t count, HcclD
 
 HcclResult hcclGetCommAsyncError(HcclComm comm, HcclResult* asyncError)
 {
-    typedef HcclResult(*HcclGetCommAsyncErrorVFunc)(HcclComm, HcclResult*);
+    using HcclGetCommAsyncErrorVFunc = HcclResult(*)(HcclComm, HcclResult*);
     static HcclGetCommAsyncErrorVFunc func = nullptr;
     if (func == nullptr) {
         func = (HcclGetCommAsyncErrorVFunc)GET_FUNC(HcclGetCommAsyncError);
@@ -108,7 +109,7 @@ HcclResult hcclGetCommAsyncError(HcclComm comm, HcclResult* asyncError)
 HcclResult hcclScatter(void *sendBuf, void *recvBuf, uint64_t count, HcclDataType dataType, uint32_t root,
     HcclComm comm, aclrtStream stream)
 {
-    typedef HcclResult(*HcclScatterVFunc)(void *, void *, uint64_t, HcclDataType, uint32_t, HcclComm, aclrtStream);
+    using HcclScatterVFunc = HcclResult(*)(void *, void *, uint64_t, HcclDataType, uint32_t, HcclComm, aclrtStream);
     static HcclScatterVFunc func = nullptr;
     if (func == nullptr) {
         func = (HcclScatterVFunc)GET_FUNC(HcclScatter);
@@ -120,7 +121,7 @@ HcclResult hcclScatter(void *sendBuf, void *recvBuf, uint64_t count, HcclDataTyp
 
 HcclResult hcclBatchIsendIrecv(void* sendRecvInfo, uint32_t itemNum, HcclComm comm, aclrtStream stream)
 {
-    typedef HcclResult(*HcclBatchIsendIrecvVFunc)(
+    using HcclBatchIsendIrecvVFunc = HcclResult(*)(
         void *, uint32_t, HcclComm, aclrtStream);
     static HcclBatchIsendIrecvVFunc func = nullptr;
     if (func == nullptr) {
@@ -135,7 +136,7 @@ HcclResult hcclAlltoAll(const void *sendBuf, uint64_t sendCount, HcclDataType se
     const void *recvBuf, uint64_t recvCount, HcclDataType recvType,
     HcclComm comm, aclrtStream stream)
 {
-    typedef HcclResult(*HcclAlltoAllFunc)(
+    using HcclAlltoAllFunc = HcclResult(*)(
         const void *, uint64_t, HcclDataType,
         const void *, uint64_t, HcclDataType,
         HcclComm, aclrtStream);
@@ -194,7 +195,7 @@ bool hcclReduceScatterVExist()
 
 HcclResult hcclCommInitRootInfoConfig(uint32_t nRanks, const HcclRootInfo *rootInfo, uint32_t rank, HcclCommConfig* config, HcclComm *comm)
 {
-    typedef HcclResult(*HcclCommInitRootInfoConfigFunc)(
+    using HcclCommInitRootInfoConfigFunc = HcclResult(*)(
         uint32_t, const HcclRootInfo *, uint32_t, HcclCommConfig*, HcclComm *);
     static HcclCommInitRootInfoConfigFunc func = nullptr;
     if (func == nullptr) {
@@ -207,7 +208,7 @@ HcclResult hcclCommInitRootInfoConfig(uint32_t nRanks, const HcclRootInfo *rootI
 
 bool isHcclFeatureSupported(HcclCommConfigCapability configParameter)
 {
-    typedef uint32_t(*HcclGetCommConfigCapabilityFunc)();
+    using HcclGetCommConfigCapabilityFunc = uint32_t(*)();
     static HcclGetCommConfigCapabilityFunc func = (HcclGetCommConfigCapabilityFunc) GET_FUNC(
             HcclGetCommConfigCapability);
     if (func == nullptr) {
@@ -227,7 +228,7 @@ bool hcclCommInitClusterInfoConfigExist()
 
 HcclResult hcclCommInitClusterInfoConfig(const char *clusterInfo, uint32_t rank, HcclCommConfig *config, HcclComm *comm)
 {
-    typedef HcclResult(*HcclCommInitClusterInfoConfigFunc)(const char *, uint32_t, HcclCommConfig *, HcclComm *);
+    using HcclCommInitClusterInfoConfigFunc = HcclResult(*)(const char *, uint32_t, HcclCommConfig *, HcclComm *);
     static HcclCommInitClusterInfoConfigFunc func = nullptr;
     if (func == nullptr) {
         func = (HcclCommInitClusterInfoConfigFunc)GET_FUNC(HcclCommInitClusterInfoConfig)
@@ -249,7 +250,7 @@ bool hcclCreateSubCommConfigExist()
 HcclResult hcclCreateSubCommConfig(HcclComm *comm, uint32_t rankNum, uint32_t *rankIds, uint64_t subCommId, uint32_t subCommRankId,
     HcclCommConfig* config, HcclComm *subComm)
 {
-    typedef HcclResult(*HcclCreateSubCommConfigFunc)(HcclComm *, uint32_t, uint32_t *, uint64_t, uint32_t, HcclCommConfig *, HcclComm *);
+    using HcclCreateSubCommConfigFunc = HcclResult(*)(HcclComm *, uint32_t, uint32_t *, uint64_t, uint32_t, HcclCommConfig *, HcclComm *);
     static HcclCreateSubCommConfigFunc func = nullptr;
     if (func == nullptr) {
         func = (HcclCreateSubCommConfigFunc)GET_FUNC(HcclCreateSubCommConfig)
@@ -258,4 +259,25 @@ HcclResult hcclCreateSubCommConfig(HcclComm *comm, uint32_t rankNum, uint32_t *r
     auto ret = func(comm, rankNum, rankIds, subCommId, subCommRankId, config, subComm);
     return ret;
 }
+
+bool hcclCommWorkingDevNicSetExist()
+{
+    const static bool isHcclCommWorkingDevNicSetExist = []() -> bool {
+        auto func = GET_FUNC(HcclCommWorkingDevNicSet)
+        return func != nullptr;
+    }();
+    return isHcclCommWorkingDevNicSetExist;
+}
+
+HcclResult hcclCommWorkingDevNicSet(HcclComm comm, uint32_t *ranks, bool *useBackup, uint32_t nRanks)
+{
+    using HcclCommWorkingDevNicSetFunc = HcclResult(*)(HcclComm, uint32_t *, bool *, uint32_t);
+    static HcclCommWorkingDevNicSetFunc func = nullptr;
+    if (func == nullptr) {
+        func = (HcclCommWorkingDevNicSetFunc)GET_FUNC(HcclCommWorkingDevNicSet)
+    }
+    TORCH_CHECK(func, "Failed to find function ", "HcclCommWorkingDevNicSet", DIST_ERROR(ErrCode::NOT_FOUND));
+    auto ret = func(comm, ranks, useBackup, nRanks);
+    return ret;
+}
 } // namespace c10d_npu
diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp
index 3e975538ba2850cd8b26c5c3011d2702783ec4ce..30e0dce7cdb44a9c63aefec6efecb4265d9385b7 100644
--- a/torch_npu/csrc/distributed/Init.cpp
+++ b/torch_npu/csrc/distributed/Init.cpp
@@ -21,11 +21,11 @@
 #include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
 #include "torch_npu/csrc/distributed/ProcessGroupLCCL.hpp"
 #include "torch_npu/csrc/distributed/reducer.hpp"
-#include "torch_npu/csrc/distributed/Init.h"
 #include "torch_npu/csrc/distributed/ParallelTcpStore.hpp"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/aten/CustomFunctions.h"
 #include "torch_npu/csrc/core/NPUBridge.h"
+#include "torch_npu/csrc/distributed/Init.h"
 
 
 namespace {
@@ -49,7 +49,7 @@ public:
         : impl_(c10::intrusive_ptr<T>::unsafe_steal_from_new(impl)) {}
     ~IntrusivePtrNoGilDestructor() {
         if (impl_) {
-        if (PyGILState_Check()) {
+        if (PyGILState_Check() != 0) {
             pybind11::gil_scoped_release release;
             impl_.reset();
         } else {
@@ -95,7 +95,7 @@ using intrusive_ptr_no_gil_destructor_class_ =
 
 class BroadcastWork {
 public:
-    inline std::vector<at::Tensor> cast_tensors(at::TensorList tensors)
+    inline std::vector<at::Tensor> cast_tensors(at::TensorList tensors) const
     {
         static auto cast_back_to_ori_format = [](const at::Tensor &t) {
             return at_npu::native::custom_ops::npu_format_cast(t, torch_npu::NPUBridge::GetNpuStorageImpl(t)->npu_desc_.origin_format_);
@@ -396,6 +396,12 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
         .def("get_hccl_comm", &::c10d_npu::ProcessGroupHCCL::getHcclComm)
         .def("_set_hccl_comm_name", &::c10d_npu::ProcessGroupHCCL::setHcclCommName)
         .def("resume_hccl_comm", &::c10d_npu::ProcessGroupHCCL::resumeHcclComm)
+        .def("_set_switch_nic_comm",
+            &::c10d_npu::ProcessGroupHCCL::setSwitchNicComm,
+             py::arg("rankid"),
+             py::arg("nRanks"),
+             py::arg("ranks") = std::vector<uint32_t>{},
+             py::arg("useBackup") = std::vector<bool>{})
         .def("abort_hccl_comm", &::c10d_npu::ProcessGroupHCCL::abortAndClearHcclComm)
         .def("_delete_tcpstore_key", &::c10d_npu::ProcessGroupHCCL::deleteTCPStoreKey)
         .def("set_watchdog_status", &::c10d_npu::ProcessGroupHCCL::setWatchdogStatus)
@@ -532,6 +538,22 @@ Example::
            py::arg("wait_workers") = true,
            py::arg("multi_tenant") = false);
 
+    module.def("_dump_hccl_trace_json",
+        [](std::optional<bool> includeCollectives,
+            std::optional<bool> onlyActive) {
+            return py::bytes(::c10d_npu::dump_hccl_trace_json(
+                includeCollectives.value_or(true), onlyActive.value_or(false)));
+        },
+        py::arg("includeCollectives") = std::optional<bool>(),
+        py::arg("onlyActive") = std::optional<bool>(),
+        R"(
+        Arguments:
+                includeCollectives(bool, optional): Whether to include collective work traces. Default is True.
+                onlyActive (bool, optional): Whether to only include active collective work traces. Default is False.
+        Returns:
+                Stringified json work traces.
+                Default settings return everything - i.e. contains HCCL comm dumps and collective traces.
+        )");
     module.def("_dump_hccl_trace",
         [](std::optional<bool> includeCollectives,
             std::optional<bool> includeStackTraces,
diff --git a/torch_npu/csrc/distributed/LCCLUtils.cpp b/torch_npu/csrc/distributed/LCCLUtils.cpp
index b4e62d9f2991dc0c15ec7cb9067fa87be80ef4fb..913bb52d04abfd11abaf5ff2656f5aa359af3b8c 100644
--- a/torch_npu/csrc/distributed/LCCLUtils.cpp
+++ b/torch_npu/csrc/distributed/LCCLUtils.cpp
@@ -63,7 +63,7 @@ at_npu::lccl::LcclDataType getLcclDataType(at::ScalarType type)
 std::string getLcclDataTypeSerialString(at_npu::lccl::LcclDataType type)
 {
     const auto& iter = kLcclDataTypeToStringMap.find(type);
-    if (iter != kLcclDataTypeToStringMap.end()) {
+    if (iter != kLcclDataTypeToStringMap.cend()) {
         return iter->second;
     } else {
         TORCH_NPU_WARN_ONCE("Cannot serialize undefined LCCL data type.");
diff --git a/torch_npu/csrc/distributed/ParallelStoreProxy.cpp b/torch_npu/csrc/distributed/ParallelStoreProxy.cpp
index f61b82cbe190d1a11636b65e97a5d29049ff6753..1da5fa19cb5e3f85a64260e170c58f8a47a17784 100644
--- a/torch_npu/csrc/distributed/ParallelStoreProxy.cpp
+++ b/torch_npu/csrc/distributed/ParallelStoreProxy.cpp
@@ -1,11 +1,11 @@
 #include <unistd.h>
-#include "ParallelStoreProxy.hpp"
 #include "ParallelTcpStore.hpp"
 #include "StoreClient.hpp"
 #include "c10/util/Exception.h"
 #include <stdexcept>
 #include "StoreMessagePacker.hpp"
 #include "ParallelTcpServer.hpp"
+#include "ParallelStoreProxy.hpp"
 
 namespace c10d {
 namespace torch_npu {
@@ -129,5 +129,10 @@ int Proxy::LoopProcessData() noexcept
 
     return result;
 }
+
+int Proxy::SetReceiveTimeout(const std::chrono::milliseconds &value) const noexcept
+{
+    return tcpClient_->SetReceiveTimeout(value);
+}
 } // torch_npu
 } // c10d
\ No newline at end of file
diff --git a/torch_npu/csrc/distributed/ParallelStoreProxy.hpp b/torch_npu/csrc/distributed/ParallelStoreProxy.hpp
index 0646f6f5d7ee68aaa75c750ba6fbc90ce40eb721..be222710fd14bcffa45a18d571267c79dde65bf4 100644
--- a/torch_npu/csrc/distributed/ParallelStoreProxy.hpp
+++ b/torch_npu/csrc/distributed/ParallelStoreProxy.hpp
@@ -20,6 +20,7 @@ public:
     StoreMessage HandleLocalServerMessage(const int &fd, const torch_npu::StoreMessage &message) noexcept;
     void WriteData(const int &fd, std::vector<uint8_t> &buf, int64_t &unpackSize) noexcept;
     int LoopProcessData() noexcept;
+    int SetReceiveTimeout(const std::chrono::milliseconds &value) const noexcept;
 
 private:
     const std::string host_{};
diff --git a/torch_npu/csrc/distributed/ParallelTcpServer.cpp b/torch_npu/csrc/distributed/ParallelTcpServer.cpp
index e7047c0ce023c1439ed41e99b447cc37bd37214f..72e7ebf9a096c630e2029ac6e1abffe20cf5a465 100644
--- a/torch_npu/csrc/distributed/ParallelTcpServer.cpp
+++ b/torch_npu/csrc/distributed/ParallelTcpServer.cpp
@@ -16,11 +16,13 @@
 #include <sys/socket.h>
 #include <sys/epoll.h>
 #include <sys/un.h>
+#include <sys/stat.h>
 #include <netinet/in.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <arpa/inet.h>
 #include "c10/util/Logging.h"
+#include "torch_npu/csrc/framework/utils/NpuUtils.h"
 #include "ParallelTcpServer.hpp"
 
 namespace c10d {
@@ -206,7 +208,7 @@ void ParallelTcpServer::WakeupWaitingClients(const std::string &key) noexcept
         return;
     }
 
-    for (auto it : pos->second) {
+    for (auto it : std::as_const(pos->second)) {
         if (--socketWaitKeyNum_[it] <= 0) {
             stopWaitingSockets.emplace_back(it);
             socketWaitKeyNum_.erase(it);
@@ -226,18 +228,49 @@ void ParallelTcpServer::WakeupWaitingClients(const std::string &key) noexcept
 
 int ParallelTcpServer::CreateSocket(const std::string host, uint16_t port) noexcept
 {
-    struct sockaddr_in servAddr {};
-    servAddr.sin_family = AF_INET;
-    servAddr.sin_addr.s_addr = inet_addr(host.c_str());
-    servAddr.sin_port = htons(port);
+    auto sockFd = CreateSocketWithFamily(host, port, AF_INET);
+    if (sockFd >= 0) {
+        return sockFd;
+    }
+    
+    sockFd = CreateSocketWithFamily(host, port, AF_INET6);
+    if (sockFd >= 0) {
+        return sockFd;
+    }
+    return -1;
+}
+
+int ParallelTcpServer::CreateSocketWithFamily(const std::string host, uint16_t port, int family) noexcept
+{
+    struct addrinfo hints = {0};
+    hints.ai_family = family;
+    hints.ai_socktype = SOCK_STREAM;
+
+    ::addrinfo* result = nullptr;
+    int r = ::getaddrinfo(host.c_str(), std::to_string(port).c_str(), &hints, &result);
+    if (r != 0) {
+        LOG(ERROR) << "getaddrinfo failed " << errno << " : " << strerror(errno);
+        return -1;
+    }
+
+    for (::addrinfo* addr = result; addr != nullptr; addr = addr->ai_next) {
+        int sockFd = CreateSocketAndListen(*addr);
+        if (sockFd >= 0) {
+            return sockFd;
+        }
+    }
+    return -1;
+}
 
-    auto sockFd = ::socket(AF_INET, SOCK_STREAM, 0);
+int ParallelTcpServer::CreateSocketAndListen(const ::addrinfo &addr) noexcept
+{
+    auto sockFd = ::socket(addr.ai_family, addr.ai_socktype, addr.ai_protocol);
     if (sockFd < 0) {
         LOG(ERROR) << "create server socket fd failed " << errno << " : " << strerror(errno);
         return -1;
     }
 
-    auto ret = ::bind(sockFd, reinterpret_cast<struct sockaddr *>(&servAddr), sizeof(servAddr));
+    auto ret = ::bind(sockFd, addr.ai_addr, addr.ai_addrlen);
     if (ret != 0) {
         LOG(ERROR) << "bind server socket fd failed " << errno << " : " << strerror(errno);
         close(sockFd);
@@ -284,6 +317,11 @@ int ParallelTcpServer::CreateLocalSocket(const std::string &localSocketPath) noe
         return -1;
     }
 
+    if (!at_npu::native::NpuUtils::setFilePermissions(sockFd, S_IRUSR | S_IWUSR | S_IRGRP)) {
+        close(sockFd);
+        return -1;
+    }
+
     ret = listen(sockFd, MAX_EVENT_COUNT);
     if (ret != 0) {
         LOG(ERROR) << "listen local socket fd failed " << errno << " : " << strerror(errno);
@@ -332,7 +370,7 @@ int ParallelTcpServer::SetNonBlocking(int fd) noexcept
         return -1;
     }
 
-    auto ret = fcntl(fd, F_SETFL, old | O_NONBLOCK);
+    auto ret = fcntl(fd, F_SETFL, static_cast<int>(old) | O_NONBLOCK);
     if (ret != 0) {
         LOG(ERROR) << "set fd flags failed " << errno << " : " << strerror(errno);
         return -1;
@@ -413,7 +451,7 @@ void ParallelTcpServer::ProcessListenEvent() noexcept
 void ParallelTcpServer::ProcessClientEvent(int epFd, int fd, uint32_t event,
     std::unordered_map<int, ClientIoContext> &ctx) noexcept
 {
-    if (event & (EPOLLRDHUP | EPOLLHUP)) {
+    if ((event & (EPOLLRDHUP | EPOLLHUP)) != 0) {
         epoll_ctl(epFd, EPOLL_CTL_DEL, fd, nullptr);
         close(fd);
         fd = -1;
@@ -427,7 +465,7 @@ void ParallelTcpServer::ProcessClientEvent(int epFd, int fd, uint32_t event,
     }
 
     auto setEvents = pos->second.currentEvents_;
-    if (event & EPOLLIN) {
+    if ((event & EPOLLIN) != 0) {
         pos->second.ReceiveData();
         while (pos->second.HasNextReq()) {
             auto response = process_(fd, pos->second.NextRequest());
@@ -443,7 +481,7 @@ void ParallelTcpServer::ProcessClientEvent(int epFd, int fd, uint32_t event,
         }
     }
 
-    if (event & EPOLLOUT) {
+    if ((event & EPOLLOUT) != 0) {
         pos->second.FlushSendBuf();
         setEvents = EPOLLIN | EPOLLRDHUP | EPOLLHUP;
     }
diff --git a/torch_npu/csrc/distributed/ParallelTcpServer.hpp b/torch_npu/csrc/distributed/ParallelTcpServer.hpp
index aea1daa682cfdf0dfaa9873e5226c2a99f693b62..d2abf9a2cc7d723d3eac2409c2c877a713500d35 100644
--- a/torch_npu/csrc/distributed/ParallelTcpServer.hpp
+++ b/torch_npu/csrc/distributed/ParallelTcpServer.hpp
@@ -23,6 +23,7 @@
 #include <atomic>
 #include <unordered_map>
 #include <functional>
+#include <netdb.h>
 
 #include "StoreMessagePacker.hpp"
 
@@ -120,6 +121,8 @@ public:
     void WakeupWaitingClients(const std::string &key) noexcept;
 
 private:
+    static int CreateSocketWithFamily(const std::string host, uint16_t port, int family) noexcept;
+    static int CreateSocketAndListen(const ::addrinfo &addr) noexcept;
     static int CreateSocket(const std::string host, uint16_t port) noexcept;
     static int CreateLocalSocket(const std::string &localSocketPath) noexcept;
 
diff --git a/torch_npu/csrc/distributed/ParallelTcpStore.cpp b/torch_npu/csrc/distributed/ParallelTcpStore.cpp
index 59d07278e6eee4f9a875f17247340f8bea9fd2ed..3f47930fbaece27e0237b8e0f21d2b982db608c0 100644
--- a/torch_npu/csrc/distributed/ParallelTcpStore.cpp
+++ b/torch_npu/csrc/distributed/ParallelTcpStore.cpp
@@ -15,10 +15,10 @@
  */
 #include <chrono>
 #include "ParallelTcpServer.hpp"
-#include "ParallelTcpStore.hpp"
 #include "ParallelStoreProxy.hpp"
 #include "StoreClient.hpp"
 #include "torch_npu/csrc/core/npu/npu_log.h"
+#include "ParallelTcpStore.hpp"
 
 namespace c10d {
 namespace torch_npu {
@@ -474,7 +474,11 @@ void ParallelTcpStore::wait(const std::vector<std::string> &keys, const std::chr
 {
     torch_npu::StoreMessage request{ torch_npu::MessageType::WAIT, 0, keys };
     torch_npu::StoreMessage response;
-    client_->SetReceiveTimeout(timeout);
+    if (proxy_) {
+        proxy_->SetReceiveTimeout(timeout);
+    } else {
+        client_->SetReceiveTimeout(timeout);
+    }
     std::lock_guard<std::mutex> lockGuard{ clientMutex_ };
     DoWait(request, response);
 }
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index cc076314f2d237173ce1c9f56bad4619979362f2..73ee79512160d1602bbb185bd88673894c4dcd97 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -4,11 +4,11 @@
 #include <tuple>
 #include <unordered_set>
 #include <unistd.h>
-#include <linux/limits.h>
 #include <fstream>
 #include <iostream>
 #include <functional>
 #include <cstdlib>
+#include <linux/limits.h>
 
 #include <pybind11/pybind11.h>
 #include <pybind11/eval.h>
@@ -25,6 +25,7 @@
 #include "third_party/acl/inc/acl/acl_base.h"
 #include "torch_npu/csrc/aten/CustomFunctions.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/core/npu/GetCANNInfo.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
 #include "torch_npu/csrc/core/NPUBridge.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
@@ -41,13 +42,13 @@
 #include "torch_npu/csrc/distributed/HcclCompile.h"
 #include "torch_npu/csrc/distributed/TraceUtils.h"
 #include "torch_npu/csrc/distributed/PrefixStore.hpp"
-#include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
 #include "torch_npu/csrc/toolkit/profiler/common/utils.h"
 #include "torch_npu/csrc/framework/OpHook.h"
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/framework/utils/OpPreparation.h"
 #include "torch_npu/csrc/profiler/npu_profiler.h"
 #include "torch_npu/csrc/logging/LogContext.h"
+#include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
 
 namespace py = pybind11;
 using namespace py::literals;
@@ -56,12 +57,10 @@ namespace c10d_npu {
 namespace {
 static constexpr uint32_t kOpWaitTimeoutOffset = 30U; // second
 static uint32_t kOpWaitTimeout = 1868U; // second
-static int32_t defaultExecTimeout = 1800;
+static int32_t defaultExecTimeout = 1836;
 constexpr const char* P2P_DEVICE_KEY = "_p2p";
 
 using hcclUs = std::chrono::steady_clock::time_point;
-#define DURATION_US(x) (std::chrono::duration_cast<std::chrono::microseconds>(x))
-#define TIME_NOW() ({ std::chrono::steady_clock::now(); })
 
 constexpr int32_t MAX_GROUP_NAME_LEN = 128;
 
@@ -82,7 +81,7 @@ std::map<c10d::ReduceOp, std::string> unsupportedOp = {
 bool nslb_is_end = false;
 bool uce_error_flag = false;
 bool force_stop_error_flag = false;
-char* nslb_path = c10_npu::option::OptionsManager::GetNslbPath();
+const char* nslb_path = c10_npu::option::OptionsManager::GetNslbPath();
 bool status_save_enable = c10_npu::option::OptionsManager::CheckStatusSaveEnable();
 std::string status_save_path = c10_npu::option::OptionsManager::GetStatusSavePath();
 
@@ -241,9 +240,8 @@ void syncStreams(
         c10_npu::NPUStream& hcclStream = hcclStreams[i];
         c10_npu::NPUEvent& hcclEvent = hcclEvents[i];
         hcclEvent.record(c10_npu::getCurrentNPUStream(devices[i].index()));
-        ASCEND_LOGI("Event: record hccl group is successfully executed, event=%p", hcclEvent.event());
         hcclEvent.block(hcclStream);
-        ASCEND_LOGI("Event: block hccl group is successfully executed, event=%p", hcclEvent.event());
+        ASCEND_LOGI("Event: record and block hccl group is successfully executed, event=%p", hcclEvent.event());
     }
 }
 
@@ -289,7 +287,16 @@ void getHcclCommConfig(HcclCommConfig* config, bool isP2P = false)
     }
 
     // Temporarily adding this logic to set deterministic states to avoid a known issues within HCCL.
-    config->hcclDeterministic = getDeterministicState() ? 1 : 0;
+    static const bool isCannVersionGteBase = []() {
+        const std::string baseCannversion = "8.2.RC1";
+        const std::string baseCannModule = "CANN";
+        return IsGteCANNVersion(baseCannversion, baseCannModule);
+    }();
+    if (isCannVersionGteBase) {
+        config->hcclDeterministic = 0xffffffff;
+    } else {
+        config->hcclDeterministic = getDeterministicState() ? 1 : 0;
+    }
 
     // Compatible with the size check of the old version of HCCL, forcibly convert
     // the config object to a size_t=32 object, and retain the N ± 2 version
@@ -414,6 +421,7 @@ int ProcessGroupHCCL::deviceId_ = -1;
 int ProcessGroupHCCL::numRanks_ = -1;
 std::string ProcessGroupHCCL::exceptionMessage_ = "";
 std::atomic<bool> ProcessGroupHCCL::shouldDump_(false);
+std::atomic<bool> ProcessGroupHCCL::monitorThreadEnabled_(false);
 std::shared_ptr<npu_logging::Logger> logger = npu_logging::logging().getLogger("torch.distributed");
 
 std::string dump_hccl_trace(
@@ -425,6 +433,12 @@ std::string dump_hccl_trace(
         c10::nullopt, includeCollectives, includeStackTraces, onlyActive);
 }
 
+std::string dump_hccl_trace_json(bool includeCollectives, bool onlyActive)
+{
+    return HCCLTraceBuffer::get()->dump_json(
+        c10::nullopt, includeCollectives, onlyActive);
+}
+
 c10::optional<std::function<void(std::function<void(const std::string &)>)>> &get_cpp_trace_dumper()
 {
     static c10::optional<
@@ -467,6 +481,10 @@ std::ostream& operator<<(std::ostream& output, const ProcessGroupHCCL::WorkHCCL&
         workHCCL.seq_,
         ", OpType=",
         opTypeToString(workHCCL.opType_),
+        ", NumelIn=",
+        workHCCL.numelIn_,
+        ", NumelOut=",
+        workHCCL.numelOut_,
         ", Timeout(ms)=",
         workHCCL.opTimeout_.count(),
         ")");
@@ -487,10 +505,10 @@ ProcessGroupHCCL::WorkHCCL::WorkHCCL(
     // Creates the npu event wrappers
     // Note: The actual events are lazily created when first recorded to with
     // DEFAULT_FLAGS = npuEventDisableTiming.
-    if (desyncDebug || (status_save_enable)) {
+    if (desyncDebug || (status_save_enable) || ProcessGroupHCCL::monitorThreadEnabled_.load()) {
         hcclStartEvents_ = std::make_shared<std::vector<c10_npu::NPUEvent>>();
         hcclStartEvents_->reserve(devices.size());
-        for (int i = 0; i < devices.size(); i++) {
+        for (size_t i = 0; i < devices.size(); i++) {
             hcclStartEvents_->emplace_back(ACL_EVENT_CAPTURE_STREAM_PROGRESS);
         }
     }
@@ -511,6 +529,8 @@ ProcessGroupHCCL::WorkHCCL::WorkHCCL(const WorkHCCL& w)
     workStartTime_(w.workStartTime_),
     seq_(w.seq_),
     startTraceUpdated_(w.startTraceUpdated_),
+    numelIn_(w.numelIn_),
+    numelOut_(w.numelOut_),
     store_(w.store_),
     is_dispatched(w.is_dispatched),
     is_reported(w.is_reported),
@@ -659,7 +679,7 @@ bool ProcessGroupHCCL::WorkHCCL::checkTimeout(c10::optional<std::chrono::millise
     return true;
 }
 
-std::chrono::milliseconds GetDispatchTimeout()
+std::chrono::milliseconds GetDispatchTimeout() noexcept
 {
     uint32_t dispatchTimeout_ = 600U;
     uint32_t dispatchoffset = 30U;
@@ -667,8 +687,8 @@ std::chrono::milliseconds GetDispatchTimeout()
 
     int32_t hccl_exec_timeout = c10_npu::option::OptionsManager::GetHCCLExecTimeout();
     if (hccl_exec_timeout > 0) {
-        if (hccl_exec_timeout < dispatchTimeout_ + dispatchoffset && hccl_exec_timeout > mindispatchTimeout_ + dispatchoffset) {
-            dispatchTimeout_ = hccl_exec_timeout - dispatchoffset;
+        if (static_cast<uint32_t>(hccl_exec_timeout) < dispatchTimeout_ + dispatchoffset && static_cast<uint32_t>(hccl_exec_timeout) > mindispatchTimeout_ + dispatchoffset) {
+            dispatchTimeout_ = static_cast<uint32_t>(hccl_exec_timeout) - dispatchoffset;
         };
     };
     ASCEND_LOGI("set dispatchTimeout_ %u s.", dispatchTimeout_);
@@ -701,7 +721,7 @@ bool ProcessGroupHCCL::WorkHCCL::checkExec()
 
     static int32_t hccl_exec_timeout = c10_npu::option::OptionsManager::GetHCCLExecTimeout();
     if (hccl_exec_timeout <= 0) {
-        hccl_exec_timeout = 1800;
+        hccl_exec_timeout = defaultExecTimeout;
     }
     int32_t timeout = std::max(60, hccl_exec_timeout - 60);
     auto currentTimepoint = std::chrono::steady_clock::now();
@@ -884,38 +904,32 @@ ProcessGroupHCCL::ProcessGroupHCCL(
     this->setGroupName(groupName);
     int32_t hccl_event_timeout = c10_npu::option::OptionsManager::GetHCCLEventTimeout();
     int32_t hccl_exec_timeout = c10_npu::option::OptionsManager::GetHCCLExecTimeout();
+    if (hccl_exec_timeout < 0) {
+        hccl_exec_timeout = defaultExecTimeout;
+    }
+
     if (hccl_event_timeout > 0) {
-        if (hccl_exec_timeout < 0) {
-            if (hccl_event_timeout < defaultExecTimeout) {
-                TORCH_NPU_WARN_ONCE("The value of HCCL_EVENT_TIMEOUT:", hccl_event_timeout, " is less than the default value of HCCL_EXEC_TIMEOUT:", defaultExecTimeout, ".");
-            }
-            kOpWaitTimeout = hccl_event_timeout;
+        kOpWaitTimeout = static_cast<uint32_t>(hccl_event_timeout);
+        if (hccl_event_timeout <= hccl_exec_timeout) {
+            TORCH_NPU_WARN_ONCE("The value of HCCL_EVENT_TIMEOUT:", hccl_event_timeout, " is less than or equal to the value of HCCL_EXEC_TIMEOUT:", hccl_exec_timeout, ".");
         } else if (hccl_exec_timeout == 0) {
-            kOpWaitTimeout = 0;
-            TORCH_NPU_WARN_ONCE("The value of HCCL_EVENT_TIMEOUT:", hccl_event_timeout, " is less than the value of HCCL_EXEC_TIMEOUT:", hccl_exec_timeout, ", so set op wait timeout to never timeout.");
-        } else {
-            kOpWaitTimeout = hccl_event_timeout;
-            if (hccl_event_timeout < hccl_exec_timeout) {
-                TORCH_NPU_WARN_ONCE("The value of HCCL_EVENT_TIMEOUT:", hccl_event_timeout, " is less than the value of HCCL_EXEC_TIMEOUT:", hccl_exec_timeout, ".");
-            }
+            TORCH_NPU_WARN_ONCE("The value of HCCL_EXEC_TIMEOUT was set to 0(never timeout), so it is bigger than the value of HCCL_EVENT_TIMEOUT:", hccl_event_timeout, ".");
         }
-    }
-    if (hccl_event_timeout == 0) {
+    } else if (hccl_event_timeout == 0) {
         kOpWaitTimeout = 0;
-    }
-    if (hccl_event_timeout < 0) {
+    } else {
         if (hccl_exec_timeout == 0) {
             kOpWaitTimeout = 0;
-        }
-        if (hccl_exec_timeout > 0 && hccl_exec_timeout > kOpWaitTimeout) {
-            kOpWaitTimeout = hccl_exec_timeout + kOpWaitTimeoutOffset;
-            if (kOpWaitTimeout <= hccl_exec_timeout) {
+        } else {
+            kOpWaitTimeout = static_cast<uint32_t>(hccl_exec_timeout) + kOpWaitTimeoutOffset;
+            if (kOpWaitTimeout <= static_cast<uint32_t>(hccl_exec_timeout)) {
                 kOpWaitTimeout = UINT_MAX;
             }
         }
     }
+
     ASCEND_LOGI("Set op wait timeout to %d.", kOpWaitTimeout);
-    NPU_CHECK_SUPPORTED_OR_ERROR(c10_npu::acl::AclrtSetOpWaitTimeout(kOpWaitTimeout));
+    NPU_CHECK_ERROR(c10_npu::acl::AclrtSetOpWaitTimeout(kOpWaitTimeout));
     logPrefix_ = createLogPrefix();
     if (options_->global_ranks_in_group.empty()) {
         numRanks_ = size_;
@@ -935,7 +949,7 @@ ProcessGroupHCCL::ProcessGroupHCCL(
     PrefixStore *prefixStore = dynamic_cast<PrefixStore *>(store_.get());
     globalStore_ = prefixStore ? prefixStore->getUnderlyingNonPrefixStore() : store_;
 
-    char* blockingWait = getenv(HCCL_BLOCKING_WAIT);
+    const char* blockingWait = getenv(HCCL_BLOCKING_WAIT);
     try {
         if (blockingWait != nullptr) {
             auto val = std::stoi(blockingWait);
@@ -976,22 +990,23 @@ ProcessGroupHCCL::ProcessGroupHCCL(
 
 #ifdef ENABLE_HCCL_ERROR_CHECKING
     if (asyncErrorHandling_ == TearDown) {
-        if (hccl_exec_timeout > 0) {
-            if ((hccl_exec_timeout * 1000) > (options_->timeout).count()) {
-                TORCH_NPU_WARN("The HCCL execution timeout ", hccl_exec_timeout * 1000, "ms is bigger than watchdog timeout ",
-                    (options_->timeout).count(), "ms which is set by init_process_group! The plog may not be recorded.");
+        if ((options_->timeout).count() != DEFAULT_TIMEOUT) {
+            if ((options_->timeout).count() <= hccl_exec_timeout * 1000) {
+                TORCH_NPU_WARN("The watchdog timeout ", (options_->timeout).count(), "ms(which is set by init_process_group) is less than or equal to HCCL execution timeout ",
+                    hccl_exec_timeout * 1000, "ms! The plog may not be recorded.");
+            } else if (hccl_exec_timeout == 0) {
+                TORCH_NPU_WARN("The HCCL execution timeout was set to 0(never timeout), so it is bigger than watchdog timeout ",
+                    (options_->timeout).count(), "ms which is set by init_process_group! The plog may not be recorded. You can disable watchdog by 'export HCCL_ASYNC_ERROR_HANDLING=0'.");
             }
-        } else if (hccl_exec_timeout == 0) {
-            TORCH_NPU_WARN("The HCCL execution timeout was set to never timeout, so it is bigger than watchdog timeout ",
-                (options_->timeout).count(), "ms which is set by init_process_group! The plog may not be recorded. You can disable watchdog by 'export HCCL_ASYNC_ERROR_HANDLING=0'.");
         } else {
-            if ((options_->timeout).count() == DEFAULT_TIMEOUT) {
-                // Only when the timeout is default, we will change it.
-                options_->timeout = std::chrono::milliseconds(DEFAULT_TIMEOUT * 2);
-            }
-            if ((options_->timeout).count() < DEFAULT_TIMEOUT) {
-                TORCH_NPU_WARN("The HCCL execution timeout 1800000ms is bigger than watchdog timeout ",
-                    (options_->timeout).count(), "ms which is set by init_process_group! The plog may not be recorded.");
+            if (hccl_exec_timeout == 0) {
+                options_->timeout = std::chrono::milliseconds(LLONG_MAX);
+            } else {
+                long long watchdog_timeout = (static_cast<long long>(hccl_exec_timeout) + 1800) * 1000;
+                if (watchdog_timeout <= static_cast<long long>(hccl_exec_timeout) * 1000) {
+                    watchdog_timeout = LLONG_MAX;
+                }
+                options_->timeout = std::chrono::milliseconds(watchdog_timeout);
             }
         }
     }
@@ -1085,7 +1100,7 @@ void ProcessGroupHCCL::waitForFutureOrTimeout(
     std::future<bool> &fut,
     const std::chrono::milliseconds &timeOutMilSec,
     const std::string &futDescription,
-    bool throwException)
+    bool throwException) const
 {
     std::string errorMsg;
     TORCH_CHECK(fut.valid(), "Expected a valid future");
@@ -1362,8 +1377,8 @@ void ProcessGroupHCCL::heartbeatMonitor()
         // somewhere else to avoid the deadlock.
         std::unique_lock<std::mutex> lock(monitorMutex_);
         if (monitorWakeUpCV_.wait_for(lock,
-                                      std::chrono::milliseconds(monitorPollInterval),
-                                      [&]{ return terminateHeartbeatMonitorThread_.load(); })) {
+            std::chrono::milliseconds(monitorPollInterval),
+            [&]{ return terminateHeartbeatMonitorThread_.load(); })) {
             // For the normal complete or user interception, monitorWakeUpCV_
             // will get notified, we early return and exit heartbeatMonitor.
             return;
@@ -1390,9 +1405,9 @@ void ProcessGroupHCCL::heartbeatMonitor()
                     "Received a dump signal from this local rank and will ",
                     "start to dump the debug info. ",
                     "Last enqueued HCCL work: ",
-                    pgStatus_.lastEnqueuedSeq,
+                    pgStatus_->lastEnqueuedSeq,
                     ", last completed HCCL work: ",
-                    pgStatus_.lastCompletedSeq,
+                    pgStatus_->lastCompletedSeq,
                     ".");
                 exitMsg = c10::str(
                     "ProcessGroupHCCL's watchdog detected an exception from the local rank. ",
@@ -1408,7 +1423,7 @@ void ProcessGroupHCCL::heartbeatMonitor()
             // we haven't polled for `heartbeat_timeout` seconds and there haven't
             // any work added or removed for `watchdog_timeout` seconds.
             if (computeDeltaMS(lastWorkListUpdateTime_, currentTime) >= kWatchdogThreadSleepMillis &&
-                computeDeltaMS(lastTimePollStore, currentTime) >= coordCheckIntervalMilSec_) {
+                computeDeltaMS(lastTimePollStore, currentTime) >= coordCheckIntervalMilSec_ && !hasGlobalDumped) {
                 lastTimePollStore = currentTime;
                 // Wrap globalStore_->check() in a try-catch block to avoid crashing if
                 // the store is not available.
@@ -1449,9 +1464,9 @@ void ProcessGroupHCCL::heartbeatMonitor()
                         timeOutRank,
                         ", and will start to dump the debug info. ",
                         "Last enqueued HCCL work: ",
-                        pgStatus_.lastEnqueuedSeq,
+                        pgStatus_->lastEnqueuedSeq,
                         ", last completed HCCL work: ",
-                        pgStatus_.lastCompletedSeq,
+                        pgStatus_->lastCompletedSeq,
                         ".");
                     exitMsg = c10::str(
                         "ProcessGroupHCCL's watchdog detected a dump signal from rank ",
@@ -1464,6 +1479,7 @@ void ProcessGroupHCCL::heartbeatMonitor()
                         "bugs in the communications library (e.g. HCCL), etc. We tried our best to ",
                         "dump the debug info into the storage to help you debug the issue.");
                     dumpTraceAndResetStatus();
+                    hasGlobalDumped = true;
                 }
             }
         }
@@ -1573,9 +1589,8 @@ void ProcessGroupHCCL::heartbeatMonitor()
 
 void ProcessGroupHCCL::hcclCommWatchdog()
 {
+    c10_npu::SetThreadType(c10_npu::ThreadType::WATCHDOG_THREAD);
     try {
-        c10_npu::SetThreadName(c10_npu::ThreadType::hcclCommWatchdogThread);
-
         VLOG(2) << "[Rank " << rank_ << "] HCCL watchdog thread started!";
         if (monitorThreadEnabled_.load()) {
             hcclHeartbeatMonitorThread_ = std::thread(&ProcessGroupHCCL::heartbeatMonitor, this);
@@ -1718,12 +1733,23 @@ void ProcessGroupHCCL::workCleanupLoop()
             work.checkAndSetException();
             work.checkDispatch();
             bool exec_timeout = work.checkExec();
-            if (exec_timeout) {
-                if (!shouldDump_.load()) {
+            if (dumpOnException_ && exec_timeout) {
+                try {
+                    auto rank = globalRank();
+                    auto vec = std::vector<uint8_t>(
+                        reinterpret_cast<uint8_t *>(&rank),
+                        reinterpret_cast<uint8_t *>(&rank) + sizeof(rank));
+                    globalStore_->set(std::string(EXCEPTION_DUMP), vec);
+                    if (!shouldDump_.load()) {
+                        LOG(ERROR) << logPrefix()
+                            << "First watchdog exec timeout to set the dump signal.";
+                    }
+                    shouldDump_.store(true);
+                } catch (const std::exception &e) {
                     LOG(ERROR) << logPrefix()
-                        << "First watchdog exec timeout to set the dump signal.";
+                               << "Failed to set exec timeout dump signal in tcpstore. "
+                               << "Error: " << e.what();
                 }
-                shouldDump_.store(true);
             }
             bool timedOut = work.checkTimeout();
 
@@ -1782,6 +1808,17 @@ void ProcessGroupHCCL::workCleanupLoop()
                 }
             }
 
+            // a work could be started but not completed, so we should not update
+            // lastStartedSeq and lastStartedOpName if the work state is checked
+            // multiple times after the start
+            if (monitorThreadEnabled_.load() && pgStatus_->lastStartedSeq < static_cast<int64_t>(work.seq_) &&
+                work.isStarted()) {
+                pgStatus_->lastStartedSeq = static_cast<int64_t>(work.seq_);
+                pgStatus_->lastStartedWorkName = opTypeToString(work.opType_);
+                pgStatus_->lastStartedNumelIn = work.numelIn_;
+                pgStatus_->lastStartedNumelOut = work.numelOut_;
+            }
+
             // Clean up completed work
             if (work.isCompleted()) {
                 if (*(work.is_dispatched) && work.is_reported) {
@@ -1789,14 +1826,18 @@ void ProcessGroupHCCL::workCleanupLoop()
                     work.is_reported = false;
                 }
                 if (status_save_enable) {
-                    refreshStatusInfo(work, "end"); // Update Statusinfo，but not write into the map
+                    is_refreshed = refreshStatusInfo(work, "end"); // Update Statusinfo，but not write into the map
                 }
+                pgStatus_->lastCompletedSeq = static_cast<int64_t>(work.seq_);
+                pgStatus_->lastCompletedWorkName = opTypeToString(work.opType_);
+                pgStatus_->lastCompletedNumelIn = work.numelIn_;
+                pgStatus_->lastCompletedNumelOut = work.numelOut_;
                 HCCLTraceBuffer::get()->retire_id(work.trace_id_, true);
                 it = workMetaList_.erase(it);
                 c10_npu::NPUGraph::dec_pending_event_queries();
             } else {
                 if (status_save_enable && work.isStarted()) {
-                    refreshStatusInfo(work, "start"); // Update Statusinfo，but not write into the map
+                    is_refreshed = refreshStatusInfo(work, "start"); // Update Statusinfo，but not write into the map
                 }
                 // Increment the iterator if the current WorkHCCL object is not
                 // completed.
@@ -1809,6 +1850,10 @@ void ProcessGroupHCCL::workCleanupLoop()
         }
         }
 
+        if (status_save_enable && is_refreshed) {
+            updateStatusOutput();
+        }
+
         if (recordflag && recordHcclStatus(status_save_path)) {
             lastrecordtime = std::chrono::steady_clock::now();
         }
@@ -1955,8 +2000,11 @@ void ProcessGroupHCCL::recordDataVol(std::string opName, const std::string dataV
     outfile.close();
 }
 
-void ProcessGroupHCCL::refreshStatusInfo(ProcessGroupHCCL::WorkHCCL work, std::string status)
+bool ProcessGroupHCCL::refreshStatusInfo(ProcessGroupHCCL::WorkHCCL work, std::string status)
 {
+    if (StatusInfo.seq == work.seq_ && StatusInfo.status == status) {
+        return false;
+    }
     StatusInfo.seq = work.seq_;
     StatusInfo.pgId = options_->group_id;
     StatusInfo.opType = opTypeToString(work.opType_);
@@ -1969,19 +2017,21 @@ void ProcessGroupHCCL::refreshStatusInfo(ProcessGroupHCCL::WorkHCCL work, std::s
         StatusInfo.commIds = "all";
     }
     StatusInfo.status = status;
+    return true;
 }
 
 void ProcessGroupHCCL::updateStatusOutput()
 {
+    std::unique_lock<std::mutex> lock(StatusMapmutex_);
     if (!StatusInfo.pgId.empty()) {
         StatusOutput_[options_->group_id] = StatusInfo;
     }
+    is_refreshed = false;
 }
 
 bool ProcessGroupHCCL::recordHcclStatus(const std::string path, bool end, bool error)
 {
     std::unique_lock<std::mutex> lock(StatusMapmutex_);
-    updateStatusOutput();
     if (!options_->global_ranks_in_group.empty() && !error) {
         return true;
     } else if (!StatusOutput_.empty()) {
@@ -2023,7 +2073,7 @@ bool ProcessGroupHCCL::recordHcclStatus(const std::string path, bool end, bool e
                 outfile << ", {";
             }
             outfile << "\"seq\":" << info->second.seq << ", \"op_type\":\"" << info->second.opType;
-            outfile << "\", \"pg_id\":" << info->second.pgId << ", \"comm_ids\":\"" << info->second.commIds;
+            outfile << "\", \"pg_id\":\"" << info->second.pgId << "\", \"comm_ids\":\"" << info->second.commIds;
             outfile << "\", \"status\":\""<< info->second.status << "\"}";
             first_op = false;
         }
@@ -2115,6 +2165,7 @@ void ProcessGroupHCCL::createHCCLComm(
     broadcastMasterID(&hcclID, isSingleP2POp, devicesKey, p2pRank);
 
     c10_npu::OptionalNPUGuard npuGuard;
+    auto startTime = std::chrono::steady_clock::now();
     for (size_t i = 0; i < devices.size(); ++i) {
         int numRanks = getSize();
         int rank = getRank() * static_cast<int>(devices.size()) + static_cast<int>(i);
@@ -2131,12 +2182,15 @@ void ProcessGroupHCCL::createHCCLComm(
                     config = createHcclCommConfigWithOptions();
                     hcclComms[i] = HCCLComm::create_config(numRanks, rank, hcclID, &config);
                 }
+                hcclComms[i]->hcclCommType = static_cast<int>(HcclCommType::DEFAULT);
                 break;
             case HcclCommType::P2P: // P2P not support set hcclCommName
                 numRanks = 2;
                 rank = p2pRank;
                 getHcclCommConfig(&config, true);
                 hcclComms[i] = HCCLComm::create_config(numRanks, rank, hcclID, &config);
+                hcclComms[i]->hcclCommType = static_cast<int>(HcclCommType::P2P);
+                hcclComms[i]->p2pPeer = getP2pPeer();
                 break;
             default:
                 throw std::runtime_error(
@@ -2147,6 +2201,10 @@ void ProcessGroupHCCL::createHCCLComm(
         // Creates the HCCL streams
         streamVal.push_back(getNPUStreamByCurrentType(devices[i].index()));
     }
+    auto endTime = std::chrono::steady_clock::now();
+    auto timeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime);
+    logger->info("Create hccl comm by hcclCommInitRootInfoConfig success, group id is %s, commType is %d, use %d ms.",
+        options_->group_id.c_str(), static_cast<int>(commType), timeElapsed.count());
 }
 
 bool ProcessGroupHCCL::createHCCLCommEx(
@@ -2195,6 +2253,7 @@ bool ProcessGroupHCCL::createHCCLCommEx(
         auto endTime = std::chrono::steady_clock::now();
         auto timeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime);
         ASCEND_LOGI("Create global hccl comm with ranktable success, take %d milliseconds", timeElapsed.count());
+        logger->info("Create global hccl comm with ranktable success, take %d milliseconds", timeElapsed.count());
         return true;
     }
 
@@ -2250,6 +2309,8 @@ bool ProcessGroupHCCL::createHCCLCommEx(
     auto subTimeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(subEndTime - subStartTime);
     ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig success, group id is %s, subCommId is %llu, use %d ms.",
         options_->group_id.c_str(), hcclid, subTimeElapsed.count());
+    logger->info("Create sub hccl comm by hcclCreateSubCommConfig success, group id is %s, subCommId is %llu, use %d ms.",
+        options_->group_id.c_str(), hcclid, subTimeElapsed.count());
     return true;
 }
 
@@ -2357,7 +2418,7 @@ int64_t ProcessGroupHCCL::getStreamId(bool p2p, int peer)
         TORCH_CHECK(peer >= 0, "In p2p scenarios, the passed 'dst rank id' is error.", DIST_ERROR(ErrCode::PARAM));
         key = getKeySendRecv(rank_, peer);
     }
-    if ((!hcclStreams_.count(key)) || hcclStreams_[key].empty()) {
+    if ((hcclStreams_.count(key) == 0) || hcclStreams_[key].empty()) {
         return -1;
     }
     return hcclStreams_[key][0].id();
@@ -2609,6 +2670,8 @@ c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL> ProcessGroupHCCL::initWork(
             outputs,
             desyncDebug_ ? &((*(r->hcclStartEvents_))[0]) : nullptr,
             &((*(r->hcclEndEvents_))[0]),
+            options_->timeout,
+            pgStatus_,
             isP2P);
     }
     return r;
@@ -2638,6 +2701,11 @@ void ProcessGroupHCCL::workEnqueue(c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL
         // needs to be destructed in user thread. Otherwise will
         // get deadlock. Here we enqueue work without outputs_.
         workMetaList_.emplace_back(*work);
+        // update the PG status related to the last enqueued work
+        pgStatus_->lastEnqueuedSeq = work->seq_;
+        pgStatus_->lastEnqueuedWorkName = opTypeToString(work->opType_);
+        pgStatus_->lastEnqueuedNumelIn = work->numelIn_;
+        pgStatus_->lastEnqueuedNumelOut = work->numelOut_;
     }
 }
 
@@ -2690,6 +2758,93 @@ void ProcessGroupHCCL::resumeHcclComm(int device_id)
     ASCEND_LOGI("resumeHcclComm success, group id is %s.", options_->group_id.c_str());
 }
 
+bool ProcessGroupHCCL::setCommWorkingDevNic(
+    const HcclComm& comm,
+    int nranks,
+    std::vector<uint32_t>& ranks,
+    std::vector<bool>& useBackup,
+    int rankid,
+    int hcclCommType,
+    int p2pPeer)
+{
+    HcclComm sendComm = comm;
+    uint32_t sendnRank = 0;
+    std::vector<uint32_t> sendRanks;
+    std::vector<bool> sendUseBackup;
+    if (hcclCommType == 1) {
+        int p2pRank = rankid <= p2pPeer ? 0 : 1;
+        bool isSendRecvSelf = rank_ == p2pPeer;
+        int p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank;
+        for (int i = 0; i < nranks; i++) {
+            if (ranks[i] == rankid) {
+                sendRanks.push_back(p2pRank);
+                sendUseBackup.push_back(useBackup[i]);
+                sendnRank++;
+            }
+            if (ranks[i] == p2pTargetRank) {
+                sendRanks.push_back(p2pTargetRank);
+                sendUseBackup.push_back(useBackup[i]);
+                sendnRank++;
+            }
+        }
+    } else {
+        for (int i = 0; i < nranks; i++) {
+            uint32_t localrank = 0;
+            for (uint32_t val : groupRanks()) {
+                if (ranks[i] == val) {
+                    sendRanks.push_back(localrank);
+                    sendUseBackup.push_back(useBackup[i]);
+                    sendnRank++;
+                    break;
+                }
+                localrank++;
+            }
+        }
+    }
+    if (sendnRank == 0) {
+        return true;
+    }
+    bool useBackupArr[sendUseBackup.size()];
+    uint32_t sendRanksArr[sendRanks.size()];
+    for (size_t i = 0; i < sendnRank; i++) {
+        useBackupArr[i] = sendUseBackup[i];
+        sendRanksArr[i] = sendRanks[i];
+    }
+    auto ret = hcclCommWorkingDevNicSet(sendComm, sendRanksArr, useBackupArr, sendnRank);
+    if (ret != HCCL_SUCCESS) {
+        ASCEND_LOGI("Fail to hcclCommWorkingDevNicSet");
+        return false;
+    }
+    return true;
+}
+
+bool ProcessGroupHCCL::setSwitchNicComm(int rankid, int nranks, std::vector<uint32_t>& ranks, std::vector<bool>& useBackup)
+{
+    if (!hcclCommWorkingDevNicSetExist()) {
+        ASCEND_LOGI("The hcclCommWorkingDevNicSet does not exist. Skip it.");
+        return true;
+    }
+    at::Device device = getDeviceForRank(rankid);
+    std::vector<at::Device> devices = {device};
+    auto key = getKeyFromDevices(devices);
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        if (devHCCLCommMap_.find(key) != devHCCLCommMap_.end()) {
+            auto& hcclComms = devHCCLCommMap_[key];
+            for (auto& hcclComm : hcclComms) {
+                HcclComm comm = hcclComm->getHcclComm();
+                bool result = setCommWorkingDevNic(comm, nranks, ranks, useBackup, rankid, hcclComm->hcclCommType, hcclComm->p2pPeer);
+                if (!result) {
+                    return false;
+                }
+            }
+        } else {
+            return true;
+        }
+    }
+    ASCEND_LOGI("Succeed to hcclCommWorkingDevNicSet");
+    return true;
+}
 
 void ProcessGroupHCCL::clearWorkMetaList()
 {
@@ -2766,7 +2921,7 @@ std::string ProcessGroupHCCL::getHcclCommName(int rankid, bool init_comm)
     return std::string(commName);
 }
 
-std::string ProcessGroupHCCL::getHcclCommNameWithoutInit(std::vector<std::shared_ptr<HCCLComm>>& hcclComms)
+std::string ProcessGroupHCCL::getHcclCommNameWithoutInit(std::vector<std::shared_ptr<HCCLComm>>& hcclComms) const
 {
     TORCH_CHECK(hcclComms.size() == 1, "expect hcclComms.size() = 1, but hcclComms.size() = ",
         hcclComms.size(), DIST_ERROR(ErrCode::VALUE));
@@ -2863,34 +3018,17 @@ void ProcessGroupHCCL::silenceCheck(at::Tensor &input, c10d::OpType opType)
             return;
         }
     }
-    if (c10_npu::opapi::IsExistAclnnSilentCheckV2()) {
-        at::Tensor val = at::norm(input, std::numeric_limits<float>::infinity()).pow(2).view(-1);
-        at::Tensor max;
-        if (silenceCheckCache_.find(opType) == silenceCheckCache_.end()) {
-            at::Tensor stepTensor = at::zeros({1}, input.options().dtype(at::kLong));
-            at::Tensor avg = input.detach().pow(2).max().view(-1);
-            max = avg;
-            silenceCheckCache_.emplace(opType, std::make_pair(std::move(stepTensor), std::move(avg)));
-        } else {
-            max = val;
-        }
-        static double beta1 = 0.99;
-        op_plugin::_npu_silent_check_v3(val, input, silenceCheckCache_[opType].first, max, silenceCheckCache_[opType].second,
-            c10_npu::option::OptionsManager::GetSilenceUpperThresh().first, c10_npu::option::OptionsManager::GetSilenceUpperThresh().second,
-            beta1, static_cast<int64_t>(c10_npu::option::OptionsManager::GetSilenceCheckFlag()));
-    } else {
-        if (silenceCheckCache_.find(opType) == silenceCheckCache_.end()) {
-            at::Tensor stepTensor = at::zeros({1}, input.options().dtype(at::kLong));
-            at::Tensor cacheTensor = at::zeros({3}, input.options().dtype(at::kFloat));
-            silenceCheckCache_.emplace(opType, std::make_pair(std::move(stepTensor), std::move(cacheTensor)));
-        }
-        at::Tensor val = at::norm(input);
-        static double min_steps = 100.0;
-        op_plugin::_npu_silent_check_v2(val, input, silenceCheckCache_[opType].second, silenceCheckCache_[opType].first, min_steps,
-            c10_npu::option::OptionsManager::GetSilenceUpperThresh().first, c10_npu::option::OptionsManager::GetSilenceSigmaThresh().first,
-            c10_npu::option::OptionsManager::GetSilenceUpperThresh().second, c10_npu::option::OptionsManager::GetSilenceSigmaThresh().second,
-            static_cast<int64_t>(c10_npu::option::OptionsManager::GetSilenceCheckFlag()));
+    if (silenceCheckCache_.find(opType) == silenceCheckCache_.end()) {
+        at::Tensor stepTensor = at::zeros({1}, input.options().dtype(at::kLong));
+        at::Tensor cacheTensor = at::zeros({3}, input.options().dtype(at::kFloat));
+        silenceCheckCache_.emplace(opType, std::make_pair(std::move(stepTensor), std::move(cacheTensor)));
     }
+    at::Tensor val = at::norm(input);
+    static double min_steps = 100.0;
+    op_plugin::_npu_silent_check_v2(val, input, silenceCheckCache_[opType].second, silenceCheckCache_[opType].first, min_steps,
+        c10_npu::option::OptionsManager::GetSilenceUpperThresh().first, c10_npu::option::OptionsManager::GetSilenceSigmaThresh().first,
+        c10_npu::option::OptionsManager::GetSilenceUpperThresh().second, c10_npu::option::OptionsManager::GetSilenceSigmaThresh().second,
+        static_cast<int64_t>(c10_npu::option::OptionsManager::GetSilenceCheckFlag()));
 }
 
 HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions()
@@ -2898,9 +3036,11 @@ HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions()
     HcclCommConfig config;
     getHcclCommConfig(&config);
 
-    // update group name in hccl comm config
-    std::string groupName = getGroupName();
-    torch_npu::toolkit::profiler::Utils::safe_strcpy_s(config.hcclCommName, groupName.c_str(), COMM_NAME_MAX_LENGTH);
+    if (isHcclFeatureSupported(HcclCommConfigCapability::HCCL_COMM_CONFIG_COMM_NAME)) {
+        // Update group name in hccl comm config when this capability is supported.
+        std::string groupName = getGroupName();
+        torch_npu::toolkit::profiler::Utils::safe_strcpy_s(config.hcclCommName, groupName.c_str(), COMM_NAME_MAX_LENGTH);
+    }
 
     if (options_->hccl_config.empty()) {
         return config;
@@ -2916,13 +3056,13 @@ HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions()
 
     if (options_->hccl_config.find("group_name") != options_->hccl_config.end()) {
         if (std::holds_alternative<std::string>(options_->hccl_config["group_name"])) {
-            auto groupName = std::get<std::string>(options_->hccl_config["group_name"]);
-            uint32_t udiLength = groupName.length();
-            if (groupName.length() >= UDI_MAX_LENGTH) {
+            auto hcclGroupName = std::get<std::string>(options_->hccl_config["group_name"]);
+            uint32_t udiLength = hcclGroupName.length();
+            if (hcclGroupName.length() >= UDI_MAX_LENGTH) {
                 udiLength = UDI_MAX_LENGTH - 1;
                 TORCH_NPU_WARN("The length of group_name has exceeded the limit UDI_MAX_LENGTH which will be truncated to UDI_MAX_LENGTH - 1.");
             }
-            strncpy(config.hcclUdi, groupName.c_str(), udiLength);
+            strncpy(config.hcclUdi, hcclGroupName.c_str(), udiLength);
             config.hcclUdi[udiLength] = '\0';
         } else {
             TORCH_CHECK(false, "Value type of group_name should be string.", DIST_ERROR(ErrCode::TYPE));
@@ -3015,7 +3155,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collective(
             for (auto tensor:inputs) {
                 dataVol += tensor.storage().nbytes();
             }
-            char* global_rank = getenv("RANK");
+            const char* global_rank = getenv("RANK");
             TORCH_CHECK(global_rank != nullptr, "Unable to fetch global rank for NSLB.", DIST_ERROR(ErrCode::NOT_FOUND));
             recordDataVol(opTypeToString(opType), std::to_string(dataVol), strtol(global_rank, nullptr, 10), hcclComms);
         }
@@ -3085,7 +3225,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collective(
             // to avoid to much task pushed to the stream, leading to stream overflow
             // insert sync point fluxLimit(key, i)
             c10_npu::NPUStream& hcclStream = hcclStreams[i];
-            hcclUs startut = TIME_NOW();
+            hcclUs startut = std::chrono::steady_clock::now();
             HCCL_CHECK_ERROR(fn(inputs[i], outputs[i], hcclComms[i]->getHcclComm(), hcclStream, work->is_dispatched), opTypeToString(opType).c_str());
             if (c10_npu::option::OptionsManager::GetMultiStreamMemoryReuse() == c10_npu::option::ERASE_RECORD_STREAM) {
                 work->recorded_outputs_.push_back(
@@ -3113,6 +3253,16 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collective(
     work->blockingWait_ = blockingWait_;
     work->opTimeout_ = options_->timeout;
     work->store_ = store_;
+    // Record size info for debug. We only record the size on the first device as
+    // multi-device per process is deprecated
+    work->numelIn_ = 0;
+    work->numelOut_ = 0;
+    for (const auto& input : inputs) {
+        work->numelIn_ += input.numel();
+    }
+    for (const auto& output : outputs) {
+        work->numelOut_ += output.numel();
+    }
     c10_npu::NPUGraph::inc_pending_event_queries();
     if (asyncErrorHandling_ != NoHandling && capture_status == c10_npu::CaptureStatus::None) {
         workEnqueue(work);
@@ -3176,7 +3326,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collectiveCoalesced(
             for (auto tensor:inputs) {
                 dataVol += tensor.storage().nbytes();
             }
-            char* global_rank = getenv("RANK");
+            const char* global_rank = getenv("RANK");
             TORCH_CHECK(global_rank != nullptr, "Unable to fetch global rank for NSLB.", DIST_ERROR(ErrCode::NOT_FOUND));
             recordDataVol(opTypeToString(opType), std::to_string(dataVol), strtol(global_rank, nullptr, 10), hcclComms);
         }
@@ -3246,7 +3396,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collectiveCoalesced(
             // to avoid to much task pushed to the stream, leading to stream overflow
             // insert sync point fluxLimit(key, i)
             c10_npu::NPUStream& hcclStream = hcclStreams[0];
-            hcclUs startut = TIME_NOW();
+            hcclUs startut = std::chrono::steady_clock::now();
             HCCL_CHECK_ERROR(fn(inputs[i], outputs[i], hcclComms[0]->getHcclComm(), hcclStream, work->is_dispatched), opTypeToString(opType).c_str());
             if (c10_npu::option::OptionsManager::GetMultiStreamMemoryReuse() == c10_npu::option::ERASE_RECORD_STREAM) {
                 work->recorded_outputs_.push_back(
@@ -3273,6 +3423,10 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collectiveCoalesced(
     work->blockingWait_ = blockingWait_;
     work->opTimeout_ = options_->timeout;
     work->store_ = store_;
+    // Record size info for debug. We only record the size on the first device as
+    // multi-device per process is deprecated
+    work->numelIn_ = inputs[0].numel();
+    work->numelOut_ = outputs[0].numel();
     c10_npu::NPUGraph::inc_pending_event_queries();
     if (asyncErrorHandling_ != NoHandling && capture_status == c10_npu::CaptureStatus::None) {
         workEnqueue(work);
@@ -3306,6 +3460,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::pointToPoint(
         p2pRank = rank_ <= peer ? 0 : 1;
         isSendRecvSelf = rank_ == peer;
         p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank;
+        setP2pPeer(peer);
         hcclComms = getHCCLComm(key, devices, HcclCommType::P2P, nullptr, p2pRank);
     } else {
         p2pTargetRank = peer;
@@ -3357,7 +3512,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::pointToPoint(
             for (auto tensor : tensors) {
                 dataVol += tensor.storage().nbytes();
             }
-            char* global_rank = getenv("RANK");
+            const char* global_rank = getenv("RANK");
             TORCH_CHECK(global_rank != nullptr, "Unable to fetch global rank for NSLB.",
                         DIST_ERROR(ErrCode::NOT_FOUND));
             recordDataVol(opTypeToString(opType), std::to_string(dataVol), strtol(global_rank, nullptr, 10), hcclComms);
@@ -3425,7 +3580,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::pointToPoint(
             // to avoid to much task pushed to the stream, leading to stream overflow
             // insert sync point fluxLimit(key, i)
             c10_npu::NPUStream& hcclStream = hcclStreams_[key][i];
-            hcclUs startut = TIME_NOW();
+            hcclUs startut = std::chrono::steady_clock::now();
             HCCL_CHECK_ERROR(fn(tensors[i], hcclComms[i]->getHcclComm(), hcclStream, work->is_dispatched, p2pTargetRank), opTypeToString(opType).c_str());
         }
     }
@@ -3450,6 +3605,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::pointToPoint(
         work->blockingWait_ = blockingWait_;
         work->opTimeout_ = options_->timeout;
         work->store_ = store_;
+        // Record size info for debug. We only record the size on the first device
+        // as multi-device per process is deprecated
+        work->numelIn_ = work->numelOut_ = tensors[i].numel();
     }
     
     c10_npu::NPUGraph::inc_pending_event_queries();
@@ -3529,7 +3687,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce(
                 *is_dispatched = true;
                 return hccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("HcclAllreduce", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclAllreduce", hccl_call);
 
             return HCCL_SUCCESS;
         },
@@ -3605,7 +3763,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::batch_isend_irecv(
                 *is_dispatched = true;
                 return hccl_result;
 			};
-            at_npu::native::OpCommand::RunOpApi("HcclBatchSendRecv", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclBatchSendRecv", hccl_call);
             return HCCL_SUCCESS;
         },
         [&](std::vector<c10_npu::NPUStream>& hcclStreams, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {
@@ -3653,7 +3811,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::broadcast(
                 *is_dispatched = true;
                 return hccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("HcclBroadcast", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclBroadcast", hccl_call);
 
             return HCCL_SUCCESS;
         },
@@ -3771,7 +3929,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce(
                 *is_dispatched = true;
                 return hccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("HcclReduce", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclReduce", hccl_call);
 
             return HCCL_SUCCESS;
         },
@@ -3831,7 +3989,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_oop(
                 *is_dispatched = true;
                 return hccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("HcclReduce", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclReduce", hccl_call);
 
             return HCCL_SUCCESS;
         },
@@ -3862,7 +4020,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_oop(
 }
 
 constexpr int64_t ADDRESS_ALIGNMENT_BYTE = 512;
-at::Tensor ProcessGroupHCCL::byte_alignment(at::Tensor& tensors)
+at::Tensor ProcessGroupHCCL::byte_alignment(at::Tensor& tensors) const
 {
     at::Tensor inter_tensors = at::reshape(tensors, {1, tensors.numel()});
     if (tensors.element_size() == 0) {
@@ -3908,7 +4066,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_scatter_base_uneven(
     check_split_sizes(inputSplitSizes, inputTensor, size_);
 
     int inputSize = static_cast<int>(inputSplitSizes.size());
-    int inputRowSize = static_cast<int>(inputTensor.size(0) ? inputTensor.numel() / inputTensor.size(0) : 1);
+    int inputRowSize = static_cast<int>(inputTensor.size(0) != 0 ? inputTensor.numel() / inputTensor.size(0) : 1);
     std::vector<uint64_t> inputCounts;
     std::vector<uint64_t> inputSpl;
     inputSpl.push_back(0);
@@ -3962,7 +4120,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_scatter_base_uneven(
                     *is_dispatched = true;
                     return hccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("HcclReduceScatterV", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclReduceScatterV", hccl_call);
 
             return HCCL_SUCCESS;
         },
@@ -3996,7 +4154,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_allgather_base_uneven(
     check_split_sizes(outputSplitSizes, outputTensor, size_);
 
     int outputSize = static_cast<int>(outputSplitSizes.size());
-    int outputRowSize = static_cast<int>(outputTensor.size(0) ? outputTensor.numel() / outputTensor.size(0) : 1);
+    int outputRowSize = static_cast<int>(outputTensor.size(0) != 0 ? outputTensor.numel() / outputTensor.size(0) : 1);
     std::vector<uint64_t> outputCounts;
     std::vector<uint64_t> outputSpl;
     outputSpl.push_back(0);
@@ -4047,7 +4205,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_allgather_base_uneven(
                     *is_dispatched = true;
                     return hccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("HcclAllGatherV", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclAllGatherV", hccl_call);
 
             return HCCL_SUCCESS;
         },
@@ -4110,7 +4268,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
                     *is_dispatched = true;
                     return hccl_result;
                 };
-                at_npu::native::OpCommand::RunOpApi("HcclAllgather", hccl_call);
+                at_npu::native::OpCommand::RunOpApiV2("HcclAllgather", hccl_call);
 
                 return HCCL_SUCCESS;
             },
@@ -4197,7 +4355,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
                         *is_dispatched = true;
                         return hccl_result;
                 };
-                at_npu::native::OpCommand::RunOpApi("HcclAllGatherV", hccl_call);
+                at_npu::native::OpCommand::RunOpApiV2("HcclAllGatherV", hccl_call);
 
                 return HCCL_SUCCESS;
             },
@@ -4353,7 +4511,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather_togather(
                 *is_dispatched = true;
                 return hccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("HcclAllGather", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclAllGather", hccl_call);
 
             return HCCL_SUCCESS;
         },
@@ -4404,7 +4562,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_allgather_base(
                 *is_dispatched = true;
                 return hccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("HcclAllGather", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclAllGather", hccl_call);
 
             return HCCL_SUCCESS;
         },
@@ -4452,7 +4610,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter(
                 *is_dispatched = true;
                 return hccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("HcclReduceScatter", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclReduceScatter", hccl_call);
 
             return HCCL_SUCCESS;
         },
@@ -4543,7 +4701,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter(
                         *is_dispatched = true;
                         return hccl_result;
                 };
-                at_npu::native::OpCommand::RunOpApi("HcclReduceScatterV", hccl_call);
+                at_npu::native::OpCommand::RunOpApiV2("HcclReduceScatterV", hccl_call);
 
                 return HCCL_SUCCESS;
             },
@@ -4568,6 +4726,12 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter(
                     at::Tensor output_tensor_reshape = at::reshape(outputFlattened[i], outputTensors[i].sizes());
                     outputTensors[i].copy_(output_tensor_reshape, true);
                 }
+                if (opts.reduceOp == c10d::ReduceOp::AVG) {
+                    c10_npu::NPUStreamGuard guard(hcclStreams[0]);
+                    for (auto& tensor : outputTensors) {
+                        tensor.div_(getSize());
+                    }
+                }
             },
             c10d::OpType::REDUCE_SCATTER);
     } else {
@@ -4642,7 +4806,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_scatter_base(
                 *is_dispatched = true;
                 return hccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("HcclReduceScatter", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclReduceScatter", hccl_call);
 
             return HCCL_SUCCESS;
         },
@@ -4818,7 +4982,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::scatter(
                 *is_dispatched = true;
                 return hccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("HcclScatter", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclScatter", hccl_call);
 
             return HCCL_SUCCESS;
         },
@@ -4866,11 +5030,11 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::send(std::vector<at::Tensor>& t
                 torch_npu::profiler::MstxRange range(
                     getMstxHcclMsg("HcclSend", numel, hcclType, comm, streamId, -1, dst_rank), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
-                auto hccl_result = HcclSend(inputDataPtr, numel, hcclType, dst_rank, comm, stream.stream(false));
+                auto hccl_result = HcclSend(inputDataPtr, numel, hcclType, static_cast<uint32_t>(dst_rank), comm, stream.stream(false));
                 *is_dispatched = true;
                 return hccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("HcclSend", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclSend", hccl_call);
 
             return HCCL_SUCCESS;
         },
@@ -4901,11 +5065,11 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::recv(std::vector<at::Tensor>& t
                 torch_npu::profiler::MstxRange range(
                     getMstxHcclMsg("HcclRecv", numel, hcclType, comm, streamId, src_rank, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
-                auto hccl_result = HcclRecv(outputDataPtr, numel, hcclType, src_rank, comm, stream.stream(false));
+                auto hccl_result = HcclRecv(outputDataPtr, numel, hcclType, static_cast<uint32_t>(src_rank), comm, stream.stream(false));
                 *is_dispatched = true;
                 return hccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("HcclRecv", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclRecv", hccl_call);
 
             return HCCL_SUCCESS;
         },
@@ -5011,7 +5175,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall_base(
                         *is_dispatched = true;
                         return hccl_result;
                     };
-                    at_npu::native::OpCommand::RunOpApi("HcclAlltoAll", hccl_call);
+                    at_npu::native::OpCommand::RunOpApiV2("HcclAlltoAll", hccl_call);
                     return HCCL_SUCCESS;
                 },
             [&](std::vector<c10_npu::NPUStream>&, c10::intrusive_ptr<ProcessGroupHCCL::WorkHCCL>&) {},
@@ -5051,8 +5215,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall_base(
 
         int inputSize = static_cast<int>(inputSplitSizes.size());
         int outSize = static_cast<int>(outputSplitSizes.size());
-        int inputRowSize = static_cast<int>(inputTensor.size(0) ? inputTensor.numel() / inputTensor.size(0) : 1);
-        int outputRowSize = static_cast<int>(outputTensor.size(0) ? outputTensor.numel() / outputTensor.size(0) : 1);
+        int inputRowSize = static_cast<int>(inputTensor.size(0) != 0 ? inputTensor.numel() / inputTensor.size(0) : 1);
+        int outputRowSize = static_cast<int>(outputTensor.size(0) != 0 ? outputTensor.numel() / outputTensor.size(0) : 1);
         std::vector<uint64_t> inputCounts;
         std::vector<uint64_t> inputSpl;
         std::vector<uint64_t> outputCounts;
@@ -5113,7 +5277,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall_base(
                     *is_dispatched = true;
                     return hccl_result;
                 };
-                at_npu::native::OpCommand::RunOpApi("HcclAlltoAllV", hccl_call);
+                at_npu::native::OpCommand::RunOpApiV2("HcclAlltoAllV", hccl_call);
 
                 return HCCL_SUCCESS;
             },
@@ -5240,7 +5404,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall(
                 *is_dispatched = true;
                 return hccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("HcclAlltoAllV", hccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("HcclAlltoAllV", hccl_call);
 
             return HCCL_SUCCESS;
         },
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
index a6f7f02b2c8b28a09ecc2682b2fb1c138adf800e..4021373b52b42290db011dc93094df4784e99842 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
@@ -69,6 +69,37 @@ static std::vector<std::string> TORCH_HCCL_HEARTBEAT_TIMEOUT_SEC = {
 static std::vector<std::string> TORCH_HCCL_COORD_CHECK_MILSEC = {
     "TORCH_HCCL_COORD_CHECK_MILSEC"};
 
+// A struct to hold the latest status of the process group.
+struct ProcessGroupStatus {
+    // the sequential number of the last collective enqueued into workMetaList_
+    // This is useful for indentifying a rank that has not join a collective
+    // initialized to be -1 to indicate no collective has been enqueued
+    int64_t lastEnqueuedSeq{-1};
+    // the sequential number of the last collective started as the kernel
+    int64_t lastStartedSeq{-1};
+    // the sequential number of the last colletive completed marked by
+    // the watchdog thread
+    // initialized to be -1 to indicate no collective has been completed
+    int64_t lastCompletedSeq{-1};
+
+    // the name of the last collective enqueued into workMetaList_
+    std::string lastEnqueuedWorkName;
+    // the name of the last collective started as the kernel
+    std::string lastStartedWorkName;
+    // the name of the last collective completed
+    std::string lastCompletedWorkName;
+
+    // the sizes of the last work enqueued
+    size_t lastEnqueuedNumelIn;
+    size_t lastEnqueuedNumelOut;
+    // the sizes of the last work completed
+    size_t lastCompletedNumelIn;
+    size_t lastCompletedNumelOut;
+    // the sizes of the last work started
+    size_t lastStartedNumelIn;
+    size_t lastStartedNumelOut;
+};
+
 struct DumpPipe {
     DumpPipe(int rank)
     {
@@ -177,7 +208,7 @@ enum class WatchdogStatus {
 //
 //   // Now continue on other work in the current stream.
 
-class ProcessGroupHCCL : public c10d::Backend {
+class C10_NPU_API ProcessGroupHCCL : public c10d::Backend {
 public:
     class WorkHCCL : public c10d::Work, public std::enable_shared_from_this<WorkHCCL> {
     public:
@@ -283,6 +314,11 @@ public:
         // This will be used by desync debug.
         bool startTraceUpdated_{false};
 
+        // Record collective sizes for debug. We only record the size on the first
+        // device as multi-device per process is deprecated
+        size_t numelIn_ = -1;
+        size_t numelOut_ = -1;
+
         // Wrapper method for the static checkForHCCLErrors which can be overridden
         // for tests.
         virtual std::exception_ptr checkForHCCLErrors(
@@ -342,10 +378,10 @@ public:
 
         // return intrusive_ptr of the object
         static c10::intrusive_ptr<Options> create(
-            bool is_high_priority_stream = false,
+            bool _is_high_priority_stream = false,
             std::chrono::milliseconds timeout = kNoTimeout)
         {
-            return c10::make_intrusive<Options>(is_high_priority_stream);
+            return c10::make_intrusive<Options>(_is_high_priority_stream);
         }
 
         std::unordered_map<std::string, std::variant<uint32_t, std::string>> hccl_config;
@@ -359,34 +395,6 @@ public:
         std::string group_id;
     };
 
-    // A struct to hold the latest status of the process group.
-    struct ProcessGroupStatus {
-        // the sequential number of the last collective enqueued into workMetaList_
-        // This is useful for indentifying a rank that has not join a collective
-        // initialized to be -1 to indicate no collective has been enqueued
-        int64_t lastEnqueuedSeq{-1};
-        // the sequential number of the last collective started as the kernel
-        int64_t lastStartedSeq{-1};
-        // the sequential number of the last colletive completed marked by
-        // the watchdog thread
-        // initialized to be -1 to indicate no collective has been completed
-        int64_t lastCompletedSeq{-1};
-
-        // the name of the last collective enqueued into workMetaList_
-        std::string lastEnqueuedWorkName;
-        // the name of the last collective started as the kernel
-        std::string lastStartedWorkName;
-        // the name of the last collective completed
-        std::string lastCompletedWorkName;
-
-        // the sizes of the last work enqueued
-        size_t lastEnqueuedNumelIn;
-        size_t lastEnqueuedNumelOut;
-        // the sizes of the last work completed
-        size_t lastCompletedNumelIn;
-        size_t lastCompletedNumelOut;
-    };
-
     // If you wish to create multiple process groups, each with a potentially
     // different rank and size, you can do so by passing a new store instance
     // to each one. If you have only a single store object, you can
@@ -455,7 +463,7 @@ public:
 	    std::vector<at::Tensor>& tensors,
 	    std::vector<uint32_t> remote_rank_list);
 
-    at::Tensor byte_alignment(at::Tensor& tensors);
+    at::Tensor byte_alignment(at::Tensor& tensors) const;
 
     c10::intrusive_ptr<c10d::Work> _reduce_scatter_base_uneven(
         at::Tensor& outputTensor,
@@ -563,6 +571,17 @@ public:
 
     void resumeHcclComm(int device_id);
 
+    bool setCommWorkingDevNic(
+        const HcclComm& comm,
+        int nranks,
+        std::vector<uint32_t>& ranks,
+        std::vector<bool>& useBackup,
+        int rankid,
+        int hcclCommType,
+        int p2pPeer);
+
+    bool setSwitchNicComm(int rankid, int nranks, std::vector<uint32_t>& ranks, std::vector<bool>& useBackup);
+
     void setWatchdogStatus(int status);
 
     void clearWorkMetaList();
@@ -579,7 +598,7 @@ public:
 
     void abortAndClearHcclComm(c10::optional<std::string> abortReason);
 
-    std::string getHcclCommNameWithoutInit(std::vector<std::shared_ptr<HCCLComm>>& hcclComms);
+    std::string getHcclCommNameWithoutInit(std::vector<std::shared_ptr<HCCLComm>>& hcclComms) const;
 
     // Return the global ranks of a PG
     const std::vector<uint32_t>& groupRanks() const;
@@ -644,7 +663,17 @@ protected:
     {
         return pg_desc_;
     }
-    
+
+    void setP2pPeer(int newPeer)
+    {
+        peer_ = newPeer;
+    }
+
+    const int getP2pPeer() const
+    {
+        return peer_;
+    }
+
     // In the timeout case and we will dump debug info such as the NCCL flight
     // recorder to storage. Down the road, if we have more complicated or blocking
     // operations, we might need to use a side thread to do it.
@@ -668,7 +697,7 @@ protected:
         std::future<bool>& fut,
         const std::chrono::milliseconds& timeOutMilSec,
         const std::string& futDescription,
-        bool throwException = false);
+        bool throwException = false) const;
 
     static const int64_t kWatchdogThreadSleepMillis;
 
@@ -742,7 +771,7 @@ protected:
     int hcclTraceBufferSize_;
 
     // We gate the heartbeat monitor thread so that we can roll it out gradually.
-    std::atomic<bool> monitorThreadEnabled_;
+    static std::atomic<bool> monitorThreadEnabled_;
 
     // Monitor thread which checks the heartbeat of Watchdog thread.
     // If the monitor thread finds there is no heartbeat, it will dump debug info
@@ -848,6 +877,8 @@ protected:
     // timeout and hccl errors.
     bool dumpOnException_;
 
+    bool hasGlobalDumped = false;
+
     // the perfdump path
     static std::string perfdumppath;
 
@@ -888,9 +919,11 @@ protected:
     std::string pg_name_;
     std::string pg_desc_;
 
+    int peer_;
+
     std::exception_ptr watchDogException_ = nullptr;
 
-    ProcessGroupStatus pgStatus_;
+    std::shared_ptr<ProcessGroupStatus> pgStatus_ = std::make_shared<ProcessGroupStatus>();
 
     struct StatusStruct {
         uint64_t seq = 0;
@@ -902,7 +935,9 @@ protected:
 
     StatusStruct StatusInfo;
 
-    void refreshStatusInfo(ProcessGroupHCCL::WorkHCCL work, std::string status);
+    bool refreshStatusInfo(ProcessGroupHCCL::WorkHCCL work, std::string status);
+
+    bool is_refreshed = false;
 
     static std::unordered_map<std::string, StatusStruct> StatusOutput_;
 
@@ -1057,6 +1092,13 @@ TORCH_API std::string dump_hccl_trace(
     bool includeStackTraces,
     bool onlyActive);
 
+// Dumps the HCCL comm traces and additional information about the Process
+// Group in JSON formatted string.
+// We don't include stack traces in JSON format as it is far too much data.
+TORCH_API std::string dump_hccl_trace_json(
+    bool includeCollectives,
+    bool onlyActive);
+
 // Gets a mutable reference to a global optional function.Heartbeat Monitor
 // will use this function to dump traces, if available. Inside fbcode, we
 // store a function here that uses an internal tool for process tracing
@@ -1065,7 +1107,7 @@ TORCH_API c10::optional<std::function<void(std::function<void(const std::string
 // Similar to get_cpp_trace_dumper, this stores a function defined in
 // torch-python layer that lets us check whether the GIL can be acquired,
 // helpful for instrumenting in cases where a hang was observed.
-typedef bool (*gil_checker_t)();
+using gil_checker_t = bool (*)();
 
 TORCH_API gil_checker_t &get_gil_checker();
 } // namespace c10d_npu
diff --git a/torch_npu/csrc/distributed/ProcessGroupLCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupLCCL.cpp
index 03c979088ad13dd32477e8ca57ae6dad72f5e3f2..a0a1322b5820820d37134fb07e792b9eadac15c9 100644
--- a/torch_npu/csrc/distributed/ProcessGroupLCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupLCCL.cpp
@@ -104,7 +104,7 @@ std::vector<at::Tensor> ProcessGroupLCCL::WorkLCCL::result()
     return *outputs_;
 }
 
-void ProcessGroupLCCL::WorkLCCL::checkAndThrowException()
+void ProcessGroupLCCL::WorkLCCL::checkAndThrowException() const
 {
     // Set the appropriate exception if found.
     checkAndSetException();
@@ -115,7 +115,7 @@ void ProcessGroupLCCL::WorkLCCL::checkAndThrowException()
     }
 }
 
-void ProcessGroupLCCL::WorkLCCL::checkAndSetException()
+void ProcessGroupLCCL::WorkLCCL::checkAndSetException() const
 {
     if (exception()) {
         // We already have an exception.
@@ -309,7 +309,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupLCCL::allreduce(std::vector<at::Tenso
                                                                lcclReduceOp, comm, stream.stream(false));
                 return lccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("LcclAllreduce", lccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("LcclAllreduce", lccl_call);
             return 0;
         },
         c10d::OpType::ALLREDUCE);
@@ -341,7 +341,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupLCCL::allgather(std::vector<std::vect
                                                                    stream.stream(false));
                     return lccl_result;
                 };
-                at_npu::native::OpCommand::RunOpApi("LcclAllgather", lccl_call);
+                at_npu::native::OpCommand::RunOpApiV2("LcclAllgather", lccl_call);
                 return 0;
             },
             [&](std::vector<c10_npu::NPUStream> &, c10::intrusive_ptr<ProcessGroupLCCL::WorkLCCL> &) {
@@ -385,7 +385,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupLCCL::broadcast(std::vector<at::Tenso
                     at_npu::lccl::LcclBroadcast(inputDataPtr, numel, lcclType, root, comm, stream.stream(false));
                 return lccl_result;
             };
-            at_npu::native::OpCommand::RunOpApi("LcclBroadcast", lccl_call);
+            at_npu::native::OpCommand::RunOpApiV2("LcclBroadcast", lccl_call);
             return 0;
         },
         c10d::OpType::BROADCAST);
@@ -418,7 +418,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupLCCL::reduce_scatter(std::vector<at::
                                                                        lcclReduceOp, comm, stream.stream(false));
                     return lccl_result;
                 };
-                at_npu::native::OpCommand::RunOpApi("LcclReduceScatter", lccl_call);
+                at_npu::native::OpCommand::RunOpApiV2("LcclReduceScatter", lccl_call);
                 return 0;
             },
             [&](std::vector<c10_npu::NPUStream> &lcclStreams, c10::intrusive_ptr<ProcessGroupLCCL::WorkLCCL> &work) {
diff --git a/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp
index df4e5cd73f870c26507a959d5188b8d757185150..a26eb8f9f9082e2469b5f5854bda44fec81608b7 100644
--- a/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp
@@ -72,10 +72,10 @@ public:
         void synchronizeInternal(std::chrono::milliseconds timeout);
 
         // Checks for LCCL errors and sets an appropriate exception_ptr.
-        void checkAndSetException();
+        void checkAndSetException() const;
 
         // Checks for LCCL errors and throws an appropriate exception.
-        void checkAndThrowException();
+        void checkAndThrowException() const;
 
         // Just checks whether NPU execution has completed, without modifying
         // exception_ptr.
diff --git a/torch_npu/csrc/distributed/StoreClient.cpp b/torch_npu/csrc/distributed/StoreClient.cpp
index 8be1fd58318b64af4387403fd242119224b6a03a..70edef8978ace54c83e319fade83dca3a9ac06b0 100644
--- a/torch_npu/csrc/distributed/StoreClient.cpp
+++ b/torch_npu/csrc/distributed/StoreClient.cpp
@@ -25,8 +25,8 @@
 #include <thread>
 
 #include "c10/util/Logging.h"
-#include "StoreClient.hpp"
 #include "StoreMessagePacker.hpp"
+#include "StoreClient.hpp"
 
 namespace c10d {
 namespace torch_npu {
@@ -38,27 +38,26 @@ Client::Client(const std::string localSocketPath, const std::chrono::millisecond
     : localSocketPath_ { localSocketPath }, socketFd_(-1), timeout_{ timeout }
 {}
 
-int Client::Connect() noexcept
+int Client::TryConnectCore(const ::addrinfo &addr) noexcept
 {
-    socketFd_ = socket(AF_INET, SOCK_STREAM, 0);
+    socketFd_ = ::socket(addr.ai_family, addr.ai_socktype, addr.ai_protocol);
     if (socketFd_ < 0) {
         LOG(ERROR) << "create tcp client socket failed " << errno << " : " << strerror(errno);
         return -1;
     }
+
     auto ret = SetReceiveTimeout(timeout_);
     if (ret < 0) {
         LOG(ERROR) << "set socket timeout failed. " << errno << " : " << strerror(errno);
+        close(socketFd_);
+        socketFd_ = -1;
         return -1;
     }
-    struct sockaddr_in servAddr {};
-    servAddr.sin_family = AF_INET;
-    servAddr.sin_port = htons(port_);
-    servAddr.sin_addr.s_addr = inet_addr(host_.c_str());
 
     int lastError = 0;
-    auto endTime = std::chrono::steady_clock::now() + std::chrono::minutes(1);
+    auto endTime = std::chrono::steady_clock::now() + timeout_;
     while (std::chrono::steady_clock::now() < endTime) {
-        ret = connect(socketFd_, reinterpret_cast<const struct sockaddr *>(&servAddr), sizeof(servAddr));
+        ret = connect(socketFd_, addr.ai_addr, addr.ai_addrlen);
         if (ret == 0) {
             return 0;
         }
@@ -79,21 +78,62 @@ int Client::Connect() noexcept
         }
     }
 
+    close(socketFd_);
+    socketFd_ = -1;
+    return -1;
+}
+
+int Client::TryConnect(int family) noexcept
+{
+    struct addrinfo hints = {0};
+    hints.ai_family = family;
+    hints.ai_socktype = SOCK_STREAM;
+
+    ::addrinfo* result = nullptr;
+    int r = ::getaddrinfo(host_.c_str(), std::to_string(port_).c_str(), &hints, &result);
+    if (r != 0) {
+        LOG(ERROR) << "getaddrinfo failed " << errno << " : " << strerror(errno);
+        return -1;
+    }
+
+    for (::addrinfo* addr = result; addr != nullptr; addr = addr->ai_next) {
+        int ret = TryConnectCore(*addr);
+        if (ret == 0) {
+            return 0;
+        }
+    }
+    return -1;
+}
+
+int Client::Connect() noexcept
+{
+    auto ret = TryConnect(AF_INET);
+    if (ret >= 0) {
+        return 0;
+    }
+    
+    ret = TryConnect(AF_INET6);
+    if (ret >= 0) {
+        return 0;
+    }
     return -1;
 }
 
 int Client::Close() noexcept
 {
     shutdown(socketFd_, SHUT_RDWR);
-    auto ret = close(socketFd_);
-    if (ret == 0) {
-        socketFd_ = -1;
-        return 0;
+    if (socketFd_ >= 0) {
+        auto ret = close(socketFd_);
+        if (ret == 0) {
+            socketFd_ = -1;
+            return 0;
+        }
+        LOG(ERROR) << "close socket to server(" << host_ << ":" << port_ << ") failed " << errno << " : " <<
+            strerror(errno);
+        return ret;
     }
 
-    LOG(ERROR) << "close socket to server(" << host_ << ":" << port_ << ") failed " << errno << " : " <<
-        strerror(errno);
-    return ret;
+    return 0;
 }
 
 int Client::LocalConnect() noexcept
@@ -120,7 +160,7 @@ int Client::LocalConnect() noexcept
     servAddr.sun_path[sizeof(servAddr.sun_path) - 1] = '\0';
 
     int lastError = 0;
-    auto endTime = std::chrono::steady_clock::now() + std::chrono::minutes(1);
+    auto endTime = std::chrono::steady_clock::now() + timeout_;
     while (std::chrono::steady_clock::now() < endTime) {
         ret = connect(socketFd_, reinterpret_cast<const struct sockaddr *>(&servAddr), sizeof(servAddr));
         if (ret == 0) {
@@ -228,7 +268,7 @@ int Client::SetReceiveTimeout(const std::chrono::milliseconds &value) const noex
     return ret;
 }
 
-int Client::GetSocketFd() noexcept
+int Client::GetSocketFd() const noexcept
 {
     return socketFd_;
 }
diff --git a/torch_npu/csrc/distributed/StoreClient.hpp b/torch_npu/csrc/distributed/StoreClient.hpp
index 869c217f0f63d2c3b2843e289eba2843223c743c..9631242cbeb55e85b95a2ece6bd2f7041af8a671 100644
--- a/torch_npu/csrc/distributed/StoreClient.hpp
+++ b/torch_npu/csrc/distributed/StoreClient.hpp
@@ -18,6 +18,7 @@
 #include <cstdint>
 #include <string>
 #include <chrono>
+#include <netdb.h>
 
 #include "StoreMessagePacker.hpp"
 
@@ -29,12 +30,14 @@ public:
     explicit Client(const std::string host, uint16_t port, const std::chrono::milliseconds timeout) noexcept;
     explicit Client(const std::string localSocketPath, const std::chrono::milliseconds timeout) noexcept;
     int Connect() noexcept;
+    int TryConnect(int family) noexcept;
+    int TryConnectCore(const ::addrinfo &addr) noexcept;
     int Close() noexcept;
     int LocalConnect() noexcept;
     int LocalClose() noexcept;
     int SyncCall(const StoreMessage &request, StoreMessage &response) noexcept;
     int SetReceiveTimeout(const std::chrono::milliseconds &value) const noexcept;
-    int GetSocketFd() noexcept;
+    int GetSocketFd() const noexcept;
 private:
     const std::string localSocketPath_{};
     const std::string host_{};
diff --git a/torch_npu/csrc/distributed/StoreMessagePacker.cpp b/torch_npu/csrc/distributed/StoreMessagePacker.cpp
index 0ff08c8d95f08a43032cba4895fb7021749c98f7..4cb1b67083bc8673db062ead1da94ed8d95f6fc0 100644
--- a/torch_npu/csrc/distributed/StoreMessagePacker.cpp
+++ b/torch_npu/csrc/distributed/StoreMessagePacker.cpp
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "torch_npu/csrc/core/npu/NPUException.h"
 #include "StoreMessagePacker.hpp"
 
 namespace c10d {
@@ -86,6 +87,7 @@ int64_t StoreMessagePacker::Unpack(const std::vector<uint8_t> &buffer, StoreMess
     }
 
     auto ptr = buffer.data();
+    auto ptr_end = ptr + buffer.size();
     auto totalSize = *reinterpret_cast<const uint64_t *>(ptr);
     ptr += sizeof(uint64_t);
 
@@ -97,22 +99,26 @@ int64_t StoreMessagePacker::Unpack(const std::vector<uint8_t> &buffer, StoreMess
 
     auto keyCount = *reinterpret_cast<const uint64_t *>(ptr);
     ptr += sizeof(uint64_t);
-    message.keys.reserve(keyCount);
     for (auto i = 0UL; i < keyCount; i++) {
         auto keySize = *reinterpret_cast<const uint64_t *>(ptr);
         ptr += sizeof(uint64_t);
         message.keys.emplace_back(reinterpret_cast<const char *>(ptr), keySize);
         ptr += keySize;
+        if (ptr > ptr_end) {
+            break;
+        }
     }
 
     auto valueCount = *reinterpret_cast<const uint64_t *>(ptr);
     ptr += sizeof(uint64_t);
-    message.values.reserve(valueCount);
     for (auto i = 0UL; i < valueCount; i++) {
         auto valueSize = *reinterpret_cast<const uint64_t *>(ptr);
         ptr += sizeof(uint64_t);
         message.values.emplace_back(ptr, ptr + valueSize);
         ptr += valueSize;
+        if (ptr > ptr_end) {
+            break;
+        }
     }
 
     return static_cast<int64_t>(totalSize);
diff --git a/torch_npu/csrc/distributed/TraceUtils.h b/torch_npu/csrc/distributed/TraceUtils.h
index 4956845eb8c9631b6bfaf7ef7ee4ddaaea03b59b..b770ae98648bcb74020c7635586fe30e06dd423d 100644
--- a/torch_npu/csrc/distributed/TraceUtils.h
+++ b/torch_npu/csrc/distributed/TraceUtils.h
@@ -11,9 +11,11 @@
 
 #include "torch_npu/csrc/core/npu/NPUEvent.h"
 #include "torch_npu/csrc/distributed/HCCLUtils.hpp"
+#include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
 
 #include <sys/types.h>
 #include <cstdlib>
+#include <nlohmann/json.hpp>
 #include <fstream>
 #include <string>
 #include <system_error>
@@ -21,37 +23,46 @@
 
 namespace c10d_npu {
 
-    static c10::IValue entries_key = "entries";
-    static c10::IValue hccl_comm_key = "hccl_comm_state";
-    static c10::IValue version_key = "version";
-    // Update whenever changing contents or formatting of the dump
-    // (minor when adding fields, major when changing existing fields)
-    static c10::IValue version_val = "2.1";
-    static c10::IValue pg_config_key = "pg_config";
-    static c10::IValue record_id_key = "record_id";
-    static c10::IValue pg_id_key = "pg_id";
-    static c10::IValue pg_name_key = "process_group";
-    static c10::IValue collective_seq_id_key = "collective_seq_id";
-    static c10::IValue p2p_seq_id_key = "p2p_seq_id";
-    static c10::IValue is_p2p_key = "is_p2p";
-    static c10::IValue op_id_key = "op_id";
-    static c10::IValue profiling_name_key = "profiling_name";
-    static c10::IValue input_sizes_key = "input_sizes";
-    static c10::IValue input_dtypes_key = "input_dtypes";
-    static c10::IValue output_sizes_key = "output_sizes";
-    static c10::IValue output_dtypes_key = "output_dtypes";
-    static c10::IValue time_created_key = "time_created_ns";
-    static c10::IValue duration_key = "duration_ms";
-
-    static c10::IValue frames_key = "frames";
-    static c10::IValue state_key = "state";
-    static c10::IValue line_key = "line";
-    static c10::IValue name_key = "name";
-    static c10::IValue filename_key = "filename";
-    static c10::IValue retired_key = "retired";
-    static c10::IValue time_discovered_started_key = "time_discovered_started_ns";
-    static c10::IValue time_discovered_completed_key =
-        "time_discovered_completed_ns";
+#define DEFINE_CONSTANT(name, value) \
+    static c10::IValue name = value;   \
+    static std::string name##_str = value;
+// Update whenever changing contents or formatting of the dump
+// (minor when adding fields, major when changing existing fields)
+// Also update both JSON and Pickle dumps to make use of the newly defined
+// field(s).
+DEFINE_CONSTANT(version_val, "2.4")
+DEFINE_CONSTANT(entries_key, "entries")
+DEFINE_CONSTANT(hccl_comm_key, "hccl_comm_state")
+DEFINE_CONSTANT(version_key, "version")
+DEFINE_CONSTANT(pg_config_key, "pg_config")
+DEFINE_CONSTANT(pg_status_key, "pg_status")
+DEFINE_CONSTANT(record_id_key, "record_id")
+DEFINE_CONSTANT(pg_id_key, "pg_id")
+DEFINE_CONSTANT(pg_name_key, "process_group")
+DEFINE_CONSTANT(collective_seq_id_key, "collective_seq_id")
+DEFINE_CONSTANT(p2p_seq_id_key, "p2p_seq_id")
+DEFINE_CONSTANT(is_p2p_key, "is_p2p")
+DEFINE_CONSTANT(op_id_key, "op_id")
+DEFINE_CONSTANT(profiling_name_key, "profiling_name")
+DEFINE_CONSTANT(input_sizes_key, "input_sizes")
+DEFINE_CONSTANT(input_dtypes_key, "input_dtypes")
+DEFINE_CONSTANT(output_sizes_key, "output_sizes")
+DEFINE_CONSTANT(output_dtypes_key, "output_dtypes")
+DEFINE_CONSTANT(time_created_key, "time_created_ns")
+DEFINE_CONSTANT(duration_key, "duration_ms")
+DEFINE_CONSTANT(timeout_key, "timeout_ms")
+DEFINE_CONSTANT(frames_key, "frames")
+DEFINE_CONSTANT(state_key, "state")
+DEFINE_CONSTANT(line_key, "line")
+DEFINE_CONSTANT(name_key, "name")
+DEFINE_CONSTANT(filename_key, "filename")
+DEFINE_CONSTANT(retired_key, "retired")
+DEFINE_CONSTANT(time_discovered_started_key, "time_discovered_started_ns")
+DEFINE_CONSTANT(time_discovered_completed_key, "time_discovered_completed_ns")
+DEFINE_CONSTANT(completed_state, "completed")
+DEFINE_CONSTANT(scheduled_state, "scheduled")
+DEFINE_CONSTANT(started_state, "started")
+#undef DEFINE_CONSTANT
 
     /* Trace Utils Related to TORCH_HCCL_DESYNC_DEBUG */
 
@@ -124,7 +135,7 @@ namespace c10d_npu {
         std::string report =
             "\n\t - To our best knowledge, the lagging/dead/mismatched ranks "
             "that caused the desync are:";
-        if (startRanks.size()) {
+        if (startRanks.size() != 0) {
             report += c10::str(
                 "\n\t   - [",
                 ranksToString(startRanks),
@@ -132,7 +143,7 @@ namespace c10d_npu {
                 lagSeq,
                 " (count from 1)");
         }
-        if (endRanks.size()) {
+        if (endRanks.size() != 0) {
             report += c10::str(
                 "\n\t     [",
                 ranksToString(endRanks),
@@ -165,7 +176,7 @@ namespace c10d_npu {
                 }
             }
 
-            if (collectivesStart.size()) {
+            if (collectivesStart.size() != 0) {
                 report += c10::str("\n\t   #", seq, " started ranks:");
                 for (auto &mapPair : collectivesStart) {
                     report += c10::str(
@@ -175,7 +186,7 @@ namespace c10d_npu {
                         mapPair.first);
                 }
             }
-            if (collectivesEnd.size()) {
+            if (collectivesEnd.size() != 0) {
                 report += c10::str("\n\t   #", seq, " finished ranks:");
                 for (auto &mapPair : collectivesEnd) {
                     report += c10::str(
@@ -205,52 +216,6 @@ namespace c10d_npu {
         return hcclStartEvent.elapsed_time(hcclEndEvent);
     }
 
-    DebugInfoWriter::~DebugInfoWriter() = default;
-
-    void DebugInfoWriter::write(const std::string &hcclTrace)
-    {
-        // Open a file for writing. The ios::binary flag is used to write data as
-        // binary.
-        std::ofstream file(filename_, std::ios::binary);
-
-        // Check if the file was opened successfully.
-        if (!file.is_open()) {
-            LOG(ERROR) << "Error opening file for writing HCCLPG debug info: "
-                       << filename_;
-            return;
-        }
-
-        file.write(hcclTrace.data(), hcclTrace.size());
-        LOG(INFO) << "Finished writing HCCLPG debug info to " << filename_;
-    }
-
-    DebugInfoWriter &DebugInfoWriter::getWriter(int rank)
-    {
-        if (writer_ == nullptr) {
-            std::string fileNamePrefix = getCvarString(
-                {"TORCH_HCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/hccl_trace_rank_");
-            // Using std::unique_ptr here to auto-delete the writer object
-            // when the pointer itself is destroyed.
-            std::unique_ptr<DebugInfoWriter> writerPtr(
-                new DebugInfoWriter(fileNamePrefix, rank));
-            DebugInfoWriter::registerWriter(std::move(writerPtr));
-        }
-        return *writer_;
-    }
-
-    void DebugInfoWriter::registerWriter(std::unique_ptr<DebugInfoWriter> writer)
-    {
-        TORCH_CHECK_WITH(
-            DistBackendError,
-            hasWriterRegistered_.load() == false,
-            "debugInfoWriter already registered");
-        hasWriterRegistered_.store(true);
-        writer_ = std::move(writer);
-    }
-
-    std::unique_ptr<DebugInfoWriter> DebugInfoWriter::writer_ = nullptr;
-    std::atomic<bool> DebugInfoWriter::hasWriterRegistered_(false);
-
     inline std::string pickle_str(const c10::IValue &v)
     {
         std::vector<char> result;
@@ -324,7 +289,7 @@ namespace c10d_npu {
         }
         HCCLTraceBuffer()
         {
-            max_entries_ = getCvarInt({"TORCH_HCCL_TRACE_BUFFER_SIZE"}, 0);
+            max_entries_ = static_cast<size_t>(getCvarInt({"TORCH_HCCL_TRACE_BUFFER_SIZE"}, 0));
             capture_cpp_stack_ = getCvarBool({"TORCH_HCCL_TRACE_CPP_STACK"}, false);
             enabled_ = max_entries_ > 0;
         }
@@ -358,6 +323,9 @@ namespace c10d_npu {
             // was 'enqueued'- not necessarily started
             c10::time_t time_created_;
 
+            // configured timeout for this entry
+            c10::time_t timeout_ms_;
+
             // Is this a P2P event?
             bool isP2P_;
 
@@ -375,9 +343,9 @@ namespace c10d_npu {
             std::optional<c10::time_t> time_discovered_completed_;
 
             // size information for input/output tensors
-            c10::SmallVector<int, 4> input_dims_;
+            c10::SmallVector<int64_t, 4> input_dims_;
             std::vector<c10::ScalarType> input_dtypes_;
-            c10::SmallVector<int, 4> output_dims_;
+            c10::SmallVector<int64_t, 4> output_dims_;
             std::vector<c10::ScalarType> output_dtypes_;
             c10::SmallVector<int64_t, 8> sizes_; // flattened from inputs, outputs
             bool retired_ = false;               // is this work entry no longer in the workMetaList_?
@@ -391,6 +359,7 @@ namespace c10d_npu {
         size_t max_entries_ = 0;
         size_t next_ = 0;
         size_t id_ = 0;
+        std::map<size_t, std::shared_ptr<ProcessGroupStatus>> all_pg_status_ = {};
         std::map<std::tuple<std::string, std::string>, std::vector<uint32_t>>
             pg_name_to_ranks_ = {};
 
@@ -405,11 +374,17 @@ namespace c10d_npu {
             const std::vector<at::Tensor> &outputs,
             Event *start,
             Event *end,
+            std::chrono::milliseconds timeout_ms,
+            std::shared_ptr<ProcessGroupStatus> pg_status,
             bool isP2P)
         {
             if (!enabled_) {
                 return c10::nullopt;
             }
+            if (all_pg_status_.find(pg_id) == all_pg_status_.end()) {
+                // Current pg_status is not in FR.
+                all_pg_status_[pg_id] = std::move(pg_status);
+            }
             auto traceback =
                 torch_npu::CapturedTraceback::gather(true, true, capture_cpp_stack_);
             std::lock_guard<std::mutex> guard(mutex_);
@@ -426,19 +401,20 @@ namespace c10d_npu {
                 std::move(start),
                 std::move(end),
                 c10::getTime(),
+                timeout_ms.count(),
                 isP2P};
 
             for (const auto &input : inputs) {
                 c10::IntArrayRef sizes = input.sizes();
                 te.input_dtypes_.push_back(input.dtype().toScalarType());
-                te.input_dims_.push_back(sizes.size());
+                te.input_dims_.push_back(static_cast<int64_t>(sizes.size()));
                 te.sizes_.insert(te.sizes_.end(), sizes.begin(), sizes.end());
             }
 
             for (const auto &output : outputs) {
                 c10::IntArrayRef sizes = output.sizes();
                 te.output_dtypes_.push_back(output.dtype().toScalarType());
-                te.output_dims_.push_back(sizes.size());
+                te.output_dims_.push_back(static_cast<int64_t>(sizes.size()));
                 te.sizes_.insert(te.sizes_.end(), sizes.begin(), sizes.end());
             }
 
@@ -464,7 +440,7 @@ namespace c10d_npu {
             pg_name_to_ranks_[pg_name] = ranks;
         }
 
-        void update_state(Entry &r)
+        void update_state(Entry &r) const
         {
             if (r.start_ != nullptr) {
                 bool started = r.start_->query();
@@ -589,7 +565,7 @@ namespace c10d_npu {
                 if (includeStacktraces) {
                     auto &tb = stracebacks.tracebacks.at(i);
                     auto frames = new_list();
-                    for (int64_t frame : tb) {
+                    for (uint64_t frame : tb) {
                         frames.push_back(all_frames.at(frame));
                     }
                     dict.insert(frames_key, frames);
@@ -608,7 +584,7 @@ namespace c10d_npu {
                 }
 
                 auto it = e.sizes_.begin();
-                auto read_sizes = [&](const c10::SmallVector<int, 4> &dims) {
+                auto read_sizes = [&](const c10::SmallVector<int64_t, 4> &dims) {
                     auto sizes = new_list();
                     for (auto dim : dims) {
                         auto arg_sizes = new_list();
@@ -654,6 +630,7 @@ namespace c10d_npu {
                         ? int64_t(*e.time_discovered_completed_)
                         : c10::IValue());
                 dict.insert(retired_key, e.retired_);
+                dict.insert(timeout_key, e.timeout_ms_);
                 dict.insert(is_p2p_key, e.isP2P_);
 
                 entries.push_back(dict);
@@ -675,6 +652,140 @@ namespace c10d_npu {
             return pg_config;
         }
 
+        const std::map<std::string, std::map<std::string, std::string>> getPgConfigJson()
+        {
+            std::map<std::string, std::map<std::string, std::string>> result;
+            for (const auto& [pg_name, ranks] : pg_name_to_ranks_) {
+                auto pg_info = std::map<std::string, std::string>();
+                pg_info["name"] = std::get<0>(pg_name);
+                pg_info["desc"] = std::get<1>(pg_name);
+                pg_info["ranks"] = ranks_str(ranks);
+                result.emplace(std::get<0>(pg_name), pg_info);
+            }
+            return result;
+        }
+
+        const c10::Dict<c10::IValue, c10::IValue> getPgStatus()
+        {
+            auto all_pg_status = new_dict();
+            for (const auto& [pg_id, status] : all_pg_status_) {
+                auto pg_status = new_dict();
+                pg_status.insert("last_enqueued_collective", status->lastEnqueuedSeq);
+                pg_status.insert("last_started_collective", status->lastStartedSeq);
+                pg_status.insert("last_completed_collective", status->lastCompletedSeq);
+                all_pg_status.insert(std::to_string(pg_id), pg_status);
+            }
+            return all_pg_status;
+        }
+
+        const std::map<std::string, std::map<std::string, std::string>> getPgStatusJson()
+        {
+            std::map<std::string, std::map<std::string, std::string>> result;
+            for (const auto& [pg_id, status] : all_pg_status_) {
+                auto pg_status = std::map<std::string, std::string>();
+                pg_status["last_enqueued_collective"] =
+                    std::to_string(status->lastEnqueuedSeq);
+                pg_status["last_started_collective"] =
+                    std::to_string(status->lastStartedSeq);
+                pg_status["last_completed_collective"] =
+                    std::to_string(status->lastCompletedSeq);
+                result[std::to_string(pg_id)] = pg_status;
+            }
+            return result;
+        }
+
+        std::string dump_json(
+            const c10::optional<std::unordered_map<
+                std::string,
+                std::unordered_map<std::string, std::string>>>& hcclDumpMap,
+            bool includeCollectives,
+            bool onlyActive)
+        {
+            using json = nlohmann::json;
+            json result;
+            result[version_key_str] = version_val_str;
+            result[pg_config_key_str] = getPgConfigJson();
+            result[pg_status_key_str] = getPgStatusJson();
+
+            // collective trace
+            if (includeCollectives) {
+                std::list<json> entries;
+                for (auto& e : dump_entries()) {
+                json j;
+                if (onlyActive && e.time_discovered_completed_.has_value()) {
+                    continue;
+                }
+                j[record_id_key_str] = int64_t(e.id_);
+                j[pg_id_key_str] = int64_t(e.pg_id_);
+                j[pg_name_key_str] = e.pg_name_;
+                j[collective_seq_id_key_str] = int64_t(e.collective_seq_id_);
+                j[p2p_seq_id_key_str] = int64_t(e.p2p_seq_id_);
+                j[op_id_key_str] = int64_t(e.op_id_);
+                j[profiling_name_key_str] = e.profiling_name_;
+                j[time_created_key_str] = int64_t(e.time_created_);
+                if (e.duration_) {
+                    j[duration_key_str] = *e.duration_;
+                }
+                auto it = e.sizes_.begin();
+                auto read_sizes = [&](const c10::SmallVector<int64_t, 4>& dims) {
+                    auto sizes = std::list<std::list<int64_t>>();
+                    for (auto dim : dims) {
+                    auto arg_sizes = std::list<int64_t>();
+                    for (auto i : c10::irange(dim)) {
+                        (void)i;
+                        arg_sizes.push_back(*it++);
+                    }
+                    sizes.push_back(arg_sizes);
+                    }
+                    return sizes;
+                };
+                j[input_sizes_key_str] = read_sizes(e.input_dims_);
+                std::vector<std::string> input_dtypes_strs;
+                input_dtypes_strs.reserve(e.input_dtypes_.size());
+                for (const auto& input_dtype : e.input_dtypes_) {
+                    input_dtypes_strs.emplace_back(c10::toString(input_dtype));
+                }
+                j[input_dtypes_key_str] = input_dtypes_strs;
+                j[output_sizes_key_str] = read_sizes(e.output_dims_);
+                std::vector<std::string> output_dtypes_strs;
+                output_dtypes_strs.reserve(e.output_dtypes_.size());
+                for (const auto& output_dtype : e.output_dtypes_) {
+                    output_dtypes_strs.emplace_back(c10::toString(output_dtype));
+                }
+                j[output_dtypes_key_str] = output_dtypes_strs;
+                if (e.time_discovered_completed_.has_value()) {
+                    j[state_key_str] = completed_state_str;
+                } else if (e.time_discovered_started_.has_value()) {
+                    j[state_key_str] = started_state_str;
+                } else {
+                    j[state_key_str] = scheduled_state_str;
+                }
+                j[time_discovered_started_key_str] =
+                    e.time_discovered_started_.has_value()
+                    ? int64_t(*e.time_discovered_started_)
+                    : 0;
+                j[time_discovered_completed_key_str] =
+                    e.time_discovered_completed_.has_value()
+                    ? int64_t(*e.time_discovered_completed_)
+                    : 0;
+                j[retired_key_str] = e.retired_;
+                j[timeout_key_str] = e.timeout_ms_;
+                j[is_p2p_key_str] = e.isP2P_;
+                entries.emplace_back(j);
+                }
+
+                if (!entries.empty()) {
+                result[entries_key_str] = entries;
+                }
+            }
+
+            if (hcclDumpMap.has_value()) {
+                result[hccl_comm_key_str] = hcclDumpMap.value();
+            }
+
+            return result.dump();
+        }
+
         // dump all collectives + hcclDumpMap
         std::string dump(
             const c10::optional<std::unordered_map<
@@ -688,6 +799,7 @@ namespace c10d_npu {
             // common values
             result.insert(version_key, version_val);
             result.insert(pg_config_key, getPgConfig());
+            result.insert(pg_status_key, getPgStatus());
 
             // collective trace
             if (includeCollectives) {
diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp
index 8a4f8221e25128423db88f6081948f0f75bda931..da3664149b04f9e53fabcc7e4b19263e9a182910 100644
--- a/torch_npu/csrc/distributed/reducer.cpp
+++ b/torch_npu/csrc/distributed/reducer.cpp
@@ -31,12 +31,12 @@
 #include <torch/csrc/utils/memory.h>
 #include <c10d/debug.h>
 
-#include "torch_npu/csrc/distributed/reducer.hpp"
 #include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/framework/utils/OpPreparation.h"
 #include "torch_npu/csrc/core/NPUBridge.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
+#include "torch_npu/csrc/distributed/reducer.hpp"
 
 namespace c10d_npu {
 namespace {
@@ -269,17 +269,17 @@ Reducer::~Reducer() noexcept(false)
     }
 }
 
-bool Reducer::dynamic_graph_find_unused()
+bool Reducer::dynamic_graph_find_unused() const
 {
     return !static_graph_ && find_unused_parameters_;
 }
 
-bool Reducer::static_graph_first_iteration()
+bool Reducer::static_graph_first_iteration() const
 {
     return static_graph_ && num_iterations_ == 1;
 }
 
-bool Reducer::static_graph_after_first_iteration()
+bool Reducer::static_graph_after_first_iteration() const
 {
     return static_graph_ && num_iterations_ > 1;
 }
@@ -1147,7 +1147,8 @@ void Reducer::initialize_bucket_views(
 // (see Note:  "Gradient Layout Contract" in initialize_buckets).
 void Reducer::populate_bucket_views_out(
     Reducer::BucketReplica& replica,
-    at::Tensor& tensor) {
+    at::Tensor& tensor) const
+{
     replica.bucket_views_out.clear();
     for (size_t i = 0; i < replica.variables.size(); i++) {
         const auto& v = replica.variables[i];
@@ -1871,7 +1872,7 @@ void Reducer::set_ddp_runtime_logging_sample_rate(int sample_rate)
     ddp_runtime_logging_sample_rate_ = sample_rate;
 }
 
-int Reducer::get_ddp_runtime_logging_sample_rate()
+int Reducer::get_ddp_runtime_logging_sample_rate() const
 {
     return ddp_runtime_logging_sample_rate_;
 }
@@ -1948,7 +1949,8 @@ struct BucketKey {
     const c10::Device device;
 
     // See torch/csrc/utils/hash.h for dispatch code.
-    static size_t hash(const BucketKey& key) {
+    static size_t hash(const BucketKey& key)
+    {
         return c10::get_hash(key.type, key.device);
     }
 };
diff --git a/torch_npu/csrc/distributed/reducer.hpp b/torch_npu/csrc/distributed/reducer.hpp
index d82cd3884cc25325d47031b5597ef66cdcb96c4b..fb202b88600343892f7664c4537023da25b3dbc0 100644
--- a/torch_npu/csrc/distributed/reducer.hpp
+++ b/torch_npu/csrc/distributed/reducer.hpp
@@ -400,7 +400,7 @@ protected:
     // This function is called inside `finalize_backward`, it happens only if
     // DDP communication hook was registered to recreate just bucket_views_out
     // with the result of `future_work`.
-    void populate_bucket_views_out(BucketReplica& replica, at::Tensor& tensor);
+    void populate_bucket_views_out(BucketReplica& replica, at::Tensor& tensor) const;
 
     // If gradient_as_bucket_view_ is false, after allreduce buckets,
     // copy bucket results back to grads.
@@ -459,9 +459,10 @@ protected:
 
         VariableLocator() = default;
 
-        VariableLocator(size_t bucket_index_, size_t intra_bucket_index_) {
-        bucket_index = bucket_index_;
-        intra_bucket_index = intra_bucket_index_;
+        VariableLocator(size_t bucket_index_, size_t intra_bucket_index_)
+        {
+            bucket_index = bucket_index_;
+            intra_bucket_index = intra_bucket_index_;
         }
     };
 
@@ -490,7 +491,7 @@ protected:
     void record_backward_comm_start_time();
     void record_backward_comm_end_time();
 
-    int get_ddp_runtime_logging_sample_rate();
+    int get_ddp_runtime_logging_sample_rate() const;
     int ddp_runtime_logging_sample_rate_ = kDDPRuntimeLoggingSampleRate;
 
     bool is_multi_device_module_ = false;
@@ -560,9 +561,9 @@ private:
     void initialize_local_used_map();
     // get current cuda stream
     const c10::Stream get_current_stream();
-    bool dynamic_graph_find_unused();
-    bool static_graph_first_iteration();
-    bool static_graph_after_first_iteration();
+    bool dynamic_graph_find_unused() const;
+    bool static_graph_first_iteration() const;
+    bool static_graph_after_first_iteration() const;
 
     // comm_hook_ is used to access the DDP communication hook if registered.
     std::unique_ptr<c10d::CommHookInterface> comm_hook_;
diff --git a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp
index b07493117dc687d21ee27a8e187ea89b47dd0018..da0aba1196326046dc06577e90a70703197c77e0 100644
--- a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -156,7 +156,7 @@ const std::string &TensorPipeAgent::guessAddress()
     static const std::string uvAddress = []() {
         tensorpipe_npu::Error error;
         std::string result;
-        char *ifnameEnv = std::getenv(kSocketIfnameEnvVar.c_str());
+        const char *ifnameEnv = std::getenv(kSocketIfnameEnvVar.c_str());
         if (ifnameEnv != nullptr) {
             std::tie(error, result) = tensorpipe_npu::transport::uv::lookupAddrForIface(ifnameEnv);
             if (error) {
@@ -632,7 +632,7 @@ void TensorPipeAgent::respond(std::shared_ptr<tensorpipe_npu::Pipe> &pipe)
     pipeRead(pipe, [this, pipe](const tensorpipe_npu::Error &error, c10::intrusive_ptr<Message> requestMessage,
                                 std::vector<c10::Stream> streams) mutable {
         if (error) {
-            if (shuttingDown_) {
+            if (shuttingDown_.load()) {
                 // This is expected.
             } else {
                 LOG(WARNING) << "RPC agent for " << workerInfo_.name_
diff --git a/torch_npu/csrc/framework/FormatHelper.cpp b/torch_npu/csrc/framework/FormatHelper.cpp
index ac1610b020e1ac1155c97400d23223a7402e0d3a..6a92fe5af4d8039b3d0ff9c50e49d1fd5fa30a00 100644
--- a/torch_npu/csrc/framework/FormatHelper.cpp
+++ b/torch_npu/csrc/framework/FormatHelper.cpp
@@ -154,6 +154,16 @@ bool FormatHelper::IsOpInputBaseFormat(const c10::List<c10::optional<at::Tensor>
     return iter == tensors.end();
 }
 
+bool FormatHelper::IsOpInputBaseFormat(const c10::optional<at::TensorList> &tensors)
+{
+    if (!tensors.has_value()) {
+        return true;
+    }
+    const auto &iter =
+        std::find_if(tensors.value().begin(), tensors.value().end(), [](const auto &tensor) { return !IsOpInputBaseFormat(tensor); });
+    return iter == tensors.value().end();
+}
+
 bool FormatHelper::IsOpInputBaseFormat(const at::TensorList &tensors)
 {
     const auto &iter =
diff --git a/torch_npu/csrc/framework/FormatHelper.h b/torch_npu/csrc/framework/FormatHelper.h
index 7bfe12e4e40d65424953f37a22a87c992a0940f6..ba987e6c725eee8b24bd1c7949d0c271deb2fdc9 100644
--- a/torch_npu/csrc/framework/FormatHelper.h
+++ b/torch_npu/csrc/framework/FormatHelper.h
@@ -39,6 +39,7 @@ public:
 
     static bool IsOpInputBaseFormat(const at::Tensor &tensor);
     static bool IsOpInputBaseFormat(const c10::optional<at::Tensor> &tensor);
+    static bool IsOpInputBaseFormat(const c10::optional<at::TensorList> &tensors);
     static bool IsOpInputBaseFormat(const c10::List<c10::optional<at::Tensor>> &tensors);
     static bool IsOpInputBaseFormat(const at::TensorList &tensors);
     static bool IsOpInputBaseFormat(const at::ITensorListRef &tensors);
diff --git a/torch_npu/csrc/framework/InferFormat.cpp b/torch_npu/csrc/framework/InferFormat.cpp
index c7a9f4e0b1026e086d10e8bf32d8940ccdd42977..13ebf787d33abced4b42395ba77290365ee78b48 100644
--- a/torch_npu/csrc/framework/InferFormat.cpp
+++ b/torch_npu/csrc/framework/InferFormat.cpp
@@ -59,6 +59,8 @@ aclFormat InferFormat::GuessStorageFormat(const c10::IntArrayRef &size, aclForma
 {
     if (format == ACL_FORMAT_FRACTAL_NZ && size.size() < 2) {
         // scalar scene and rank=1 scene do not support NZ
+        TORCH_WARN_ONCE("Cannot create tensor with NZ format while dim < 2, "
+                        "tensor will be created with ND format.");
         return ACL_FORMAT_ND;
     }
 
diff --git a/torch_npu/csrc/framework/LazyInitAclops.cpp b/torch_npu/csrc/framework/LazyInitAclops.cpp
index 03b08146593685f0362686c35ba305695a345c74..5f51f9f0a5cfd43afceb87cf6f4552ea61d4e50c 100644
--- a/torch_npu/csrc/framework/LazyInitAclops.cpp
+++ b/torch_npu/csrc/framework/LazyInitAclops.cpp
@@ -126,7 +126,7 @@ void InitializeJitCompilationMode()
     std::string value_str = GetJitCompileMode();
     if (value_str != "") {
         c10_npu::option::SetOption("jitCompileInit", value_str);
-        ASCEND_LOGI("Set jitCompileInit option to %s", value_str);
+        ASCEND_LOGI("Set jitCompileInit option to %s", value_str.c_str());
     } else {
         c10_npu::option::SetOption("jitCompileInit", "disable");
         ASCEND_LOGI("Set jitCompileInit option to default value: disable");
diff --git a/torch_npu/csrc/framework/NPUDefine.cpp b/torch_npu/csrc/framework/NPUDefine.cpp
index d8f9172a51c350ce29b9ef80e44a46ca6a6d4cd9..2bf90fffaea9e0f502fab76707bab66632b198b9 100644
--- a/torch_npu/csrc/framework/NPUDefine.cpp
+++ b/torch_npu/csrc/framework/NPUDefine.cpp
@@ -44,7 +44,19 @@ void ExecuteParas::Release()
     void ExecuteParasOpApi::Copy(ExecuteParasOpApi &other)
     {
         strncpy(this->opType, other.opType, sizeof(ExecuteParasOpApi::opType) - 1);
-        this->customHandler = other.customHandler;
+        this->customHandler = std::move(other.customHandler);
+    }
+
+    void ExecuteParasOpApi::Copy(ExecuteParasOpApiV2 &other)
+    {
+        static const auto max_len = sizeof(ExecuteParasOpApi::opType);
+        auto len = other.opName->length();
+        if (len + 1 < max_len) {
+            other.opName->copy(this->opType, len + 1);
+        } else {
+            other.opName->copy(this->opType, max_len - 1);
+        }
+        this->customHandler = std::move(*(other.customHandler));
     }
 
     NPUStatus DestroyAclParams(ACL_PARAMS &params)
diff --git a/torch_npu/csrc/framework/NPUDefine.h b/torch_npu/csrc/framework/NPUDefine.h
index 20dd4f14f740f64e3967e191883a3fdb68bb74a9..291bcf9be92c7cb995bcf939315464704e052d17 100644
--- a/torch_npu/csrc/framework/NPUDefine.h
+++ b/torch_npu/csrc/framework/NPUDefine.h
@@ -68,6 +68,13 @@ struct ExecuteParas {
     PROCESS_FUNC customHandler;
 };
 
+struct ExecuteParasOpApiV2 {
+    using PROCESS_FUNC = std::function<int()>;
+    std::string *opName;
+    PROCESS_FUNC *customHandler;
+    ExecuteParasOpApiV2() = default;
+};
+
 struct ExecuteParasOpApi {
     using PROCESS_FUNC = std::function<int()>;
     char opType[100]{};
@@ -75,6 +82,7 @@ struct ExecuteParasOpApi {
     ExecuteParasOpApi() = default;
     void Release();
     void Copy(ExecuteParasOpApi &other);
+    void Copy(ExecuteParasOpApiV2 &other);
 };
 
 NPUStatus DestroyAclParams(ACL_PARAMS &params);
diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp
index 01b894f0d0bd6cdf5eb4a6f9b1e63345e7820e34..c67d7f7f0b7e87ca5b3304f6a6ef99f8c5753f31 100644
--- a/torch_npu/csrc/framework/OpCommand.cpp
+++ b/torch_npu/csrc/framework/OpCommand.cpp
@@ -13,6 +13,7 @@
 #include "torch_npu/csrc/aten/CustomFunctions.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
 #include "torch_npu/csrc/core/npu/NPUGraphsUtils.h"
+#include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #ifndef BUILD_LIBTORCH
 #include "torch_npu/csrc/sanitizer/NPUTrace.h"
 #endif
@@ -122,9 +123,10 @@ OpCommand& OpCommand::Output(at::Tensor &output, const string &descName,
 
 void OpCommand::Run()
 {
+    c10_npu::SetOpDispatch();
     // Check for npu graph
-    if (c10_npu::is_stream_capturing.load() && aclCmd->CheckCustomHandlerNull()) {
-        c10_npu::assertNotCapturing("Cannot run aclop operators");
+    if (aclCmd->CheckCustomHandlerNull()) {
+        c10_npu::assertNotCapturingAclop(aclCmd->GetName());
     }
 
     aclCmd->SetEnginePriority();
@@ -169,6 +171,7 @@ void OpCommand::Run()
 
 void OpCommand::RunOpApi(const string &op_name, PROC_FUNC func, bool sync)
 {
+    c10_npu::SetOpDispatch();
 #ifndef BUILD_LIBTORCH
     const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
 #endif
@@ -212,6 +215,48 @@ void OpCommand::RunOpApi(const string &op_name, PROC_FUNC func, bool sync)
     }
 }
 
+void OpCommand::RunOpApiV2(const string &op_name, const PROC_FUNC &func, bool sync)
+{
+    c10_npu::SetOpDispatch();
+#ifndef BUILD_LIBTORCH
+    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
+#endif
+    auto stream = c10_npu::getCurrentNPUStream();
+    if (!stream.isSyncLaunchStream() && c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
+        RECORD_FUNCTION(op_name, std::vector<c10::IValue>({}));
+#ifndef BUILD_LIBTORCH
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, op_name);
+#endif
+        ExecuteParasOpApiV2 execParams;
+        execParams.opName = const_cast<std::string*>(&op_name);
+        execParams.customHandler = const_cast<PROC_FUNC*>(&func);
+
+        c10_npu::queue::QueueParas params(c10_npu::queue::EXECUTE_OPAPI_V2, sizeof(ExecuteParasOpApiV2), &execParams);
+        c10_npu::enCurrentNPUStream(&params);
+#ifndef BUILD_LIBTORCH
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, op_name, params.correlation_id);
+#endif
+    } else {
+#ifndef BUILD_LIBTORCH
+        if (C10_UNLIKELY(trigger)) {
+            trigger->traceNpuAclStartExecution(op_name);
+        }
+#endif
+        OpCommandImpl::RunOpApi(op_name, func);
+        if (c10_npu::option::OptionsManager::CheckBlockingEnable()) {
+            NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(stream));
+        }
+#ifndef BUILD_LIBTORCH
+        if (C10_UNLIKELY(trigger)) {
+            trigger->traceNpuAclFinishExecution(op_name);
+        }
+#endif
+    }
+    if (sync) {
+        NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(stream));
+    }
+}
+
 OpCommand& OpCommand::Sync(c10::SmallVector<int64_t, N> &index)
 {
     sync_index = index;
diff --git a/torch_npu/csrc/framework/OpCommand.h b/torch_npu/csrc/framework/OpCommand.h
index 9144b5ddba0af9b8e2b376aa0a831c604cbd861e..e60617077c976b0109a37b72c33254a25333a095 100644
--- a/torch_npu/csrc/framework/OpCommand.h
+++ b/torch_npu/csrc/framework/OpCommand.h
@@ -124,6 +124,7 @@ public:
     OpCommand& Sync();
 
     static void RunOpApi(const string &op_name, PROC_FUNC func, bool sync = false);
+    static void RunOpApiV2(const string &op_name, const PROC_FUNC &func, bool sync = false);
 
 private:
     OpCommand& AddTensorInput(at::Tensor &tensor,
diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp
index ddead0eeebf0daf49f0aa5075c858f63ea26cc2d..6bf40ae9a837b4fdd6c18c63bb9ddee541289a10 100644
--- a/torch_npu/csrc/framework/OpParamMaker.cpp
+++ b/torch_npu/csrc/framework/OpParamMaker.cpp
@@ -8,7 +8,7 @@
 #include "torch_npu/csrc/distributed/HCCLUtils.hpp"
 #include "torch_npu/csrc/framework/OpCmdHelper.h"
 #include "torch_npu/csrc/framework/OpParamMaker.h"
-#include "torch_npu/csrc/framework/aoe/AoeUtils.h"
+#include "torch_npu/csrc/framework/aoe/AoeDumpGraphManager.h"
 #include "torch_npu/csrc/framework/interface/HcclInterface.h"
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
@@ -313,6 +313,16 @@ void printErrorLog(ExecuteParas *cur_paras)
     }
 }
 
+bool ContainsAny(const std::string& str, std::initializer_list<std::string> patterns)
+{
+    for (const auto& pattern : patterns) {
+        if (str.find(pattern) != std::string::npos) {
+            return true;
+        }
+    }
+    return false;
+}
+
 int ExecFunc(c10_npu::queue::QueueParas *in, aclrtStream stream)
 {
     auto cur_paras = static_cast<ExecuteParas *>(in->paramVal);
@@ -325,9 +335,8 @@ int ExecFunc(c10_npu::queue::QueueParas *in, aclrtStream stream)
         try {
             ret = cur_paras->customHandler();
         } catch (std::exception &e) {
-            if (std::string(e.what()).find(DEVICE_TASK_ABORT) != std::string::npos ||
-                std::string(e.what()).find(DEVICE_MEM_ERROR) != std::string::npos ||
-                std::string(e.what()).find(DEVICE_HBM_ECC_ERROR) != std::string::npos) {
+            if (ContainsAny(std::string(e.what()), {DEVICE_TASK_ABORT, DEVICE_MEM_ERROR, DEVICE_HBM_ECC_ERROR,
+                SUSPECT_DEVICE_MEM_ERROR, HCCS_LINK_ERROR, HCCL_OP_RETRY_FAILED})) {
                 ret = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL);
             } else {
                 ret = ACL_ERROR_INVALID_PARAM;
@@ -335,8 +344,7 @@ int ExecFunc(c10_npu::queue::QueueParas *in, aclrtStream stream)
             }
             ASCEND_LOGE("Custom hand error:%s", e.what());
         }
-        if (ret != ACL_ERROR_NONE && ret != ACL_ERROR_RT_DEVICE_TASK_ABORT && ret != ACL_ERROR_RT_DEVICE_MEM_ERROR &&
-            ret != ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR) {
+        if (ret != ACL_ERROR_NONE) {
             ASCEND_LOGE("Custom hand fail! name=%s, ret=0x%#x", cur_paras->opType, ret);
         }
         return ret;
@@ -412,9 +420,8 @@ int ExecFuncOpApi(c10_npu::queue::QueueParas *in, aclrtStream stream)
     try {
         ret = cur_paras->customHandler();
     } catch (std::exception &e) {
-        if (std::string(e.what()).find(DEVICE_TASK_ABORT) != std::string::npos ||
-            std::string(e.what()).find(DEVICE_MEM_ERROR) != std::string::npos ||
-            std::string(e.what()).find(DEVICE_HBM_ECC_ERROR) != std::string::npos) {
+        if (ContainsAny(std::string(e.what()), {DEVICE_TASK_ABORT, DEVICE_MEM_ERROR, DEVICE_HBM_ECC_ERROR,
+            SUSPECT_DEVICE_MEM_ERROR, HCCS_LINK_ERROR, HCCL_OP_RETRY_FAILED})) {
             ret = c10_npu::acl::AclrtPeekAtLastError(ACL_RT_THREAD_LEVEL);
         } else {
             ret = ACL_ERROR_INVALID_PARAM;
@@ -422,8 +429,7 @@ int ExecFuncOpApi(c10_npu::queue::QueueParas *in, aclrtStream stream)
         }
         ASCEND_LOGE("Custom hand error:%s", e.what());
     }
-    if (ret != ACL_ERROR_NONE && ret != ACL_ERROR_RT_DEVICE_TASK_ABORT && ret != ACL_ERROR_RT_DEVICE_MEM_ERROR &&
-        ret != ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR) {
+    if (ret != ACL_ERROR_NONE) {
         ASCEND_LOGE("Custom hand fail! name=%s, ret=0x%#x", cur_paras->opType, ret);
     }
     return ret;
@@ -502,7 +508,7 @@ int LazyDestroyEventFunc(c10_npu::queue::QueueParas *in, aclrtStream stream)
         }
         ASCEND_LOGE("LazyDestroy error! ret = %d, eventAllocatorType = %d", ret, cur_paras->eventAllocatorType);
     }
-    ASCEND_LOGI("Event: LazyDestroyEventFunc dequeue is successfully executed, event=%p", cur_paras->event);
+    ASCEND_LOGD("Event: LazyDestroyEventFunc dequeue is successfully executed, event=%p", cur_paras->event);
     return ret;
 }
 
@@ -519,14 +525,24 @@ void CopyFunc(void *dst, void *src)
         (static_cast<ExecuteParas *>(dstPtr->paramVal))->~ExecuteParas();
     }
 
+    const bool is_opapi_v2 = (srcPtr->paramType == c10_npu::queue::EXECUTE_OPAPI_V2);
     dstPtr->paramStream = srcPtr->paramStream;
-    dstPtr->paramType = srcPtr->paramType;
+    if (is_opapi_v2) {
+        dstPtr->paramType = c10_npu::queue::EXECUTE_OPAPI;
+    } else {
+        dstPtr->paramType = srcPtr->paramType;
+    }
     dstPtr->paramLen = srcPtr->paramLen;
     dstPtr->correlation_id = srcPtr->correlation_id;
     if (dstPtr->paramType == c10_npu::queue::EXECUTE_OPAPI) {
         new (dstPtr->paramVal) ExecuteParasOpApi();
-        (static_cast<ExecuteParasOpApi *>(dstPtr->paramVal))
-            ->Copy(*(static_cast<ExecuteParasOpApi *>(srcPtr->paramVal)));
+        if (is_opapi_v2) {
+            (static_cast<ExecuteParasOpApi *>(dstPtr->paramVal))
+                ->Copy(*(static_cast<ExecuteParasOpApiV2 *>(srcPtr->paramVal)));
+        } else {
+            (static_cast<ExecuteParasOpApi *>(dstPtr->paramVal))
+                ->Copy(*(static_cast<ExecuteParasOpApi *>(srcPtr->paramVal)));
+        }
     } else if (srcPtr->paramType == c10_npu::queue::COMPILE_AND_EXECUTE) {
         new (dstPtr->paramVal) ExecuteParas();
         (static_cast<ExecuteParas *>(dstPtr->paramVal))->Copy(*(static_cast<ExecuteParas *>(srcPtr->paramVal)));
@@ -586,7 +602,10 @@ void CopyReleaseParamFunc(void *dst, void *src)
     auto srcPtr = static_cast<c10_npu::queue::QueueParas *>(src);
     dstPtr->paramType = srcPtr->paramType;
     dstPtr->paramVal = static_cast<uint8_t *>(dst) + sizeof(c10_npu::queue::QueueParas);
-    if (srcPtr->paramType == c10_npu::queue::COMPILE_AND_EXECUTE) {
+    if (srcPtr->paramType == c10_npu::queue::EXECUTE_OPAPI) {
+        (static_cast<ExecuteParasOpApi *>(dstPtr->paramVal))->Copy(*(static_cast<ExecuteParasOpApi *>(srcPtr->paramVal)));
+        (static_cast<ExecuteParasOpApi *>(srcPtr->paramVal))->Release();
+    } else if (srcPtr->paramType == c10_npu::queue::COMPILE_AND_EXECUTE) {
         (static_cast<ExecuteParas *>(dstPtr->paramVal))->CopyEx(*(static_cast<ExecuteParas *>(srcPtr->paramVal)));
         (static_cast<ExecuteParas *>(srcPtr->paramVal))->hostMemory.clear();
     }
diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h
index 24c17fc919b32ecafcbd0ea4ac89ebe0127741e2..054f88849c8ab30da00572a6337c42d28b4ddeeb 100644
--- a/torch_npu/csrc/framework/OpParamMaker.h
+++ b/torch_npu/csrc/framework/OpParamMaker.h
@@ -244,10 +244,11 @@ public:
     // export op execute params
     void ExportParams(ExecuteParas &params)
     {
+        static const auto max_len = sizeof(ExecuteParas::opType);
         if (opName.length() + 1 < sizeof(ExecuteParas::opType)) {
             opName.copy(params.opType, opName.length() + 1);
         } else {
-            opName.copy(params.opType, sizeof(ExecuteParas::opType) - 1);
+            opName.copy(params.opType, max_len - 1);
         }
         params.attr = execParam.attr;
         // make params
diff --git a/torch_npu/csrc/framework/aoe/AoeUtils.cpp b/torch_npu/csrc/framework/aoe/AoeDumpGraphManager.cpp
similarity index 64%
rename from torch_npu/csrc/framework/aoe/AoeUtils.cpp
rename to torch_npu/csrc/framework/aoe/AoeDumpGraphManager.cpp
index f5f141116614c7e3f5eeb507f3510408236d7949..676edada7b01e4eb3b11da91c9643c67d4525ff5 100644
--- a/torch_npu/csrc/framework/aoe/AoeUtils.cpp
+++ b/torch_npu/csrc/framework/aoe/AoeDumpGraphManager.cpp
@@ -1,45 +1,52 @@
 #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
-#include "torch_npu/csrc/framework/aoe/AoeUtils.h"
+#include "torch_npu/csrc/framework/aoe/AoeDumpGraphManager.h"
 
 namespace at_npu {
 namespace native {
 namespace aoe {
 
-void AoeDumpGraphManager::SetDumpGraphPath(const std::string& dump_path) {
+void AoeDumpGraphManager::SetDumpGraphPath(const std::string& dump_path)
+{
     autotune_graphdumppath = dump_path;
 }
 
-std::string AoeDumpGraphManager::GetDumpGraphPath() const {
+std::string AoeDumpGraphManager::GetDumpGraphPath() const
+{
     return autotune_graphdumppath;
 }
 
-aclGraphDumpOption* AoeDumpGraphManager::CreateGraphDumpOption() {
+aclGraphDumpOption* AoeDumpGraphManager::CreateGraphDumpOption()
+{
     AclGraphDumpOption = AclCreateGraphDumpOpt();
     return AclGraphDumpOption;
 }
 
-void AoeDumpGraphManager::DestropyGraphDumpOption() {
+void AoeDumpGraphManager::DestropyGraphDumpOption()
+{
     AclDestroyGraphDumpOpt(AclGraphDumpOption);
-    AclGraphDumpOption = NULL;
+    AclGraphDumpOption = nullptr;
 }
 
-void AoeDumpGraphManager::EnableAoe() {
+void AoeDumpGraphManager::EnableAoe()
+{
     aoe_enable = true;
 }
 
-bool AoeDumpGraphManager::IsAoeEnabled() const {
+bool AoeDumpGraphManager::IsAoeEnabled() const
+{
     return aoe_enable;
 }
 
-bool AoeDumpGraphManager::IsInWhitelist(const std::string &opName) const {
-    if (white_list_.find(opName) != white_list_.end())
-    {
+bool AoeDumpGraphManager::IsInWhitelist(const std::string &opName) const
+{
+    if (white_list_.find(opName) != white_list_.end()) {
         return true;
     }
     return false;
 }
 
-AoeDumpGraphManager& aoe_manager() {
+AoeDumpGraphManager& aoe_manager()
+{
     static AoeDumpGraphManager instance;
     return instance;
 }
diff --git a/torch_npu/csrc/framework/aoe/AoeUtils.h b/torch_npu/csrc/framework/aoe/AoeDumpGraphManager.h
similarity index 98%
rename from torch_npu/csrc/framework/aoe/AoeUtils.h
rename to torch_npu/csrc/framework/aoe/AoeDumpGraphManager.h
index 54bac615b3b9e613523e50037f263b8de9ceea97..cae67e2d595ec78ef6d07594534a553c6f1c3a01 100644
--- a/torch_npu/csrc/framework/aoe/AoeUtils.h
+++ b/torch_npu/csrc/framework/aoe/AoeDumpGraphManager.h
@@ -24,7 +24,7 @@ public:
   bool aoe_enable = false;
   // to save graph for autotune, default path is ./
   std::string autotune_graphdumppath = "./";
-  aclGraphDumpOption* AclGraphDumpOption = NULL;
+  aclGraphDumpOption* AclGraphDumpOption = nullptr;
   std::unordered_set<std::string> white_list_ = {
       "Abs",
       "AccumulateNV2",
diff --git a/torch_npu/csrc/framework/autograd/FunctionsManual.cpp b/torch_npu/csrc/framework/autograd/FunctionsManual.cpp
index 7bba1b6d8804eb5868aee2a1eb72f46c92ccc91f..92084e5252280e6a633dd608aad863c89d182ac2 100644
--- a/torch_npu/csrc/framework/autograd/FunctionsManual.cpp
+++ b/torch_npu/csrc/framework/autograd/FunctionsManual.cpp
@@ -19,41 +19,48 @@ using at::IntArrayRef;
 using at::TensorList;
 using at::areAnyTensorSubclassLike;
 
-Tensor apply_loss_reduction(const Tensor& unreduced, int64_t reduction) {
-  if (reduction == at::Reduction::Mean) {
-    return unreduced.mean();
-  } else if (reduction == at::Reduction::Sum) {
-    return unreduced.sum();
-  }
-  return unreduced;
+Tensor apply_loss_reduction(const Tensor& unreduced, int64_t reduction)
+{
+    if (reduction == at::Reduction::Mean) {
+        return unreduced.mean();
+    } else if (reduction == at::Reduction::Sum) {
+        return unreduced.sum();
+    }
+    return unreduced;
 }
 
 bool any_variable_defined(const variable_list& variables) {
-  for (const auto& variable : variables) {
-    if (variable.defined()) {
-      return true;
+    for (const auto& variable : variables) {
+        if (variable.defined()) {
+            return true;
+        }
     }
-  }
-  return false;
+    return false;
 }
 
 bool isDefined(const c10::optional<Tensor>& t) {
-  return t.has_value() && t->defined();
+    return t.has_value() && t->defined();
 }
 
-Tensor toNonOptTensor(const c10::optional<Tensor>& t) {
-  return t.has_value() ? *t : Tensor();
+Tensor toNonOptTensor(const c10::optional<Tensor>& t)
+{
+    return t.has_value() ? *t : Tensor();
 }
 
-Tensor toNonOptFwGrad(const c10::optional<Tensor>& t) {
-  return (t.has_value() && t->defined()) ? t->_fw_grad(/* level */ 0) : Tensor();
+Tensor toNonOptFwGrad(const c10::optional<Tensor>& t)
+{
+    // 0: level 0
+    return (t.has_value() && t->defined()) ? t->_fw_grad(0) : Tensor();
 }
 
-Tensor toNonOptPrimal(const c10::optional<Tensor>& t) {
-  return (t.has_value() && t->defined()) ? t->_fw_primal(/* level */ 0) : Tensor();
+Tensor toNonOptPrimal(const c10::optional<Tensor>& t)
+{
+    // 0: level 0
+    return (t.has_value() && t->defined()) ? t->_fw_primal(0) : Tensor();
 }
 
-void copy_range(variable_list& out, IndexRange range, const Tensor& t) {
+void copy_range(variable_list& out, IndexRange range, const Tensor& t)
+{
     AT_ASSERT(range.second <= out.size(), OPS_ERROR(ErrCode::PARAM));
     AT_ASSERTM(range.second - range.first == 1,
                "inconsistent range for Tensor output",
@@ -71,34 +78,37 @@ void copy_range(variable_list& out, IndexRange range, at::ArrayRef<Tensor> t) {
 
 template <typename T>
 T not_implemented_base(const char* name, const char* reason) {
-  std::string msg = c10::str("the derivative for '", name, "' is not implemented.");
-  if (strlen(reason) > 0) {
-    msg = c10::str(msg, " ", reason);
-  };
-  TORCH_CHECK_NOT_IMPLEMENTED(false, msg);
+    std::string msg = c10::str("the derivative for '", name, "' is not implemented.");
+    if (strlen(reason) > 0) {
+        msg = c10::str(msg, " ", reason);
+    };
+        TORCH_CHECK_NOT_IMPLEMENTED(false, msg);
 }
 
-Tensor not_implemented(const char* name, const char* reason) {
-  return not_implemented_base<Tensor>(name, reason);
+Tensor not_implemented(const char* name, const char* reason)
+{
+    return not_implemented_base<Tensor>(name, reason);
 }
 
-std::vector<Tensor> not_implemented_list(const char* name, const char* reason) {
-  return not_implemented_base<std::vector<Tensor>>(name, reason);
+std::vector<Tensor> not_implemented_list(const char* name, const char* reason)
+{
+    return not_implemented_base<std::vector<Tensor>>(name, reason);
 }
 
-Tensor maybe_multiply(const Tensor& t, const Scalar& s) {
-  bool is_one = false;
-  if (s.isFloatingPoint()) {
-    is_one = s.toSymFloat() == 1;
-  } else if (s.isIntegral(true)) {
-    is_one = s.toSymInt() == 1;
-  }
-
-  if (is_one) {
-    return t;
-  } else {
-    return t * s;
-  }
+Tensor maybe_multiply(const Tensor& t, const Scalar& s)
+{
+    bool is_one = false;
+    if (s.isFloatingPoint()) {
+        is_one = s.toSymFloat() == 1;
+    } else if (s.isIntegral(true)) {
+        is_one = s.toSymInt() == 1;
+    }
+
+    if (is_one) {
+        return t;
+    } else {
+        return t * s;
+    }
 }
 
 } // namespace details
diff --git a/torch_npu/csrc/framework/autograd/FunctionsManual.h b/torch_npu/csrc/framework/autograd/FunctionsManual.h
index 522edbdb32c3d7622dcaf7979f035751f78c9570..495a97eb2bf390f2cb2b06935b1c632dca487b77 100644
--- a/torch_npu/csrc/framework/autograd/FunctionsManual.h
+++ b/torch_npu/csrc/framework/autograd/FunctionsManual.h
@@ -20,13 +20,13 @@ namespace details {
 // A simple way to imperatively compute index ranges for slots
 // that have been flattened
 struct IndexRangeGenerator {
-  IndexRange range(size_t range_size) {
-    i += range_size;
-    return {i - range_size, i};
-  }
-  size_t size() { return i; }
-  private:
-    size_t i = 0;
+    IndexRange range(size_t range_size) {
+        i += range_size;
+        return {i - range_size, i};
+    }
+    size_t size() const { return i; }
+    private:
+        size_t i = 0;
 };
 
 Tensor toNonOptFwGrad(const c10::optional<Tensor>& t);
diff --git a/torch_npu/csrc/framework/autograd/VariableTypeManual.cpp b/torch_npu/csrc/framework/autograd/VariableTypeManual.cpp
index 28be6051d675c232037804b5185aecb2eb7a2049..a81a5c2b942088bc85239cf3f57d86971613f838 100644
--- a/torch_npu/csrc/framework/autograd/VariableTypeManual.cpp
+++ b/torch_npu/csrc/framework/autograd/VariableTypeManual.cpp
@@ -22,93 +22,104 @@ namespace at_npu {
 namespace autograd {
 namespace VariableType {
 
-std::vector<at::DeprecatedTypeProperties*> allTypesForBackends(at::ArrayRef<at::Backend> backends) {
-  std::vector<DeprecatedTypeProperties*> res;
-  res.reserve(backends.size());
-  for (auto p : backends) {
-    for (const auto s : c10::irange(static_cast<int64_t>(ScalarType::NumOptions))) {
-      auto& type = getDeprecatedTypeProperties(static_cast<Backend>(p), static_cast<ScalarType>(s));
-      res.emplace_back(&type);
+std::vector<at::DeprecatedTypeProperties*> allTypesForBackends(at::ArrayRef<at::Backend> backends)
+{
+    std::vector<DeprecatedTypeProperties*> res;
+    res.reserve(backends.size());
+    for (auto p : backends) {
+        for (const auto s : c10::irange(static_cast<int64_t>(ScalarType::NumOptions))) {
+            auto& type = getDeprecatedTypeProperties(static_cast<Backend>(p), static_cast<ScalarType>(s));
+            res.emplace_back(&type);
+        }
     }
-  }
-  return res;
+    return res;
 }
 
-C10_EXPORT std::vector<at::DeprecatedTypeProperties*> allCPUTypes() {
-  return allTypesForBackends({ Backend::CPU, Backend::SparseCPU });
+C10_EXPORT std::vector<at::DeprecatedTypeProperties*> allCPUTypes()
+{
+    return allTypesForBackends({ Backend::CPU, Backend::SparseCPU });
 }
 
 namespace {
-const Variable& checked_cast_variable(const Tensor& t, const char* name, int pos) {
-  if (!t.defined()) {
-    AT_ERROR("Expected a proper Tensor but got None (or an undefined Tensor in C++) ",
-             "for argument #", pos, " '", name, "'");
-  }
-  return t;
+const Variable& checked_cast_variable(const Tensor& t, const char* name, int pos)
+{
+    if (!t.defined()) {
+        AT_ERROR("Expected a proper Tensor but got None (or an undefined Tensor in C++) ",
+                 "for argument #", pos, " '", name, "'");
+    }
+    return t;
 }
 
-Variable& checked_cast_variable(Tensor& t, const char* name, int pos) {
-  if (!t.defined()) {
-    AT_ERROR("Expected a proper Tensor but got None (or an undefined Tensor in C++) ",
-             "for argument #", pos, " '", name, "'");
-  }
-  return t;
+Variable& checked_cast_variable(Tensor& t, const char* name, int pos)
+{
+    if (!t.defined()) {
+        AT_ERROR("Expected a proper Tensor but got None (or an undefined Tensor in C++) ",
+                 "for argument #", pos, " '", name, "'");
+    }
+    return t;
 }
 } // namespace
 
-const Tensor& unpack(const Tensor& t, const char* name, int pos) {
-  return checked_cast_variable(t, name, pos);
+const Tensor& unpack(const Tensor& t, const char* name, int pos)
+{
+    return checked_cast_variable(t, name, pos);
 }
 
-Tensor& unpack(Tensor& t, const char* name, int pos) {
-  return checked_cast_variable(t, name, pos);
+Tensor& unpack(Tensor& t, const char* name, int pos)
+{
+    return checked_cast_variable(t, name, pos);
 }
 
-Tensor unpack_opt(const Tensor& t, const char* name, int pos) {
-  if (!t.defined()) {
-    return Tensor();
-  }
-  return unpack(t, name, pos);
+Tensor unpack_opt(const Tensor& t, const char* name, int pos)
+{
+    if (!t.defined()) {
+        return Tensor();
+    }
+    return unpack(t, name, pos);
 }
 
-std::vector<at::Tensor> unpack(at::TensorList tl, const char* name, int pos) {
-  std::vector<at::Tensor> ret(tl.size());
-  for (const auto i : c10::irange(tl.size())) {
-    const auto &t = tl[i];
-    if (!t.defined()) {
-      continue;
+std::vector<at::Tensor> unpack(at::TensorList tl, const char* name, int pos)
+{
+    std::vector<at::Tensor> ret(tl.size());
+    for (const auto i : c10::irange(tl.size())) {
+        const auto &t = tl[i];
+        if (!t.defined()) {
+            continue;
+        }
+        ret[i] = static_cast<const Variable&>(t);
     }
-    ret[i] = static_cast<const Variable&>(t);
-  }
-  return ret;
+    (void) name;
+    (void) pos;
+    return ret;
 }
 
 namespace {
 
 // Taken from codegened version
-Tensor _fw_primal(c10::DispatchKeySet ks, const Tensor& self, int64_t level) {
-  auto& self_ = unpack(self, "self", 0);
-  std::shared_ptr<Identity> grad_fn;
-  if (compute_requires_grad(self)) {
-    grad_fn = std::make_shared<Identity>();
-    grad_fn->set_next_edges(collect_next_edges(self));
-  }
-
-  auto result = ([&]() {
-    at::AutoDispatchBelowAutograd guard;
-    return at::redispatch::_fw_primal(ks& c10::after_autograd_keyset, self_, level);
-  })();
-
-  if (grad_fn) {
-      set_history(flatten_tensor_args(result), grad_fn);
-  }
-  if (isFwGradDefined(self)) {
-    // Modified from original codegen
-    // We explicitly want to ignore the forward grad at the given level
-    TORCH_CHECK(level == 0, "Invalid level given to _fw_primal", OPS_ERROR(ErrCode::VALUE));
-    // End modified from original codegen
-  }
-  return result;
+Tensor _fw_primal(c10::DispatchKeySet ks, const Tensor& self, int64_t level)
+{
+    auto& self_ = unpack(self, "self", 0);
+    std::shared_ptr<Identity> grad_fn;
+    if (compute_requires_grad(self)) {
+        grad_fn = std::make_shared<Identity>();
+        grad_fn->set_next_edges(collect_next_edges(self));
+    }
+
+    auto result = ([&]() {
+        at::AutoDispatchBelowAutograd guard;
+        return at::redispatch::_fw_primal(ks& c10::after_autograd_keyset, self_, level);
+        })();
+
+    if (grad_fn) {
+        set_history(flatten_tensor_args(result), grad_fn);
+    }
+    if (isFwGradDefined(self)) {
+        // Modified from original codegen
+        // We explicitly want to ignore the forward grad at the given level
+        TORCH_CHECK(level == 0, "Invalid level given to _fw_primal", OPS_ERROR(ErrCode::VALUE));
+        // End modified from original codegen
+    }
+    return result;
 }
 } // namespace
 
diff --git a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
index cc134627cf30ec28ae440b676f14ed379681df01..a3992a878ff4e311564b3799fcd28c74e484bffe 100644
--- a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
+++ b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
@@ -1,6 +1,6 @@
+#include <ATen/quantized/QTensorImpl.h>
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
-#include <ATen/quantized/QTensorImpl.h>
 
 namespace at_npu {
 namespace native {
@@ -10,74 +10,75 @@ OptimizationCases TransContiguous::optCasesAnyFormat = {"reshape", "slice"};
 ska::flat_hash_map<size_t, CachedContiguousOpt> TransContiguous::cached_contiguous_opt;
 
 
-ContiguousTensorDesc TransContiguous::GetTensorDescInfo(
-    const at::Tensor &src, const OptimizationCases &opt_cases) {
-  auto src_base_info = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc();
-  c10::SmallVector<int64_t, MAX_DIM> src_size_inferred;
-  c10::SmallVector<int64_t, MAX_DIM> src_stride_inferred;
-  c10::SmallVector<int64_t, MAX_DIM> src_storage_size_inferred =
-      src_base_info.storage_sizes_;
-  if (src.dim() == 0) {
-    src_size_inferred = {1};
-    src_stride_inferred = {1};
-    if (src_storage_size_inferred.size() == 0) {
-      src_storage_size_inferred = {1};
+ContiguousTensorDesc TransContiguous::GetTensorDescInfo(const at::Tensor &src, const OptimizationCases &opt_cases)
+{
+    auto src_base_info = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc();
+    c10::SmallVector<int64_t, MAX_DIM> src_size_inferred;
+    c10::SmallVector<int64_t, MAX_DIM> src_stride_inferred;
+    c10::SmallVector<int64_t, MAX_DIM> src_storage_size_inferred = src_base_info.storage_sizes_;
+    if (src.dim() == 0) {
+        src_size_inferred = {1};
+        src_stride_inferred = {1};
+        if (src_storage_size_inferred.size() == 0) {
+            src_storage_size_inferred = {1};
+        }
+    } else {
+        src_size_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.sizes());
+        src_stride_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.strides());
+    }
+    ContiguousTensorDesc src_desc = {
+        src.is_contiguous(),       src_size_inferred,
+        src_stride_inferred,       src.storage_offset(),
+        src_base_info.base_sizes_, src_base_info.base_strides_,
+        src_storage_size_inferred, src_base_info.base_offset_,
+        src_base_info.npu_format_, opt_cases};
+    if (src_desc.opt_cases_.empty()) {
+        src_desc.find_match_optimization_cases();
     }
-  } else {
-    src_size_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.sizes());
-    src_stride_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.strides());
-  }
-  ContiguousTensorDesc src_desc = {
-      src.is_contiguous(),       src_size_inferred,
-      src_stride_inferred,       src.storage_offset(),
-      src_base_info.base_sizes_, src_base_info.base_strides_,
-      src_storage_size_inferred, src_base_info.base_offset_,
-      src_base_info.npu_format_, opt_cases};
-  if (src_desc.opt_cases_.empty()) {
-    src_desc.find_match_optimization_cases();
-  }
-  return src_desc;
+    return src_desc;
 }
 
-bool TransContiguous::CheckClone(const at::Tensor &src, at::Tensor &self) {
-  // self tensor may not be temporary constructed empty tensor from src, so:
-  // 1. contiguous storage is needed:storage_offset and numels eq
-  // 2. full memory copy: size match between src and self
-  if (StorageDescHelper::OffsetAreMatch(&self) && self.is_contiguous() &&
-      src.sizes().equals(self.sizes()) &&
-      self.sizes().equals(torch_npu::NPUBridge::GetNpuStorageImpl(self)->get_npu_desc().base_sizes_)) {
-    return true;
-  }
-  return false;
+bool TransContiguous::CheckClone(const at::Tensor &src, at::Tensor &self)
+{
+    // self tensor may not be temporary constructed empty tensor from src, so:
+    // 1. contiguous storage is needed:storage_offset and numels eq
+    // 2. full memory copy: size match between src and self
+    if (StorageDescHelper::OffsetAreMatch(&self) && self.is_contiguous() &&
+        src.sizes().equals(self.sizes()) &&
+        self.sizes().equals(torch_npu::NPUBridge::GetNpuStorageImpl(self)->get_npu_desc().base_sizes_)) {
+        return true;
+    }
+    return false;
 }
 
 bool TransContiguous::can_optimize_(ContiguousTensorDesc &tensor_desc) {
-  for (auto opt_case : tensor_desc.opt_cases_) {
-    bool res = register_opt::CopyOptRegister::GetInstance()->CanOptimize(
-        opt_case, tensor_desc);
+    for (auto opt_case : tensor_desc.opt_cases_) {
+    bool res = register_opt::CopyOptRegister::GetInstance()->CanOptimize(opt_case, tensor_desc);
     if (res) {
-      // refresh patterns to only keep optimized pattern
-      tensor_desc.opt_cases_.clear();
-      tensor_desc.opt_cases_.emplace_back(opt_case);
-      return true;
+        // refresh patterns to only keep optimized pattern
+        tensor_desc.opt_cases_.clear();
+        tensor_desc.opt_cases_.emplace_back(opt_case);
+        return true;
+    }
     }
-  }
-  return false;
+    return false;
 }
 
-bool TransContiguous::CanOptimize(ContiguousTensorDesc &tensor_desc) {
-  return can_optimize_(tensor_desc);
+bool TransContiguous::CanOptimize(ContiguousTensorDesc &tensor_desc)
+{
+    return can_optimize_(tensor_desc);
 }
 
-bool TransContiguous::CanOptimize(const at::Tensor &tensor,
-                                  const OptimizationCases &opt_cases) {
-  ContiguousTensorDesc tensor_desc = GetTensorDescInfo(tensor, opt_cases);
-  return can_optimize_(tensor_desc);
+bool TransContiguous::CanOptimize(const at::Tensor &tensor, const OptimizationCases &opt_cases)
+{
+    ContiguousTensorDesc tensor_desc = GetTensorDescInfo(tensor, opt_cases);
+    return can_optimize_(tensor_desc);
 }
 
 bool TransContiguous::contiguous_optimize_with_anyformat_(
-    at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc) {
-  if (!CheckClone(src, self)) {
+    at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc)
+{
+    if (!CheckClone(src, self)) {
         return false;
     }
     for (auto &opt_case : src_desc.opt_cases_) {
@@ -87,97 +88,99 @@ bool TransContiguous::contiguous_optimize_with_anyformat_(
             return true;
         }
     }
-  return false;
+    return false;
 }
 
-    size_t GetHash_(const c10::SmallVector<int64_t, MAX_DIM>& small_vector_size)
-    {
-        size_t seed = 0;
-        for (size_t i = 0; i < small_vector_size.size(); i++) {
-            seed ^= static_cast<size_t>(small_vector_size[i]) + (seed << 6) + (seed >> 2);
-        }
-        return seed;
+size_t GetHash_(const c10::SmallVector<int64_t, MAX_DIM>& small_vector_size)
+{
+    size_t seed = 0;
+    for (size_t i = 0; i < small_vector_size.size(); i++) {
+        seed ^= static_cast<size_t>(small_vector_size[i]) + (seed << 6) + (seed >> 2);
     }
+    return seed;
+}
+
+size_t GetHash_(const ContiguousTensorDesc &src_desc)
+{
+    size_t hash_src_desc = (GetHash_(src_desc.sizes_)<<52) +
+                           (GetHash_(src_desc.base_sizes_)<<40) +
+                           (GetHash_(src_desc.strides_)<<28) +
+                           (GetHash_(src_desc.base_strides_)<<16) +
+                           (static_cast<size_t>(src_desc.offset_) << 4) +
+                           src_desc.npu_format_;
+    return hash_src_desc;
+}
 
-    size_t GetHash_(const ContiguousTensorDesc &src_desc)
-    {
-        size_t hash_src_desc = (GetHash_(src_desc.sizes_)<<52) +
-                               (GetHash_(src_desc.base_sizes_)<<40) +
-                               (GetHash_(src_desc.strides_)<<28) +
-                               (GetHash_(src_desc.base_strides_)<<16) +
-                               (static_cast<size_t>(src_desc.offset_) << 4) +
-                               src_desc.npu_format_;
-        return hash_src_desc;
+bool equalDesc(const ContiguousTensorDesc &src_desc, const ContiguousTensorDesc &desc_desc)
+{
+    if (src_desc.sizes_ == desc_desc.sizes_ &&
+        src_desc.base_sizes_ == desc_desc.base_sizes_ &&
+        src_desc.strides_ == desc_desc.strides_ &&
+        src_desc.base_strides_ == desc_desc.base_strides_ &&
+        src_desc.offset_ == desc_desc.offset_ &&
+        src_desc.npu_format_ == desc_desc.npu_format_) {
+        return true;
     }
+    return false;
+}
 
-    bool equalDesc(const ContiguousTensorDesc &src_desc, const ContiguousTensorDesc &desc_desc)
-    {
-        if (src_desc.sizes_ == desc_desc.sizes_ &&
-            src_desc.base_sizes_ == desc_desc.base_sizes_ &&
-            src_desc.strides_ == desc_desc.strides_ &&
-            src_desc.base_strides_ == desc_desc.base_strides_ &&
-            src_desc.offset_ == desc_desc.offset_ &&
-            src_desc.npu_format_ == desc_desc.npu_format_) {
-            return true;
-        }
+bool TransContiguous::cached_contiguous_optimize_with_anyformat_(
+    at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc)
+{
+    // No cached, try caching
+    if (!CheckClone(src, self)) {
         return false;
     }
-
-    bool TransContiguous::cached_contiguous_optimize_with_anyformat_(
-        at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc)
-    {
-        // No cached, try caching
-        if (!CheckClone(src, self)) {
-            return false;
-        }
-        src_desc.hash_src_desc = GetHash_(src_desc);
-        auto it = TransContiguous::cached_contiguous_opt.find(src_desc.hash_src_desc);
-        if (it != TransContiguous::cached_contiguous_opt.end()) {
-            // Cached
-            if (equalDesc(src_desc, it->second.contiguous_tensor_desc)) {
-                src_desc.cached_contiguous = true;
-                auto &opt_case = it->second.cached_opt_case;
-                return register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self,
-                                                                               src, src_desc);
-            }
-            return contiguous_optimize_with_anyformat_(self, src, src_desc);
+    src_desc.hash_src_desc = GetHash_(src_desc);
+    auto it = TransContiguous::cached_contiguous_opt.find(src_desc.hash_src_desc);
+    if (it != TransContiguous::cached_contiguous_opt.end()) {
+        // Cached
+        if (equalDesc(src_desc, it->second.contiguous_tensor_desc)) {
+            src_desc.cached_contiguous = true;
+            auto &opt_case = it->second.cached_opt_case;
+            return register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self,
+                                                                           src, src_desc);
         }
+        return contiguous_optimize_with_anyformat_(self, src, src_desc);
+    }
 
-        for (auto &opt_case : src_desc.opt_cases_) {
-            bool res = false;
-            if (TransContiguous::cached_contiguous_opt.size() >= CachedMaxSize) {
-                res = register_opt::CopyOptRegister::GetInstance()->Run(opt_case, self, src, src_desc);
-            } else {
-                src_desc.cached_contiguous = false;
-                res =  register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self, src, src_desc);
-            }
-            if (res) {
-                return true;
-            }
+    for (auto &opt_case : src_desc.opt_cases_) {
+        bool res = false;
+        if (TransContiguous::cached_contiguous_opt.size() >= CachedMaxSize) {
+            res = register_opt::CopyOptRegister::GetInstance()->Run(opt_case, self, src, src_desc);
+        } else {
+            src_desc.cached_contiguous = false;
+            res =  register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self, src, src_desc);
+        }
+        if (res) {
+            return true;
         }
-        return false;
     }
+    return false;
+}
 
 bool TransContiguous::ContiguousOptimizeWithAnyFormat(
     at::Tensor &self, const at::Tensor &src,
-    const OptimizationCases &opt_cases) {
-  ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases);
-  return contiguous_optimize_with_anyformat_(self, src, src_desc);
+    const OptimizationCases &opt_cases)
+{
+    ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases);
+    return contiguous_optimize_with_anyformat_(self, src, src_desc);
 }
 
 c10::optional<at::Tensor> TransContiguous::ContiguousOptimizeWithAnyFormat(
-    const at::Tensor &src, const OptimizationCases &opt_cases) {
-  TORCH_CHECK(src.device().type() == c10::DeviceType::PrivateUse1,
-      "Expected all tensors to be on the same device. "
-      "Expected NPU tensor, please check whether the input tensor device is correct.",
-      OPS_ERROR(ErrCode::TYPE));
-  auto self = OpPreparation::ApplyTensorWithFormat(
-      src.sizes(), src.options(), torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc().npu_format_);
-  ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases);
-  if (cached_contiguous_optimize_with_anyformat_(self, src, src_desc)) {
-    return self;
-  }
-  return c10::nullopt;
+    const at::Tensor &src, const OptimizationCases &opt_cases)
+{
+    TORCH_CHECK(src.device().type() == c10::DeviceType::PrivateUse1,
+        "Expected all tensors to be on the same device. "
+        "Expected NPU tensor, please check whether the input tensor device is correct.",
+        OPS_ERROR(ErrCode::TYPE));
+    auto self = OpPreparation::ApplyTensorWithFormat(
+        src.sizes(), src.options(), torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc().npu_format_);
+    ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases);
+    if (cached_contiguous_optimize_with_anyformat_(self, src, src_desc)) {
+        return self;
+    }
+    return c10::nullopt;
 }
 
 bool TransContiguous::ContiguousOptimizeWithBaseFormat(
@@ -197,32 +200,32 @@ bool TransContiguous::ContiguousOptimizeWithBaseFormat(
 }
 
 
-    at::Tensor TransContiguous::view_tensor(const at::Tensor& self,
-                                            int64_t offset,
-                                            const c10::IntArrayRef& sizes,
-                                            const c10::IntArrayRef& strides)
-    {
-        at::Tensor self_;
-        if (self.is_quantized()) {
-            self_ = at::detail::make_tensor<at::QTensorImpl>(
-                    c10::TensorImpl::VIEW,
-                    c10::Storage(self.storage()),
-                    self.key_set(),
-                    self.dtype(),
-                    get_qtensorimpl(self)->quantizer());
-        } else {
-            self_ = at::detail::make_tensor<at::TensorImpl>(
-                    c10::TensorImpl::VIEW,
-                    c10::Storage(self.storage()),
-                    self.key_set(),
-                    self.dtype());
-        }
-        auto* self_tmp_ = self_.unsafeGetTensorImpl();
-        self_tmp_->set_storage_offset(offset);
-        self_tmp_->set_sizes_and_strides(sizes, strides);
-        at::namedinference::propagate_names(self_, self);
-        return self_;
+at::Tensor TransContiguous::view_tensor(const at::Tensor& self,
+                                        int64_t offset,
+                                        const c10::IntArrayRef& sizes,
+                                        const c10::IntArrayRef& strides)
+{
+    at::Tensor self_;
+    if (self.is_quantized()) {
+        self_ = at::detail::make_tensor<at::QTensorImpl>(
+                c10::TensorImpl::VIEW,
+                c10::Storage(self.storage()),
+                self.key_set(),
+                self.dtype(),
+                get_qtensorimpl(self)->quantizer());
+    } else {
+        self_ = at::detail::make_tensor<at::TensorImpl>(
+                c10::TensorImpl::VIEW,
+                c10::Storage(self.storage()),
+                self.key_set(),
+                self.dtype());
     }
+    auto* self_tmp_ = self_.unsafeGetTensorImpl();
+    self_tmp_->set_storage_offset(offset);
+    self_tmp_->set_sizes_and_strides(sizes, strides);
+    at::namedinference::propagate_names(self_, self);
+    return self_;
+}
 
 } // namespace native
 } // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/contiguous/ContiguousOpt.h b/torch_npu/csrc/framework/contiguous/ContiguousOpt.h
index d7eef999035242e507f3f9a515f03e5601e761e9..e4679c9e30f9994396e13fea48876c7c0868d850 100644
--- a/torch_npu/csrc/framework/contiguous/ContiguousOpt.h
+++ b/torch_npu/csrc/framework/contiguous/ContiguousOpt.h
@@ -1,10 +1,11 @@
 #ifndef __PULGIN_NATIVE_CONTIGUOUS_CONTIGUOUS_OPTIMIZE__
 #define __PULGIN_NATIVE_CONTIGUOUS_CONTIGUOUS_OPTIMIZE__
 
+#include <ATen/record_function.h>
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
 #include "torch_npu/csrc/framework/contiguous/contiguous_register.h"
 #include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include <ATen/record_function.h>
+
 
 namespace at_npu {
 namespace native {
@@ -28,37 +29,37 @@ namespace native {
 
 class TransContiguous {
 public:
-  TransContiguous() {}
-  virtual ~TransContiguous() {}
-  static bool CheckClone(const at::Tensor &src, at::Tensor &self);
-  static ContiguousTensorDesc
-  GetTensorDescInfo(const at::Tensor &src,
-                    const OptimizationCases &opt_cases = optCasesDefault);
-  static bool can_optimize_(ContiguousTensorDesc &tensor_desc);
-  static bool CanOptimize(ContiguousTensorDesc &tensor_desc);
-  static bool CanOptimize(const at::Tensor &tensor,
-                          const OptimizationCases &opt_cases);
-  static bool
-  contiguous_optimize_with_anyformat_(at::Tensor &self, const at::Tensor &src,
-                                      ContiguousTensorDesc &src_desc);
-  static bool ContiguousOptimizeWithAnyFormat(
-      at::Tensor &self, const at::Tensor &src,
-      const OptimizationCases &opt_cases = optCasesAnyFormat);
-  static c10::optional<at::Tensor> ContiguousOptimizeWithAnyFormat(
-      const at::Tensor &src,
-      const OptimizationCases &opt_cases = optCasesAnyFormat);
-  static bool ContiguousOptimizeWithBaseFormat(
-      at::Tensor &self, const at::Tensor &src,
-      const OptimizationCases &opt_cases = optCasesDefault,
-      bool OpenCombined = true);
+    TransContiguous() {}
+    virtual ~TransContiguous() {}
+    static bool CheckClone(const at::Tensor &src, at::Tensor &self);
+    static ContiguousTensorDesc
+    GetTensorDescInfo(const at::Tensor &src,
+                const OptimizationCases &opt_cases = optCasesDefault);
+    static bool can_optimize_(ContiguousTensorDesc &tensor_desc);
+    static bool CanOptimize(ContiguousTensorDesc &tensor_desc);
+    static bool CanOptimize(const at::Tensor &tensor,
+                      const OptimizationCases &opt_cases);
+    static bool
+    contiguous_optimize_with_anyformat_(at::Tensor &self, const at::Tensor &src,
+                                  ContiguousTensorDesc &src_desc);
+    static bool ContiguousOptimizeWithAnyFormat(
+        at::Tensor &self, const at::Tensor &src,
+        const OptimizationCases &opt_cases = optCasesAnyFormat);
+    static c10::optional<at::Tensor> ContiguousOptimizeWithAnyFormat(
+        const at::Tensor &src,
+        const OptimizationCases &opt_cases = optCasesAnyFormat);
+    static bool ContiguousOptimizeWithBaseFormat(
+        at::Tensor &self, const at::Tensor &src,
+        const OptimizationCases &opt_cases = optCasesDefault,
+        bool OpenCombined = true);
     static bool cached_contiguous_optimize_with_anyformat_(
-            at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc);
+        at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc);
     static ska::flat_hash_map<size_t, CachedContiguousOpt> cached_contiguous_opt;
     static at::Tensor view_tensor(const at::Tensor& self, int64_t offset, const c10::IntArrayRef& sizes, const c10::IntArrayRef& strides);
 
 private:
-  static OptimizationCases optCasesDefault;
-  static OptimizationCases optCasesAnyFormat;
+    static OptimizationCases optCasesDefault;
+    static OptimizationCases optCasesAnyFormat;
 };
 
 } // namespace native
diff --git a/torch_npu/csrc/framework/contiguous/ContiguousUtils.cpp b/torch_npu/csrc/framework/contiguous/ContiguousUtils.cpp
index 0ad48916696bb791abd466400274d223faaf758e..7803e36f28b6d21e862fe2847176b1e91cb3d425 100644
--- a/torch_npu/csrc/framework/contiguous/ContiguousUtils.cpp
+++ b/torch_npu/csrc/framework/contiguous/ContiguousUtils.cpp
@@ -4,54 +4,56 @@ namespace at_npu {
 namespace native {
 
 void ContiguousTensorDesc::refresh_contiguous_using_size_and_stride() {
-  if (c10::multiply_integers(sizes_) == 0) {
-    is_contiguous_ = true;
-  }
-  int64_t infer_axis_size = 1;
-  for (int64_t dim = static_cast<int64_t>(sizes_.size()) - 1; dim >= 0; dim--) {
-    if (sizes_[dim] != 1) {
-      if (strides_[dim] == infer_axis_size) {
-        infer_axis_size *= sizes_[dim];
-      } else {
-        is_contiguous_ = false;
-        return;
-      }
+    if (c10::multiply_integers(sizes_) == 0) {
+        is_contiguous_ = true;
+    }
+    int64_t infer_axis_size = 1;
+    for (int64_t dim = static_cast<int64_t>(sizes_.size()) - 1; dim >= 0; dim--) {
+        if (sizes_[dim] != 1) {
+            if (strides_[dim] == infer_axis_size) {
+                infer_axis_size *= sizes_[dim];
+            } else {
+                is_contiguous_ = false;
+                return;
+            }
+        }
     }
-  }
-  is_contiguous_ = true;
+    is_contiguous_ = true;
 }
 
 void ContiguousTensorDesc::reset_optimization_cases(
     const OptimizationCases &opt_cases) {
-  opt_cases_ = opt_cases;
-}
+        opt_cases_ = opt_cases;
+    }
 
-void ContiguousTensorDesc::add_optimization_case(const std::string &opt_case) {
-  opt_cases_.emplace_back(opt_case);
+void ContiguousTensorDesc::add_optimization_case(const std::string &opt_case)
+{
+     opt_cases_.emplace_back(opt_case);
 }
 
-void ContiguousTensorDesc::find_match_optimization_cases() {
-  for (const auto i : c10::irange(sizes_.size())) {
-    if (strides_[i] == 0) {
-      opt_cases_.emplace_back("broadcast");
-      return;
+void ContiguousTensorDesc::find_match_optimization_cases()
+{
+    for (const auto i : c10::irange(sizes_.size())) {
+        if (strides_[i] == 0) {
+          opt_cases_.emplace_back("broadcast");
+          return;
+        }
+    }
+
+    for (const auto i : c10::irange(strides_.size() - 1)) {
+        if (strides_[i] < strides_[i + 1]) {
+          opt_cases_.emplace_back("permute");
+          return;
+        }
     }
-  }
 
-  for (const auto i : c10::irange(strides_.size() - 1)) {
-    if (strides_[i] < strides_[i + 1]) {
-      opt_cases_.emplace_back("permute");
-      return;
+    // Considering combined-cases, we cannot split slice cases any further.
+    if (c10::multiply_integers(sizes_) < c10::multiply_integers(base_sizes_)) {
+        opt_cases_.emplace_back("slice");
+        opt_cases_.emplace_back("select");
+        opt_cases_.emplace_back("indexing");
+        return;
     }
-  }
-
-  // Considering combined-cases, we cannot split slice cases any further.
-  if (c10::multiply_integers(sizes_) < c10::multiply_integers(base_sizes_)) {
-    opt_cases_.emplace_back("slice");
-    opt_cases_.emplace_back("select");
-    opt_cases_.emplace_back("indexing");
-    return;
-  }
 }
 
 } // namespace native
diff --git a/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp b/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp
index 48a3d88ea326ba471ecb89d282bc0e51ecb0b0b3..bd29f88e2744c6c155a3b49821914e6d442f1286 100644
--- a/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp
+++ b/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp
@@ -4,26 +4,28 @@
 namespace at_npu {
 namespace native {
 
-bool can_use_memecpy_for_NZ_format(const ContiguousTensorDesc &tensor_desc) {
-  int64_t tensor_shape_size = static_cast<int64_t>(tensor_desc.sizes_.size());
-  int64_t base_shape_size = static_cast<int64_t>(tensor_desc.base_sizes_.size());
-  // No padding&&offset!=0 at the same time. e.g. x(3, 15, 16)[1:]
-  if (((tensor_desc.sizes_[tensor_shape_size - 1] % 16 != 0) ||
+bool can_use_memecpy_for_NZ_format(const ContiguousTensorDesc &tensor_desc)
+{
+    int64_t tensor_shape_size = static_cast<int64_t>(tensor_desc.sizes_.size());
+    int64_t base_shape_size = static_cast<int64_t>(tensor_desc.base_sizes_.size());
+    // No padding&&offset!=0 at the same time. e.g. x(3, 15, 16)[1:]
+    if (((tensor_desc.sizes_[tensor_shape_size - 1] % 16 != 0) ||
        (tensor_desc.sizes_[tensor_shape_size - 2] % 16 != 0)) &&
-      tensor_desc.offset_ != 0) {
-    return false;
-  }
-  // Make sure that sizes of last 2 dims don't change
-  if (tensor_desc.sizes_[tensor_shape_size - 1] !=
+        tensor_desc.offset_ != 0) {
+        return false;
+    }
+    // Make sure that sizes of last 2 dims don't change
+    if (tensor_desc.sizes_[tensor_shape_size - 1] !=
           tensor_desc.base_sizes_[base_shape_size - 1] ||
-      tensor_desc.sizes_[tensor_shape_size - 2] !=
+        tensor_desc.sizes_[tensor_shape_size - 2] !=
           tensor_desc.base_sizes_[base_shape_size - 2]) {
-    return false;
-  }
-  return true;
+        return false;
+    }
+    return true;
 }
 
-bool can_use_memcpy_for_other_format(const ContiguousTensorDesc &tensor_desc) {
+bool can_use_memcpy_for_other_format(const ContiguousTensorDesc &tensor_desc)
+{
     // torch.flatten(x) case should be removed
     if (tensor_desc.sizes_.size() < 2) {
         return false;
@@ -48,36 +50,38 @@ bool can_use_memcpy_for_other_format(const ContiguousTensorDesc &tensor_desc) {
     }
 }
 
-bool check_reshape_match(const ContiguousTensorDesc &self_desc,
-                         const ContiguousTensorDesc &src_desc) {
-  // For all format, both src and self are taken into consideration
-  if (check_reshape_match(src_desc) && check_reshape_match(self_desc)) {
-    // tensor numels eqs for self and src tensor. i.e. make sure that storage
-    // keep same.
-    if (!(self_desc.sizes_ == src_desc.sizes_)) {
-      return false;
+bool check_reshape_match(const ContiguousTensorDesc &self_desc, const ContiguousTensorDesc &src_desc)
+{
+    // For all format, both src and self are taken into consideration
+    if (check_reshape_match(src_desc) && check_reshape_match(self_desc)) {
+        // tensor numels eqs for self and src tensor. i.e. make sure that storage
+        // keep same.
+        if (!(self_desc.sizes_ == src_desc.sizes_)) {
+            return false;
+        }
+
+        return true;
     }
+    return false;
+}
 
+bool check_reshape_match(const ContiguousTensorDesc &tensor_desc)
+{
+    // (case 1) Reshape tensor should be contiguous
+    if (!tensor_desc.is_contiguous_) {
+        return false;
+    }
+    // (case2) for other format, sizes at key dims should remain unchanged
+    if (!FormatHelper::IsBaseFormatType(tensor_desc.npu_format_)) {
+        return can_use_memcpy_for_other_format(tensor_desc);
+    }
     return true;
-  }
-  return false;
 }
 
-bool check_reshape_match(const ContiguousTensorDesc &tensor_desc) {
-  // (case 1) Reshape tensor should be contiguous
-  if (!tensor_desc.is_contiguous_) {
-    return false;
-  }
-  // (case2) for other format, sizes at key dims should remain unchanged
-  if (!FormatHelper::IsBaseFormatType(tensor_desc.npu_format_)) {
+bool CanUseMemcpyForOtherFormat(const at::Tensor &tensor)
+{
+    ContiguousTensorDesc tensor_desc = TransContiguous::GetTensorDescInfo(tensor);
     return can_use_memcpy_for_other_format(tensor_desc);
-  }
-  return true;
-}
-
-bool CanUseMemcpyForOtherFormat(const at::Tensor &tensor) {
-  ContiguousTensorDesc tensor_desc = TransContiguous::GetTensorDescInfo(tensor);
-  return can_use_memcpy_for_other_format(tensor_desc);
 }
 
 } // namespace native
diff --git a/torch_npu/csrc/framework/contiguous/ReshapeOpt.h b/torch_npu/csrc/framework/contiguous/ReshapeOpt.h
index 7a721806ef3013a9d2f0fe8cd8f25ae8b620fba9..018f3853f8e07ecd50f26571a03c58b2a9b0802d 100644
--- a/torch_npu/csrc/framework/contiguous/ReshapeOpt.h
+++ b/torch_npu/csrc/framework/contiguous/ReshapeOpt.h
@@ -8,15 +8,15 @@
 namespace at_npu {
 namespace native {
 
-bool can_use_memecpy_for_NZ_format(const ContiguousTensorDesc &);
-bool can_use_memcpy_for_other_format(const ContiguousTensorDesc &);
+bool can_use_memecpy_for_NZ_format(const ContiguousTensorDesc &tensor_desc);
+bool can_use_memcpy_for_other_format(const ContiguousTensorDesc &tensor_desc);
 bool check_reshape_match_flex(const ContiguousTensorDesc &,
                               const ContiguousTensorDesc &);
-bool check_reshape_match(const ContiguousTensorDesc &,
-                         const ContiguousTensorDesc &);
+bool check_reshape_match(const ContiguousTensorDesc &self_desc,
+                         const ContiguousTensorDesc &src_desc);
 bool check_reshape_match_flex(const ContiguousTensorDesc &);
-bool check_reshape_match(const ContiguousTensorDesc &);
-bool CanUseMemcpyForOtherFormat(const at::Tensor &);
+bool check_reshape_match(const ContiguousTensorDesc &tensor_desc);
+bool CanUseMemcpyForOtherFormat(const at::Tensor &tensor);
 
 } // namespace native
 } // namespace at_npu
diff --git a/torch_npu/csrc/framework/contiguous/contiguous_register.h b/torch_npu/csrc/framework/contiguous/contiguous_register.h
index 2fe9c4636d1fa7a0717c1539672c0d5e5696e181..1e8db7f3769214159c970a1052a1ffbec46a8f9b 100644
--- a/torch_npu/csrc/framework/contiguous/contiguous_register.h
+++ b/torch_npu/csrc/framework/contiguous/contiguous_register.h
@@ -10,7 +10,6 @@
 
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
-#include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
 #include "torch_npu/csrc/framework/contiguous/ContiguousUtils.h"
 
 namespace at_npu {
@@ -18,13 +17,13 @@ namespace native {
 
 class ContiguousOpt {
 public:
-  ContiguousOpt() {}
-  virtual ~ContiguousOpt() = default;
-  virtual bool Optimizer(at::Tensor &self, const at::Tensor &src,
+    ContiguousOpt() {}
+        virtual ~ContiguousOpt() = default;
+    virtual bool Optimizer(at::Tensor &self, const at::Tensor &src,
                          const ContiguousTensorDesc &src_desc) = 0;
-  virtual bool CanOptimizer(const ContiguousTensorDesc &src_desc) {
-    return false;
-  }
+    virtual bool CanOptimizer(const ContiguousTensorDesc &src_desc) {
+        return false;
+    }
     virtual bool CachedOptimizer(at::Tensor &self, const at::Tensor &src,
                                  const ContiguousTensorDesc &src_desc)
     {
@@ -35,32 +34,32 @@ public:
 namespace register_opt {
 class CopyOptRegister {
 public:
-  ~CopyOptRegister() = default;
-  static CopyOptRegister *GetInstance() {
-    static CopyOptRegister instance;
-    return &instance;
-  }
-  void Register(std::string &name, ::std::unique_ptr<ContiguousOpt> &ptr) {
-    std::lock_guard<std::mutex> lock(mu_);
-    registry.emplace(name, std::move(ptr));
-  }
+    ~CopyOptRegister() = default;
+    static CopyOptRegister *GetInstance() {
+        static CopyOptRegister instance;
+        return &instance;
+    }
+    void Register(std::string &name, ::std::unique_ptr<ContiguousOpt> &ptr) {
+        std::lock_guard<std::mutex> lock(mu_);
+        registry.emplace(name, std::move(ptr));
+    }
 
-  bool CanOptimize(std::string &name, const ContiguousTensorDesc &src_desc) {
-    auto itr = registry.find(name);
-    if (itr != registry.end()) {
-      return itr->second->CanOptimizer(src_desc);
+    bool CanOptimize(std::string &name, const ContiguousTensorDesc &src_desc) {
+        auto itr = registry.find(name);
+        if (itr != registry.end()) {
+            return itr->second->CanOptimizer(src_desc);
+        }
+        return false;
     }
-    return false;
-  }
 
-  bool Run(const std::string &name, at::Tensor &self, const at::Tensor &src,
+    bool Run(const std::string &name, at::Tensor &self, const at::Tensor &src,
            const ContiguousTensorDesc &src_desc) {
-    auto itr = registry.find(name);
-    if (itr != registry.end()) {
-      return itr->second->Optimizer(self, src, src_desc);
+        auto itr = registry.find(name);
+        if (itr != registry.end()) {
+            return itr->second->Optimizer(self, src, src_desc);
+        }
+        return false;
     }
-    return false;
-  }
 
     bool CachedRun(const std::string &name, at::Tensor &self, const at::Tensor &src,
                    const ContiguousTensorDesc &src_desc)
@@ -73,17 +72,17 @@ public:
     }
 
 private:
-  CopyOptRegister() {}
-  mutable std::mutex mu_;
-  mutable std::map<std::string, ::std::unique_ptr<ContiguousOpt>> registry;
+    CopyOptRegister() {}
+    mutable std::mutex mu_;
+    mutable std::map<std::string, ::std::unique_ptr<ContiguousOpt>> registry;
 }; // class CopyOptRegister
 
 class CopyOptBuilder {
 public:
-  CopyOptBuilder(std::string name, ::std::unique_ptr<ContiguousOpt> &ptr) {
-    CopyOptRegister::GetInstance()->Register(name, ptr);
-  }
-  ~CopyOptBuilder() = default;
+    CopyOptBuilder(std::string name, ::std::unique_ptr<ContiguousOpt> &ptr) {
+        CopyOptRegister::GetInstance()->Register(name, ptr);
+    }
+    ~CopyOptBuilder() = default;
 }; // class CopyOptBuilder
 } // namespace register_opt
 
diff --git a/torch_npu/csrc/framework/interface/AclInterface.cpp b/torch_npu/csrc/framework/interface/AclInterface.cpp
index 33a1f1bf8ecc73957f17247eb1b38f8e60ec17d1..4f5387eef9cd4f5696520e30f5fd81441446fd07 100644
--- a/torch_npu/csrc/framework/interface/AclInterface.cpp
+++ b/torch_npu/csrc/framework/interface/AclInterface.cpp
@@ -26,111 +26,120 @@ LOAD_FUNCTION(aclprofFinalize)
 LOAD_FUNCTION(aclprofCreateConfig)
 LOAD_FUNCTION(aclprofDestroyConfig)
 
-aclprofStepInfoPtr init_stepinfo() {
-  typedef aclprofStepInfoPtr(*npdInitFunc)();
-  static npdInitFunc func = nullptr;
-  if (func == nullptr) {
-      func = (npdInitFunc)GET_FUNC(aclprofCreateStepInfo);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofCreateStepInfo", PROF_ERROR(ErrCode::NOT_FOUND));
-  auto ret = func();
-  return ret;
+aclprofStepInfoPtr init_stepinfo()
+{
+    typedef aclprofStepInfoPtr(*npdInitFunc)();
+    static npdInitFunc func = nullptr;
+    if (func == nullptr) {
+        func = (npdInitFunc)GET_FUNC(aclprofCreateStepInfo);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofCreateStepInfo", PROF_ERROR(ErrCode::NOT_FOUND));
+    auto ret = func();
+    return ret;
 }
 
-NpdStatus destroy_stepinfo(aclprofStepInfoPtr stepInfo) {
-  typedef NpdStatus(*npdDestroyFunc)(aclprofStepInfoPtr);
-  static npdDestroyFunc func = nullptr;
-  if (func == nullptr) {
-      func = (npdDestroyFunc)GET_FUNC(aclprofDestroyStepInfo);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofDestroyStepInfo", PROF_ERROR(ErrCode::NOT_FOUND));
-  auto ret = func(stepInfo);
-  return ret;
+NpdStatus destroy_stepinfo(aclprofStepInfoPtr stepInfo)
+{
+    typedef NpdStatus(*npdDestroyFunc)(aclprofStepInfoPtr);
+    static npdDestroyFunc func = nullptr;
+    if (func == nullptr) {
+        func = (npdDestroyFunc)GET_FUNC(aclprofDestroyStepInfo);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofDestroyStepInfo", PROF_ERROR(ErrCode::NOT_FOUND));
+    auto ret = func(stepInfo);
+    return ret;
 }
 
-NpdStatus start_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream) {
-  typedef NpdStatus(*npdStartProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream);
-  static npdStartProfiling func = nullptr;
-  if (func == nullptr) {
-      func = (npdStartProfiling)GET_FUNC(aclprofGetStepTimestamp);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofGetStepTimestamp", PROF_ERROR(ErrCode::NOT_FOUND));
-  auto ret = func(stepInfo, stepTag, stream);
-  return ret;
+NpdStatus start_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream)
+{
+    typedef NpdStatus(*npdStartProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream);
+    static npdStartProfiling func = nullptr;
+    if (func == nullptr) {
+        func = (npdStartProfiling)GET_FUNC(aclprofGetStepTimestamp);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofGetStepTimestamp", PROF_ERROR(ErrCode::NOT_FOUND));
+    auto ret = func(stepInfo, stepTag, stream);
+    return ret;
 }
 
-NpdStatus stop_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream) {
-  typedef NpdStatus(*npdStopProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream);
-  static npdStopProfiling func = nullptr;
-  if (func == nullptr) {
-      func = (npdStopProfiling)GET_FUNC(aclprofGetStepTimestamp);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofGetStepTimestamp", PROF_ERROR(ErrCode::NOT_FOUND));
-  auto ret = func(stepInfo, stepTag, stream);
-  return ret;
+NpdStatus stop_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream)
+{
+    typedef NpdStatus(*npdStopProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream);
+    static npdStopProfiling func = nullptr;
+    if (func == nullptr) {
+        func = (npdStopProfiling)GET_FUNC(aclprofGetStepTimestamp);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofGetStepTimestamp", PROF_ERROR(ErrCode::NOT_FOUND));
+    auto ret = func(stepInfo, stepTag, stream);
+    return ret;
 }
 
 const char *AclGetErrMsg()
 {
-  typedef const char *(*aclGetErrMsg)();
-  static aclGetErrMsg func = nullptr;
-  if (func == nullptr) {
-    func = (aclGetErrMsg)GET_FUNC(aclGetRecentErrMsg);
-  }
-  if (func != nullptr) {
-    return func();
-  }
-  return "";
+    typedef const char *(*aclGetErrMsg)();
+    static aclGetErrMsg func = nullptr;
+    if (func == nullptr) {
+        func = (aclGetErrMsg)GET_FUNC(aclGetRecentErrMsg);
+    }
+    if (func != nullptr) {
+        return func();
+    }
+    return "";
 }
 
-aclError AclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag) {
-  typedef aclError(*AclrtCreateEventWithFlagFunc)(aclrtEvent*, uint32_t);
-  static AclrtCreateEventWithFlagFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclrtCreateEventWithFlagFunc)GET_FUNC(aclrtCreateEventWithFlag);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclrtCreateEventWithFlag", PTA_ERROR(ErrCode::NOT_FOUND));
-  return func(event, flag);
+aclError AclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag)
+{
+    typedef aclError(*AclrtCreateEventWithFlagFunc)(aclrtEvent*, uint32_t);
+    static AclrtCreateEventWithFlagFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtCreateEventWithFlagFunc)GET_FUNC(aclrtCreateEventWithFlag);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclrtCreateEventWithFlag", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(event, flag);
 }
 
-aclError AclProfilingInit(const char *profilerResultPath, size_t length) {
-  typedef aclError (*AclProfInitFunc) (const char *, size_t);
-  static AclProfInitFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclProfInitFunc)GET_FUNC(aclprofInit);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofInit", PROF_ERROR(ErrCode::NOT_FOUND));
-  return func(profilerResultPath, length);
+aclError AclProfilingInit(const char *profilerResultPath, size_t length)
+{
+    typedef aclError (*AclProfInitFunc) (const char *, size_t);
+    static AclProfInitFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclProfInitFunc)GET_FUNC(aclprofInit);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofInit", PROF_ERROR(ErrCode::NOT_FOUND));
+    return func(profilerResultPath, length);
 }
 
-aclError AclProfilingStart(const aclprofConfig *profilerConfig) {
-  typedef aclError (*AclProfStartFunc) (const aclprofConfig *);
-  static AclProfStartFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclProfStartFunc)GET_FUNC(aclprofStart);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofStart", PROF_ERROR(ErrCode::NOT_FOUND));
-  return func(profilerConfig);
+aclError AclProfilingStart(const aclprofConfig *profilerConfig)
+{
+    typedef aclError (*AclProfStartFunc) (const aclprofConfig *);
+    static AclProfStartFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclProfStartFunc)GET_FUNC(aclprofStart);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofStart", PROF_ERROR(ErrCode::NOT_FOUND));
+    return func(profilerConfig);
 }
 
-aclError AclProfilingStop(const aclprofConfig *profilerConfig) {
-  typedef aclError (*AclProfStopFunc) (const aclprofConfig*);
-  static AclProfStopFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclProfStopFunc)GET_FUNC(aclprofStop);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofStop", PROF_ERROR(ErrCode::NOT_FOUND));
-  return func(profilerConfig);
+aclError AclProfilingStop(const aclprofConfig *profilerConfig)
+{
+    typedef aclError (*AclProfStopFunc) (const aclprofConfig*);
+    static AclProfStopFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclProfStopFunc)GET_FUNC(aclprofStop);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofStop", PROF_ERROR(ErrCode::NOT_FOUND));
+    return func(profilerConfig);
 }
 
-aclError AclProfilingFinalize() {
-  typedef aclError (*AclProfFinalizeFunc) ();
-  static AclProfFinalizeFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclProfFinalizeFunc)GET_FUNC(aclprofFinalize);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofFinalize", PROF_ERROR(ErrCode::NOT_FOUND));
-  return func();
+aclError AclProfilingFinalize()
+{
+    typedef aclError (*AclProfFinalizeFunc) ();
+    static AclProfFinalizeFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclProfFinalizeFunc)GET_FUNC(aclprofFinalize);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofFinalize", PROF_ERROR(ErrCode::NOT_FOUND));
+    return func();
 }
 
 aclprofConfig *AclProfilingCreateConfig(
@@ -138,25 +147,27 @@ aclprofConfig *AclProfilingCreateConfig(
     uint32_t deviceNums,
     aclprofAicoreMetrics aicoreMetrics,
     aclprofAicoreEvents *aicoreEvents,
-    uint64_t dataTypeConfig) {
-  typedef aclprofConfig *(*AclProfCreateConfigFunc) \
-    (uint32_t *, uint32_t, aclprofAicoreMetrics, const aclprofAicoreEvents *, uint64_t);
-  static AclProfCreateConfigFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclProfCreateConfigFunc)GET_FUNC(aclprofCreateConfig);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofCreateConfig", PROF_ERROR(ErrCode::NOT_FOUND));
-  return func(deviceIdList, deviceNums, aicoreMetrics, aicoreEvents, dataTypeConfig);
+    uint64_t dataTypeConfig)
+{
+    typedef aclprofConfig *(*AclProfCreateConfigFunc) \
+        (uint32_t *, uint32_t, aclprofAicoreMetrics, const aclprofAicoreEvents *, uint64_t);
+    static AclProfCreateConfigFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclProfCreateConfigFunc)GET_FUNC(aclprofCreateConfig);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofCreateConfig", PROF_ERROR(ErrCode::NOT_FOUND));
+    return func(deviceIdList, deviceNums, aicoreMetrics, aicoreEvents, dataTypeConfig);
 }
 
-aclError AclProfilingDestroyConfig(const aclprofConfig *profilerConfig) {
-  typedef aclError (*AclProfDestroyConfigFunc) (const aclprofConfig *);
-  static AclProfDestroyConfigFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclProfDestroyConfigFunc)GET_FUNC(aclprofDestroyConfig);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofDestroyConfig", PROF_ERROR(ErrCode::NOT_FOUND));
-  return func(profilerConfig);
+aclError AclProfilingDestroyConfig(const aclprofConfig *profilerConfig)
+{
+    typedef aclError (*AclProfDestroyConfigFunc) (const aclprofConfig *);
+    static AclProfDestroyConfigFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclProfDestroyConfigFunc)GET_FUNC(aclprofDestroyConfig);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofDestroyConfig", PROF_ERROR(ErrCode::NOT_FOUND));
+    return func(profilerConfig);
 }
 
 #undef LOAD_ASCEND_DUMP_FUNCTION
@@ -171,8 +182,9 @@ REGISTER_LIBRARY(libascend_dump)
 LOAD_ASCEND_DUMP_FUNCTION(aclopStartDumpArgs)
 LOAD_ASCEND_DUMP_FUNCTION(aclopStopDumpArgs)
 
-aclError AclopStartDumpArgs(uint32_t dumpType, const char *path) {
-  typedef aclError(*AclopStartDumpArgsFunc)(uint32_t, const char *);
+aclError AclopStartDumpArgs(uint32_t dumpType, const char *path)
+{
+    typedef aclError(*AclopStartDumpArgsFunc)(uint32_t, const char *);
     static AclopStartDumpArgsFunc func = nullptr;
     if (func == nullptr) {
         func = (AclopStartDumpArgsFunc)GET_ASCEND_DUMP_FUNC(aclopStartDumpArgs);
@@ -184,8 +196,9 @@ aclError AclopStartDumpArgs(uint32_t dumpType, const char *path) {
     return func(dumpType, path);
 }
 
-aclError AclopStopDumpArgs(uint32_t dumpType) {
-  typedef aclError(*AclopStopDumpArgsFunc)(uint32_t);
+aclError AclopStopDumpArgs(uint32_t dumpType)
+{
+    typedef aclError(*AclopStopDumpArgsFunc)(uint32_t);
     static AclopStopDumpArgsFunc func = nullptr;
     if (func == nullptr) {
         func = (AclopStopDumpArgsFunc)GET_ASCEND_DUMP_FUNC(aclopStopDumpArgs);
diff --git a/torch_npu/csrc/framework/interface/AclOpCompileInterface.cpp b/torch_npu/csrc/framework/interface/AclOpCompileInterface.cpp
index eab43131f633c3274bc846999907510977c566f9..0f32ff630daddd93e0d5234a3132dcd3e72b29d0 100644
--- a/torch_npu/csrc/framework/interface/AclOpCompileInterface.cpp
+++ b/torch_npu/csrc/framework/interface/AclOpCompileInterface.cpp
@@ -3,8 +3,8 @@
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/register/FunctionLoader.h"
 #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
-#include "third_party/acl/inc/acl/acl_base.h"
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
+#include "third_party/acl/inc/acl/acl_base.h"
 
 namespace at_npu
 {
@@ -28,7 +28,8 @@ namespace at_npu
     LOAD_FUNCTION(aclrtCtxSetSysParamOpt)
     LOAD_FUNCTION(aclrtSetSysParamOpt)
 
-aclError AclSetCompileopt(aclCompileOpt opt, const char *value) {
+aclError AclSetCompileopt(aclCompileOpt opt, const char *value)
+{
     bool ge_init_disable = c10_npu::option::OptionsManager::CheckGeInitDisable();
     if (ge_init_disable) {
         return ACL_ERROR_NONE;
@@ -43,120 +44,127 @@ aclError AclSetCompileopt(aclCompileOpt opt, const char *value) {
     return ret;
 }
 
-c10::optional<size_t> AclGetCompileoptSize(aclCompileOpt opt) {
-  typedef aclError (*aclGetCompileoptSizeFunc)(aclCompileOpt opt);
-  static aclGetCompileoptSizeFunc func = nullptr;
-  if (func == nullptr) {
-    func = (aclGetCompileoptSizeFunc)GET_FUNC(aclGetCompileoptSize);
-  }
-  if (func == nullptr) {
-    return c10::nullopt;
-  } else {
-    return func(opt);
-  }
+c10::optional<size_t> AclGetCompileoptSize(aclCompileOpt opt)
+{
+    typedef aclError (*aclGetCompileoptSizeFunc)(aclCompileOpt opt);
+    static aclGetCompileoptSizeFunc func = nullptr;
+    if (func == nullptr) {
+        func = (aclGetCompileoptSizeFunc)GET_FUNC(aclGetCompileoptSize);
+    }
+    if (func == nullptr) {
+        return c10::nullopt;
+    } else {
+        return func(opt);
+    }
 }
 
-aclError AclGetCompileopt(aclCompileOpt opt, char *value, size_t length) {
-  typedef aclError (*aclGetCompileoptFunc)(aclCompileOpt opt, char *value, size_t length);
-  static aclGetCompileoptFunc func = nullptr;
-  if (func == nullptr) {
-    func = (aclGetCompileoptFunc)GET_FUNC(aclGetCompileopt);
-  }
-  if (func == nullptr) {
-    return ACL_ERROR_GE_FAILURE;
-  } else {
-    return func(opt, value, length);
-  }
+aclError AclGetCompileopt(aclCompileOpt opt, char *value, size_t length)
+{
+    typedef aclError (*aclGetCompileoptFunc)(aclCompileOpt opt, char *value, size_t length);
+    static aclGetCompileoptFunc func = nullptr;
+    if (func == nullptr) {
+        func = (aclGetCompileoptFunc)GET_FUNC(aclGetCompileopt);
+    }
+    if (func == nullptr) {
+        return ACL_ERROR_GE_FAILURE;
+    } else {
+        return func(opt, value, length);
+    }
 }
 
 aclError AclGenGraphAndDumpForOp(const char *opType,
     int numInputs, const aclTensorDesc *const inputDesc[], const aclDataBuffer *const inputs[],
     int numOutputs, const aclTensorDesc *const outputDesc[], aclDataBuffer *const outputs[],
     const aclopAttr *attr, aclopEngineType engineType, const char *graphDumpPath,
-    aclGraphDumpOption* graphdumpOpt) {
-  typedef aclError(*AclGenGraphAndDumpForOpFunc)(const char *, int,
-      const aclTensorDesc *const [], const aclDataBuffer *const [],
-      int, const aclTensorDesc *const [], aclDataBuffer *const [],
-      const aclopAttr *, aclopEngineType, const char *, aclGraphDumpOption*);
-  static AclGenGraphAndDumpForOpFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclGenGraphAndDumpForOpFunc)GET_FUNC(aclGenGraphAndDumpForOp);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclGenGraphAndDumpForOp", PTA_ERROR(ErrCode::NOT_FOUND));
-  auto ret = func(opType, numInputs, inputDesc, inputs, numOutputs,
-      outputDesc, outputs, attr, engineType, graphDumpPath, graphdumpOpt);
-  return ret;
+    aclGraphDumpOption* graphdumpOpt)
+{
+    typedef aclError(*AclGenGraphAndDumpForOpFunc)(const char *, int,
+        const aclTensorDesc *const [], const aclDataBuffer *const [],
+        int, const aclTensorDesc *const [], aclDataBuffer *const [],
+        const aclopAttr *, aclopEngineType, const char *, aclGraphDumpOption*);
+    static AclGenGraphAndDumpForOpFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclGenGraphAndDumpForOpFunc)GET_FUNC(aclGenGraphAndDumpForOp);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclGenGraphAndDumpForOp", PTA_ERROR(ErrCode::NOT_FOUND));
+    auto ret = func(opType, numInputs, inputDesc, inputs, numOutputs,
+        outputDesc, outputs, attr, engineType, graphDumpPath, graphdumpOpt);
+    return ret;
 }
 
-aclGraphDumpOption* AclCreateGraphDumpOpt() {
-  typedef aclGraphDumpOption*(*AclCreateGraphDumpOptFunc)();
-  static AclCreateGraphDumpOptFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclCreateGraphDumpOptFunc)GET_FUNC(aclCreateGraphDumpOpt);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclCreateGraphDumpOpt", PTA_ERROR(ErrCode::NOT_FOUND));
-  return func();
+aclGraphDumpOption* AclCreateGraphDumpOpt()
+{
+    typedef aclGraphDumpOption*(*AclCreateGraphDumpOptFunc)();
+    static AclCreateGraphDumpOptFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclCreateGraphDumpOptFunc)GET_FUNC(aclCreateGraphDumpOpt);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclCreateGraphDumpOpt", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func();
 }
 
-aclError AclDestroyGraphDumpOpt(aclGraphDumpOption* aclGraphDumpOpt) {
-  typedef aclError(*AclDestroyGraphDumpOptFunc)(aclGraphDumpOption*);
-  static AclDestroyGraphDumpOptFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclDestroyGraphDumpOptFunc)GET_FUNC(aclDestroyGraphDumpOpt);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclDestroyGraphDumpOpt", PTA_ERROR(ErrCode::NOT_FOUND));
-  return func(aclGraphDumpOpt);
+aclError AclDestroyGraphDumpOpt(aclGraphDumpOption* aclGraphDumpOpt)
+{
+    typedef aclError(*AclDestroyGraphDumpOptFunc)(aclGraphDumpOption*);
+    static AclDestroyGraphDumpOptFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclDestroyGraphDumpOptFunc)GET_FUNC(aclDestroyGraphDumpOpt);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclDestroyGraphDumpOpt", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(aclGraphDumpOpt);
 }
 
 aclError AclopCompileAndExecuteV2(const char *opType,
     int numInputs, aclTensorDesc *inputDesc[], aclDataBuffer *inputs[],
     int numOutputs, aclTensorDesc *outputDesc[], aclDataBuffer *outputs[],
     aclopAttr *attr, aclopEngineType engineType, aclopCompileType compileFlag,
-    const char *opPath, aclrtStream stream) {
-  typedef aclError(*AclopCompileAndExecuteV2Func)(const char *,
-      int, aclTensorDesc * [], aclDataBuffer * [],
-      int, aclTensorDesc * [], aclDataBuffer * [],
-      aclopAttr *, aclopEngineType, aclopCompileType,
-      const char *, aclrtStream);
-  static AclopCompileAndExecuteV2Func func = nullptr;
-  if (func == nullptr) {
-    func = (AclopCompileAndExecuteV2Func)GET_FUNC(aclopCompileAndExecuteV2);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclopCompileAndExecuteV2", PTA_ERROR(ErrCode::NOT_FOUND));
-  auto ret = func(opType, numInputs, inputDesc, inputs, numOutputs,
-      outputDesc, outputs, attr, engineType, compileFlag, opPath, stream);
-  return ret;
+    const char *opPath, aclrtStream stream)
+{
+    typedef aclError(*AclopCompileAndExecuteV2Func)(const char *,
+        int, aclTensorDesc * [], aclDataBuffer * [],
+        int, aclTensorDesc * [], aclDataBuffer * [],
+        aclopAttr *, aclopEngineType, aclopCompileType,
+        const char *, aclrtStream);
+    static AclopCompileAndExecuteV2Func func = nullptr;
+    if (func == nullptr) {
+        func = (AclopCompileAndExecuteV2Func)GET_FUNC(aclopCompileAndExecuteV2);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclopCompileAndExecuteV2", PTA_ERROR(ErrCode::NOT_FOUND));
+    auto ret = func(opType, numInputs, inputDesc, inputs, numOutputs,
+        outputDesc, outputs, attr, engineType, compileFlag, opPath, stream);
+    return ret;
 }
 
-aclError AclrtCtxSetSysParamOpt(aclSysParamOpt opt, int64_t value) {
-  typedef aclError (*AclrtCtxSetSysParamOptFunc)(aclSysParamOpt opt, int64_t value);
-  static AclrtCtxSetSysParamOptFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclrtCtxSetSysParamOptFunc)GET_FUNC(aclrtCtxSetSysParamOpt);
-  }
-  if (func == nullptr) {
-    TORCH_WARN("Failed to find this aclrtCtxSetSysParamOpt function!");
-    return ACL_ERROR_NONE;
-  }
-  auto ret = func(opt, value);
-  return ret;
+aclError AclrtCtxSetSysParamOpt(aclSysParamOpt opt, int64_t value)
+{
+    typedef aclError (*AclrtCtxSetSysParamOptFunc)(aclSysParamOpt opt, int64_t value);
+    static AclrtCtxSetSysParamOptFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtCtxSetSysParamOptFunc)GET_FUNC(aclrtCtxSetSysParamOpt);
+    }
+    if (func == nullptr) {
+        TORCH_WARN("Failed to find this aclrtCtxSetSysParamOpt function!");
+        return ACL_ERROR_NONE;
+    }
+    auto ret = func(opt, value);
+    return ret;
 }
 
 aclError AclrtSetSysParamOpt(aclSysParamOpt opt, int64_t value)
 {
-  typedef aclError (*AclrtSetSysParamOptFunc)(aclSysParamOpt opt, int64_t value);
-  static AclrtSetSysParamOptFunc func = nullptr;
-  if (func == nullptr)
-  {
-    func = (AclrtSetSysParamOptFunc)GET_FUNC(aclrtSetSysParamOpt);
-  }
-  if (func == nullptr)
-  {
-    TORCH_WARN("Failed to find this aclrtSetSysParamOpt function!");
-    return ACL_ERROR_NONE;
-  }
-  auto ret = func(opt, value);
-  return ret;
+    typedef aclError (*AclrtSetSysParamOptFunc)(aclSysParamOpt opt, int64_t value);
+    static AclrtSetSysParamOptFunc func = nullptr;
+    if (func == nullptr)
+    {
+        func = (AclrtSetSysParamOptFunc)GET_FUNC(aclrtSetSysParamOpt);
+    }
+    if (func == nullptr)
+    {
+        TORCH_WARN("Failed to find this aclrtSetSysParamOpt function!");
+        return ACL_ERROR_NONE;
+    }
+    auto ret = func(opt, value);
+    return ret;
 }
 
   } // namespace native
diff --git a/torch_npu/csrc/framework/interface/AclOpCompileInterface.h b/torch_npu/csrc/framework/interface/AclOpCompileInterface.h
index bdde820d5eb08ea4fffe62feecf143883ba68ac2..14d89d2235e20adfefb7e340c4e1eb0ee1abf889 100644
--- a/torch_npu/csrc/framework/interface/AclOpCompileInterface.h
+++ b/torch_npu/csrc/framework/interface/AclOpCompileInterface.h
@@ -10,7 +10,8 @@ namespace native {
  * @ingroup AscendCL
  * @brief an interface set compile flag
  *
- * @param flag [IN]     flag: ACL_OPCOMPILE_DEFAULT represent static compile while ACL_OPCOMPILE_FUZZ represent dynamic compile
+ * @param flag [IN] flag: ACL_OPCOMPILE_DEFAULT represent static compile
+    while ACL_OPCOMPILE_FUZZ represent dynamic compile
  *
  * @retval ACL_ERROR_NONE The function is successfully executed.
  * @retval OtherValues Failure
diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp
index cf12ff84810673905139316fd2b99c0b965e6f61..577f8ef58b0d4176343bb92847d16615c9f69bb8 100644
--- a/torch_npu/csrc/framework/interface/EnvVariables.cpp
+++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp
@@ -5,7 +5,7 @@
 #include "torch_npu/csrc/framework/utils/ForceJitCompileList.h"
 #include "torch_npu/csrc/framework/utils/ForceAclnnList.h"
 #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
-#include "torch_npu/csrc/framework/aoe/AoeUtils.h"
+#include "torch_npu/csrc/framework/aoe/AoeDumpGraphManager.h"
 #include "torch_npu/csrc/core/npu/npu_log.h"
 #include "torch_npu/csrc/core/npu/NpuVariables.h"
 #include "torch_npu/csrc/core/npu/register/OptionRegister.h"
@@ -13,17 +13,19 @@ namespace at_npu {
 namespace native {
 namespace env {
 
-void ValidPathCheck(const std::string& file_path) {
-  char abs_path[PATH_MAX] = {'\0'};
-  if (realpath(file_path.c_str(), abs_path) == nullptr) {
-    TORCH_CHECK(0, "configPath path Fails, path ", (char*)file_path.c_str(), PTA_ERROR(ErrCode::PARAM));
-  }
+void ValidPathCheck(const std::string& file_path)
+{
+    char abs_path[PATH_MAX] = {'\0'};
+    if (realpath(file_path.c_str(), abs_path) == nullptr) {
+        TORCH_CHECK(0, "configPath path Fails, path ", (char*)file_path.c_str(), PTA_ERROR(ErrCode::PARAM));
+    }
 }
 
-REGISTER_OPTION_HOOK(autotune, [](const std::string& val) {
-  if (val == "enable") {
-    at_npu::native::aoe::aoe_manager().EnableAoe();
-  }
+REGISTER_OPTION_HOOK(autotune, [](const std::string& val)
+{
+    if (val == "enable") {
+        at_npu::native::aoe::aoe_manager().EnableAoe();
+    }
 })
 
 REGISTER_OPTION_HOOK(autotunegraphdumppath, [](const std::string& val) {
@@ -45,11 +47,10 @@ REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string &val) {
   aclmdlSetDump(val.c_str());
 })
 
-auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode();
-
 REGISTER_OPTION_BOOL_FUNCTION(CheckJitDisableInner, jitCompile, "enable", "disable")
 REGISTER_OPTION_CACHE(bool, isJitDisable, CheckJitDisableInner)
 REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) {
+    auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode();
     if (acl_op_init_mode == 0) {
         NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, val.c_str()));
     } else if (GET_OPTION_WITH_CACHE(isJitDisable) != ("disable" == val)) {
@@ -93,24 +94,25 @@ REGISTER_OPTION_HOOK(ACL_PRECISION_MODE, [](const std::string &val) {
   NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_PRECISION_MODE, val.c_str()));
 })
 
-bool IsAllowFP32ToFP16() {
-  // For Ascend910B1 and subsequent device, the default precision mode is must_keep_origin_dtype,
-  // and the default value for others is allow_fp32_to_fp16.
-  bool is_allow_fp32_to_fp16 = c10_npu::GetSocVersion() < c10_npu::SocVersion::Ascend910B1;
+bool IsAllowFP32ToFP16()
+{
+    // For Ascend910B1 and subsequent device, the default precision mode is must_keep_origin_dtype,
+    // and the default value for others is allow_fp32_to_fp16.
+    bool is_allow_fp32_to_fp16 = c10_npu::GetSocVersion() < c10_npu::SocVersion::Ascend910B1;
 
-  static const std::string precision_mode = "ACL_PRECISION_MODE";
-  auto precision_mode_val = c10_npu::option::GetOption(precision_mode);
-  if (precision_mode_val.has_value()) {
+    static const std::string precision_mode = "ACL_PRECISION_MODE";
+    auto precision_mode_val = c10_npu::option::GetOption(precision_mode);
+    if (precision_mode_val.has_value()) {
     if (precision_mode_val.value() == "must_keep_origin_dtype") {
-      is_allow_fp32_to_fp16 = false;
+        is_allow_fp32_to_fp16 = false;
     } else if (precision_mode_val.value() == "allow_fp32_to_fp16") {
-      is_allow_fp32_to_fp16 = true;
+        is_allow_fp32_to_fp16 = true;
     } else {
-      ASCEND_LOGW("Unsupported precision mode value, using default value according to soc version.");
+        ASCEND_LOGW("Unsupported precision mode value, using default value according to soc version.");
+    }
     }
-  }
 
-  return is_allow_fp32_to_fp16;
+    return is_allow_fp32_to_fp16;
 }
 
 REGISTER_OPTION_HOOK(ACL_OP_SELECT_IMPL_MODE, [](const std::string &val) {
@@ -135,7 +137,8 @@ REGISTER_OPTION_HOOK(OP_HOOK_ENABLE, [](const std::string &val) {
     SET_OPTION_WITH_CACHE(isOpHookEnable, "enable" == val);
 })
 
-bool CheckOpHookEnable() {
+bool CheckOpHookEnable()
+{
     return GET_OPTION_WITH_CACHE(isOpHookEnable);
 }
 
diff --git a/torch_npu/csrc/framework/interface/HcclInterface.cpp b/torch_npu/csrc/framework/interface/HcclInterface.cpp
index 19525ea62abd36839745a8fb14a74bb86cd92397..3b6b6eabcc2e34ad659795cd7a74d55d8c1f4079 100644
--- a/torch_npu/csrc/framework/interface/HcclInterface.cpp
+++ b/torch_npu/csrc/framework/interface/HcclInterface.cpp
@@ -24,7 +24,8 @@ extern HcclResult HcclSetConfig(HcclConfig config, HcclConfigValue configValue)
         func = (HcclSetConfigFunc)GET_FUNC(HcclSetConfig);
     }
     if (func == nullptr) {
-        TORCH_NPU_WARN("Failed to find this HcclSetConfig function, get real hccl config, need to upgrade hccl version!");
+        TORCH_NPU_WARN(
+            "Failed to find this HcclSetConfig function, get real hccl config, need to upgrade hccl version!");
         return HcclResult::HCCL_SUCCESS;
     }
     return func(config, configValue);
diff --git a/torch_npu/csrc/framework/interface/HcclInterface.h b/torch_npu/csrc/framework/interface/HcclInterface.h
index b2cb3fd35d9190a06a630b4fe5c89f250475d9a9..e4a0ed585e78b24624a7c7b410150c1fa2dbe52a 100644
--- a/torch_npu/csrc/framework/interface/HcclInterface.h
+++ b/torch_npu/csrc/framework/interface/HcclInterface.h
@@ -1,3 +1,6 @@
+#ifndef __PLUGIN_NATIVE_NPU_INTERFACE_HCCLINTERFACE__
+#define __PLUGIN_NATIVE_NPU_INTERFACE_HCCLINTERFACE__
+
 #include "third_party/hccl/inc/hccl/hccl.h"
 
 namespace at_npu {
@@ -16,4 +19,6 @@ extern HcclResult HcclSetConfig(HcclConfig config, HcclConfigValue configValue);
 
 } // namespace hccl
 } // namespace native
-} // namespace at_npu
\ No newline at end of file
+} // namespace at_npu
+
+#endif
diff --git a/torch_npu/csrc/framework/interface/LibAscendHal.cpp b/torch_npu/csrc/framework/interface/LibAscendHal.cpp
index 536821cd48ff160f68660ea6b96756fe3d4dfca0..dba95bd4e867c4ac05f5c6ae95eaadf38184143f 100644
--- a/torch_npu/csrc/framework/interface/LibAscendHal.cpp
+++ b/torch_npu/csrc/framework/interface/LibAscendHal.cpp
@@ -22,7 +22,8 @@ constexpr uint32_t ERR_FREQ = 0;
 constexpr uint32_t ERR_VER = 0;
 constexpr uint32_t FREQ_CONFIG = 24;
 
-int64_t getFreq() {
+int64_t getFreq()
+{
     using getReqFun = int32_t (*)(uint32_t, int32_t, int32_t, int64_t*);
     static getReqFun getFreqInfo = nullptr;
     if (getFreqInfo == nullptr) {
@@ -39,7 +40,8 @@ int64_t getFreq() {
     return ERR_FREQ;
 }
 
-int64_t getVer() {
+int64_t getVer()
+{
     using getReqFun = int32_t (*)(int32_t*);
     static getReqFun getVerInfo = nullptr;
     if (getVerInfo == nullptr) {
@@ -57,7 +59,8 @@ int64_t getVer() {
     return ver;
 }
 
-bool isSyscntEnable() {
+bool isSyscntEnable()
+{
     constexpr int32_t supportVersion = 0x071905;
     return getVer() >= supportVersion && getFreq() != ERR_FREQ;
 }
diff --git a/torch_npu/csrc/framework/interface/LibAscendHal.h b/torch_npu/csrc/framework/interface/LibAscendHal.h
index 09bf24a0e176002e83ac24879f3c240268327aa7..73aae03c4998e72ba4bd29ec0d5e3305670fa6d2 100644
--- a/torch_npu/csrc/framework/interface/LibAscendHal.h
+++ b/torch_npu/csrc/framework/interface/LibAscendHal.h
@@ -3,8 +3,8 @@
 
 namespace at_npu {
 namespace native {
-int64_t getFreq();
-bool isSyscntEnable();
+    int64_t getFreq();
+    bool isSyscntEnable();
 } // namespace native
 } // namespace torchat_npu_npu
 #endif
diff --git a/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp b/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
index a36fbb8d506b5e07e70d1a56139240a58500e8ee..f2dcf6994486795755513eb0ab82b66d06260887 100644
--- a/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
+++ b/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
@@ -1,5 +1,5 @@
-#include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/framework/interface/MsProfilerInterface.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/register/FunctionLoader.h"
 #include "third_party/acl/inc/acl/acl_prof.h"
 
@@ -18,8 +18,23 @@ REGISTER_LIBRARY(libmsprofiler)
 LOAD_FUNCTION(aclprofWarmup)
 LOAD_FUNCTION(aclprofSetConfig)
 LOAD_FUNCTION(aclprofGetSupportedFeatures)
+LOAD_FUNCTION(aclprofGetSupportedFeaturesV2)
+LOAD_FUNCTION(aclprofRegisterDeviceCallback)
 LOAD_FUNCTION(aclprofMarkEx)
 
+aclError AclProfilingRegisterDeviceCallback()
+{
+    typedef aclError (*AclProfRegisterDeviceCallbackFunc)();
+    static AclProfRegisterDeviceCallbackFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclProfRegisterDeviceCallbackFunc)GET_FUNC(aclprofRegisterDeviceCallback);
+        if (func == nullptr) {
+            return ACL_ERROR_PROF_MODULES_UNSUPPORTED;
+        }
+    }
+    return func();
+}
+
 aclError AclProfilingWarmup(const aclprofConfig *profilerConfig)
 {
     typedef aclError (*AclProfWarmupFunc)(const aclprofConfig *);
@@ -52,12 +67,16 @@ aclError AclprofGetSupportedFeatures(size_t* featuresSize, void** featuresData)
     typedef aclError(*AclprofGetSupportedFeaturesFunc)(size_t*, void**);
     static AclprofGetSupportedFeaturesFunc func = nullptr;
     if (func == nullptr) {
-        func = (AclprofGetSupportedFeaturesFunc)GET_FUNC(aclprofGetSupportedFeatures);
+        func = (AclprofGetSupportedFeaturesFunc)GET_FUNC(aclprofGetSupportedFeaturesV2);
         if (func == nullptr) {
-            return ACL_ERROR_PROF_MODULES_UNSUPPORTED;
+            func = (AclprofGetSupportedFeaturesFunc)GET_FUNC(aclprofGetSupportedFeatures);
         }
     }
-    return func(featuresSize, featuresData);
+    
+    if (func != nullptr) {
+        return func(featuresSize, featuresData);
+    }
+    return ACL_ERROR_PROF_MODULES_UNSUPPORTED;
 }
 
 aclError AclProfilingMarkEx(const char *msg, size_t msgLen, aclrtStream stream)
diff --git a/torch_npu/csrc/framework/interface/MsProfilerInterface.h b/torch_npu/csrc/framework/interface/MsProfilerInterface.h
index b06ca001e6522b8e5f2fdceb96bdf5cf45f3b49a..d049a0559371041b0320fdbc8acd7d659052ccfd 100644
--- a/torch_npu/csrc/framework/interface/MsProfilerInterface.h
+++ b/torch_npu/csrc/framework/interface/MsProfilerInterface.h
@@ -7,6 +7,8 @@
 namespace at_npu {
 namespace native {
 
+aclError AclProfilingRegisterDeviceCallback();
+
 aclError AclProfilingWarmup(const aclprofConfig *profilerConfig);
 
 aclError AclprofSetConfig(aclprofConfigType configType, const char* config, size_t configLength);
diff --git a/torch_npu/csrc/framework/interface/MstxInterface.cpp b/torch_npu/csrc/framework/interface/MstxInterface.cpp
index 4024a63e270aaa410f27c6afbee3b2a2d75be840..b46dec9cc4eed88f99d26ba6b39809694dbee3fa 100644
--- a/torch_npu/csrc/framework/interface/MstxInterface.cpp
+++ b/torch_npu/csrc/framework/interface/MstxInterface.cpp
@@ -1,5 +1,5 @@
-#include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/framework/interface/MstxInterface.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/register/FunctionLoader.h"
 #include "torch_npu/csrc/core/npu/npu_log.h"
 #include "torch_npu/csrc/toolkit/profiler/common/utils.h"
diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp
index e54c951dc1b3e79ce777eb3da4376ac7768104ee..daa278a8e1563868751a892f16985b21d61e9d49 100644
--- a/torch_npu/csrc/framework/utils/NpuUtils.cpp
+++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp
@@ -1,5 +1,6 @@
 #include <mutex>
 #include <set>
+#include <sys/stat.h>
 
 #include "torch_npu/csrc/aten/CustomFunctions.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
@@ -267,6 +268,15 @@ void NpuUtils::check_1d(const at::Tensor &t, const char *arg, const char *fn)
                 OPS_ERROR(ErrCode::PARAM));
 }
 
+bool NpuUtils::setFilePermissions(int fd, mode_t mode)
+{
+    if (fchmod(fd, mode) == -1) {
+        ASCEND_LOGI("Failed to set permissions.");
+        return false;
+    }
+    return true;
+}
+
 #ifndef BUILD_LIBTORCH
 void NpuUtils::ProfReportMarkDataToNpuProfiler(uint32_t category, const std::string &data, uint64_t correlation_id)
 {
diff --git a/torch_npu/csrc/framework/utils/NpuUtils.h b/torch_npu/csrc/framework/utils/NpuUtils.h
index 0a2d63c267b2b8c59512c4ad2bc02d56aa681d0c..2f58120539eb52176bea03580251e55cb58caf48 100644
--- a/torch_npu/csrc/framework/utils/NpuUtils.h
+++ b/torch_npu/csrc/framework/utils/NpuUtils.h
@@ -46,6 +46,7 @@ public:
     static bool check_5d_5d_match(const at::Tensor &tensor);
     static bool IsOomError(aclError ret, int index);
     static void check_1d(const at::Tensor &t, const char *arg, const char *fn);
+    static bool setFilePermissions(int fd, mode_t mode);
 #ifndef BUILD_LIBTORCH
     static void ProfReportMarkDataToNpuProfiler(uint32_t category, const std::string &data,
                                                 uint64_t correlation_id = 0);
diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp
index c8d9922f1ad8f70229036fbfd1b1bd96bf8a3099..20f357c654b94a8c618ab339f68a68eeed8b67b6 100644
--- a/torch_npu/csrc/framework/utils/OpPreparation.cpp
+++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp
@@ -294,7 +294,8 @@ at::Tensor OpPreparation::apply_tensor_with_sizes(c10::IntArrayRef sizes, const
                                                  options.layout_opt(),
                                                  options.device_opt(),
                                                  options.pinned_memory_opt(),
-                                                 format);
+                                                 format,
+                                                 c10::nullopt);
 }
 
 void OpPreparation::CheckOut(const std::initializer_list<at::Tensor> &inputs, at::Tensor &output, at::Tensor dst)
@@ -491,7 +492,8 @@ at::Tensor OpPreparation::ApplyTensorWithSizes(c10::IntArrayRef sizes, const c10
                                                  options.layout_opt(),
                                                  options.device_opt(),
                                                  options.pinned_memory_opt(),
-                                                 format);
+                                                 format,
+                                                 c10::nullopt);
 }
 
 void OpPreparation::CheckMemory(const std::initializer_list<at::Tensor> &inputs,
diff --git a/torch_npu/csrc/logging/LogContext.cpp b/torch_npu/csrc/logging/LogContext.cpp
index a05a0594219fa0a1d1de1378b3f25a138ddaa2e1..c39a05381e7aaef9c89e1a9e1384c942a6ed17cf 100644
--- a/torch_npu/csrc/logging/LogContext.cpp
+++ b/torch_npu/csrc/logging/LogContext.cpp
@@ -54,7 +54,7 @@ void LogContext::setLogs(const std::unordered_map<std::string, int>& qnameLevels
     }
 }
 
-std::shared_ptr<Logger> LogContext::getLogger(const std::string& name)
+std::shared_ptr<Logger> LogContext::getLogger(const std::string& name) noexcept
 {
     std::lock_guard<std::mutex> lock(mutex_);
     auto iter = loggers_.find(name);
diff --git a/torch_npu/csrc/logging/LogContext.h b/torch_npu/csrc/logging/LogContext.h
index f0bdd6be575d550e4566427ce65865dbfc2686b5..c45d32104e1d2fbc1df18eef11d9c8e819307fce 100644
--- a/torch_npu/csrc/logging/LogContext.h
+++ b/torch_npu/csrc/logging/LogContext.h
@@ -16,7 +16,7 @@ public:
 
     ~LogContext() = default;
 
-    std::shared_ptr<Logger> getLogger(const std::string& name = "");
+    std::shared_ptr<Logger> getLogger(const std::string& name = "") noexcept;
     static LogContext& GetInstance();
     void setLogs(const std::unordered_map<std::string, int>& qnameLevels);
 
diff --git a/torch_npu/csrc/logging/Logger.cpp b/torch_npu/csrc/logging/Logger.cpp
index 385d11f6afb47483f0e01f32de56cc5205d78eb1..eaab8bc004e588f27ad9f8f022dc3c6d72ff7611 100644
--- a/torch_npu/csrc/logging/Logger.cpp
+++ b/torch_npu/csrc/logging/Logger.cpp
@@ -22,6 +22,11 @@ void Logger::setAllowLevel(LoggingLevel level)
     allow_level_ = level;
 }
 
+LoggingLevel Logger::getAllowLevel()
+{
+    return allow_level_;
+}
+
 void Logger::setQName(const std::string& qname)
 {
     qname_ = qname;
@@ -52,13 +57,15 @@ void Logger::log(LoggingLevel level, const char* format, va_list args)
     long nowMs = ts.tv_nsec / 1000000;
 
     auto rank = c10_npu::option::OptionsManager::GetRankId();
+    std::ostringstream oss;
     if (rank != -1) {
-        std::cerr << "[rank:" << rank << "]:[" << timeBuffer << ":" << std::setfill('0') << std::setw(3) << nowMs << "] " << name_ <<
-            ": [" << LoggingLevelNames[level] << "] " << buffer << std::endl;
-    } else {
-        std::cerr << "[" << timeBuffer << ":" << std::setfill('0') << std::setw(3) << nowMs << "] " << name_ << ": [" <<
-            LoggingLevelNames[level] << "] " << buffer << std::endl;
+        oss << "[rank:" << rank << "]:";
     }
+    oss << "[" << timeBuffer << ":" << std::setfill('0') << std::setw(3) << nowMs << "] " << name_ << ": [" <<
+        LoggingLevelNames[level] << "] " << buffer << std::endl;
+    std::string s = oss.str();
+    std::cerr.write(s.c_str(), s.size());
+    std::cerr.flush();
 }
 
 void Logger::debug(const char* format, ...)
diff --git a/torch_npu/csrc/logging/Logger.h b/torch_npu/csrc/logging/Logger.h
index 6c08c6c379e36602d2ffc941b27f9676af584e4a..1734a7c7bebbf574860c4675bee52ec039ce3d16 100644
--- a/torch_npu/csrc/logging/Logger.h
+++ b/torch_npu/csrc/logging/Logger.h
@@ -20,6 +20,7 @@ public:
     Logger(const std::string &name) : name_(name) {};
     ~Logger() = default;
 
+    LoggingLevel getAllowLevel();
     void setAllowLevel(LoggingLevel level);
     void setQName(const std::string& qname);
     std::string getQName();
diff --git a/torch_npu/csrc/npu/Event.cpp b/torch_npu/csrc/npu/Event.cpp
index 653b46849465767861378135d602a8d8fb8378b4..99fc0d381f3b05a28bc0e5c14df6fc41bd1889fe 100644
--- a/torch_npu/csrc/npu/Event.cpp
+++ b/torch_npu/csrc/npu/Event.cpp
@@ -1,13 +1,16 @@
+#include "torch_npu/csrc/npu/Event.h"
+
 #include <pybind11/pybind11.h>
 #include <torch/csrc/Device.h>
 #include <torch/csrc/THP.h>
 #include <torch/csrc/utils/python_arg_parser.h>
-#include "torch_npu/csrc/core/npu/NPUGuard.h"
 #include <structmember.h>
+#include "torch_npu/csrc/core/npu/NPUGuard.h"
 
-#include "torch_npu/csrc/npu/Event.h"
 #include "torch_npu/csrc/npu/Stream.h"
 
+#define ACL_EVENT_DEFAULT 0x0000000Eu
+
 PyObject *THNPEventClass = nullptr;
 
 static PyObject* THNPEvent_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
@@ -16,10 +19,11 @@ static PyObject* THNPEvent_pynew(PyTypeObject *type, PyObject *args, PyObject *k
     unsigned char enable_timing = 0;
     unsigned char blocking = 0;
     unsigned char interprocess = 0;
+    unsigned char external = 0;
 
-    constexpr const char* kwlist[] = {"enable_timing", "blocking", "interprocess", nullptr};
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|bbb", const_cast<char**>(kwlist),
-        &enable_timing, &blocking, &interprocess)) {
+    constexpr const char* kwlist[] = {"enable_timing", "blocking", "interprocess", "graph_external", nullptr};
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|bbbb", const_cast<char**>(kwlist),
+        &enable_timing, &blocking, &interprocess, &external)) {
         return nullptr;
     }
 
@@ -36,6 +40,9 @@ static PyObject* THNPEvent_pynew(PyTypeObject *type, PyObject *args, PyObject *k
     } else {
         flags = enable_timing ? ACL_EVENT_TIME_LINE : ACL_EVENT_DEFAULT;
     }
+    if (external) {
+        flags = ACL_EVENT_EXTERNAL;
+    }
     new (&self->npu_event) c10_npu::NPUEvent(flags);
 
     return (PyObject *)ptr.release();
@@ -120,6 +127,18 @@ static PyObject* THNPEvent_synchronize(THNPEvent *self, PyObject *noargs)
     END_HANDLE_TH_ERRORS
 }
 
+static PyObject* THNPEvent_reset(THNPEvent *self, THNPStream *stream)
+{
+    HANDLE_TH_ERRORS
+    {
+        pybind11::gil_scoped_release no_gil;
+        self->npu_event.reset(stream->npu_stream);
+        ASCEND_LOGI("Event: reset api is successfully executed, event=%p", self->npu_event.event());
+    }
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
 static struct PyGetSetDef THNPEvent_properties[] = {
     {"device", (getter)THNPEvent_get_device, nullptr, nullptr, nullptr},
     {"npu_event", (getter)THNPEvent_get_npu_event, nullptr, nullptr, nullptr},
@@ -133,6 +152,7 @@ static PyMethodDef THNPEvent_methods[] = {
     {(char*)"elapsed_time", (PyCFunction)THNPEvent_elapsed_time, METH_O, nullptr},
     {(char*)"recorded_time", (PyCFunction)THNPEvent_recorded_time, METH_NOARGS, nullptr},
     {(char*)"synchronize", (PyCFunction)THNPEvent_synchronize, METH_NOARGS, nullptr},
+    {(char*)"reset", (PyCFunction)THNPEvent_reset, METH_O, nullptr},
     {nullptr}
 };
 
@@ -143,37 +163,37 @@ PyTypeObject THNPEventType = {
     0,                                     /* tp_itemsize */
     (destructor)THNPEvent_dealloc,         /* tp_dealloc */
     0,                                     /* tp_vectorcall_offset */
-    0,                                     /* tp_getattr */
-    0,                                     /* tp_setattr */
-    0,                                     /* tp_reserved */
-    0,                                     /* tp_repr */
-    0,                                     /* tp_as_number */
-    0,                                     /* tp_as_sequence */
-    0,                                     /* tp_as_mapping */
-    0,                                     /* tp_hash  */
-    0,                                     /* tp_call */
-    0,                                     /* tp_str */
-    0,                                     /* tp_getattro */
-    0,                                     /* tp_setattro */
-    0,                                     /* tp_as_buffer */
+    nullptr,                               /* tp_getattr */
+    nullptr,                               /* tp_setattr */
+    nullptr,                               /* tp_reserved */
+    nullptr,                               /* tp_repr */
+    nullptr,                               /* tp_as_number */
+    nullptr,                               /* tp_as_sequence */
+    nullptr,                               /* tp_as_mapping */
+    nullptr,                               /* tp_hash  */
+    nullptr,                               /* tp_call */
+    nullptr,                               /* tp_str */
+    nullptr,                               /* tp_getattro */
+    nullptr,                               /* tp_setattro */
+    nullptr,                               /* tp_as_buffer */
     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
     nullptr,                                  /* tp_doc */
-    0,                                     /* tp_traverse */
-    0,                                     /* tp_clear */
-    0,                                     /* tp_richcompare */
+    nullptr,                               /* tp_traverse */
+    nullptr,                               /* tp_clear */
+    nullptr,                               /* tp_richcompare */
     0,                                     /* tp_weaklistoffset */
-    0,                                     /* tp_iter */
-    0,                                     /* tp_iternext */
+    nullptr,                               /* tp_iter */
+    nullptr,                               /* tp_iternext */
     THNPEvent_methods,                     /* tp_methods */
-    0,                                     /* tp_members */
+    nullptr,                               /* tp_members */
     THNPEvent_properties,                  /* tp_getset */
-    0,                                     /* tp_base */
-    0,                                     /* tp_dict */
-    0,                                     /* tp_descr_get */
-    0,                                     /* tp_descr_set */
+    nullptr,                               /* tp_base */
+    nullptr,                               /* tp_dict */
+    nullptr,                               /* tp_descr_get */
+    nullptr,                               /* tp_descr_set */
     0,                                     /* tp_dictoffset */
-    0,                                     /* tp_init */
-    0,                                     /* tp_alloc */
+    nullptr,                               /* tp_init */
+    nullptr,                               /* tp_alloc */
     THNPEvent_pynew,                       /* tp_new */
 };
 
diff --git a/torch_npu/csrc/npu/Event.h b/torch_npu/csrc/npu/Event.h
index 674a2ef29ba5fb0c23ed2cdd1488964153726f51..b5b8074a16d3138391da53ed18025665277f8e75 100644
--- a/torch_npu/csrc/npu/Event.h
+++ b/torch_npu/csrc/npu/Event.h
@@ -1,9 +1,9 @@
 #ifndef THNP_EVENT_INC
 #define THNP_EVENT_INC
 
+#include <torch/csrc/python_headers.h>
 #include "torch_npu/csrc/core/npu/NPUMacros.h"
 #include "torch_npu/csrc/core/npu/NPUEvent.h"
-#include <torch/csrc/python_headers.h>
 
 struct THNPEvent {
     PyObject_HEAD
diff --git a/torch_npu/csrc/npu/Graph.cpp b/torch_npu/csrc/npu/Graph.cpp
index 27e917474076fbd8614c23461875c993fd98b28b..c8d30cfa448b07e00d7671ff5e6aa7169686ee60 100644
--- a/torch_npu/csrc/npu/Graph.cpp
+++ b/torch_npu/csrc/npu/Graph.cpp
@@ -7,6 +7,7 @@
 
 #include "torch_npu/csrc/core/npu/NPUGraph.h"
 #include "torch_npu/csrc/core/npu/NPUGraphsUtils.h"
+#include "torch_npu/csrc/npu/Stream.h"
 
 template <typename T>
 using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
@@ -16,7 +17,26 @@ void TORCH_NPU_API THNPGraph_init(PyObject* module) {
     // but CI linter and some builds prefer "module".
     auto torch_N_m = py::handle(module).cast<py::module>();
 
-    torch_N_m.def("_graph_pool_handle", &c10_npu::graph_pool_handle);
+    py::class_<c10_npu::NPUTaskGroupHandle>(torch_N_m, "_NPUTaskGroupHandle")
+            .def_readonly("task_group", &c10_npu::NPUTaskGroupHandle::task_group);
+
+    torch_N_m.def("_graph_pool_handle", &c10_npu::graph_pool_handle)
+        .def("_graph_task_group_begin", [](py::object py_stream) {
+            auto stream = (*py_stream).ptr();
+            c10_npu::graph_task_group_begin(THNPUtils_PyObject_to_NPUStream(stream));
+        })
+        .def("_graph_task_group_end", [](py::object py_stream) {
+            auto stream = (*py_stream).ptr();
+            return c10_npu::graph_task_group_end(THNPUtils_PyObject_to_NPUStream(stream));
+        })
+        .def("_graph_task_update_begin", [](py::object py_stream, c10_npu::NPUTaskGroupHandle handle) {
+            auto stream = (*py_stream).ptr();
+            c10_npu::graph_task_update_begin(THNPUtils_PyObject_to_NPUStream(stream), handle);
+        })
+        .def("_graph_task_update_end", [](py::object py_stream) {
+            auto stream = (*py_stream).ptr();
+            c10_npu::graph_task_update_end(THNPUtils_PyObject_to_NPUStream(stream));
+        });
 
     shared_ptr_class_<c10_npu::NPUGraph>(torch_N_m, "_NPUGraph")
         .def(py::init<>())
@@ -25,15 +45,15 @@ void TORCH_NPU_API THNPGraph_init(PyObject* module) {
             [](c10_npu::NPUGraph& self,
                std::optional<c10_npu::MempoolId_t> pool_opt,
                std::string capture_error_mode) {
-                aclmdlCaptureMode capture_mode;
+                aclmdlRICaptureMode capture_mode;
                 c10_npu::MempoolId_t pool = pool_opt.has_value()
                     ? pool_opt.value() : c10_npu::MempoolId_t{0, 0};
                 if (capture_error_mode == "global") {
-                    capture_mode = aclmdlCaptureMode::ACL_MODEL_CAPTURE_MODE_GLOBAL;
+                    capture_mode = aclmdlRICaptureMode::ACL_MODEL_RI_CAPTURE_MODE_GLOBAL;
                 } else if (capture_error_mode == "thread_local") {
-                    capture_mode = aclmdlCaptureMode::ACL_MODEL_CAPTURE_MODE_THREAD_LOCAL;
+                    capture_mode = aclmdlRICaptureMode::ACL_MODEL_RI_CAPTURE_MODE_THREAD_LOCAL;
                 } else if (capture_error_mode == "relaxed") {
-                    capture_mode = aclmdlCaptureMode::ACL_MODEL_CAPTURE_MODE_RELAXED;
+                    capture_mode = aclmdlRICaptureMode::ACL_MODEL_RI_CAPTURE_MODE_RELAXED;
                 } else {
                     TORCH_CHECK(
                         false,
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index db19bb7679c2201cf9774fc1cf79580cfb124dd8..de106feda21bb635f84b1cb1489e749df4864ee5 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -33,19 +33,20 @@
 #include "torch_npu/csrc/core/OverflowUtils.h"
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
 #include "torch_npu/csrc/npu/DataParallelComm.h"
-#include "torch_npu/csrc/npu/Module.h"
 #include "torch_npu/csrc/npu/NPUPluggableAllocator.h"
 #include "torch_npu/csrc/npu/Stream.h"
+#include "torch_npu/csrc/npu/Module.h"
 #include "torch_npu/csrc/npu/memory_snapshot.h"
 #include "torch_npu/csrc/npu/Stress_detect.h"
 #include "torch_npu/csrc/aten/python_functions.h"
 #include "torch_npu/csrc/utils/LazyInit.h"
 #include "third_party/acl/inc/acl/acl.h"
 #include "torch_npu/csrc/profiler/python/combined_traceback.h"
-#include "torch_npu/csrc/profiler/msprof_tx.h"
 #include "torch_npu/csrc/core/npu/interface/OpInterface.h"
 #include "torch_npu/csrc/core/npu/GetCANNInfo.h"
+#include "torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h"
 #include "op_plugin/utils/custom_functions/opapi/FFTCommonOpApi.h"
+#include "torch_npu/csrc/aten/common/from_blob.h"
 
 struct NPUDeviceProp {
     std::string name;
@@ -488,19 +489,16 @@ void RegisterNpuPluggableAllocator(PyObject* module)
                     false);
                 return c10::Storage(storage_impl);
             });
-}
-
-PyObject* THNPModule_msTxMark(PyObject* self, PyObject* args)
-{
-    HANDLE_TH_ERRORS
-    const char *input_string;
-    if (!PyArg_ParseTuple(args, "s", &input_string)) {
-        return NULL;
-    }
-    torch_npu::profiler::Mark(input_string);
-
-    Py_RETURN_NONE;
-    END_HANDLE_TH_ERRORS
+        m.def(
+            "_weak_ref_tensor",
+            [](const at::Tensor& t) {
+                void* data_ptr = t.data_ptr();
+                std::vector<int64_t> sizes = t.sizes().vec();
+                std::vector<int64_t> strides = t.strides().vec();
+                auto options = t.options();
+                auto new_tensor = at_npu::native::from_blob(data_ptr, sizes, strides, options);
+                return new_tensor;
+            });
 }
 
 static PyObject* THNPModule_initExtension(PyObject* self, PyObject* noargs)
@@ -579,10 +577,13 @@ PyObject* THNPModule_stopDevice_wrap(PyObject* self, PyObject* arg)
     HANDLE_TH_ERRORS
     int device = THPUtils_unpackLong(arg);
     setDefaultStreamsStatus(device, c10_npu::RepoStatus::STOP_EXIT);
-    c10_npu::acl::AclrtDeviceTaskAbort(device);
-    ASCEND_LOGI("NPU stop device success, device is %d.", device);
-
-    Py_RETURN_NONE;
+    int ret = c10_npu::acl::AclrtDeviceTaskAbort(device);
+    ASCEND_LOGI("NPU stop device success, device is %d, ret is %d.", device, ret);
+    if (ret == 0) {
+        return PyLong_FromLong(0);
+    } else {
+        return PyLong_FromLong(1);
+    }
     END_HANDLE_TH_ERRORS
 }
 
@@ -620,6 +621,22 @@ PyObject* THNPModule_check_uce_in_memory_wrap(PyObject* self, PyObject* arg)
     END_HANDLE_TH_ERRORS
 }
 
+PyObject* THNPModule_get_uce_addr_wrap(PyObject* self, PyObject* noargs)
+{
+    HANDLE_TH_ERRORS
+    auto memUceInfo_ = c10_npu::get_mem_uce_info();
+
+    py::list result;
+    for (size_t i = 0; i < memUceInfo_.retSize; ++i) {
+        py::dict data;
+        data["ptr"] = reinterpret_cast<int64_t>(memUceInfo_.info[i].addr);
+        data["size"] = memUceInfo_.info[i].len;
+        result.append(data);
+    }
+    return result.release().ptr();
+    END_HANDLE_TH_ERRORS
+}
+
 PyObject* THNPModule_restart_device_wrap(PyObject* self, PyObject* arg)
 {
     HANDLE_TH_ERRORS
@@ -648,6 +665,24 @@ PyObject* THNPModule_getDevice_wrap(PyObject* self, PyObject* noargs)
     END_HANDLE_TH_ERRORS
 }
 
+PyObject* THNPModule_getDeviceWithoutSet_wrap(PyObject* self, PyObject* noargs)
+{
+    HANDLE_TH_ERRORS
+    int device;
+    NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDeviceWithoutSet(&device));
+    return PyLong_FromLong(device);
+    END_HANDLE_TH_ERRORS
+}
+
+PyObject* THNPModule_maybeExchangeDevice_wrap(PyObject* self, PyObject* arg)
+{
+    HANDLE_TH_ERRORS
+    int64_t device = THPUtils_unpackLong(arg);
+    int current_device = c10_npu::MaybeExchangeDevice(device);
+    return PyLong_FromLong(current_device);
+    END_HANDLE_TH_ERRORS
+}
+
 PyObject* THNPModule_stressDetect_wrap(PyObject* self, PyObject* noargs)
 {
     HANDLE_TH_ERRORS
@@ -799,6 +834,40 @@ PyObject* THNPModule_setStream_wrap(
     END_HANDLE_TH_ERRORS
 }
 
+PyObject* THNPModule_npu_eraseStream_wrap(PyObject* self, PyObject* args, PyObject* kwargs)
+{
+    HANDLE_TH_ERRORS
+    PyObject *tensor_obj = nullptr;
+    int64_t stream_id = 0;
+    int64_t device_index = 0;
+    int64_t device_type = 0;
+
+    constexpr const char* kwlist[] = {
+        "tensor", "stream_id", "device_index", "device_type", nullptr};
+    if (!PyArg_ParseTupleAndKeywords(
+        args,
+        kwargs,
+        "OLLL",
+        const_cast<char**>(kwlist),
+        &tensor_obj,
+        &stream_id,
+        &device_index,
+        &device_type)) {
+    }
+
+    if (!THPVariable_Check(tensor_obj)) {
+        TORCH_CHECK(false, "tensor is not torch.Tensor.", PTA_ERROR(ErrCode::TYPE));
+    }
+
+    // 获取 at::Tensor
+    at::Tensor tensor = THPVariable_Unpack(tensor_obj);
+    auto stream = c10_npu::NPUStream::unpack3(
+        stream_id, device_index, static_cast<c10::DeviceType>(device_type));
+    c10_npu::NPUCachingAllocator::eraseStream(tensor.storage().data_ptr(), stream);
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
 PyObject* THNPModule_isCurrentStreamCapturing_wrap(
     PyObject* self,
     PyObject* noargs)
@@ -904,6 +973,7 @@ PyObject* THNPModule_memoryStats(PyObject *_unused, PyObject *arg)
     result["reserved_bytes"] = statArrayToDict(stats.reserved_bytes);
     result["active_bytes"] = statArrayToDict(stats.active_bytes);
     result["inactive_split_bytes"] = statArrayToDict(stats.inactive_split_bytes);
+    result["requested_bytes"] = statArrayToDict(stats.requested_bytes);
     result["oversize_allocations"] = statToDict(stats.oversize_allocations);
     result["oversize_segments"] = statToDict(stats.oversize_segments);
 
@@ -1021,6 +1091,16 @@ PyObject* THNPModule_memorySnapshot(PyObject* _unused, PyObject* noargs)
         segments.append(segmentInfoToDict(segmentInfo));
     }
 
+    auto workspace_snapshot = c10_npu::NPUWorkspaceAllocator::snapshot();
+    for (int i = 0; i < workspace_snapshot.segments.size(); i++) {
+        segments.append(segmentInfoToDict(workspace_snapshot.segments[i]));
+    }
+
+    for (int i = 0; i < workspace_snapshot.device_traces.size(); i++) {
+        snapshot.device_traces[i].insert(snapshot.device_traces[i].begin(), workspace_snapshot.device_traces[i].begin(),
+                                         workspace_snapshot.device_traces[i].end());
+    }
+
     py::list traces;
     py::str action_s = "action";
     py::str alloc_s = "alloc";
@@ -1032,6 +1112,7 @@ PyObject* THNPModule_memorySnapshot(PyObject* _unused, PyObject* noargs)
     py::str segment_unmap_s = "segment_unmap";
 
     py::str snapshot_s = "snapshot";
+    py::str workspace_snapshot_s = "workspace_snapshot";
     py::str oom_s = "oom";
     py::str device_free_s = "device_free";
 
@@ -1053,6 +1134,8 @@ PyObject* THNPModule_memorySnapshot(PyObject* _unused, PyObject* noargs)
                 return oom_s;
             case TraceEntry::SNAPSHOT:
                 return snapshot_s;
+            case TraceEntry::WORKSPACE_SNAPSHOT:
+                return workspace_snapshot_s;
             case TraceEntry::SEGMENT_UNMAP:
                 return segment_unmap_s;
             case TraceEntry::SEGMENT_MAP:
@@ -1470,27 +1553,30 @@ PyObject* THNPModule_npu_set_module_train_state(PyObject* _unused, PyObject* arg
 PyObject* THNPModule_npu_get_silent_check_version(PyObject* self, PyObject* noargs)
 {
     HANDLE_TH_ERRORS
-    if (c10_npu::opapi::IsExistAclnnSilentCheckV2()) {
-        // silent check v3
-        return PyLong_FromLong(3);
-    } else {
-        if (c10_npu::opapi::IsExistAclnnSilentCheck()) {
-            // silent check v2
-            return PyLong_FromLong(2);
-        }
-        // silent check v1
-        return PyLong_FromLong(1);
+    if (c10_npu::opapi::IsExistAclnnSilentCheck()) {
+        // silent check v2
+        return PyLong_FromLong(2);
     }
+    // silent check v1
+    return PyLong_FromLong(1);
     END_HANDLE_TH_ERRORS
 }
 
-PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* noargs)
+PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* args)
 {
     HANDLE_TH_ERRORS
-    int device_index;
-    NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_index));
-    c10::DeviceIndex device = static_cast<c10::DeviceIndex>(device_index);
-    c10_npu::SetThreadAffinity(device, c10_npu::ThreadType::mainThread);
+    int core_start;
+    int core_end;
+    if (!PyArg_ParseTuple(args, "ii", &core_start, &core_end)) {
+        throw torch::TypeError("Pybind failed to parse parameters." + PTA_ERROR(ErrCode::TYPE));
+    }
+
+    if (core_start == -1) {
+        c10_npu::SetThreadAffinity(c10_npu::ThreadType::OTHER_THREAD);
+    } else {
+        c10_npu::SetThreadAffinity(core_start, core_end);
+    }
+
     Py_RETURN_NONE;
     END_HANDLE_TH_ERRORS
 }
@@ -1498,10 +1584,7 @@ PyObject* THNPModule_npu_set_thread_affinity(PyObject* self, PyObject* noargs)
 PyObject* THNPModule_npu_reset_thread_affinity(PyObject* self, PyObject* noargs)
 {
     HANDLE_TH_ERRORS
-    int device_index;
-    NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::GetDevice(&device_index));
-    c10::DeviceIndex device = static_cast<c10::DeviceIndex>(device_index);
-    c10_npu::SetThreadAffinity(device, c10_npu::ThreadType::unknownThread);
+    c10_npu::SetThreadAffinity(c10_npu::ThreadType::MAIN_THREAD);
     Py_RETURN_NONE;
     END_HANDLE_TH_ERRORS
 }
@@ -1582,9 +1665,12 @@ static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_synchronize", (PyCFunction)THNPModule_npuSynchronize, METH_NOARGS, nullptr},
     {"_npu_setDevice", (PyCFunction)THNPModule_setDevice_wrap, METH_O, nullptr},
     {"_npu_getDevice", (PyCFunction)THNPModule_getDevice_wrap, METH_NOARGS, nullptr},
+    {"_npu_getDeviceWithoutSet", (PyCFunction)THNPModule_getDeviceWithoutSet_wrap, METH_NOARGS, nullptr},
+    {"_npu_maybeExchangeDevice", (PyCFunction)THNPModule_maybeExchangeDevice_wrap, METH_O, nullptr},
     {"_npu_stopDevice", (PyCFunction)THNPModule_stopDevice_wrap, METH_O, nullptr},
     {"_npu_restart_device", (PyCFunction)THNPModule_restart_device_wrap, METH_O, nullptr},
     {"_npu_check_uce_in_memory", (PyCFunction)THNPModule_check_uce_in_memory_wrap, METH_O, nullptr},
+    {"_npu_get_uce_addr", (PyCFunction)THNPModule_get_uce_addr_wrap, METH_NOARGS, nullptr},
     {"_npu_stress_detect", (PyCFunction)THNPModule_stressDetect_wrap, METH_NOARGS, nullptr},
     {"_npu_getLocalDevice", (PyCFunction)THNPModule_getLocalDevice_wrap, METH_NOARGS, nullptr},
     {"_npu_getDeviceCount", (PyCFunction)THNPModule_getDeviceCount_wrap, METH_NOARGS, nullptr},
@@ -1593,6 +1679,7 @@ static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_getCurrentStream", (PyCFunction)THNPModule_getCurrentStream_wrap, METH_O, nullptr},
     {"_npu_getDefaultStream", (PyCFunction)THNPModule_getDefaultStream_wrap, METH_O, nullptr},
     {"_npu_setStream", (PyCFunction)THNPModule_setStream_wrap,  METH_VARARGS | METH_KEYWORDS, nullptr},
+    {"_npu_eraseStream", (PyCFunction)THNPModule_npu_eraseStream_wrap, METH_VARARGS | METH_KEYWORDS, nullptr},
     {"_npu_isCurrentStreamCapturing", (PyCFunction)THNPModule_isCurrentStreamCapturing_wrap, METH_NOARGS, nullptr},
     {"_npu_is_jit_compile_false", (PyCFunction)THNPModule_is_jit_compile_false_wrap, METH_NOARGS, nullptr},
     {"_npu_setMemoryFraction", (PyCFunction) THNPModule_setMemoryFraction, METH_VARARGS, nullptr},
@@ -1622,12 +1709,11 @@ static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_get_sync_debug_mode", (PyCFunction)THNPModule_npu_get_sync_debug_mode, METH_NOARGS, nullptr},
     {"_tensor_construct_from_storage", (PyCFunction)THNPModule_tensor_construct_from_storage, METH_VARARGS, nullptr},
     {"_tensor_storage_resize", (PyCFunction)THNPModule_tensor_storage_resize, METH_VARARGS, nullptr},
-    {"_mark", (PyCFunction)THNPModule_msTxMark, METH_VARARGS, nullptr},
     {"_npu_set_call_state", (PyCFunction)THNPModule_npu_set_call_state, METH_O, nullptr},
     {"_npu_set_module_train_state", (PyCFunction)THNPModule_npu_set_module_train_state, METH_O, nullptr},
     {"_get_silent_check_version", (PyCFunction)THNPModule_npu_get_silent_check_version, METH_NOARGS, nullptr},
-    {"_npu_set_threads_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_NOARGS, nullptr},
-    {"_npu_reset_threads_affinity", (PyCFunction)THNPModule_npu_reset_thread_affinity, METH_NOARGS, nullptr},
+    {"_npu_set_thread_affinity", (PyCFunction)THNPModule_npu_set_thread_affinity, METH_VARARGS, nullptr},
+    {"_npu_reset_thread_affinity", (PyCFunction)THNPModule_npu_reset_thread_affinity, METH_NOARGS, nullptr},
     {"_npu_set_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_set_fft_plan_cache_max_size, METH_VARARGS, nullptr},
     {"_npu_get_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_max_size, METH_NOARGS, nullptr},
     {"_npu_get_fft_plan_cache_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_size, METH_NOARGS, nullptr},
diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
index eb35a8e8c744ba93d5429facb46e40396463603a..af1fcbcd2143aea4b470b4f2f30ffbf8237d3418 100644
--- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
+++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
@@ -122,6 +122,12 @@ c10::DataPtr NPUPluggableAllocator::allocate(size_t size) const
     return data_ptr;
 }
 
+c10::DataPtr NPUPluggableAllocator::allocate_with_aligned(size_t size, size_t base_addr_aligned_kb) const
+{
+    TORCH_CHECK(false, "NPUPluggableAllocator does't has allocate_with_aligned", PTA_ERROR(ErrCode::NOT_SUPPORT));
+    return c10::DataPtr();
+}
+
 c10::DeleterFnPtr NPUPluggableAllocator::raw_deleter() const
 {
     return &custom_raw_deleter;
diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h
index 837b349d500fafd0ef803d03870f00405299852f..8e0b56f34e1532b9289d86d54c3e5527518ca607 100644
--- a/torch_npu/csrc/npu/NPUPluggableAllocator.h
+++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h
@@ -51,6 +51,7 @@ struct NPUPluggableAllocator
     void* malloc(size_t size, int device, aclrtStream stream);
 
     c10::DataPtr allocate(size_t size) const override;
+    c10::DataPtr allocate_with_aligned(size_t size, size_t base_addr_aligned_kb) const override;
     c10::DeleterFnPtr raw_deleter() const override;
 
     void* raw_alloc(size_t nbytes) override;
diff --git a/torch_npu/csrc/npu/Stream.cpp b/torch_npu/csrc/npu/Stream.cpp
index 53e84e4e1048c54c53a49054b917a80e023e2174..180fede5ec3acb34785a91e2fa21bb48640938bf 100644
--- a/torch_npu/csrc/npu/Stream.cpp
+++ b/torch_npu/csrc/npu/Stream.cpp
@@ -174,37 +174,37 @@ PyTypeObject THNPStreamType = {
   0,                                     /* tp_itemsize */
   (destructor)THNPStream_dealloc,        /* tp_dealloc */
   0,                                     /* tp_vectorcall_offset */
-  0,                                     /* tp_getattr */
-  0,                                     /* tp_setattr */
-  0,                                     /* tp_reserved */
-  0,                                     /* tp_repr */
-  0,                                     /* tp_as_number */
-  0,                                     /* tp_as_sequence */
-  0,                                     /* tp_as_mapping */
-  0,                                     /* tp_hash  */
-  0,                                     /* tp_call */
-  0,                                     /* tp_str */
-  0,                                     /* tp_getattro */
-  0,                                     /* tp_setattro */
-  0,                                     /* tp_as_buffer */
+  nullptr,                               /* tp_getattr */
+  nullptr,                               /* tp_setattr */
+  nullptr,                               /* tp_reserved */
+  nullptr,                               /* tp_repr */
+  nullptr,                               /* tp_as_number */
+  nullptr,                               /* tp_as_sequence */
+  nullptr,                               /* tp_as_mapping */
+  nullptr,                               /* tp_hash  */
+  nullptr,                               /* tp_call */
+  nullptr,                               /* tp_str */
+  nullptr,                               /* tp_getattro */
+  nullptr,                               /* tp_setattro */
+  nullptr,                               /* tp_as_buffer */
   Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
   nullptr,                                  /* tp_doc */
-  0,                                     /* tp_traverse */
-  0,                                     /* tp_clear */
-  0,                                     /* tp_richcompare */
+  nullptr,                               /* tp_traverse */
+  nullptr,                               /* tp_clear */
+  nullptr,                               /* tp_richcompare */
   0,                                     /* tp_weaklistoffset */
-  0,                                     /* tp_iter */
-  0,                                     /* tp_iternext */
+  nullptr,                               /* tp_iter */
+  nullptr,                               /* tp_iternext */
   THNPStream_methods,                    /* tp_methods */
   THNPStream_members,                    /* tp_members */
   THNPStream_properties,                /* tp_getset */
-  0,                                     /* tp_base */
-  0,                                     /* tp_dict */
-  0,                                     /* tp_descr_get */
-  0,                                     /* tp_descr_set */
+  nullptr,                               /* tp_base */
+  nullptr,                               /* tp_dict */
+  nullptr,                               /* tp_descr_get */
+  nullptr,                               /* tp_descr_set */
   0,                                     /* tp_dictoffset */
-  0,                                     /* tp_init */
-  0,                                     /* tp_alloc */
+  nullptr,                               /* tp_init */
+  nullptr,                               /* tp_alloc */
   THNPStream_pynew,                      /* tp_new */
 };
 
@@ -250,3 +250,12 @@ std::vector<c10::optional<c10_npu::NPUStream>> THNPUtils_PySequence_to_NPUStream
     }
     return streams;
 }
+
+c10_npu::NPUStream THNPUtils_PyObject_to_NPUStream(PyObject* stream)
+{
+    TORCH_CHECK(PyObject_IsInstance(stream, THNPStreamClass), "Need torch_npu.npu.Stream argument type.");
+    return c10_npu::NPUStream::unpack3(
+        (reinterpret_cast<THNPStream *>(stream))->stream_id,
+        (reinterpret_cast<THNPStream *>(stream))->device_index,
+        static_cast<c10::DeviceType>((reinterpret_cast<THNPStream *>(stream))->device_type));
+}
diff --git a/torch_npu/csrc/npu/Stream.h b/torch_npu/csrc/npu/Stream.h
index f51479f2b06002a913b13eb840cc178a1cd4b32a..f6f084bca3be71c4d01bb126ebac21eb1991c29e 100644
--- a/torch_npu/csrc/npu/Stream.h
+++ b/torch_npu/csrc/npu/Stream.h
@@ -21,4 +21,6 @@ inline bool THNPStream_Check(PyObject* obj)
 
 TORCH_NPU_API std::vector<c10::optional<c10_npu::NPUStream>> THNPUtils_PySequence_to_NPUStreamList(PyObject* obj);
 
+c10_npu::NPUStream THNPUtils_PyObject_to_NPUStream(PyObject* py_stream);
+
 #endif // THNP_STREAM_INC
diff --git a/torch_npu/csrc/npu/memory_snapshot.cpp b/torch_npu/csrc/npu/memory_snapshot.cpp
index 604a25038aea13d0d4fc07d2aa88df713658b81f..9f0aadbcd7628db5d6bb94df9ad8aa7c6eecbb72 100644
--- a/torch_npu/csrc/npu/memory_snapshot.cpp
+++ b/torch_npu/csrc/npu/memory_snapshot.cpp
@@ -1,10 +1,11 @@
 #include <ATen/Context.h>
 #include <torch/csrc/jit/serialization/pickler.h>
 
+#include "torch_npu/csrc/utils/LazyInit.h"
 #include "torch_npu/csrc/profiler/combined_traceback.h"
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+#include "torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h"
 #include "torch_npu/csrc/npu/memory_snapshot.h"
-#include "torch_npu/csrc/utils/LazyInit.h"
 
 using torch::jit::Pickler;
 using c10_npu::NPUCachingAllocator::BlockInfo;
@@ -65,6 +66,7 @@ void _record_memory_history(c10::optional<std::string> enabled,
     }
     c10_npu::NPUCachingAllocator::recordHistory(enabled.has_value(), recorder,
                                                 max_entries, when);
+    c10_npu::NPUWorkspaceAllocator::recordHistory(enabled.has_value(), recorder, when);
 }
 
 std::string write_pickle(const c10::IValue& v)
diff --git a/torch_npu/csrc/profiler/init.cpp b/torch_npu/csrc/profiler/init.cpp
index a367c250e3af0d5c2937130f1193fa829b0eed6e..ce73aef70366436f53f5291663a6c3e0b08e339a 100644
--- a/torch_npu/csrc/profiler/init.cpp
+++ b/torch_npu/csrc/profiler/init.cpp
@@ -22,7 +22,8 @@
 namespace torch_npu {
 namespace profiler {
 
-PyObject* profiler_initExtension(PyObject* _unused, PyObject *unused) {
+PyObject* profiler_initExtension(PyObject* _unused, PyObject *unused)
+{
     auto torch_npu_C_module = THPObjectPtr(PyImport_ImportModule("torch_npu._C"));
     if (!torch_npu_C_module) {
         return nullptr;
@@ -35,29 +36,45 @@ PyObject* profiler_initExtension(PyObject* _unused, PyObject *unused) {
         .value("NPU", NpuActivityType::NPU);
 
     py::class_<ExperimentalConfig>(m, "_ExperimentalConfig")
-        .def(py::init<std::string, std::string, bool, bool, bool, bool>(),
+        .def(py::init<std::string, std::string, bool, bool, bool, bool, std::vector<std::string>,
+             std::vector<std::string>, std::vector<std::string>, bool, bool>(),
              py::arg("trace_level") = "Level0",
              py::arg("metrics") = "ACL_AICORE_NONE",
              py::arg("l2_cache") = false,
              py::arg("record_op_args") = false,
              py::arg("msprof_tx") = false,
-             py::arg("op_attr") = false
+             py::arg("op_attr") = false,
+             py::arg("host_sys") = std::vector<std::string>{},
+             py::arg("mstx_domain_include") = std::vector<std::string>{},
+             py::arg("mstx_domain_exclude") = std::vector<std::string>{},
+             py::arg("sys_io") = false,
+             py::arg("sys_interconnection") = false
         )
         .def(py::pickle(
             [](const ExperimentalConfig& p) {
-                return py::make_tuple(p.trace_level, p.metrics, p.l2_cache, p.record_op_args, p.msprof_tx, p.op_attr);
+                return py::make_tuple(p.trace_level, p.metrics, p.l2_cache, p.record_op_args, p.msprof_tx, p.op_attr,
+                                      p.host_sys, p.mstx_domain_include, p.mstx_domain_exclude, p.sys_io,
+                                      p.sys_interconnection);
             },
             [](py::tuple t) {
-                if (t.size() < 6) {  // 6表示ExperimentalConfig的配置有六项
-                    throw std::runtime_error("Expected atleast 5 values in state" + PROF_ERROR(ErrCode::PARAM));
+                if (t.size() < static_cast<size_t>(ExperConfigType::CONFIG_TYPE_MAX_COUNT)) {
+                    throw std::runtime_error(
+                        "Expected at least " + std::to_string(static_cast<size_t>(ExperConfigType::CONFIG_TYPE_MAX_COUNT)) +
+                        " values in state" + PROF_ERROR(ErrCode::PARAM)
+                    );
                 }
                 return ExperimentalConfig(
-                    t[0].cast<std::string>(),
-                    t[1].cast<std::string>(),
-                    t[2].cast<bool>(),
-                    t[3].cast<bool>(),
-                    t[4].cast<bool>(),
-                    t[5].cast<bool>()
+                    t[static_cast<size_t>(ExperConfigType::TRACE_LEVEL)].cast<std::string>(),
+                    t[static_cast<size_t>(ExperConfigType::METRICS)].cast<std::string>(),
+                    t[static_cast<size_t>(ExperConfigType::L2_CACHE)].cast<bool>(),
+                    t[static_cast<size_t>(ExperConfigType::RECORD_OP_ARGS)].cast<bool>(),
+                    t[static_cast<size_t>(ExperConfigType::MSPROF_TX)].cast<bool>(),
+                    t[static_cast<size_t>(ExperConfigType::OP_ATTR)].cast<bool>(),
+                    t[static_cast<size_t>(ExperConfigType::HOST_SYS)].cast<std::vector<std::string>>(),
+                    t[static_cast<size_t>(ExperConfigType::MSTX_DOMAIN_INCLUDE)].cast<std::vector<std::string>>(),
+                    t[static_cast<size_t>(ExperConfigType::MSTX_DOMAIN_EXCLUDE)].cast<std::vector<std::string>>(),
+                    t[static_cast<size_t>(ExperConfigType::SYS_IO)].cast<bool>(),
+                    t[static_cast<size_t>(ExperConfigType::SYS_INTERCONNECTION)].cast<bool>()
                 );
             }
         ));
@@ -97,20 +114,50 @@ static PyMethodDef TorchProfilerMethods[] = { // NOLINT
 };
 
 
-PyMethodDef* profiler_functions() {
+PyMethodDef* profiler_functions()
+{
     return TorchProfilerMethods;
 }
 
+PyObject* THNPModule_markOnHost(PyObject* _unused, PyObject* args)
+{
+    HANDLE_TH_ERRORS
+    const char* message;
+    const char* domain;
+    if (!PyArg_ParseTuple(args, "ss", &message, &domain)) {
+        return nullptr;
+    }
+    mstxMark(message, nullptr, domain);
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
+PyObject* THNPModule_mark(PyObject* _unused, PyObject* args)
+{
+    HANDLE_TH_ERRORS
+    const char* message;
+    const char* domain;
+    PyObject* stream_o = nullptr;
+    if (!PyArg_ParseTuple(args, "sOs", &message, &stream_o, &domain)) {
+        return nullptr;
+    }
+    aclrtStream stream = static_cast<aclrtStream>(PyLong_AsVoidPtr(stream_o));
+    mstxMark(message, stream, domain);
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
 PyObject* THNPModule_rangeStart(PyObject* _unused, PyObject* args)
 {
     HANDLE_TH_ERRORS
-    char *message;
+    const char* message;
+    const char* domain;
     PyObject* stream_o = nullptr;
-    if (!PyArg_ParseTuple(args, "sO", &message, &stream_o)) {
+    if (!PyArg_ParseTuple(args, "sOs", &message, &stream_o, &domain)) {
         return nullptr;
     }
     aclrtStream stream = static_cast<aclrtStream>(PyLong_AsVoidPtr(stream_o));
-    int id = mstxRangeStart(message, stream);
+    int id = mstxRangeStart(message, stream, domain);
     return PyLong_FromLong(id);
     END_HANDLE_TH_ERRORS
 }
@@ -118,11 +165,12 @@ PyObject* THNPModule_rangeStart(PyObject* _unused, PyObject* args)
 PyObject* THNPModule_rangeStartOnHost(PyObject* _unused, PyObject* args)
 {
     HANDLE_TH_ERRORS
-    char *message;
-    if (!PyArg_ParseTuple(args, "s", &message)) {
+    const char* message;
+    const char* domain;
+    if (!PyArg_ParseTuple(args, "ss", &message, &domain)) {
         return nullptr;
     }
-    int id = mstxRangeStart(message, nullptr);
+    int id = mstxRangeStart(message, nullptr, domain);
     return PyLong_FromLong(id);
     END_HANDLE_TH_ERRORS
 }
@@ -130,16 +178,19 @@ PyObject* THNPModule_rangeStartOnHost(PyObject* _unused, PyObject* args)
 PyObject* THNPModule_rangeEnd(PyObject* self, PyObject* args)
 {
     HANDLE_TH_ERRORS
-    mstxRangeId rangeId;
-    if (!PyArg_ParseTuple(args, "k", &rangeId)) {
+    int rangeId;
+    const char* domain;
+    if (!PyArg_ParseTuple(args, "is", &rangeId, &domain)) {
         return nullptr;
     }
-    mstxRangeEnd(rangeId);
+    mstxRangeEnd(rangeId, domain);
     Py_RETURN_NONE;
     END_HANDLE_TH_ERRORS
 }
 
 static std::vector<PyMethodDef> mstxMethods = {
+    {"_mark_on_host", (PyCFunction)THNPModule_markOnHost, METH_VARARGS, nullptr},
+    {"_mark", (PyCFunction)THNPModule_mark, METH_VARARGS, nullptr},
     {"_range_start_on_host", (PyCFunction)THNPModule_rangeStartOnHost, METH_VARARGS, nullptr},
     {"_range_start", (PyCFunction)THNPModule_rangeStart, METH_VARARGS, nullptr},
     {"_range_end", (PyCFunction)THNPModule_rangeEnd, METH_VARARGS, nullptr},
diff --git a/torch_npu/csrc/profiler/init.h b/torch_npu/csrc/profiler/init.h
index f29ff5d5f4fe50d88d0ddd7191fce6980b94510a..3a3a0788a99565c4511712ec623b23aaf93daa75 100644
--- a/torch_npu/csrc/profiler/init.h
+++ b/torch_npu/csrc/profiler/init.h
@@ -4,6 +4,21 @@
 
 namespace torch_npu {
 namespace profiler {
+enum class ExperConfigType {
+    TRACE_LEVEL = 0,
+    METRICS,
+    L2_CACHE,
+    RECORD_OP_ARGS,
+    MSPROF_TX,
+    OP_ATTR,
+    HOST_SYS,
+    MSTX_DOMAIN_INCLUDE,
+    MSTX_DOMAIN_EXCLUDE,
+    SYS_IO,
+    SYS_INTERCONNECTION,
+    CONFIG_TYPE_MAX_COUNT  // 表示枚举的总数，固定放在枚举的最后一个
+};
+
 TORCH_NPU_API PyMethodDef* profiler_functions();
 TORCH_NPU_API void initMstx(PyObject *module);
 }
diff --git a/torch_npu/csrc/profiler/msprof_tx.h b/torch_npu/csrc/profiler/msprof_tx.h
deleted file mode 100644
index b6dd9b366975725db51cbfb91de7c45f829492ba..0000000000000000000000000000000000000000
--- a/torch_npu/csrc/profiler/msprof_tx.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef __TORCH_NPU_MSPROF_TX__
-#define __TORCH_NPU_MSPROF_TX__
-
-#include "torch_npu/csrc/core/npu/NPUStream.h"
-#include "torch_npu/csrc/core/npu/NPUFunctions.h"
-#include "torch_npu/csrc/framework/interface/MsProfilerInterface.h"
-#include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/profiler/npu_profiler.h"
-
-namespace torch_npu {
-namespace profiler {
-
-void profMark(const char *message, aclrtStream stream)
-{
-    if (at_npu::native::IsSupportMstxFunc()) {
-        at_npu::native::MstxMarkA(message, stream);
-    } else {
-        (void)at_npu::native::AclProfilingMarkEx(message, strlen(message), stream);
-    }
-}
-
-void Mark(const char *message)
-{
-    if (!mstxEnable()) {
-        return;
-    }
-    RECORD_FUNCTION("mark_op", std::vector<c10::IValue>({}));
-    c10::DeviceIndex device_id = -1;
-    aclrtStream stream = c10_npu::getCurrentNPUStreamNoWait(device_id);
-    auto mark_call = [msg_ptr = std::make_shared<std::string>(message), stream]() -> int {
-        (void)profMark(msg_ptr->c_str(), stream);
-        return 0;
-    };
-    at_npu::native::OpCommand::RunOpApi("mstx_mark_op", mark_call);
-}
-
-} // profiler
-} // torch_npu
-
-#endif
diff --git a/torch_npu/csrc/profiler/mstx_mgr.cpp b/torch_npu/csrc/profiler/mstx_mgr.cpp
index 671908257ba0c63d7df1ebd794ac0b99e601e7c9..5dd49495cf86d0761c4da40ea7532adc60219233 100644
--- a/torch_npu/csrc/profiler/mstx_mgr.cpp
+++ b/torch_npu/csrc/profiler/mstx_mgr.cpp
@@ -11,54 +11,95 @@
 
 namespace torch_npu {
 namespace profiler {
+void markImpl(const char* message, const aclrtStream stream, mstxDomainHandle_t domain)
+{
+    if (domain == nullptr) {
+        (void)at_npu::native::MstxMarkA(message, stream);
+    } else {
+        (void)at_npu::native::MstxDomainMarkA(domain, message, stream);
+    }
+}
+
+void rangeStartImpl(const char* message, const aclrtStream stream, int ptRangeId, mstxDomainHandle_t domain)
+{
+    if (domain == nullptr) {
+        (void)at_npu::native::MstxRangeStartA(message, stream, ptRangeId);
+    } else {
+        (void)at_npu::native::MstxDomainRangeStartA(domain, message, stream, ptRangeId);
+    }
+}
+
+void rangeEndImpl(int ptRangeId, mstxDomainHandle_t domain)
+{
+    if (domain == nullptr) {
+        at_npu::native::MstxRangeEnd(ptRangeId);
+    } else {
+        at_npu::native::MstxDomainRangeEnd(domain, ptRangeId);
+    }
+}
+
 MstxMgr::MstxMgr()
 {
 }
 
-void MstxMgr::mark(const char* message, const aclrtStream stream)
+void MstxMgr::mark(const char* message, const aclrtStream stream, const char* domain)
 {
     if (!isMstxEnable()) {
         return;
     }
-    int id = ptRangeId_++;
+    std::string domainStr(domain);
+    if (!isMstxTxDomainEnable(domainStr)) {
+        return;
+    }
+    mstxDomainHandle_t domainHandle = createProfDomain(domainStr);
     if (stream == nullptr) {
-        (void)at_npu::native::MstxMarkA(message, nullptr);
+        markImpl(message, nullptr, domainHandle);
         return;
     }
-    auto mark_call = [msg_ptr = std::make_shared<std::string>(message), stream]() -> int {
-        (void)at_npu::native::MstxMarkA(msg_ptr->c_str(), stream);
+    auto mark_call = [msg_ptr = std::make_shared<std::string>(message), stream, domainHandle]() -> int {
+        markImpl(msg_ptr->c_str(), stream, domainHandle);
         return 0;
     };
-    at_npu::native::OpCommand::RunOpApi("mstx_mark_op", mark_call);
+    at_npu::native::OpCommand::RunOpApiV2("mstx_mark_op", mark_call);
 }
 
-int MstxMgr::rangeStart(const char* message, const aclrtStream stream)
+int MstxMgr::rangeStart(const char* message, const aclrtStream stream, const char* domain)
 {
     if (!isMstxEnable()) {
         return 0;
     }
+    std::string domainStr(domain);
+    if (!isMstxTxDomainEnable(domainStr)) {
+        return 0;
+    }
+    mstxDomainHandle_t domainHandle = createProfDomain(domainStr);
     int id = ptRangeId_++;
     if (stream == nullptr) {
-        int res = at_npu::native::MstxRangeStartA(message, nullptr, id);
+        rangeStartImpl(message, nullptr, id, domainHandle);
         return id;
     }
     {
         std::lock_guard<std::mutex> lock(mtx_);
         ptRangeIdsWithStream_.insert(id);
     }
-    auto range_start_call = [msg_ptr = std::make_shared<std::string>(message), stream, id]() -> int {
-        int taskId = at_npu::native::MstxRangeStartA(msg_ptr->c_str(), stream, id);
+    auto range_start_call = [msg_ptr = std::make_shared<std::string>(message), stream, id, domainHandle]() -> int {
+        rangeStartImpl(msg_ptr->c_str(), stream, id, domainHandle);
         return 0;
     };
-    at_npu::native::OpCommand::RunOpApi("mstx_range_start_op", range_start_call);
+    at_npu::native::OpCommand::RunOpApiV2("mstx_range_start_op", range_start_call);
     return id;
 }
 
-void MstxMgr::rangeEnd(int ptRangeId)
+void MstxMgr::rangeEnd(int ptRangeId, const char* domain)
 {
     if (!isMstxEnable() || ptRangeId == 0) {
         return;
     }
+    std::string domainStr(domain);
+    if (!isMstxTxDomainEnable(domainStr)) {
+        return;
+    }
+    mstxDomainHandle_t domainHandle = createProfDomain(domainStr);
     bool rangeIdWithStream = false;
     {
         std::lock_guard<std::mutex> lock(mtx_);
@@ -69,14 +110,14 @@ void MstxMgr::rangeEnd(int ptRangeId)
         }
     }
     if (!rangeIdWithStream) {
-        at_npu::native::MstxRangeEnd(ptRangeId);
+        rangeEndImpl(ptRangeId, domainHandle);
         return;
     }
-    auto range_end_call = [ptRangeId]() -> int {
-        at_npu::native::MstxRangeEnd(ptRangeId);
+    auto range_end_call = [ptRangeId, domainHandle]() -> int {
+        rangeEndImpl(ptRangeId, domainHandle);
         return 0;
     };
-    at_npu::native::OpCommand::RunOpApi("mstx_range_end_op", range_end_call);
+    at_npu::native::OpCommand::RunOpApiV2("mstx_range_end_op", range_end_call);
 }
 
 int MstxMgr::getRangeId()
@@ -84,86 +125,42 @@ int MstxMgr::getRangeId()
     return ptRangeId_++;
 }
 
-mstxDomainHandle_t MstxMgr::createDomain(const char* name)
+mstxDomainHandle_t MstxMgr::createProfDomain(const std::string &name)
 {
-    if (!isMsleaksEnable() && !isMstxEnable()) {
+    if (!at_npu::native::IsSupportMstxDomainFunc()) {
         return nullptr;
     }
-    return at_npu::native::MstxDomainCreateA(name);
-}
-
-void MstxMgr::destroyDomain(mstxDomainHandle_t domain)
-{
-    at_npu::native::MstxDomainDestroy(domain);
-}
-
-void MstxMgr::domainMark(mstxDomainHandle_t domain, const char* message, const aclrtStream stream)
-{
-    if (!isMstxEnable()) {
-        return;
+    if (name == DOMAIN_DEFAULT) { // don't need to create default domain
+        return nullptr;
     }
-    int id = ptRangeId_++;
-    if (stream == nullptr) {
-        (void)at_npu::native::MstxDomainMarkA(domain, message, nullptr);
-        return;
+    std::lock_guard<std::mutex> lock(mstxDomainsMtx);
+    auto iter = mstxDomains_.find(name);
+    if (iter != mstxDomains_.end()) {
+        return iter->second;
     }
-    auto mark_call = [domain, msg_ptr = std::make_shared<std::string>(message), stream]() -> int {
-        (void)at_npu::native::MstxDomainMarkA(domain, msg_ptr->c_str(), stream);
-        return 0;
-    };
-    at_npu::native::OpCommand::RunOpApi("mstx_domain_mark_op", mark_call);
+    mstxDomainHandle_t handle = at_npu::native::MstxDomainCreateA(name.c_str());
+    if (handle != nullptr) {
+        mstxDomains_.emplace(name, handle);
+    }
+    return handle;
 }
 
-int MstxMgr::domainRangeStart(mstxDomainHandle_t domain, const char* message, const aclrtStream stream)
+mstxDomainHandle_t MstxMgr::createLeaksDomain(const char* name)
 {
-    if (!isMstxEnable()) {
-        return 0;
-    }
-    int id = ptRangeId_++;
-    if (stream == nullptr) {
-        int res = at_npu::native::MstxDomainRangeStartA(domain, message, nullptr, id);
-        return id;
-    }
-    {
-        std::lock_guard<std::mutex> lock(mtx_);
-        ptRangeIdsWithStream_.insert(id);
+    if (!at_npu::native::IsSupportMstxFunc()) {
+        return nullptr;
     }
-    auto range_start_call = [domain, msg_ptr = std::make_shared<std::string>(message), stream, id]() -> int {
-        int taskId = at_npu::native::MstxDomainRangeStartA(domain, msg_ptr->c_str(), stream, id);
-        return 0;
-    };
-    at_npu::native::OpCommand::RunOpApi("mstx_domain_range_start_op", range_start_call);
-    return id;
+    return at_npu::native::MstxDomainCreateA(name);
 }
 
-void MstxMgr::domainRangeEnd(mstxDomainHandle_t domain, int ptRangeId)
+void MstxMgr::destroyDomain(mstxDomainHandle_t domain)
 {
-    if (!isMstxEnable() || ptRangeId == 0) {
-        return;
-    }
-    bool rangeIdWithStream = false;
-    {
-        std::lock_guard<std::mutex> lock(mtx_);
-        auto iter = ptRangeIdsWithStream_.find(ptRangeId);
-        if (iter != ptRangeIdsWithStream_.end()) {
-            rangeIdWithStream = true;
-            ptRangeIdsWithStream_.erase(iter);
-        }
-    }
-    if (!rangeIdWithStream) {
-        at_npu::native::MstxDomainRangeEnd(domain, ptRangeId);
-        return;
-    }
-    auto range_end_call = [domain, ptRangeId]() -> int {
-        at_npu::native::MstxDomainRangeEnd(domain, ptRangeId);
-        return 0;
-    };
-    at_npu::native::OpCommand::RunOpApi("mstx_domain_range_end_op", range_end_call);
+    at_npu::native::MstxDomainDestroy(domain);
 }
 
 mstxMemHeapHandle_t MstxMgr::memHeapRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc)
 {
-    if (!isMsleaksEnable() || desc==nullptr) {
+    if (!at_npu::native::IsSupportMstxFunc() || desc == nullptr) {
         return nullptr;
     }
     mstxMemHeapDesc_t heapDesc;
@@ -173,7 +170,7 @@ mstxMemHeapHandle_t MstxMgr::memHeapRegister(mstxDomainHandle_t domain, mstxMemV
 
 void MstxMgr::memHeapUnregister(mstxDomainHandle_t domain, void* ptr)
 {
-    if (!isMsleaksEnable() || ptr == nullptr) {
+    if (!at_npu::native::IsSupportMstxFunc() || ptr == nullptr) {
         return;
     }
     at_npu::native::MstxMemHeapUnregister(domain, reinterpret_cast<mstxMemHeapHandle_t>(ptr));
@@ -181,7 +178,7 @@ void MstxMgr::memHeapUnregister(mstxDomainHandle_t domain, void* ptr)
 
 void MstxMgr::memRegionsRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc)
 {
-    if (!isMsleaksEnable() || desc == nullptr) {
+    if (!at_npu::native::IsSupportMstxFunc() || desc == nullptr) {
         return;
     }
     mstxMemRegionsRegisterBatch_t batch;
@@ -192,7 +189,7 @@ void MstxMgr::memRegionsRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeD
 
 void MstxMgr::memRegionsUnregister(mstxDomainHandle_t domain, void* ptr)
 {
-    if (!isMsleaksEnable() || ptr == nullptr) {
+    if (!at_npu::native::IsSupportMstxFunc() || ptr == nullptr) {
         return;
     }
     mstxMemRegionsUnregisterBatch_t unregisterBatch;
@@ -266,5 +263,13 @@ bool MstxMgr::isMstxEnable()
 {
     return isProfTxEnable() || isMsptiTxEnable();
 }
+
+bool MstxMgr::isMstxTxDomainEnable(const std::string &domainName)
+{
+    if (isProfTxEnable()) {
+        return ProfilerMgr::GetInstance()->IsMstxDomainEnabled(domainName);
+    }
+    return true;
+}
 }
 }
\ No newline at end of file
diff --git a/torch_npu/csrc/profiler/mstx_mgr.h b/torch_npu/csrc/profiler/mstx_mgr.h
index bea6f59bea71702e3619a55f03b74770cefb452e..d32743f586d5054faff371955b5910f5c0fbc367 100644
--- a/torch_npu/csrc/profiler/mstx_mgr.h
+++ b/torch_npu/csrc/profiler/mstx_mgr.h
@@ -12,22 +12,23 @@ namespace torch_npu {
 namespace profiler {
 
 const std::string DOMAIN_COMMUNICATION = "communication";
+const std::string DOMAIN_DEFAULT = "default";
 const std::string DOMAIN_MSLEAKS = "msleaks";
 
 class MstxMgr : public torch_npu::toolkit::profiler::Singleton<MstxMgr> {
 friend class torch_npu::toolkit::profiler::Singleton<MstxMgr>;
 public:
-    void mark(const char* message, const aclrtStream stream);
-    int rangeStart(const char* message, const aclrtStream stream);
-    void rangeEnd(int ptRangeId);
+    void mark(const char* message, const aclrtStream stream, const char* domain);
+    int rangeStart(const char* message, const aclrtStream stream, const char* domain);
+    void rangeEnd(int ptRangeId, const char* domain);
+
+    bool isMsleaksEnable();
     bool isMstxEnable();
     int getRangeId();
-
-    mstxDomainHandle_t createDomain(const char* name);
+    bool isMstxTxDomainEnable(const std::string &domainName);
+    mstxDomainHandle_t createProfDomain(const std::string &name);
+    mstxDomainHandle_t createLeaksDomain(const char* name);
     void destroyDomain(mstxDomainHandle_t domain);
-    void domainMark(mstxDomainHandle_t domain, const char* message, const aclrtStream stream);
-    int domainRangeStart(mstxDomainHandle_t domain, const char* message, const aclrtStream stream);
-    void domainRangeEnd(mstxDomainHandle_t domain, int ptRangeId);
     mstxMemHeapHandle_t memHeapRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc);
     void memHeapUnregister(mstxDomainHandle_t domain, void* ptr);
     void memRegionsRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc);
@@ -40,15 +41,17 @@ private:
     explicit MstxMgr(MstxMgr &&obj) = delete;
     MstxMgr& operator=(MstxMgr &&obj) = delete;
 
-    bool isMsleaksEnable();
     bool isMsleaksEnableImpl();
     bool isProfTxEnable();
     bool isMsptiTxEnable();
     bool isMsptiTxEnableImpl();
+
 private:
     std::atomic<int> ptRangeId_{1};
     std::unordered_set<int> ptRangeIdsWithStream_;
     std::mutex mtx_;
+    std::mutex mstxDomainsMtx;
+    std::unordered_map<std::string, mstxDomainHandle_t> mstxDomains_;
 };
 }
 } // namespace torch_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/profiler/npu_profiler.cpp b/torch_npu/csrc/profiler/npu_profiler.cpp
index 2d726162e54e3b0acea6279a69fc4fd4005cf9ea..295eda9aea1f2a425c21caaf037e4f60713a463e 100644
--- a/torch_npu/csrc/profiler/npu_profiler.cpp
+++ b/torch_npu/csrc/profiler/npu_profiler.cpp
@@ -131,6 +131,7 @@ struct NpuProfilerThreadLocalState : public ProfilerStateBase {
                 device.index(),
                 0,
                 0,
+                0,
                 Utils::GetTid(),
                 Utils::GetPid()
             ));
@@ -309,7 +310,8 @@ void warmupNpuProfiler(const NpuProfilerConfig &config,
     ExperimentalConfig experimental_config = config.experimental_config;
     NpuTraceConfig npu_config = {experimental_config.trace_level, experimental_config.metrics,
         config.profile_memory, experimental_config.l2_cache, experimental_config.record_op_args,
-        experimental_config.msprof_tx, experimental_config.op_attr};
+        experimental_config.msprof_tx, experimental_config.op_attr, experimental_config.host_sys, experimental_config.mstx_domain_include,
+        experimental_config.mstx_domain_exclude, experimental_config.sys_io, experimental_config.sys_interconnection};
     ProfilerMgr::GetInstance()->Warmup(npu_config, cpu_trace);
 }
 
@@ -327,7 +329,8 @@ void startNpuProfiler(const NpuProfilerConfig &config,
     ExperimentalConfig experimental_config = config.experimental_config;
     NpuTraceConfig npu_config = {experimental_config.trace_level, experimental_config.metrics,
         config.profile_memory, experimental_config.l2_cache, experimental_config.record_op_args,
-        experimental_config.msprof_tx, experimental_config.op_attr};
+        experimental_config.msprof_tx, experimental_config.op_attr, experimental_config.host_sys, experimental_config.mstx_domain_include,
+        experimental_config.mstx_domain_exclude, experimental_config.sys_io, experimental_config.sys_interconnection};
     ProfilerMgr::GetInstance()->Start(npu_config, cpu_trace);
     if (state->tracePython()) {
         python_tracer::call(python_tracer::Command::kStartAll);
@@ -387,6 +390,7 @@ void reportMemoryDataToNpuProfiler(const MemoryUsage& data)
         data.stream_ptr,
         data.device_type,
         data.device_index,
+        data.component_type,
         data.data_type,
         data.allocator_type,
         Utils::GetTid(),
diff --git a/torch_npu/csrc/profiler/npu_profiler.h b/torch_npu/csrc/profiler/npu_profiler.h
index 2a6f44a318f46d3c67a3df2d9be4fc2001e556e3..2127825bc134e0b49178fe00890cdff58011e62c 100644
--- a/torch_npu/csrc/profiler/npu_profiler.h
+++ b/torch_npu/csrc/profiler/npu_profiler.h
@@ -10,6 +10,7 @@
 #include "torch_npu/csrc/toolkit/profiler/inc/data_reporter.h"
 #include "torch_npu/csrc/profiler/profiler_mgr.h"
 #include "torch_npu/csrc/profiler/mstx_mgr.h"
+#include "torch_npu/csrc/framework/interface/MsProfilerInterface.h"
 
 namespace torch_npu {
 namespace profiler {
@@ -20,9 +21,14 @@ void registerFunctions(CallFn call);
 } // python_tracer
 
 enum class NpuActivityType {
-  NONE = 0,
-  CPU,
-  NPU,
+    NONE = 0,
+    CPU,
+    NPU,
+};
+
+enum class MemoryComponentType {
+    CACHING_ALLOCATOR = 0,
+    WORKSPACE_ALLOCATOR,
 };
 
 enum class MemoryDataType {
@@ -41,6 +47,7 @@ enum class MemoryAllocatorType {
 struct MemoryUsage {
     int8_t device_type{0};
     int8_t device_index{0};
+    uint8_t component_type{static_cast<uint8_t>(MemoryComponentType::CACHING_ALLOCATOR)};
     uint8_t data_type{static_cast<uint8_t>(MemoryDataType::MEMORY_INVALID)};
     uint8_t allocator_type{static_cast<uint8_t>(MemoryAllocatorType::ALLOCATOR_INVALID)};
     int64_t ptr{0};
@@ -54,13 +61,20 @@ struct MemoryUsage {
 struct ExperimentalConfig {
     ExperimentalConfig(std::string level = "Level0", std::string metrics = "ACL_AICORE_NONE",
                        bool l2_cache = false, bool record_op_args = false, bool msprof_tx = false,
-                       bool op_attr = false)
+                       bool op_attr = false, std::vector<std::string> host_sys = {}, std::vector<std::string> mstx_domain_include = {},
+                       std::vector<std::string> mstx_domain_exclude = {}, bool sys_io = false,
+                       bool sys_interconnection = false)
         : trace_level(level),
           metrics(metrics),
           l2_cache(l2_cache),
           record_op_args(record_op_args),
           msprof_tx(msprof_tx),
-          op_attr(op_attr) {}
+          op_attr(op_attr),
+          host_sys(host_sys),
+          mstx_domain_include(mstx_domain_include),
+          mstx_domain_exclude(mstx_domain_exclude),
+          sys_io(sys_io),
+          sys_interconnection(sys_interconnection) {}
     ~ExperimentalConfig() = default;
 
     std::string trace_level;
@@ -69,24 +83,29 @@ struct ExperimentalConfig {
     bool record_op_args;
     bool msprof_tx;
     bool op_attr;
+    std::vector<std::string> host_sys;
+    std::vector<std::string> mstx_domain_include;
+    std::vector<std::string> mstx_domain_exclude;
+    bool sys_io;
+    bool sys_interconnection;
 };
 
 struct NpuProfilerConfig {
-  explicit NpuProfilerConfig(
-    std::string path,
-    bool record_shapes = false,
-    bool profile_memory = false,
-    bool with_stack = false,
-    bool with_flops = false,
-    bool with_modules = false,
-    ExperimentalConfig experimental_config = ExperimentalConfig())
-    : path(path),
-      record_shapes(record_shapes),
-      profile_memory(profile_memory),
-      with_stack(with_stack),
-      with_flops(with_flops),
-      with_modules(with_modules),
-      experimental_config(experimental_config) {}
+    explicit NpuProfilerConfig(
+        std::string path,
+        bool record_shapes = false,
+        bool profile_memory = false,
+        bool with_stack = false,
+        bool with_flops = false,
+        bool with_modules = false,
+        ExperimentalConfig experimental_config = ExperimentalConfig())
+        : path(path),
+          record_shapes(record_shapes),
+          profile_memory(profile_memory),
+          with_stack(with_stack),
+          with_flops(with_flops),
+          with_modules(with_modules),
+          experimental_config(experimental_config) {}
 
     ~NpuProfilerConfig() = default;
     std::string path;
@@ -114,14 +133,23 @@ void reportMarkDataToNpuProfiler(uint32_t category, const std::string &msg, uint
 
 void reportMemoryDataToNpuProfiler(const MemoryUsage& data);
 
-inline int mstxRangeStart(const char* message, const aclrtStream stream)
+inline void mstxMark(const char* message, const aclrtStream stream, const char* domain)
+{
+    if (at_npu::native::IsSupportMstxFunc()) {
+        MstxMgr::GetInstance()->mark(message, stream, domain);
+    } else {
+        (void)at_npu::native::AclProfilingMarkEx(message, strlen(message), stream);
+    }
+}
+
+inline int mstxRangeStart(const char* message, const aclrtStream stream, const char* domain)
 {
-    return MstxMgr::GetInstance()->rangeStart(message, stream);
+    return MstxMgr::GetInstance()->rangeStart(message, stream, domain);
 }
 
-inline void mstxRangeEnd(int id)
+inline void mstxRangeEnd(int id, const char* domain)
 {
-    MstxMgr::GetInstance()->rangeEnd(id);
+    MstxMgr::GetInstance()->rangeEnd(id, domain);
 }
 
 inline bool mstxEnable()
@@ -139,8 +167,10 @@ struct MstxRange {
         }
         rangeId = MstxMgr::GetInstance()->getRangeId();
         if (at_npu::native::IsSupportMstxDomainFunc()) {
-            domainHandle = MstxMgr::GetInstance()->createDomain(domainName.c_str());
-            at_npu::native::MstxDomainRangeStartA(domainHandle, message.c_str(), stream, rangeId);
+            if (MstxMgr::GetInstance()->isMstxTxDomainEnable(domainName)) {
+                domainHandle = MstxMgr::GetInstance()->createProfDomain(domainName);
+                at_npu::native::MstxDomainRangeStartA(domainHandle, message.c_str(), stream, rangeId);
+            }
         } else {
             at_npu::native::MstxRangeStartA(message.c_str(), stream, rangeId);
         }
@@ -152,7 +182,9 @@ struct MstxRange {
             return;
         }
         if (at_npu::native::IsSupportMstxDomainFunc()) {
-            at_npu::native::MstxDomainRangeEnd(domainHandle, rangeId);
+            if (domainHandle != nullptr) {
+                at_npu::native::MstxDomainRangeEnd(domainHandle, rangeId);
+            }
         } else {
             at_npu::native::MstxRangeEnd(rangeId);
         }
diff --git a/torch_npu/csrc/profiler/profiler_mgr.cpp b/torch_npu/csrc/profiler/profiler_mgr.cpp
index b773e2cf4fae46e956ea2ea8464ad237e23b8d7f..eae7c9c5afd266e67adb6392db4993d3bd2d14a7 100644
--- a/torch_npu/csrc/profiler/profiler_mgr.cpp
+++ b/torch_npu/csrc/profiler/profiler_mgr.cpp
@@ -89,7 +89,8 @@ void ProfilerMgr::WarmupMsProfiler(uint32_t *deviceIdList, uint32_t deviceNum, a
     }
 }
 
-void ProfilerMgr::EnableMsProfiler(uint32_t *deviceIdList, uint32_t deviceNum, aclprofAicoreMetrics aicMetrics, uint64_t dataTypeConfig) {
+void ProfilerMgr::EnableMsProfiler(uint32_t *deviceIdList, uint32_t deviceNum, aclprofAicoreMetrics aicMetrics, uint64_t dataTypeConfig)
+{
     // Avoid duplicate config creation in scenarios where warmup is turned on
     if (profConfig_ == nullptr) {
         profConfig_ = at_npu::native::AclProfilingCreateConfig(deviceIdList, deviceNum, aicMetrics, nullptr, dataTypeConfig);
@@ -105,6 +106,14 @@ void ProfilerMgr::EnableMsProfiler(uint32_t *deviceIdList, uint32_t deviceNum, a
         ASCEND_LOGE("Profiling start failed.");
         return;
     }
+
+    ASCEND_LOGI("Try to register set device callback function.");
+    ret = at_npu::native::AclProfilingRegisterDeviceCallback();
+    if (ret == ACL_ERROR_PROF_MODULES_UNSUPPORTED) {
+        ASCEND_LOGW("Not support set device callback function.");
+    } else if (ret != ACL_SUCCESS) {
+        ASCEND_LOGE("Failed to register set device callback function.");
+    }
 }
 
 uint64_t ProfilerMgr::PrepareProfilerConfig(const NpuTraceConfig &npu_config)
@@ -129,6 +138,8 @@ uint64_t ProfilerMgr::PrepareProfilerConfig(const NpuTraceConfig &npu_config)
             ASCEND_LOGW("not support to set config for sys-hardware-mem.");
         }
     }
+    PrepareProfilerHostSysConfig(npu_config.host_sys);
+    PrepareProfilerDeviceSysConfig(npu_config);
     if (npu_config.op_attr) {
         datatype_config |= ACL_PROF_OP_ATTR;
     }
@@ -136,6 +147,44 @@ uint64_t ProfilerMgr::PrepareProfilerConfig(const NpuTraceConfig &npu_config)
     return datatype_config;
 }
 
+void ProfilerMgr::PrepareProfilerHostSysConfig(const std::vector<std::string> &host_sys)
+{
+    if (!host_sys.empty()) {
+        std::string hostSysStr;
+        for (size_t i = 0; i < host_sys.size(); ++i) {
+            if (i > 0) {
+                hostSysStr += ",";
+            }
+            hostSysStr += host_sys[i];
+        }
+        aclError hostSysRet = at_npu::native::AclprofSetConfig(ACL_PROF_HOST_SYS, hostSysStr.c_str(), hostSysStr.size());
+        if (hostSysRet != ACL_SUCCESS) {
+            ASCEND_LOGE("Failed call aclprofSetConfig to ACL_PROF_HOST_SYS. error_code: %d",
+                static_cast<int>(hostSysRet));
+        }
+    }
+}
+
+void ProfilerMgr::PrepareProfilerDeviceSysConfig(const NpuTraceConfig &npu_config)
+{
+    if (npu_config.sys_io) {
+        const std::string sysIoFreq = "100";
+        aclError sysIoRet = at_npu::native::AclprofSetConfig(ACL_PROF_SYS_IO_FREQ, sysIoFreq.c_str(), sysIoFreq.size());
+        if (sysIoRet != ACL_SUCCESS) {
+            ASCEND_LOGW("Failed call aclprofSetConfig to ACL_PROF_SYS_IO_FREQ. error_code : %d",
+                        static_cast<int>(sysIoRet));
+        }
+    }
+    if (npu_config.sys_interconnection) {
+        const std::string sysInterconnectionFreq  = "50";
+        aclError sysInterconnectionRet =
+          at_npu::native::AclprofSetConfig(ACL_PROF_SYS_INTERCONNECTION_FREQ, sysInterconnectionFreq.c_str(), sysInterconnectionFreq.size());
+        if (sysInterconnectionRet != ACL_SUCCESS) {
+            ASCEND_LOGW("Failed call aclprofSetConfig to ACL_PROF_SYS_INTERCONNECTION_FREQ. error_code : %d",
+                        static_cast<int>(sysInterconnectionRet));
+        }
+    }
+}
 
 aclprofAicoreMetrics ProfilerMgr::PrepareProfilerAicMetrics(const NpuTraceConfig &npu_config)
 {
@@ -210,6 +259,8 @@ void ProfilerMgr::Start(const NpuTraceConfig &npu_config, bool cpu_trace)
     }
     enable_warmup_.store(false);
     msprof_tx_.store(npu_config.msprof_tx);
+    mstx_domain_include_ = npu_config.mstx_domain_include;
+    mstx_domain_exclude_ = npu_config.mstx_domain_exclude;
     if (npu_config.record_op_args) {
         record_op_args_.store(true);
         const std::string op_dump_path = std::string(path_.begin(), path_.begin() + path_.find_last_not_of("/") + 1) +
@@ -235,6 +286,8 @@ void ProfilerMgr::Stop()
         profConfig_ = nullptr;
     }
     msprof_tx_.store(false);
+    mstx_domain_include_.clear();
+    mstx_domain_exclude_.clear();
     report_enable_.store(false);
     enable_warmup_.store(false);
     if (record_op_args_.load()) {
@@ -333,5 +386,22 @@ int8_t GetTraceLevel()
 {
     return ProfilerMgr::GetInstance()->GetTraceLevel();
 }
+
+bool ProfilerMgr::IsMstxDomainEnabled(const std::string &domainName)
+{
+    if (mstx_domain_include_.empty() && mstx_domain_exclude_.empty()) {
+        return true;
+    }
+    if (!mstx_domain_include_.empty()) {
+        return std::find(mstx_domain_include_.begin(), mstx_domain_include_.end(), domainName) !=
+            mstx_domain_include_.end();
+    }
+    if (!mstx_domain_exclude_.empty()) {
+        return std::find(mstx_domain_exclude_.begin(), mstx_domain_exclude_.end(), domainName) ==
+            mstx_domain_exclude_.end();
+    }
+    // both not empty, enable all domains
+    return true;
+}
 } // profiler
 } // torch_npu
diff --git a/torch_npu/csrc/profiler/profiler_mgr.h b/torch_npu/csrc/profiler/profiler_mgr.h
index 6b6c20d858e99749d103953f8c2dbb5dcab92573..17c5911a4c190897f6d141e4b793fda024be1d4c 100644
--- a/torch_npu/csrc/profiler/profiler_mgr.h
+++ b/torch_npu/csrc/profiler/profiler_mgr.h
@@ -17,13 +17,18 @@ constexpr uint64_t Level1 = ACL_PROF_TASK_TIME | ACL_PROF_ACL_API | ACL_PROF_HCC
 constexpr uint64_t Level2 = Level1 | ACL_PROF_AICPU | ACL_PROF_RUNTIME_API;
 
 struct NpuTraceConfig {
-  std::string trace_level;
-  std::string metrics;
-  bool npu_memory;
-  bool l2_cache;
-  bool record_op_args;
-  bool msprof_tx;
-  bool op_attr;
+    std::string trace_level;
+    std::string metrics;
+    bool npu_memory;
+    bool l2_cache;
+    bool record_op_args;
+    bool msprof_tx;
+    bool op_attr;
+    std::vector<std::string> host_sys;
+    std::vector<std::string> mstx_domain_include;
+    std::vector<std::string> mstx_domain_exclude;
+    bool sys_io;
+    bool sys_interconnection;
 };
 
 C10_NPU_API int8_t GetTraceLevel();
@@ -62,6 +67,8 @@ public:
         return profile_memory_;
     }
 
+    bool IsMstxDomainEnabled(const std::string &domainName);
+
 private:
     ProfilerMgr();
     explicit ProfilerMgr(const ProfilerMgr &obj) = delete;
@@ -71,6 +78,8 @@ private:
     void EnableMsProfiler(uint32_t *deviceIdList, uint32_t deviceNum, aclprofAicoreMetrics aicMetrics, uint64_t dataTypeConfig);
     void WarmupMsProfiler(uint32_t *deviceIdList, uint32_t deviceNum, aclprofAicoreMetrics aicMetrics, uint64_t dataTypeConfig);
     uint64_t PrepareProfilerConfig(const NpuTraceConfig &npu_config);
+    void PrepareProfilerDeviceSysConfig(const NpuTraceConfig &npu_config);
+    void PrepareProfilerHostSysConfig(const std::vector<std::string> &host_sys);
     aclprofAicoreMetrics PrepareProfilerAicMetrics(const NpuTraceConfig &npu_config);
     uint64_t CheckFeatureConfig(uint64_t datatype_config);
     void StartDataReceiver(const std::string &fwk_path);
@@ -92,6 +101,9 @@ private:
     torch_npu::toolkit::profiler::TraceDataDumper traceDataReceiver_;
     std::mutex reportDataMutex_;
     torch_npu::toolkit::profiler::DataDumper dataReceiverWithLock_;
+
+    std::vector<std::string> mstx_domain_include_;
+    std::vector<std::string> mstx_domain_exclude_;
 };
 } // profiler
 } // torch_npu
diff --git a/torch_npu/csrc/profiler/profiler_python.cpp b/torch_npu/csrc/profiler/profiler_python.cpp
index 704b65d923896afead8d01deb5225362e8bd8b84..571fb57bbce2c18e24ac6cd26aaedcd656968bd1 100644
--- a/torch_npu/csrc/profiler/profiler_python.cpp
+++ b/torch_npu/csrc/profiler/profiler_python.cpp
@@ -36,19 +36,6 @@ using TensorMetadata = torch_npu::toolkit::profiler::TensorMetadata;
 using ModuleParam = torch_npu::toolkit::profiler::ModuleParam;
 using OptimizerParam = torch_npu::toolkit::profiler::OptimizerParam;
 
-std::string trimPrefix(std::string s)
-{
-    static std::vector<std::string> prefixes = py::module::import("torch.profiler.python_tracer")
-        .attr("_prefix_regex")().cast<std::vector<std::string>>();
-    for (const auto& p : prefixes) {
-        if (s.compare(0, p.size(), p) == 0) {
-            s.erase(0, p.size());
-            return s;
-        }
-    }
-    return s;
-}
-
 std::vector<PyThreadState*> getInterpreterThreads(PyInterpreterState* interpreter)
 {
     pybind11::gil_scoped_acquire gil;
@@ -240,6 +227,7 @@ private:
     void reportTraceData();
     void reportHashData();
     void reportParamData();
+    std::string trimPrefix(std::string s);
 
 private:
     std::atomic<bool> active_{false};
@@ -248,6 +236,7 @@ private:
     std::deque<ThreadLocalResult> thread_local_results_;
     PyObject* module_call_code_{nullptr};
     PyObject* optimizer_call_code_{nullptr};
+    std::vector<std::string> func_name_prefixes_;
     std::unordered_map<size_t, PyCallInfo> py_call_cache_;
     std::unordered_map<size_t, at::StringView> pyc_call_cache_;
     std::unordered_map<size_t, ModuleInfo> module_info_cache_;
@@ -277,6 +266,9 @@ PythonTracer::PythonTracer() : active_(false)
         .attr("_optimizer_step_code")
         .attr("__code__")
         .ptr();
+    func_name_prefixes_ = py::module::import("torch.profiler.python_tracer")
+        .attr("_prefix_regex")()
+        .cast<std::vector<std::string>>();
 }
 
 void PythonTracer::start(size_t max_threads)
@@ -343,7 +335,7 @@ void PythonTracer::stop()
 {
     TORCH_INTERNAL_ASSERT(active_.load(), "PythonTracer is not running.", PROF_ERROR(ErrCode::INTERNAL));
 
-    pybind11::gil_scoped_acquire gil;
+    GilAndRestoreThread gil;
     for (const auto thread_state : getInterpreterThreads(interpreter_)) {
         if (thread_state->c_profilefunc == &PythonTracer::pyProfileFn) {
             PyThreadState_Swap(thread_state);
@@ -383,6 +375,17 @@ void PythonTracer::clear()
     interpreter_ = nullptr;
 }
 
+std::string PythonTracer::trimPrefix(std::string s)
+{
+    for (const auto& p : func_name_prefixes_) {
+        if (s.compare(0, p.size(), p) == 0) {
+            s.erase(0, p.size());
+            return s;
+        }
+    }
+    return s;
+}
+
 void PythonTracer::reportTraceData()
 {
     if (events_.size() > 0) {
@@ -402,7 +405,7 @@ void PythonTracer::reportHashData()
     hash_data.resize(py_call_cache_.size() + pyc_call_cache_.size() + module_info_cache_.size() + 1);
     size_t idx = 0;
     for (auto& item : py_call_cache_) {
-        hash_data[idx++] = std::make_pair(item.first, trimPrefix(item.second.get_name()));
+        hash_data[idx++] = std::make_pair(item.first, trimPrefix(std::move(item.second.get_name())));
     }
     for (auto& item : pyc_call_cache_) {
         hash_data[idx++] = std::make_pair(item.first, std::string(item.second.str()));
@@ -573,7 +576,7 @@ void PythonTracer::recordCCall(TraceContext* ctx, PyFrameObject* frame, PyObject
 void PythonTracer::recordReturn(TraceContext* ctx, PyFrameObject* frame, TraceTag tag)
 {
     recordEvent(tag, EXIT_EVENT_HASH_ID);
-    
+
     // record ctx to thread id map
     auto ctx_addr = reinterpret_cast<uintptr_t>(ctx);
     if (ctx_tid_map_.find(ctx_addr) == ctx_tid_map_.end()) {
diff --git a/torch_npu/csrc/profiler/unwind/unwind.cpp b/torch_npu/csrc/profiler/unwind/unwind.cpp
index 5c751394ceb5d5e9ad0151712217b393f724bdbc..f8fa9ded1babddfe59ba31cee4eb11d8bc672dff 100644
--- a/torch_npu/csrc/profiler/unwind/unwind.cpp
+++ b/torch_npu/csrc/profiler/unwind/unwind.cpp
@@ -1,3 +1,8 @@
+#include <dlfcn.h>
+#include <cxxabi.h>
+#include <execinfo.h>
+#include <iostream>
+
 #include <c10/util/Exception.h>
 #include "unwind.h"
 
@@ -6,9 +11,10 @@
 namespace torch_npu::unwind {
 std::vector<void*> unwind()
 {
-    TORCH_CHECK(
-        false,
-        "record_context_cpp is not support on non-linux non-x86_64 platforms");
+    const int size = 200;
+    void* buffer[size];
+    int nptrs = backtrace(buffer, size);
+    return std::vector<void*>(buffer, buffer + nptrs);
 }
 
 c10::optional<std::pair<std::string, uint64_t> > libraryFor(void* addr)
@@ -20,9 +26,34 @@ c10::optional<std::pair<std::string, uint64_t> > libraryFor(void* addr)
 
 std::vector<Frame> symbolize(const std::vector<void*>& frames)
 {
-    TORCH_CHECK(
-        false,
-        "record_context_cpp is not support on non-linux non-x86_64 platforms");
+    std::vector<Frame> results;
+    for (const auto& addr : frames) {
+        Frame frame;
+        Dl_info info;
+        if (dladdr(addr, &info)) {
+            frame.filename = info.dli_fname ? info.dli_fname : "??";
+            size_t last_pos = frame.filename.find_last_of('/');
+            if (last_pos != std::string::npos) {
+                frame.filename = frame.filename.substr(last_pos + 1);
+            }
+            char* demangled = abi::__cxa_demangle(info.dli_sname, nullptr, nullptr, nullptr);
+            if (demangled) {
+                frame.funcname = demangled;
+                free(demangled);
+            } else {
+                frame.funcname = info.dli_sname ? info.dli_sname : "??";
+            }
+        } else {
+            frame.filename = "??";
+            frame.funcname = "??";
+        }
+        if ((frame.filename == "python" && frame.filename.find("PyEval_EvalFrame") == std::string::npos) ||
+            (frame.filename.find("libc.so") != std::string::npos)) {
+            frame.funcname = "__libc_start_main";
+        }
+        results.push_back(frame);
+    }
+    return results;
 }
 
 Stats stats()
diff --git a/torch_npu/csrc/profiler/unwind/unwind.h b/torch_npu/csrc/profiler/unwind/unwind.h
index 45c39b8440ebe094bbc55aa0c1994e853734ece6..c3402977112898d8116a1286bcc6d591a9449a43 100644
--- a/torch_npu/csrc/profiler/unwind/unwind.h
+++ b/torch_npu/csrc/profiler/unwind/unwind.h
@@ -15,7 +15,7 @@ TORCH_NPU_API std::vector<void*> unwind();
 struct Frame {
     std::string filename;
     std::string funcname;
-    uint64_t lineno;
+    uint64_t lineno = 0;
 };
 
 // note: symbolize is really slow
diff --git a/torch_npu/csrc/profiler/utils.cpp b/torch_npu/csrc/profiler/utils.cpp
index 7e267193c8c44ac40b5a5508c9ead43378389795..bd2f1b1b21d93897ba0659514e73ca731b9e7839 100644
--- a/torch_npu/csrc/profiler/utils.cpp
+++ b/torch_npu/csrc/profiler/utils.cpp
@@ -27,22 +27,22 @@ static constexpr auto kMat2Size = "mat2_size";
 bool NPURecordFunction::use_npu_simple = false;
 
 static bool validateInput(
-    const std::string& op_name,
+    const std::string &op_name,
     size_t min_size,
     c10::ArrayRef<const c10::IValue> inputs,
-    const c10::ArrayRef<int>& should_be_tensor) {
-  std::stringstream ss;
+    const c10::ArrayRef<int> &should_be_tensor)
+{
+    std::stringstream ss;
     if (inputs.size() < min_size) {
-        ss << "Failed to save extra arguments for flops compuation of op "
-        << op_name << ", min size: " << min_size
-        << ", actual size: " << inputs.size();
+        ss << "Failed to save extra arguments for flops compuation of op " << op_name << ", min size: " << min_size <<
+            ", actual size: " << inputs.size();
         TORCH_NPU_WARN(ss.str());
         return false;
     }
     for (auto index : should_be_tensor) {
         if (!inputs[index].isTensor()) {
-            ss << "Failed to save extra arguments for flops compuation of op "
-                << op_name << ", input[" << index << "] must be a tensor.";
+            ss << "Failed to save extra arguments for flops compuation of op " << op_name << ", input[" << index <<
+                "] must be a tensor.";
             TORCH_NPU_WARN(ss.str());
             return false;
         }
@@ -50,7 +50,8 @@ static bool validateInput(
     return true;
 }
 
-std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunction& fn) {
+std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunction &fn)
+{
     // for specific types of fn, return the saved extra args for computing flops
     std::unordered_map<std::string, c10::IValue> map;
     auto inputs = fn.inputs();
@@ -62,7 +63,7 @@ std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunct
     }
 
     if (fname == kConv2dOp) {
-        std::vector<int> tensors{0, 1};
+        std::vector<int> tensors{ 0, 1 };
         bool check = validateInput(fname, kConv2dGroups + 1, inputs, tensors);
         if (!check) {
             return map;
@@ -81,8 +82,8 @@ std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunct
         map[kDilation] = inputs[kConv2dDilation];
         map[kGroups] = inputs[kConv2dGroups];
     } else if (fname == kGemmOp) {
-            std::vector<int> tensors{0, 1};
-            bool check = validateInput(fname, 2, inputs, tensors);
+        std::vector<int> tensors{ 0, 1 };
+        bool check = validateInput(fname, 2, inputs, tensors);
         if (!check) {
             return map;
         }
@@ -92,7 +93,7 @@ std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunct
         map[kMat1Size] = at::IValue(left.sizes());
         map[kMat2Size] = at::IValue(right.sizes());
     } else if (fname == kMulOp) {
-        std::vector<int> tensors{0};
+        std::vector<int> tensors{ 0 };
         bool check = validateInput(fname, 1, inputs, tensors);
         if (!check) {
             return map;
@@ -101,7 +102,7 @@ std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunct
         at::Tensor mat = inputs[0].toTensor();
         map[kMatSize] = at::IValue(mat.sizes());
     } else if (fname == kAddOp) {
-        std::vector<int> tensors{0};
+        std::vector<int> tensors{ 0 };
         bool check = validateInput(fname, 1, inputs, tensors);
         if (!check) {
             return map;
@@ -114,145 +115,147 @@ std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunct
     return map;
 }
 
-uint64_t computeFlops(const std::string &op_name, const std::unordered_map<std::string, c10::IValue> &extra_args) {
-  if (op_name == kConv2dOp) {
-    if (extra_args.find(kInputSize) == extra_args.end()
-        || extra_args.find(kWeightSize) == extra_args.end()
-        || extra_args.find(kGroups) == extra_args.end()
-        || extra_args.find(kPadding) == extra_args.end()
-        || extra_args.find(kStride) == extra_args.end()
-        || extra_args.find(kDilation) == extra_args.end()) {
-      TORCH_NPU_WARN("Calculating flops for aten::conv2d requires groups, padding, stride, dilation, input_size, and weight_size in saved arguments.");
-      return 0;
-    }
-    auto input_sizes_ref = extra_args.at(kInputSize);
-    auto kernel_sizes_ref = extra_args.at(kWeightSize);
-    auto groups_ref = extra_args.at(kGroups);
-    auto padding_ref = extra_args.at(kPadding);
-    auto stride_ref = extra_args.at(kStride);
-    auto dilation_ref = extra_args.at(kDilation);
-    if (!input_sizes_ref.isIntList() || !kernel_sizes_ref.isIntList()) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because it requires input and weight tensor sizes.");
-      return 0;
-    }
-    if (!padding_ref.isIntList() || !stride_ref.isIntList() || !dilation_ref.isIntList()) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because it requires padding, stride, and dilation values.");
-      return 0;
-    }
+uint64_t computeFlops(const std::string &op_name, const std::unordered_map<std::string, c10::IValue> &extra_args)
+{
+    if (op_name == kConv2dOp) {
+        if (extra_args.find(kInputSize) == extra_args.end() || extra_args.find(kWeightSize) == extra_args.end() ||
+            extra_args.find(kGroups) == extra_args.end() || extra_args.find(kPadding) == extra_args.end() ||
+            extra_args.find(kStride) == extra_args.end() || extra_args.find(kDilation) == extra_args.end()) {
+            TORCH_NPU_WARN("Calculating flops for aten::conv2d requires groups, padding, stride, dilation, input_size, "
+                           "and weight_size in saved arguments.");
+            return 0;
+        }
+        auto input_sizes_ref = extra_args.at(kInputSize);
+        auto kernel_sizes_ref = extra_args.at(kWeightSize);
+        auto groups_ref = extra_args.at(kGroups);
+        auto padding_ref = extra_args.at(kPadding);
+        auto stride_ref = extra_args.at(kStride);
+        auto dilation_ref = extra_args.at(kDilation);
+        if (!input_sizes_ref.isIntList() || !kernel_sizes_ref.isIntList()) {
+            TORCH_NPU_WARN(
+                "Failed to compute flops for op aten::conv2d because it requires input and weight tensor sizes.");
+            return 0;
+        }
+        if (!padding_ref.isIntList() || !stride_ref.isIntList() || !dilation_ref.isIntList()) {
+            TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because it requires padding, stride, and "
+                           "dilation values.");
+            return 0;
+        }
 
-    const std::vector<int64_t> input_sizes = input_sizes_ref.toIntVector();
-    const std::vector<int64_t> kernel_sizes = kernel_sizes_ref.toIntVector();
-    const uint64_t groups = (uint64_t)groups_ref.toInt();
-    const std::vector<int64_t> padding = padding_ref.toIntVector();
-    const std::vector<int64_t> stride = stride_ref.toIntVector();
-    const std::vector<int64_t> dilation = dilation_ref.toIntVector();
-    if (input_sizes.size() != 4 || kernel_sizes.size() != 4) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because both input and weight must be size 4.");
-      return 0;
-    }
-    if (!groups) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because group size must not be 0.");
-      return 0;
-    }
-    if (padding.size() != 2 || dilation.size() != 2) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because both padding and dilation must be size 2.");
-      return 0;
-    }
-    if (stride.size() != 2 || (stride[0] * stride[1] == 0)) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because stride must be size 2 and cannot be 0.");
-      return 0;
-    }
-    // format of the input is defined in torch.nn.quantized.functional.conv2d()
-    uint64_t minibatch = 0;
-    uint64_t in_channels = 0;
-    uint64_t input_h = 0;
-    uint64_t input_w = 0;
-    uint64_t out_channels = 0;
-    uint64_t kernel_h = 0;
-    uint64_t kernel_w = 0;
-    const uint64_t conv2d_multiply_factor = 2;
-    std::tie(minibatch, in_channels, input_h, input_w) = std::make_tuple(input_sizes[0], input_sizes[1],
-                                                                         input_sizes[2], input_sizes[3]);
-    std::tie(out_channels, std::ignore, kernel_h, kernel_w) = std::make_tuple(kernel_sizes[0], kernel_sizes[1],
-                                                                              kernel_sizes[2], kernel_sizes[3]);
-    uint64_t output_h = (input_h + 2 * padding[0] - dilation[0] * (kernel_h - 1) - 1) / stride[0] + 1;
-    uint64_t output_w = (input_w + 2 * padding[1] - dilation[1] * (kernel_w - 1) - 1) / stride[1] + 1;
-    if (groups == 0) {
-      TORCH_CHECK(false, "groups can not be 0.", PTA_ERROR(ErrCode::VALUE));
-    }
-    return conv2d_multiply_factor * minibatch * output_h * output_w *
-           kernel_h * kernel_w * in_channels * out_channels / groups;
-  } else if (op_name == kGemmOp) {
-    if (extra_args.find(kMat1Size) == extra_args.end()
-        || extra_args.find(kMat2Size) == extra_args.end()) {
-      TORCH_NPU_WARN("Calculating flops for aten::mm requires mat1_size and mat2_size in saved arguments.");
-      return 0;
-    }
-    auto mat1_sizes_ref = extra_args.at(kMat1Size);
-    auto mat2_sizes_ref = extra_args.at(kMat2Size);
-    if (!mat1_sizes_ref.isIntList() || !mat2_sizes_ref.isIntList()) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::mm because it requires mat1_size and mat2_size to be IntList.");
-      return 0;
-    }
+        const std::vector<int64_t> input_sizes = input_sizes_ref.toIntVector();
+        const std::vector<int64_t> kernel_sizes = kernel_sizes_ref.toIntVector();
+        const uint64_t groups = (uint64_t)groups_ref.toInt();
+        const std::vector<int64_t> padding = padding_ref.toIntVector();
+        const std::vector<int64_t> stride = stride_ref.toIntVector();
+        const std::vector<int64_t> dilation = dilation_ref.toIntVector();
+        if (input_sizes.size() != 4 || kernel_sizes.size() != 4) {
+            TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because both input and weight must be size 4.");
+            return 0;
+        }
+        if (!groups) {
+            TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because group size must not be 0.");
+            return 0;
+        }
+        if (padding.size() != 2 || dilation.size() != 2) {
+            TORCH_NPU_WARN(
+                "Failed to compute flops for op aten::conv2d because both padding and dilation must be size 2.");
+            return 0;
+        }
+        if (stride.size() != 2 || (stride[0] * stride[1] == 0)) {
+            TORCH_NPU_WARN(
+                "Failed to compute flops for op aten::conv2d because stride must be size 2 and cannot be 0.");
+            return 0;
+        }
+        // format of the input is defined in torch.nn.quantized.functional.conv2d()
+        uint64_t minibatch = 0;
+        uint64_t in_channels = 0;
+        uint64_t input_h = 0;
+        uint64_t input_w = 0;
+        uint64_t out_channels = 0;
+        uint64_t kernel_h = 0;
+        uint64_t kernel_w = 0;
+        const uint64_t conv2d_multiply_factor = 2;
+        std::tie(minibatch, in_channels, input_h, input_w) =
+            std::make_tuple(input_sizes[0], input_sizes[1], input_sizes[2], input_sizes[3]);
+        std::tie(out_channels, std::ignore, kernel_h, kernel_w) =
+            std::make_tuple(kernel_sizes[0], kernel_sizes[1], kernel_sizes[2], kernel_sizes[3]);
+        uint64_t output_h = (input_h + 2 * padding[0] - dilation[0] * (kernel_h - 1) - 1) / stride[0] + 1;
+        uint64_t output_w = (input_w + 2 * padding[1] - dilation[1] * (kernel_w - 1) - 1) / stride[1] + 1;
+        if (groups == 0) {
+            TORCH_CHECK(false, "groups can not be 0.", PTA_ERROR(ErrCode::VALUE));
+        }
+        return conv2d_multiply_factor * minibatch * output_h * output_w * kernel_h * kernel_w * in_channels *
+            out_channels / groups;
+    } else if (op_name == kGemmOp) {
+        if (extra_args.find(kMat1Size) == extra_args.end() || extra_args.find(kMat2Size) == extra_args.end()) {
+            TORCH_NPU_WARN("Calculating flops for aten::mm requires mat1_size and mat2_size in saved arguments.");
+            return 0;
+        }
+        auto mat1_sizes_ref = extra_args.at(kMat1Size);
+        auto mat2_sizes_ref = extra_args.at(kMat2Size);
+        if (!mat1_sizes_ref.isIntList() || !mat2_sizes_ref.isIntList()) {
+            TORCH_NPU_WARN(
+                "Failed to compute flops for op aten::mm because it requires mat1_size and mat2_size to be IntList.");
+            return 0;
+        }
 
-    std::vector<int64_t> mat1_size = mat1_sizes_ref.toIntVector();
-    std::vector<int64_t> mat2_size = mat2_sizes_ref.toIntVector();
-    if (mat1_size.size() == 0) {
-      return 0;
-    } else {
-      int64_t overlap_dim = mat1_size.back();
-      const uint64_t gemm_multiply_factor = 2;
-      uint64_t flops = 1;
-      for (int64_t dim : mat1_size) {
-        flops *= (uint64_t)dim;
-      }
-      if (overlap_dim == 0) {
-        TORCH_CHECK(false, "overlap_dim can not be 0.", PTA_ERROR(ErrCode::VALUE));
-      }
-      flops /= (uint64_t)overlap_dim;
-      for (int64_t dim : mat2_size) {
-        flops *= (uint64_t)dim;
-      }
-      flops *= gemm_multiply_factor;
-      return flops;
-    }
-  } else if (op_name == kMulOp) {
-    if (extra_args.find(kMatSize) == extra_args.end()) {
-      TORCH_NPU_WARN("Calculating flops for aten::mul.Tensor requires mat_size in saved arguments.");
-      return 0;
-    }
-    auto mat_sizes = extra_args.at(kMatSize);
-    if (!mat_sizes.isIntList()) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::mul because it requires mat_size to be IntList.");
-      return 0;
-    }
+        std::vector<int64_t> mat1_size = mat1_sizes_ref.toIntVector();
+        std::vector<int64_t> mat2_size = mat2_sizes_ref.toIntVector();
+        if (mat1_size.size() == 0) {
+            return 0;
+        } else {
+            int64_t overlap_dim = mat1_size.back();
+            const uint64_t gemm_multiply_factor = 2;
+            uint64_t flops = 1;
+            for (int64_t dim : mat1_size) {
+                flops *= (uint64_t)dim;
+            }
+            if (overlap_dim == 0) {
+                TORCH_CHECK(false, "overlap_dim can not be 0.", PTA_ERROR(ErrCode::VALUE));
+            }
+            flops /= (uint64_t)overlap_dim;
+            for (int64_t dim : mat2_size) {
+                flops *= (uint64_t)dim;
+            }
+            flops *= gemm_multiply_factor;
+            return flops;
+        }
+    } else if (op_name == kMulOp) {
+        if (extra_args.find(kMatSize) == extra_args.end()) {
+            TORCH_NPU_WARN("Calculating flops for aten::mul.Tensor requires mat_size in saved arguments.");
+            return 0;
+        }
+        auto mat_sizes = extra_args.at(kMatSize);
+        if (!mat_sizes.isIntList()) {
+            TORCH_NPU_WARN("Failed to compute flops for op aten::mul because it requires mat_size to be IntList.");
+            return 0;
+        }
 
-    std::vector<int64_t> mat_size = mat_sizes.toIntVector();
-    uint64_t flops = 1;
-    for (int64_t dim : mat_size) {
-      flops *= (uint64_t)dim;
-    }
-    return flops;
-  } else if (op_name == kAddOp) {
-    if (extra_args.find(kMatSize) == extra_args.end()) {
-      TORCH_NPU_WARN("Calculating flops for aten::add.Tensor requires mat_size in saved arguments.");
-      return 0;
-    }
-    auto mat_sizes = extra_args.at(kMatSize);
-    if (!mat_sizes.isIntList()) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::add because it requires mat_size to be IntList.");
-      return 0;
-    }
+        std::vector<int64_t> mat_size = mat_sizes.toIntVector();
+        uint64_t flops = 1;
+        for (int64_t dim : mat_size) {
+            flops *= (uint64_t)dim;
+        }
+        return flops;
+    } else if (op_name == kAddOp) {
+        if (extra_args.find(kMatSize) == extra_args.end()) {
+            TORCH_NPU_WARN("Calculating flops for aten::add.Tensor requires mat_size in saved arguments.");
+            return 0;
+        }
+        auto mat_sizes = extra_args.at(kMatSize);
+        if (!mat_sizes.isIntList()) {
+            TORCH_NPU_WARN("Failed to compute flops for op aten::add because it requires mat_size to be IntList.");
+            return 0;
+        }
 
-    std::vector<int64_t> mat_size = mat_sizes.toIntVector();
-    uint64_t flops = 1;
-    for (int64_t dim : mat_size) {
-      flops *= (uint64_t)dim;
+        std::vector<int64_t> mat_size = mat_sizes.toIntVector();
+        uint64_t flops = 1;
+        for (int64_t dim : mat_size) {
+            flops *= (uint64_t)dim;
+        }
+        return flops;
     }
-    return flops;
-  }
-  return 0;
+    return 0;
 }
-
 }
 }
\ No newline at end of file
diff --git a/torch_npu/csrc/profiler/utils.h b/torch_npu/csrc/profiler/utils.h
index 4aa80d9efbf946d4f31c53400f8de3073d8e5c52..b423e4e1ceac8e619fea5c863fc60e1495648ab0 100644
--- a/torch_npu/csrc/profiler/utils.h
+++ b/torch_npu/csrc/profiler/utils.h
@@ -15,24 +15,25 @@ namespace profiler {
 
 std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunction& fn);
 
-uint64_t computeFlops(const std::string &op_name,
-    const std::unordered_map<std::string, c10::IValue> &extra_args);
+uint64_t computeFlops(const std::string &op_name, const std::unordered_map<std::string, c10::IValue> &extra_args);
 
 class NPURecordFunction {
 public:
-  NPURecordFunction(bool enable_ = false) : enable(enable_) {
-    if (NPURecordFunction::use_npu_simple) {
-      at::enableRecordFunction(enable);
+    NPURecordFunction(bool enable_ = false) : enable(enable_)
+    {
+        if (NPURecordFunction::use_npu_simple) {
+            at::enableRecordFunction(enable);
+        }
     }
-  }
 
-  ~NPURecordFunction() {
-    if (NPURecordFunction::use_npu_simple) {
-      at::enableRecordFunction(!enable);
+    ~NPURecordFunction()
+    {
+        if (NPURecordFunction::use_npu_simple) {
+            at::enableRecordFunction(!enable);
+        }
     }
-  }
-  bool enable = false;
-  static bool use_npu_simple;
+    bool enable = false;
+    static bool use_npu_simple;
 };
 
 inline THPCodeObjectPtr PyFrame_GetCode_NPU(PyFrameObject* frame)
diff --git a/torch_npu/csrc/sanitizer/NPUTrace.h b/torch_npu/csrc/sanitizer/NPUTrace.h
index ff62bbc0fb422cc218ee9144696f0de752c53154..0e8d0dc58794eb793e5b5690ad4986d3cc64552c 100644
--- a/torch_npu/csrc/sanitizer/NPUTrace.h
+++ b/torch_npu/csrc/sanitizer/NPUTrace.h
@@ -22,7 +22,7 @@ struct NPUTrace {
     }
 };
 
-TORCH_NPU_API void activateNPUTrace(const int);
+C10_NPU_API void activateNPUTrace(const int);
 
 } // namespace impl
 } // namespace c10
diff --git a/torch_npu/csrc/toolkit/profiler/common/ring_buffer.h b/torch_npu/csrc/toolkit/profiler/common/ring_buffer.h
index 1bbc2f33e388dca749dcf3d6b5e814a2823c7cf6..9ca4f2ae70568f1a8680a3e96ea6662919951893 100644
--- a/torch_npu/csrc/toolkit/profiler/common/ring_buffer.h
+++ b/torch_npu/csrc/toolkit/profiler/common/ring_buffer.h
@@ -12,28 +12,31 @@ namespace profiler {
 template <typename T>
 class RingBuffer {
 public:
-  RingBuffer()
-      : is_inited_(false),
-        is_quit_(false),
-        read_index_(0),
-        write_index_(0),
-        idle_write_index_(0),
-        capacity_(0),
-        mask_(0),
-        cycles_exceed_cnt_(0),
-        full_cnt_(0) {}
+    RingBuffer()
+        : is_inited_(false),
+          is_quit_(false),
+          read_index_(0),
+          write_index_(0),
+          idle_write_index_(0),
+          capacity_(0),
+          mask_(0),
+          cycles_exceed_cnt_(0),
+          full_cnt_(0)
+    {}
 
-  ~RingBuffer() {
-    UnInit();
-  }
+    ~RingBuffer()
+    {
+        UnInit();
+    }
 
-  void Init(size_t capacity) {
-    capacity_ = capacity;
-    mask_ = capacity_ - 1;
-    data_queue_.resize(capacity);
-    is_inited_ = true;
-    is_quit_ = false;
-  }
+    void Init(size_t capacity)
+    {
+        capacity_ = capacity;
+        mask_ = capacity_ - 1;
+        data_queue_.resize(capacity);
+        is_inited_ = true;
+        is_quit_ = false;
+    }
 
     void UnInit()
     {
@@ -59,49 +62,51 @@ public:
         }
     }
 
-  bool Push(T data) {
-    size_t curr_read_index = 0;
-    size_t curr_write_index = 0;
-    size_t next_write_index = 0;
-    size_t cycles = 0;
-    static const size_t cycle_limit = 1024;
-    do {
-      if (!is_inited_ || is_quit_) {
-        return false;
-      }
-      cycles++;
-      if (cycles >= cycle_limit) {
-        cycles_exceed_cnt_.fetch_add(1, std::memory_order_relaxed);
-        return false;
-      }
-      curr_read_index = read_index_.load(std::memory_order_relaxed);
-      curr_write_index = idle_write_index_.load(std::memory_order_relaxed);
-      next_write_index = curr_write_index + 1;
-      if ((next_write_index & mask_) == (curr_read_index & mask_)) {
-        full_cnt_.fetch_add(1, std::memory_order_relaxed);
-        return false;
-      }
-    } while (!idle_write_index_.compare_exchange_weak(curr_write_index, next_write_index));
-    size_t index = curr_write_index & mask_;
-    data_queue_[index] = std::move(data);
-    write_index_++;
-    return true;
-  }
-
-  bool Pop(T &data) {
-    if (!is_inited_) {
-      return false;
+    bool Push(T data)
+    {
+        size_t curr_read_index = 0;
+        size_t curr_write_index = 0;
+        size_t next_write_index = 0;
+        size_t cycles = 0;
+        static const size_t cycle_limit = 1024;
+        do {
+            if (!is_inited_ || is_quit_) {
+                return false;
+            }
+            cycles++;
+            if (cycles >= cycle_limit) {
+                cycles_exceed_cnt_.fetch_add(1, std::memory_order_relaxed);
+                return false;
+            }
+            curr_read_index = read_index_.load(std::memory_order_relaxed);
+            curr_write_index = idle_write_index_.load(std::memory_order_relaxed);
+            next_write_index = curr_write_index + 1;
+            if ((next_write_index & mask_) == (curr_read_index & mask_)) {
+                full_cnt_.fetch_add(1, std::memory_order_relaxed);
+                return false;
+            }
+        } while (!idle_write_index_.compare_exchange_weak(curr_write_index, next_write_index));
+        size_t index = curr_write_index & mask_;
+        data_queue_[index] = std::move(data);
+        write_index_++;
+        return true;
     }
-    size_t curr_read_index = read_index_.load(std::memory_order_relaxed);
-    size_t curr_write_index = write_index_.load(std::memory_order_relaxed);
-    if ((curr_read_index & mask_) == (curr_write_index & mask_) && !is_quit_) {
-      return false;
+
+    bool Pop(T &data)
+    {
+        if (!is_inited_) {
+            return false;
+        }
+        size_t curr_read_index = read_index_.load(std::memory_order_relaxed);
+        size_t curr_write_index = write_index_.load(std::memory_order_relaxed);
+        if ((curr_read_index & mask_) == (curr_write_index & mask_) && !is_quit_) {
+            return false;
+        }
+        size_t index = curr_read_index & mask_;
+        data = std::move(data_queue_[index]);
+        read_index_++;
+        return true;
     }
-    size_t index = curr_read_index & mask_;
-    data = std::move(data_queue_[index]);
-    read_index_++;
-    return true;
-  }
 
     size_t Size()
     {
@@ -114,18 +119,18 @@ public:
     }
 
 private:
-  bool is_inited_;
-  volatile bool is_quit_;
-  std::atomic<size_t> read_index_;
-  std::atomic<size_t> write_index_;
-  std::atomic<size_t> idle_write_index_;
-  size_t capacity_;
-  size_t mask_;
-  std::vector<T> data_queue_;
+    bool is_inited_;
+    volatile bool is_quit_;
+    std::atomic<size_t> read_index_;
+    std::atomic<size_t> write_index_;
+    std::atomic<size_t> idle_write_index_;
+    size_t capacity_;
+    size_t mask_;
+    std::vector<T> data_queue_;
 
-  // Ringbuffer push failed info
-  std::atomic<size_t> cycles_exceed_cnt_;
-  std::atomic<size_t> full_cnt_;
+    // Ringbuffer push failed info
+    std::atomic<size_t> cycles_exceed_cnt_;
+    std::atomic<size_t> full_cnt_;
 };
 } // profiler
 } // toolkit
diff --git a/torch_npu/csrc/toolkit/profiler/common/singleton.h b/torch_npu/csrc/toolkit/profiler/common/singleton.h
index 4997e534f0e20aeaee4d602395b8998e0bb9b9a3..982248b8023c5488bcffa270313f4a26bc2f1b08 100644
--- a/torch_npu/csrc/toolkit/profiler/common/singleton.h
+++ b/torch_npu/csrc/toolkit/profiler/common/singleton.h
@@ -8,21 +8,22 @@ namespace profiler {
 template<typename T>
 class Singleton {
 public:
-  static T *GetInstance() noexcept(std::is_nothrow_constructible<T>::value) {
-    static T instance;
-    return &instance;
-  }
+    static T *GetInstance() noexcept(std::is_nothrow_constructible<T>::value)
+    {
+        static T instance;
+        return &instance;
+    }
 
-  virtual ~Singleton() = default;
+    virtual ~Singleton() = default;
 
 protected:
-  explicit Singleton() = default;
+    explicit Singleton() = default;
 
 private:
-  explicit Singleton(const Singleton &obj) = delete;
-  Singleton& operator=(const Singleton &obj) = delete;
-  explicit Singleton(Singleton &&obj) = delete;
-  Singleton& operator=(Singleton &&obj) = delete;
+    explicit Singleton(const Singleton &obj) = delete;
+    Singleton& operator=(const Singleton &obj) = delete;
+    explicit Singleton(Singleton &&obj) = delete;
+    Singleton& operator=(Singleton &&obj) = delete;
 };
 } // profiler
 } // toolkit
diff --git a/torch_npu/csrc/toolkit/profiler/common/thread.h b/torch_npu/csrc/toolkit/profiler/common/thread.h
index 6db1b09d0a2a02d626691a1e642789ef76f67ded..b53e9d8fe80824691455b57e14afceeca4fdc802 100644
--- a/torch_npu/csrc/toolkit/profiler/common/thread.h
+++ b/torch_npu/csrc/toolkit/profiler/common/thread.h
@@ -9,57 +9,61 @@ namespace toolkit {
 namespace profiler {
 class Thread {
 public:
-  Thread()
-      : is_alive_(false),
-        pid_(0),
-        thread_name_("NPUProfiler") {};
+    Thread() : is_alive_(false), pid_(0), thread_name_("NPUProfiler"){};
 
-  ~Thread() {
-    if (is_alive_) {
-        (void)pthread_cancel(pid_);
-        (void)pthread_join(pid_, nullptr);
+    ~Thread()
+    {
+        if (is_alive_) {
+            (void)pthread_cancel(pid_);
+            (void)pthread_join(pid_, nullptr);
+        }
     }
-  }
 
-  void SetThreadName(const std::string &name) {
-    if (!name.empty()) {
-      thread_name_ = name;
+    void SetThreadName(const std::string &name)
+    {
+        if (!name.empty()) {
+            thread_name_ = name;
+        }
     }
-  }
 
-  std::string GetThreadName() {
-    return thread_name_;
-  }
+    std::string GetThreadName()
+    {
+        return thread_name_;
+    }
 
-  int Start() {
-    int ret = pthread_create(&pid_, nullptr, Execute, (void*)this);
-    is_alive_ = (ret == 0) ? true : false;
-    return ret;
-  }
+    int Start()
+    {
+        int ret = pthread_create(&pid_, nullptr, Execute, (void *)this);
+        is_alive_ = (ret == 0) ? true : false;
+        return ret;
+    }
 
-  int Stop() {
-    return Join();
-  }
+    int Stop()
+    {
+        return Join();
+    }
 
-  int Join() {
-    int ret = pthread_join(pid_, nullptr);
-    is_alive_ = (ret == 0) ? false : true;
-    return ret;
-  }
+    int Join()
+    {
+        int ret = pthread_join(pid_, nullptr);
+        is_alive_ = (ret == 0) ? false : true;
+        return ret;
+    }
 
 private:
-  static void* Execute(void *args) {
-    Thread *thr = (Thread *)args;
-    prctl(PR_SET_NAME, (unsigned long)thr->GetThreadName().data());
-    thr->Run();
-    return nullptr;
-  }
-  virtual void Run() = 0;
+    static void *Execute(void *args)
+    {
+        Thread *thr = (Thread *)args;
+        prctl(PR_SET_NAME, (unsigned long)thr->GetThreadName().data());
+        thr->Run();
+        return nullptr;
+    }
+    virtual void Run() = 0;
 
 private:
-  bool is_alive_;
-  pthread_t pid_;
-  std::string thread_name_;
+    bool is_alive_;
+    pthread_t pid_;
+    std::string thread_name_;
 };
 } // profiler
 } // toolkit
diff --git a/torch_npu/csrc/toolkit/profiler/common/utils.h b/torch_npu/csrc/toolkit/profiler/common/utils.h
index f13d1c51c01459d0b00514935289c92520c417a2..aca4e3ee8c304b2f50dc29ed5fe87cc3b052e021 100644
--- a/torch_npu/csrc/toolkit/profiler/common/utils.h
+++ b/torch_npu/csrc/toolkit/profiler/common/utils.h
@@ -17,145 +17,157 @@ namespace toolkit {
 namespace profiler {
 class Utils {
 public:
-  static bool IsFileExist(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX) {
-      return false;
+    static bool IsFileExist(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX) {
+            return false;
+        }
+        return (access(path.c_str(), F_OK) == 0) ? true : false;
     }
-    return (access(path.c_str(), F_OK) == 0) ? true : false;
-  }
 
-  static bool IsFileWritable(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX) {
-      return false;
+    static bool IsFileWritable(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX) {
+            return false;
+        }
+        return (access(path.c_str(), W_OK) == 0) ? true : false;
     }
-    return (access(path.c_str(), W_OK) == 0) ? true : false;
-  }
 
-  static bool IsDir(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX) {
-      return false;
-    }
-    struct stat st = {0};
-    int ret = lstat(path.c_str(), &st);
-    if (ret != 0) {
-      return false;
+    static bool IsDir(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX) {
+            return false;
+        }
+        struct stat st = {0};
+        int ret = lstat(path.c_str(), &st);
+        if (ret != 0) {
+            return false;
+        }
+        return S_ISDIR(st.st_mode) ? true : false;
     }
-    return S_ISDIR(st.st_mode) ? true : false;
-  }
 
-  static bool CreateDir(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX) {
-      return false;
-    }
-    if (IsFileExist(path)) {
-      return IsDir(path) ? true : false;
-    }
-    size_t pos = 0;
-    while ((pos = path.find_first_of('/', pos)) != std::string::npos) {
-      std::string base_dir = path.substr(0, ++pos);
-      if (IsFileExist(base_dir)) {
-        if (IsDir(base_dir)) {
-          continue;
-        } else {
-          return false;
+    static bool CreateDir(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX) {
+            return false;
+        }
+        if (IsFileExist(path)) {
+            return IsDir(path) ? true : false;
+        }
+        size_t pos = 0;
+        while ((pos = path.find_first_of('/', pos)) != std::string::npos) {
+            std::string base_dir = path.substr(0, ++pos);
+            if (IsFileExist(base_dir)) {
+                if (IsDir(base_dir)) {
+                    continue;
+                } else {
+                    return false;
+                }
+            }
+            if (mkdir(base_dir.c_str(), 0750) != 0) {
+                return false;
+            }
         }
-      }
-      if (mkdir(base_dir.c_str(), 0750) != 0) {
-        return false;
-      }
+        return (mkdir(path.c_str(), 0750) == 0) ? true : false;
     }
-    return (mkdir(path.c_str(), 0750) == 0) ? true : false;
-  }
 
-  static std::string RealPath(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX) {
-      return "";
-    }
-    char realPath[PATH_MAX] = {0};
-    if (realpath(path.c_str(), realPath) == nullptr) {
-      return "";
+    static std::string RealPath(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX) {
+            return "";
+        }
+        char realPath[PATH_MAX] = {0};
+        if (realpath(path.c_str(), realPath) == nullptr) {
+            return "";
+        }
+        return std::string(realPath);
     }
-    return std::string(realPath);
-  }
 
-  static std::string RelativeToAbsPath(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX) {
-      return "";
+    static std::string RelativeToAbsPath(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX) {
+            return "";
+        }
+        if (path[0] != '/') {
+            char pwd_path[PATH_MAX] = {0};
+            if (getcwd(pwd_path, PATH_MAX) != nullptr) {
+                return std::string(pwd_path) + "/" + path;
+            }
+            return "";
+        }
+        return std::string(path);
     }
-    if (path[0] != '/') {
-      char pwd_path[PATH_MAX] = {0};
-      if (getcwd(pwd_path, PATH_MAX) != nullptr) {
-        return std::string(pwd_path) + "/" + path;
-      }
-      return "";
+
+    static std::string DirName(const std::string &path)
+    {
+        if (path.empty()) {
+            return "";
+        }
+        std::string temp_path = std::string(path.begin(), path.end());
+        char *path_c = dirname(const_cast<char *>(temp_path.data()));
+        return path_c ? std::string(path_c) : "";
     }
-    return std::string(path);
-  }
 
-  static std::string DirName(const std::string &path) {
-    if (path.empty()) {
-      return "";
+    static uint64_t GetClockMonotonicRawNs()
+    {
+        struct timespec ts = {0};
+        clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
+        return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + static_cast<uint64_t>(ts.tv_nsec); // 1000000000为秒转换为纳秒的倍数
     }
-    std::string temp_path = std::string(path.begin(), path.end());
-    char *path_c = dirname(const_cast<char *>(temp_path.data()));
-    return path_c ? std::string(path_c) : "";
-  }
-
-  static uint64_t GetClockMonotonicRawNs() {
-    struct timespec ts = {0};
-    clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
-    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + static_cast<uint64_t>(ts.tv_nsec); // 1000000000为秒转换为纳秒的倍数
-  }
-
-  static uint64_t getClockSyscnt() {
-    uint64_t cycles;
+
+    static uint64_t getClockSyscnt()
+    {
+        uint64_t cycles;
 #if defined(__aarch64__)
-    asm volatile("mrs %0, cntvct_el0" : "=r"(cycles));
+        asm volatile("mrs %0, cntvct_el0" : "=r"(cycles));
 #elif defined(__x86_64__)
-    constexpr uint32_t uint32Bits = 32U;
-    uint32_t hi = 0;
-    uint32_t lo = 0;
-    __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
-    cycles = (static_cast<uint64_t>(lo)) | ((static_cast<uint64_t>(hi)) << uint32Bits);
+        constexpr uint32_t uint32Bits = 32U;
+        uint32_t hi = 0;
+        uint32_t lo = 0;
+        __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+        cycles = (static_cast<uint64_t>(lo)) | ((static_cast<uint64_t>(hi)) << uint32Bits);
 #elif defined(__arm__)
-    const uint32_t uint32Bits = 32U;
-    uint32_t hi = 0;
-    uint32_t lo = 0;
-    asm volatile("mrrc p15, 1, %0, %1, c14" : "=r"(lo), "=r"(hi));
-    cycles = (static_cast<uint64_t>(lo)) | ((static_cast<uint64_t>(hi)) << uint32Bits);
+        const uint32_t uint32Bits = 32U;
+        uint32_t hi = 0;
+        uint32_t lo = 0;
+        asm volatile("mrrc p15, 1, %0, %1, c14" : "=r"(lo), "=r"(hi));
+        cycles = (static_cast<uint64_t>(lo)) | ((static_cast<uint64_t>(hi)) << uint32Bits);
 #else
-    cycles = 0;
+        cycles = 0;
 #endif
-    return cycles;
-  }
-
-  static uint64_t GetClockTime() {
-    static const bool isSupportSysCnt = at_npu::native::isSyscntEnable();
-    if (isSupportSysCnt) {
-      return getClockSyscnt();
-    } else {
-      return GetClockMonotonicRawNs();
+        return cycles;
     }
-  }
 
-  static bool CreateFile(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX || !CreateDir(DirName(path))) {
-      return false;
+    static uint64_t GetClockTime()
+    {
+        static const bool isSupportSysCnt = at_npu::native::isSyscntEnable();
+        if (isSupportSysCnt) {
+            return getClockSyscnt();
+        } else {
+            return GetClockMonotonicRawNs();
+        }
     }
-    int fd = creat(path.c_str(), S_IRUSR | S_IWUSR | S_IRGRP);
-    return (fd < 0 || close(fd) != 0) ? false : true;
-  }
 
-  static bool IsSoftLink(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX || !IsFileExist(path)) {
-      return false;
+    static bool CreateFile(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX || !CreateDir(DirName(path))) {
+            return false;
+        }
+        int fd = creat(path.c_str(), S_IRUSR | S_IWUSR | S_IRGRP);
+        return (fd < 0 || close(fd) != 0) ? false : true;
     }
-    struct stat st{};
-    if (lstat(path.c_str(), &st) != 0) {
-      return false;
+
+    static bool IsSoftLink(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX || !IsFileExist(path)) {
+            return false;
+        }
+        struct stat st{};
+        if (lstat(path.c_str(), &st) != 0) {
+            return false;
+        }
+        return S_ISLNK(st.st_mode);
     }
-    return S_ISLNK(st.st_mode);
-  }
 
     static uint64_t GetTid()
     {
diff --git a/torch_npu/csrc/toolkit/profiler/inc/data_dumper.h b/torch_npu/csrc/toolkit/profiler/inc/data_dumper.h
index 46b0141aed0a566df290537a42fe884a2ade175f..6880980426052f8640d4e5f44ad22b93f059dcfc 100644
--- a/torch_npu/csrc/toolkit/profiler/inc/data_dumper.h
+++ b/torch_npu/csrc/toolkit/profiler/inc/data_dumper.h
@@ -19,26 +19,26 @@ constexpr uint32_t kNotifyInterval = 256;
 
 class DataDumper : public Thread {
 public:
-  explicit DataDumper();
-  virtual ~DataDumper();
-  void Init(const std::string &path, size_t capacity);
-  void UnInit();
-  void Report(std::unique_ptr<BaseReportData> data);
-  void Start();
-  void Stop();
+    explicit DataDumper();
+    virtual ~DataDumper();
+    void Init(const std::string &path, size_t capacity);
+    void UnInit();
+    void Report(std::unique_ptr<BaseReportData> data);
+    void Start();
+    void Stop();
 
 private:
-  void Flush();
-  void Dump(const std::map<std::string, std::vector<uint8_t>> &dataMap);
-  void Run();
-  void GatherAndDumpData();
+    void Flush();
+    void Dump(const std::map<std::string, std::vector<uint8_t>> &dataMap);
+    void Run();
+    void GatherAndDumpData();
 
 private:
-  std::string path_;
-  std::atomic<bool> start_;
-  std::atomic<bool> init_;
-  RingBuffer<std::unique_ptr<BaseReportData>> data_chunk_buf_;
-  std::map<std::string, FILE*> fd_map_;
+    std::string path_;
+    std::atomic<bool> start_;
+    std::atomic<bool> init_;
+    RingBuffer<std::unique_ptr<BaseReportData>> data_chunk_buf_;
+    std::map<std::string, FILE *> fd_map_;
 };
 
 class TraceDataDumper : public Thread {
@@ -58,17 +58,17 @@ private:
     void FlushTraceData();
     void FlushHashData();
     void FlushParamData();
-    void Dump(const std::string& file_name, const std::vector<uint8_t>& encode_data);
+    void Dump(const std::string &file_name, const std::vector<uint8_t> &encode_data);
     void Run();
 
 private:
     std::string path_;
     std::atomic<bool> start_;
     std::atomic<bool> init_;
-    std::unique_ptr<PythonTracerHashData> trace_hash_data_{nullptr};
-    std::unique_ptr<ParamTensorData> param_data_{nullptr};
+    std::unique_ptr<PythonTracerHashData> trace_hash_data_{ nullptr };
+    std::unique_ptr<ParamTensorData> param_data_{ nullptr };
     RingBuffer<std::unique_ptr<PythonTracerFuncData>> trace_data_buf_;
-    std::map<std::string, FILE*> fd_map_;
+    std::map<std::string, FILE *> fd_map_;
 };
 } // profiler
 } // toolkit
diff --git a/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h b/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h
index 28eeea8ba9f8344bf9e6f145f39de26989d99a43..0eb9da1f357449d1bb66c6f346d33f82d0f2ceae 100644
--- a/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h
+++ b/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h
@@ -18,72 +18,78 @@ namespace toolkit {
 namespace profiler {
 
 template<typename T>
-std::string to_string(T value) {
-  std::ostringstream oss;
-  oss << value;
-  return oss.str();
+std::string to_string(T value)
+{
+    std::ostringstream oss;
+    oss << value;
+    return oss.str();
 }
 
 template<typename T>
-void encodeFixedData(const std::vector<T> &datas, std::vector<uint8_t> &result) {
-  for (auto data : datas) {
-    for (size_t i = 0; i < sizeof(T); ++i) {
-      result.push_back((static_cast<size_t>(data) >> (i * 8)) & 0xff);
+void encodeFixedData(const std::vector<T> &datas, std::vector<uint8_t> &result)
+{
+    for (auto data : datas) {
+        for (size_t i = 0; i < sizeof(T); ++i) {
+            result.push_back((static_cast<size_t>(data) >> (i * 8)) & 0xff);
+        }
     }
-  }
 }
 
-inline void encodeStrData(uint16_t type, const std::string &data, std::vector<uint8_t> &result) {
-  for (size_t i = 0; i < sizeof(uint16_t); ++i) {
-    result.push_back((type >> (i * 8)) & 0xff);
-  }
-  uint32_t length = data.size();
-  for (size_t i = 0; i < sizeof(uint32_t); ++i) {
-    result.push_back((length >> (i * 8)) & 0xff);
-  }
-  for (const auto &c : data) {
-    result.push_back(c);
-  }
+inline void encodeStrData(uint16_t type, const std::string &data, std::vector<uint8_t> &result)
+{
+    for (size_t i = 0; i < sizeof(uint16_t); ++i) {
+        result.push_back((type >> (i * 8)) & 0xff);
+    }
+    uint32_t length = data.size();
+    for (size_t i = 0; i < sizeof(uint32_t); ++i) {
+        result.push_back((length >> (i * 8)) & 0xff);
+    }
+    for (const auto &c : data) {
+        result.push_back(c);
+    }
 }
 
-inline void encodeStrArrayData(uint16_t type, const std::vector<std::string> &datas, std::vector<uint8_t> &result) {
-  std::string rst;
-  for (auto str : datas) {
-    rst += (str + ";");
-  }
-  if (!rst.empty()) {
-    rst.pop_back();
-  }
-  encodeStrData(type, rst, result);
+inline void encodeStrArrayData(uint16_t type, const std::vector<std::string> &datas, std::vector<uint8_t> &result)
+{
+    std::string rst;
+    for (auto str : datas) {
+        rst += (str + ";");
+    }
+    if (!rst.empty()) {
+        rst.pop_back();
+    }
+    encodeStrData(type, rst, result);
 }
 
-inline void encodeMapData(uint16_t type, const std::unordered_map<std::string, c10::IValue> &datas, std::vector<uint8_t> &result) {
-  std::string rst;
-  for (auto &entry : datas) {
-    rst += entry.first + ":" + to_string(entry.second) + ";";
-  }
-  if (!rst.empty()) {
-    rst.pop_back();
-  }
-  encodeStrData(type, rst, result);
+inline void encodeMapData(uint16_t type, const std::unordered_map<std::string, c10::IValue> &datas, std::vector<uint8_t> &result)
+{
+    std::string rst;
+    for (auto &entry : datas) {
+        rst += entry.first + ":" + to_string(entry.second) + ";";
+    }
+    if (!rst.empty()) {
+        rst.pop_back();
+    }
+    encodeStrData(type, rst, result);
 }
 
 template<typename T>
-void encode2DIntegerMatrixDatas(uint16_t type, std::vector<std::vector<T>> &datas, std::vector<uint8_t> &result) {
-  std::string rst;
-  for (auto tensor : datas) {
-    std::stringstream ss;
-    copy(tensor.begin(), tensor.end(), std::ostream_iterator<T>(ss, ","));
-    std::string str = ss.str();
-    if (!str.empty()) {
-      str.pop_back();
+void encode2DIntegerMatrixDatas(uint16_t type, std::vector<std::vector<T>> &datas, std::vector<uint8_t> &result)
+{
+    std::string rst;
+    for (auto tensor : datas) {
+        std::stringstream ss;
+        copy(tensor.begin(), tensor.end(), std::ostream_iterator<T>(ss, ","));
+        std::string str = ss.str();
+        if (!str.empty()) {
+            str.pop_back();
+        }
+        rst += (str + ";");
+    }
+    if (!rst.empty()) {
+        rst.pop_back();
     }
-    rst += (str + ";");
-  }
-  if (!rst.empty()) {
-    rst.pop_back();
-  }
-  encodeStrData(type, rst, result);
+    encodeStrData(type, rst, result);
 }
 
 class WeakTensor {
@@ -262,7 +268,7 @@ enum class OpRangeDataType {
     RESERVED = 30,
 };
 
-struct OpRangeData : BaseReportData{
+struct OpRangeData : BaseReportData {
     int64_t start_ns{0};
     int64_t end_ns{0};
     int64_t sequence_number{0};
@@ -341,6 +347,7 @@ struct MemoryData : BaseReportData {
     int64_t stream_ptr{0};
     int8_t device_type{0};
     int8_t device_index{0};
+    uint8_t component_type{0};
     uint8_t data_type{0};
     uint8_t allocator_type{0};
     uint64_t thread_id{0};
@@ -355,6 +362,7 @@ struct MemoryData : BaseReportData {
         int64_t stream_ptr,
         int8_t device_type,
         int8_t device_index,
+        uint8_t component_type,
         uint8_t data_type,
         uint8_t allocator_type,
         uint64_t thread_id,
@@ -369,6 +377,7 @@ struct MemoryData : BaseReportData {
           stream_ptr(stream_ptr),
           device_type(device_type),
           device_index(device_index),
+          component_type(component_type),
           data_type(data_type),
           allocator_type(allocator_type),
           thread_id(thread_id),
diff --git a/torch_npu/csrc/toolkit/profiler/src/data_dumper.cpp b/torch_npu/csrc/toolkit/profiler/src/data_dumper.cpp
index 69f0ad87b0e61ede6e3f87dddbbc3275de2b25df..44dd2c82ff01398e05d5019f069e4cc70bf8194e 100644
--- a/torch_npu/csrc/toolkit/profiler/src/data_dumper.cpp
+++ b/torch_npu/csrc/toolkit/profiler/src/data_dumper.cpp
@@ -18,17 +18,20 @@ DataDumper::DataDumper()
       start_(false),
       init_(false) {}
 
-DataDumper::~DataDumper() {
-  UnInit();
+DataDumper::~DataDumper()
+{
+    UnInit();
 }
 
-void DataDumper::Init(const std::string &path, size_t capacity = kDefaultRingBuffer) {
-  path_ = path;
-  data_chunk_buf_.Init(capacity);
-  init_.store(true);
+void DataDumper::Init(const std::string &path, size_t capacity = kDefaultRingBuffer)
+{
+    path_ = path;
+    data_chunk_buf_.Init(capacity);
+    init_.store(true);
 }
 
-void DataDumper::UnInit() {
+void DataDumper::UnInit()
+{
     if (init_.load()) {
         data_chunk_buf_.UnInit();
         init_.store(false);
@@ -43,22 +46,25 @@ void DataDumper::UnInit() {
     }
 }
 
-void DataDumper::Start() {
+void DataDumper::Start()
+{
     if (!init_.load() || Thread::Start() != 0) {
         return;
     }
     start_.store(true);
 }
 
-void DataDumper::Stop() {
-  if (start_.load() == true) {
-    start_.store(false);
-    Thread::Stop();
-  }
-  Flush();
+void DataDumper::Stop()
+{
+    if (start_.load() == true) {
+        start_.store(false);
+        Thread::Stop();
+    }
+    Flush();
 }
 
-void DataDumper::GatherAndDumpData() {
+void DataDumper::GatherAndDumpData()
+{
     std::map<std::string, std::vector<uint8_t>> dataMap;
     uint64_t batchSize = 0;
     while (batchSize < kBatchMaxLen) {
@@ -85,7 +91,8 @@ void DataDumper::GatherAndDumpData() {
     }
 }
 
-void DataDumper::Run() {
+void DataDumper::Run()
+{
     for (;;) {
         if (!start_.load()) {
             break;
@@ -98,10 +105,11 @@ void DataDumper::Run() {
     }
 }
 
-void DataDumper::Flush() {
-  while (data_chunk_buf_.Size() != 0) {
-    GatherAndDumpData();
-  }
+void DataDumper::Flush()
+{
+    while (data_chunk_buf_.Size() != 0) {
+        GatherAndDumpData();
+    }
 }
 
 void DataDumper::Report(std::unique_ptr<BaseReportData> data)
diff --git a/torch_npu/csrc/toolkit/profiler/src/data_reporter.cpp b/torch_npu/csrc/toolkit/profiler/src/data_reporter.cpp
index 669ee8a4d1947543d203b2f78139d844568c02e9..2cbce73a06f49a13d4ba10d76e999d66c5278e81 100644
--- a/torch_npu/csrc/toolkit/profiler/src/data_reporter.cpp
+++ b/torch_npu/csrc/toolkit/profiler/src/data_reporter.cpp
@@ -93,7 +93,7 @@ std::vector<uint8_t> MemoryData::encode()
                               total_reserved, total_active, stream_ptr},
                              result);
     encodeFixedData<int8_t>({device_type, device_index}, result);
-    encodeFixedData<uint8_t>({data_type, allocator_type}, result);
+    encodeFixedData<uint8_t>({component_type, data_type, allocator_type}, result);
     encodeFixedData<uint64_t>({thread_id, process_id}, result);
 
     std::vector<uint8_t> resultTLV;
diff --git a/torch_npu/distributed/distributed_c10d.py b/torch_npu/distributed/distributed_c10d.py
index 3d10a02457dddb3fba2e851de339b4ddc69bc648..73c77155c15e3ea7f6e75aaa6cb90304d083498a 100644
--- a/torch_npu/distributed/distributed_c10d.py
+++ b/torch_npu/distributed/distributed_c10d.py
@@ -1,11 +1,15 @@
 __all__ = ["is_hccl_available", "reinit_process_group"]
 
+import os
 from datetime import timedelta
 from typing import Optional
+from functools import wraps
 import warnings
+import logging
 import torch
 import torch.distributed as dist
 import torch.distributed.distributed_c10d as dist_c10d
+from torch.distributed.elastic.rendezvous import RendezvousParameters
 from torch.distributed.distributed_c10d import _get_default_group, get_group_rank, _check_single_tensor, \
     _check_tensor_list, _coalescing_manager, _ensure_all_tensors_same_dtype, get_rank, _rank_not_in_group, \
     _warn_not_in_group, GatherOptions, _validate_output_list_for_rank, GroupMember, _get_group_size, \
@@ -15,6 +19,9 @@ from torch.distributed.distributed_c10d import _get_default_group, get_group_ran
 from torch_npu.utils._error_code import ErrCode, dist_error
 
 
+logger = logging.getLogger("torch.distributed")
+
+
 def _batch_isend_irecv(p2p_op_list):
     group = p2p_op_list[0].group
     device = p2p_op_list[0].tensor.device
@@ -156,7 +163,8 @@ def _gather_object(obj, object_gather_list=None, dst=0, group=None):
         group_size, dtype=torch.long, device=current_device
     )
     object_size_list = [
-        object_sizes_tensor[i].unsqueeze(dim=0) for i in range(group_size)
+        object_sizes_tensor[i].unsqueeze(dim=0)
+        for i in range(group_size)
     ]
     # Allgather tensor sizes. An all-gather is needed here despite this being a
     # gather, since each rank needs to broadcast a tensor of the same (maximal)
@@ -231,6 +239,18 @@ def reinit_process_group(group=None, rebuild_link=True):
         return group
 
 
+def _comm_switch_nic(ranks, useBackup):
+    nRanks = len(ranks)
+    npu_device = torch.device('npu')
+    rankid = int(os.environ['RANK'])
+    result = True
+    for pg in _pg_map:
+        if (npu_device in pg._device_types):
+            presult = pg._get_backend(npu_device)._set_switch_nic_comm(rankid, nRanks, ranks, useBackup)
+            if not presult:
+                result = False
+    return result
+
 
 def _reduce_scatter_tensor_uneven(output, input, input_split_sizes=None, op=dist.ReduceOp.SUM, group=None, async_op=False):
     if _rank_not_in_group(group):
@@ -282,6 +302,34 @@ def _all_gather_into_tensor_uneven(output, input, output_split_sizes=None, group
         return None
 
 
+def _trigger__get_addr_and_port_decorator(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        # Only supports obtaining the master_addr and master_port through the endpoint when the backend is static.
+        if len(args) > 0 and isinstance(args[0], RendezvousParameters) and args[0].backend == "parallel":
+            args[0].backend = "static"
+            master_addr, master_port = func(*args, **kwargs)
+            args[0].backend = "parallel"
+            return master_addr, master_port
+        else:
+            return func(*args, **kwargs)
+    return wrapper
+
+
+def _trigger_rendezvous_decorator(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        use_parallel = os.getenv("TORCH_NPU_USE_PARALLEL_TCPSTORE", "False")
+        if use_parallel == "True":
+            if len(args) > 0 and args[0] == "env://":
+                master_addr = os.getenv("MASTER_ADDR", "127.0.0.1")
+                master_port = os.getenv("MASTER_PORT", "29500")
+                args = (f"parallel://{master_addr}:{master_port}",) + args[1:]
+                logger.info(f"torch_npu_run change the rendezvous url from env:// to {args[0]}")
+        return func(*args, **kwargs)
+    return wrapper
+
+
 def _destructor_process_group():
     _update_default_pg(None)
     _world.pg_map.clear()
diff --git a/torch_npu/distributed/rendezvous.py b/torch_npu/distributed/rendezvous.py
index 4aa6c7fb57b729e504c10f2cd0a48366eacefc6a..fda9427a1738f0494cc923904361bbf8cbbe02b8 100644
--- a/torch_npu/distributed/rendezvous.py
+++ b/torch_npu/distributed/rendezvous.py
@@ -196,6 +196,7 @@ def _create_parallel_handler(params: RendezvousParameters) -> RendezvousHandler:
         timeout = _default_timeout_seconds
     os.environ.setdefault("ENABLE_TIERED_PARALLEL_TCPSTORE", str(origin_args.enable_tiered_parallel_tcpstore))
     os.environ.setdefault("TORCH_NPU_ELASTIC_USE_AGENT_STORE", str(True))
+    os.environ.setdefault("TORCH_NPU_USE_PARALLEL_TCPSTORE", str(True))
     enable_tiered = str(origin_args.enable_tiered_parallel_tcpstore).lower() == "true"
     agent_run = True
     agent_pid = os.getpid()
diff --git a/torch_npu/distributed/run.py b/torch_npu/distributed/run.py
index 4bb0a0c1775de1596212e8ff1e1cd98680620121..ee97ca10887758accf605e3658ac98867b82d726 100644
--- a/torch_npu/distributed/run.py
+++ b/torch_npu/distributed/run.py
@@ -228,6 +228,8 @@ def _main(args=None):
     args = parse_args(args)
     _apply_torch_npu_run_patch()
     args.rdzv_backend = 'parallel'
+    if not args.rdzv_endpoint:
+        args.rdzv_endpoint = f"{args.master_addr}:{args.master_port}"
     torch_run.run(args)
 
 
diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py
index 0bd6788dd555109c48837f0c2bafa1b96e38d4a2..bd94890b27e32637643e927d41bc6f2a85eafe30 100644
--- a/torch_npu/npu/__init__.py
+++ b/torch_npu/npu/__init__.py
@@ -14,6 +14,7 @@ __all__ = [
     "is_available",
     "device",
     "device_of",
+    "StreamContext",
     "stream",
     "set_stream",
     "current_stream",
@@ -97,6 +98,7 @@ __all__ = [
     "stop_device",
     "restart_device",
     "check_uce_in_memory",
+    "get_uce_addr",
     "config",
     "matmul",
     "conv",
@@ -108,7 +110,12 @@ __all__ = [
     "graph",
     "graph_pool_handle",
     "is_current_stream_capturing",
-    "make_graphed_callables"
+    "make_graphed_callables",
+    "ExternalEvent",
+    "graph_task_group_begin",
+    "graph_task_group_end",
+    "graph_task_update_begin",
+    "graph_task_update_end"
 ]
 
 from typing import Tuple, Union
@@ -125,12 +132,12 @@ import torch_npu
 from torch_npu.utils._error_code import ErrCode, pta_error, prof_error
 from .utils import (synchronize, device_count, can_device_access_peer, set_device, current_device, get_device_name,
                     get_device_properties, get_device_capability, _get_device_index, 
-                    device, device_of, stream, set_stream, current_stream, default_stream, set_sync_debug_mode,
+                    device, device_of, StreamContext, stream, set_stream, current_stream, default_stream, set_sync_debug_mode,
                     get_sync_debug_mode, init_dump, current_blas_handle, is_bf16_supported,
                     utilization, finalize_dump, set_dump, get_npu_overflow_flag, clear_npu_overflow_flag, mem_get_info,
-                    check_uce_in_memory, stress_detect)
+                    check_uce_in_memory, stress_detect, get_uce_addr)
 from ._recovery import restart_device, stop_device
-from .streams import Stream, Event, SyncLaunchStream
+from .streams import Stream, Event, SyncLaunchStream, ExternalEvent
 from .mstx import mstx
 from .npu_config import *  # noqa: F403
 from .autocast_utils import *  # noqa: F403
@@ -144,6 +151,10 @@ from .graphs import (
     graph_pool_handle,
     is_current_stream_capturing,
     make_graphed_callables,
+    graph_task_group_begin,
+    graph_task_group_end,
+    graph_task_update_begin,
+    graph_task_update_end,
 )
 
 # init profiler
@@ -330,6 +341,11 @@ def _lazy_new(cls, *args, **kwargs):
     return super(_NPUBase, cls).__new__(cls, *args, **kwargs)
 
 
+def _comm_switch_nic(ranks, useBackup):
+    torch_npu.npu.synchronize()
+    return torch_npu.distributed.distributed_c10d._comm_switch_nic(ranks, useBackup)
+
+
 class _NPUBase:
     is_npu = True
     is_sparse = False
diff --git a/torch_npu/npu/_graph_tree.py b/torch_npu/npu/_graph_tree.py
index 59fa42a112a5b4610c49d9227a189188fd5b6d00..7ecabd355e122a2485b1e794c002d15192095f27 100644
--- a/torch_npu/npu/_graph_tree.py
+++ b/torch_npu/npu/_graph_tree.py
@@ -554,7 +554,9 @@ class NPUWarmupNode:
         # See: output_is_alias_of_persistent_static_inputs below. We should only be returning freshly created
         # storages in path_live_weakrefs.
         existing_path_data_ptrs = {
-            t.data_ptr() for t in self.path_live_weakrefs() if t()
+            t.data_ptr()
+            for t in self.path_live_weakrefs()
+            if t()
         }
 
         def get_non_npugraph_inps():
@@ -603,7 +605,9 @@ class NPUWarmupNode:
         if config.triton.slow_path_cudagraph_asserts and not self.already_warm:
             out_refs = self.path_live_weakrefs()
             new_storages = [
-                t for t in out_refs if t.data_ptr() not in non_npugraph_inps
+                t
+                for t in out_refs
+                if t.data_ptr() not in non_npugraph_inps
             ]
             check_memory_pool(self.device_index, self.npu_graphs_pool, new_storages)
 
@@ -734,10 +738,12 @@ class NPUGraphNode:
         # Path is a series of nodes from root to the current node
         self.outputs_weakrefs: OutputList[Optional[StorageWeakRefWrapper]] = []
         self.path_weakrefs: LevelList[OutputList[Optional[StorageWeakRefWrapper]]] = [
-            node.outputs_weakrefs for node in self._path_from_root
+            node.outputs_weakrefs
+            for node in self._path_from_root
         ]
         self.path_stacktraces: LevelList[StackTraces] = [
-            node.stack_traces for node in self._path_from_root
+            node.stack_traces
+            for node in self._path_from_root
         ]
         self.tensor_weakrefs: OutputList[Optional[TensorWeakRef]] = []
 
@@ -1305,7 +1311,8 @@ class NPUGraphNode:
     ) -> Optional[PathOutputIndex]:
         for depth, output_refs in enumerate(self.path_weakrefs):
             for output_index, storage_ref in enumerate(output_refs):
-                if (storage_and_ptr := maybe_deref(storage_ref)) is not None:
+                storage_and_ptr = maybe_deref(storage_ref)
+                if storage_and_ptr is not None:
                     storage, ptr = storage_and_ptr
                     if ptr == t.untyped_storage().data_ptr():
                         return (depth, output_index)
@@ -1378,7 +1385,8 @@ class NPUGraphNode:
             for output_idx, output_liveness in enumerate(outputs_liveness):
                 # tensor can die early, but it can't be alive when it should be dead
                 w = self.path_weakrefs[depth][output_idx]
-                if (stor_weak_ptr_and_data_ptr := maybe_deref(w)) is not None:
+                stor_weak_ptr_and_data_ptr = maybe_deref(w)
+                if stor_weak_ptr_and_data_ptr is not None:
                     if output_liveness is None:
                         raise RuntimeError("check output_liveness is not None fail")
                     stor_weak_ptr, stor_data_ptr = stor_weak_ptr_and_data_ptr
diff --git a/torch_npu/npu/_recovery.py b/torch_npu/npu/_recovery.py
index e9238caa03b9e659313648b350375594442fb561..00d983d2cd74007c13fdb833263be4dcc2701701 100644
--- a/torch_npu/npu/_recovery.py
+++ b/torch_npu/npu/_recovery.py
@@ -21,7 +21,7 @@ def check_npu_tensor_is_safe(tensor_obj):
     if isinstance(tensor_obj, torch.Tensor):
         return check_npu_storage_is_safe(tensor_obj.untyped_storage())
     else:
-        raise RuntimeError(f"param type should be Tensor, could not be {type(storage_obj)}" + pta_error(ErrCode.TYPE))
+        raise RuntimeError(f"param type should be Tensor, could not be {type(tensor_obj)}" + pta_error(ErrCode.TYPE))
 
 
 def mark_all_npu_tensor_unsafe(device: int):
@@ -39,7 +39,7 @@ def update_npu_tensor_to_safe(tensor_obj):
     if isinstance(tensor_obj, torch.Tensor):
         return update_npu_storage_to_safe(tensor_obj.untyped_storage())
     else:
-        raise RuntimeError(f"param type should be Tensor, could not be {type(storage_obj)}" + pta_error(ErrCode.TYPE))
+        raise RuntimeError(f"param type should be Tensor, could not be {type(tensor_obj)}" + pta_error(ErrCode.TYPE))
 
 
 def set_npu_tensor_unsafe_check_flag(flag: bool) -> None:
@@ -72,9 +72,10 @@ def restart_device(device_id: int, rebuild_all_resources: int = False):
 
 def stop_device(device_id):
     torch_npu.npu._lazy_init()
-    torch_npu._C._npu_stopDevice(device_id)
+    result = torch_npu._C._npu_stopDevice(device_id)
     _except_handler.set_force_stop_exception(True)
     npu_device = torch.device('npu')
     for pg in _pg_map:
         if (npu_device in pg._device_types):
             pg._get_backend(npu_device).set_watchdog_status(WATCHDOG_STATUS_STOP)
+    return result
diff --git a/torch_npu/npu/_sanitizer.py b/torch_npu/npu/_sanitizer.py
index 04c3356add195d4a7d5111c6a44006169474a16b..b4a50a90e6f0b5ef596241d43a7520404e2d8aaf 100644
--- a/torch_npu/npu/_sanitizer.py
+++ b/torch_npu/npu/_sanitizer.py
@@ -1,13 +1,17 @@
 import os
 import atexit
 
+import torch
 import torch.cuda._sanitizer as csan
+from packaging import version
 import torch_npu
 import torch_npu.utils._npu_trace as npu_trace
 import torch_npu.npu._stream_check as stream_check
 import torch_npu.npu._kernel_check as kernel_check
 from torch_npu.utils.utils import _print_warn_log
 
+PTA_SUPPORT_DISPATCHMODE_VERSION = "2.3"
+
 
 class SanitizerMode:
     STREAM = 0
@@ -110,7 +114,11 @@ class NPUSanitizer:
 
 
 def enable_npu_sanitizer():
-    npu_sanitizer.enable()
+    current_pytorch_version = torch.__version__
+    if version.parse(current_pytorch_version) < version.parse(PTA_SUPPORT_DISPATCHMODE_VERSION):
+        _print_warn_log(f"Current Pytorch's version {current_pytorch_version} doesn't support npu_sanitizer.")
+    else:
+        npu_sanitizer.enable()
 
 
 npu_sanitizer = NPUSanitizer()
diff --git a/torch_npu/npu/amp/autocast_mode.py b/torch_npu/npu/amp/autocast_mode.py
index 5de835f085b95d3eae9c229ef5300b65ae5fa55d..aece8cdee67979308c50a6ebd34483657883731b 100644
--- a/torch_npu/npu/amp/autocast_mode.py
+++ b/torch_npu/npu/amp/autocast_mode.py
@@ -108,6 +108,7 @@ def custom_fwd(fwd=None, **kwargs):
 
     @functools.wraps(fwd)
     def decorate_fwd(*args, **kwargs):
+        args[0]._dtype = torch.npu.get_autocast_dtype()
         if cast_inputs is None:
             args[0]._fwd_used_autocast = torch_npu._C.is_autocast_enabled()
             return fwd(*args, **kwargs)
@@ -136,7 +137,7 @@ def custom_bwd(bwd):
 
     @functools.wraps(bwd)
     def decorate_bwd(*args, **kwargs):
-        with autocast(args[0]._fwd_used_autocast):
+        with autocast(args[0]._fwd_used_autocast, dtype=args[0]._dtype):
             return bwd(*args, **kwargs)
 
     return decorate_bwd
diff --git a/torch_npu/npu/graphs.py b/torch_npu/npu/graphs.py
index 1733da07c2c7e23dba06c603603d7b3a548fe1ec..80945bafaace754c6660eadaa6a70c2a134dc27c 100644
--- a/torch_npu/npu/graphs.py
+++ b/torch_npu/npu/graphs.py
@@ -1,8 +1,18 @@
+__all__ = ["is_current_stream_capturing", "graph_pool_handle", "graph_task_group_begin",
+           "graph_task_group_end", "graph_task_update_begin", "graph_task_update_end",
+           "NPUGraph", "graph", "make_graphed_callables"]
+
 import gc
+import re
 import typing
+from copy import deepcopy
+from dataclasses import dataclass, field
+from typing import List, Dict, Any, Optional, Tuple
 
 import torch
 import torch_npu._C
+from torch_npu._C import _weak_ref_tensor as TensorWeakRef
+from torch_npu.utils._error_code import ErrCode, pta_error
 from .utils import _dummy_type
 from torch.utils._pytree import (
     tree_flatten as _tree_flatten,
@@ -16,11 +26,19 @@ if not hasattr(torch_npu._C, "_NPUStreamBase"):
     torch_npu._C.__dict__["_npu_isCurrentStreamCapturing"] = _dummy_type(
         "_npu_isCurrentStreamCapturing"
     )
+    torch_npu._C.__dict__["_graph_task_group_begin"] = _dummy_type("_graph_task_group_begin")
+    torch_npu._C.__dict__["_graph_task_group_end"] = _dummy_type("_graph_task_group_end")
+    torch_npu._C.__dict__["_graph_task_update_begin"] = _dummy_type("_graph_task_update_begin")
+    torch_npu._C.__dict__["_graph_task_update_end"] = _dummy_type("_graph_task_update_end")
 
 from torch_npu._C import (  # noqa: F401
     _npu_isCurrentStreamCapturing,
     _NPUGraph,
     _graph_pool_handle,
+    _graph_task_group_begin,
+    _graph_task_group_end,
+    _graph_task_update_begin,
+    _graph_task_update_end,
 )
 
 
@@ -44,6 +62,121 @@ def graph_pool_handle():
     return _graph_pool_handle()
 
 
+def graph_task_group_begin(stream):
+    _graph_task_group_begin(stream)
+
+
+def graph_task_group_end(stream):
+    return _graph_task_group_end(stream)
+
+
+def graph_task_update_begin(stream, handle):
+    _graph_task_update_begin(stream, handle)
+
+
+def graph_task_update_end(stream):
+    _graph_task_update_end(stream)
+
+
+@dataclass
+class _GraphDispatchRecord:
+    """存储单次操作的完整记录"""
+    event: Any = None
+    handle: Any = None
+    kwargs: Dict[str, Any] = field(default_factory=dict)
+    args: Tuple[Any, ...] = field(default_factory=tuple)
+    op_cache_entry: Any = None
+
+
+class _GraphDispatchMode(torch.utils._python_dispatch.TorchDispatchMode):
+    tensor_schema_name = {}
+    update_stream = None
+
+    def __init__(self):
+        self.graph_dispatch_records = []
+        if not self.update_stream:
+            self.update_stream = torch_npu.npu.Stream()
+
+    @classmethod
+    def update_schema(cls, name, schame):
+        if name in cls.tensor_schema_name:
+            return
+        pattern = r'(?:Tensor\??\s*)(\w+)'
+        cls.tensor_schema_name[name] = re.findall(pattern, schame)
+    
+    def update_capture_record(self, cpu_update_input):
+        if len(cpu_update_input) == 1:
+            new_list = [cpu_update_input[0].copy() for _ in range(len(self.graph_dispatch_records))]
+            cpu_update_input = new_list 
+        if len(self.graph_dispatch_records) != len(self.graph_dispatch_records):
+            raise RuntimeError(f"Currently, there are {len(self.graph_dispatch_records)} operators that need to be updated by capture, "
+                f"and there are only {len(self.graph_dispatch_records)} elements in the incoming cpu_update_input list", pta_error(ErrCode.PARAM))
+        with torch.npu.stream(self.update_stream):
+            for graph_dispatch_record, update_input in zip(self.graph_dispatch_records, cpu_update_input):
+                graph_task_update_begin(self.update_stream, graph_dispatch_record.handle)
+                for key in update_input:
+                    graph_dispatch_record.kwargs[key] = update_input[key]
+                graph_dispatch_record.op_cache_entry(*graph_dispatch_record.args, **graph_dispatch_record.kwargs)
+                graph_task_update_end(self.update_stream)
+                graph_dispatch_record.event.record(self.update_stream)
+
+    def _append_dispatch_record(self, event, handle, args, kwargs, func):
+        args_ref = []
+        for element in args:
+            if torch.is_tensor(element):
+                args_ref.append(TensorWeakRef(element))
+            else:
+                args_ref.append(deepcopy(element))
+        kwargs_ref = {}
+        for key, vaule in kwargs.items():
+            if key == "out":
+                kwargs_ref[key] = [TensorWeakRef(vaule[0]), TensorWeakRef(vaule[1])]
+            elif key in self.tensor_schema_name[str(func.__name__)]:
+                kwargs_ref[key] = TensorWeakRef(vaule)
+            else:
+                kwargs_ref[key] = deepcopy(vaule)
+        return _GraphDispatchRecord(event=event, handle=handle, kwargs=kwargs_ref, args=tuple(args_ref), op_cache_entry=func)
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if func.__name__ == "npu_fused_infer_attention_score":
+            func_out = torch_npu.npu_fused_infer_attention_score.out
+            self.update_schema(str(func_out.__name__), str(func_out._schema))
+            stream = torch_npu.npu.current_stream()
+            event = torch.npu.ExternalEvent()
+            event.wait(stream)
+            event.reset(stream)
+            # apply tensor
+            workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(*args, **kwargs)
+            output = torch.empty_like(args[0])
+            softmax_lse = torch.empty(1, dtype=args[0].dtype, device=args[0].device)
+            kwargs["workspace"] = workspace
+            kwargs["out"] = [output, softmax_lse]
+            # begin graph task
+            graph_task_group_begin(stream)
+            func_out(*args, **kwargs)
+            handle = graph_task_group_end(stream)
+            # save state for update
+            self.graph_dispatch_records.append(
+                self._append_dispatch_record(event, handle, args, kwargs, func_out))
+            return kwargs["out"]
+        elif func.__name__ == "npu_fused_infer_attention_score.out":
+            self.update_schema(str(func.__name__), str(func._schema))
+            stream = torch_npu.npu.current_stream()
+            event = torch.npu.ExternalEvent()
+            event.wait(stream)
+            event.reset(stream)
+            # begin graph task
+            graph_task_group_begin(stream)
+            func(*args, **kwargs)
+            handle = graph_task_group_end(stream)
+            # save state for update
+            self.graph_dispatch_records.append(
+                self._append_dispatch_record(event, handle, args, kwargs, func))
+            return kwargs["out"]
+        else:
+            return func(*args, **kwargs)
+
+
 # Python shim helps Sphinx process docstrings more reliably.
 class NPUGraph(torch_npu._C._NPUGraph):
     r"""Wrapper around a NPU graph.
@@ -54,6 +187,11 @@ class NPUGraph(torch_npu._C._NPUGraph):
 
     def __new__(cls):
         return super().__new__(cls)
+    
+    def __init__(self):
+        self.graph_dispatch_mode = _GraphDispatchMode()
+        self.auto_dispatch_capture = False
+        return super().__init__()
 
     def capture_begin(self, pool=None, capture_error_mode="global"):
         r"""Begin capturing NPU work on the current stream.
@@ -66,11 +204,11 @@ class NPUGraph(torch_npu._C._NPUGraph):
             pool (optional): Token (returned by :func:`~torch.npu.graph_pool_handle` or
                 :meth:`other_Graph_instance.pool()<torch.npu.NPUGraph.pool>`) that hints this graph may share memory
                 with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
-            capture_error_mode (str, optional): specifies the aclmdlCaptureMode for the graph capture stream.
+            capture_error_mode (str, optional): specifies the aclmdlRICaptureMode for the graph capture stream.
                 Can be "global", "thread_local" or "relaxed". During npu graph capture, some actions, such as npuMalloc,
                 may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
                 actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
-                unless you're familiar with `aclmdlCaptureMode`_
+                unless you're familiar with `aclmdlRICaptureMode`_
         """  # noqa: B950
         super().capture_begin(pool=pool, capture_error_mode=capture_error_mode)
 
@@ -101,6 +239,12 @@ class NPUGraph(torch_npu._C._NPUGraph):
         """
         return super().pool()
 
+    def update(self, cpu_update_input):
+        if not self.auto_dispatch_capture:
+            raise RuntimeError("The current graph configuration does not support update,"
+                "Try to capture by setting auto_dispatch_capture=True during capture", pta_error(ErrCode.PARAM))
+        self.graph_dispatch_mode.update_capture_record(cpu_update_input)
+
 
 class graph:
     r"""Context-manager that captures NPU work into a :class:`torch.npu.NPUGraph` object for later replay.
@@ -115,11 +259,11 @@ class graph:
             may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
         stream (torch.npu.Stream, optional): If supplied, will be set as the current stream in the context.
             If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
-        capture_error_mode (str, optional): specifies the aclmdlCaptureMode for the graph capture stream.
+        capture_error_mode (str, optional): specifies the aclmdlRICaptureMode for the graph capture stream.
             Can be "global", "thread_local" or "relaxed". During npu graph capture, some actions, such as npuMalloc,
             may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
             actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
-            unless you're familiar with `aclmdlCaptureMode`_
+            unless you're familiar with `aclmdlRICaptureMode`_
 
     .. note::
         For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
@@ -136,6 +280,7 @@ class graph:
         npu_graph,
         pool=None,
         stream=None,
+        auto_dispatch_capture=False,
         capture_error_mode: str = "global",
     ):
         # Lazy-init of default_capture_stream helps avoid circular-import errors.
@@ -153,6 +298,7 @@ class graph:
         self.stream_ctx = torch.npu.stream(self.capture_stream)
         self.npu_graph = npu_graph
         self.capture_error_mode = capture_error_mode
+        self.npu_graph.auto_dispatch_capture = auto_dispatch_capture
 
     def __enter__(self):
         # Free as much memory as we can for the graph
@@ -162,13 +308,16 @@ class graph:
 
         # Stackoverflow seems comfortable with this pattern
         self.stream_ctx.__enter__()
-
+        if self.npu_graph.auto_dispatch_capture:
+            self.npu_graph.graph_dispatch_mode.__enter__()
         self.npu_graph.capture_begin(
             *self.pool, capture_error_mode=self.capture_error_mode
         )
 
     def __exit__(self, exc_type, exc_value, traceback):
         self.npu_graph.capture_end()
+        if self.npu_graph.auto_dispatch_capture:
+            self.npu_graph.graph_dispatch_mode.__exit__(exc_type, exc_value, traceback)
         self.stream_ctx.__exit__(exc_type, exc_value, traceback)
         # returning None should propagate exceptions from either capture_end or stream_ctx.__exit__()
 
@@ -404,9 +553,9 @@ def make_graphed_callables(
             @staticmethod
             @torch.autograd.function.once_differentiable
             def backward(ctx, *grads):
-                if (len(grads) != len(static_grad_inputs)):
+                if (len(grads) != len(static_grad_outputs)):
                     raise RuntimeError("The length of grads"
-                        + " is not equal with the length of static_grad_inputs.")
+                        + " is not equal with the length of static_grad_outputs.")
                 for g, grad in zip(static_grad_outputs, grads):
                     if g is not None:
                         # don't copy if autograd gods have been kind and the
diff --git a/torch_npu/npu/memory.py b/torch_npu/npu/memory.py
index 20c1f5be88ac2875e5deb0da5acdb05b6c069643..4941aa9144d711cd79f2c5deff34f34476599b7e 100644
--- a/torch_npu/npu/memory.py
+++ b/torch_npu/npu/memory.py
@@ -762,10 +762,6 @@ def _record_memory_history_impl(
     max_entries: int = sys.maxsize,
     device=None,
 ):
-    if platform.machine() == "aarch64" and stacks == "all":
-        warnings.warn("Currently 'aarch64' does not support the display of c++ stacks, " \
-                      "changed to display only python.")
-        stacks = "python"
     torch_npu.npu._lazy_init()
     torch_npu._C._npu_record_memory_history(enabled, context, stacks, max_entries)
 
diff --git a/torch_npu/npu/mstx.py b/torch_npu/npu/mstx.py
index 8ae78319ae245271ae85b7c92163bed5219ac46f..49f3c2603e86b9478812fdc455a82a93ee025df6 100644
--- a/torch_npu/npu/mstx.py
+++ b/torch_npu/npu/mstx.py
@@ -37,44 +37,61 @@ def _no_exception_func(default_ret=None):
 class mstx:
     @staticmethod
     @_no_exception_func()
-    def mark(message=""):
+    def mark(message: str, stream=None, domain: str = 'default'):
         if not message or not isinstance(message, str):
             warnings.warn("Invalid message for mstx.mark func. Please input valid message string.")
             return
-        torch_npu._C._mark(message)
+        if not isinstance(domain, str):
+            warnings.warn("Invalid domain for mstx.mark func. Please input valid domain string.")
+            return
+        if stream:
+            if isinstance(stream, torch_npu.npu.streams.Stream):
+                stream = stream.npu_stream
+                torch_npu._C._mstx._mark(message, stream, domain)
+            else:
+                warnings.warn("Invalid stream for mstx.mark func. Please input valid stream.")
+                return
+        else:
+            torch_npu._C._mstx._mark_on_host(message, domain)
 
     @staticmethod
     @_no_exception_func()
-    def range_start(message: str, stream=None) -> int:
+    def range_start(message: str, stream=None, domain: str = 'default') -> int:
         if not message or not isinstance(message, str):
             warnings.warn("Invalid message for mstx.range_start func. Please input valid message string.")
             return 0
+        if not domain or not isinstance(domain, str):
+            warnings.warn("Invalid domain for mstx.range_start func. Please input valid domain string.")
+            return 0
         if stream:
             if isinstance(stream, torch_npu.npu.streams.Stream):
                 stream = stream.npu_stream
-                return torch_npu._C._mstx._range_start(message, stream)
+                return torch_npu._C._mstx._range_start(message, stream, domain)
             else:
                 warnings.warn("Invalid stream for mstx.range_start func. Please input valid stream.")
                 return 0
         else:
-            return torch_npu._C._mstx._range_start_on_host(message)
+            return torch_npu._C._mstx._range_start_on_host(message, domain)
 
     @staticmethod
     @_no_exception_func()
-    def range_end(range_id: int):
+    def range_end(range_id: int, domain: str = 'default'):
         if not isinstance(range_id, int):
-            warnings.warn("Invalid message for mstx.range_start func. Please input return value from mstx.range_start.")
+            warnings.warn("Invalid message for mstx.range_end func. Please input return value from mstx.range_start.")
+            return
+        if not domain or not isinstance(domain, str):
+            warnings.warn("Invalid domain for mstx.range_end func. Please input valid domain string.")
             return
-        torch_npu._C._mstx._range_end(range_id)
+        torch_npu._C._mstx._range_end(range_id, domain)
 
     @staticmethod
     @_no_exception_func()
-    def mstx_range(message: str, stream=None):
+    def mstx_range(message: str, stream=None, domain: str = 'default'):
         def wrapper(func):
             def inner(*args, **kargs):
-                range_id = mstx.range_start(message, stream)
+                range_id = mstx.range_start(message, stream, domain)
                 ret = func(*args, **kargs)
-                mstx.range_end(range_id)
+                mstx.range_end(range_id, domain)
                 return ret
             return inner
         return wrapper
diff --git a/torch_npu/npu/streams.py b/torch_npu/npu/streams.py
index b790ae2517be67b0f110a9ea5bcd8b384bb42529..c4ba551f670d36b6b09ce10fb06c7f94bf8efca4 100644
--- a/torch_npu/npu/streams.py
+++ b/torch_npu/npu/streams.py
@@ -3,7 +3,7 @@ import ctypes
 import torch_npu
 import torch_npu._C
 
-__all__ = ["Stream", "Event", "SyncLaunchStream"]
+__all__ = ["Stream", "Event", "SyncLaunchStream", "ExternalEvent"]
 
 
 class Stream(torch_npu._C._NPUStreamBase):
@@ -132,7 +132,8 @@ class Event(torch_npu._C._NPUEventBase):
     """
 
     def __new__(cls, enable_timing=False, blocking=False, interprocess=False):
-        return super(Event, cls).__new__(cls, enable_timing=enable_timing, blocking=blocking, interprocess=interprocess)
+        return super(Event, cls).__new__(cls, enable_timing=enable_timing, blocking=blocking,
+                                         interprocess=interprocess, graph_external=False)
 
     def record(self, stream=None):
         r"""Records the event in a given stream.
@@ -196,6 +197,66 @@ class Event(torch_npu._C._NPUEventBase):
             return '<torch_npu.npu.Event uninitialized>'
 
 
+class ExternalEvent(torch_npu._C._NPUEventBase):
+    r"""Wrapper around a NPU event with graph_external=True.
+
+    The difference from torch.npu.Event is that you can call wait() before
+    record(). Before reusing ExternalEvent, you need to call reset() to clear
+    the flag.
+
+    Event is captured in the graph as an external event node when performing
+    stream capture.
+
+    The underlying NPU events are lazily initialized when the event is first
+    recorded or waited.
+
+    """
+
+    def __new__(cls):
+        return super(ExternalEvent, cls).__new__(cls, enable_timing=False, blocking=False,
+                                                 interprocess=False, graph_external=True)
+
+    def record(self, stream=None):
+        r"""Records the event in a given stream.
+
+        Uses ``torch_npu.npu.current_stream()`` if no stream is specified. The
+        stream's device must match the event's device.
+        """
+        if stream is None:
+            stream = torch_npu.npu.current_stream()
+        super(ExternalEvent, self).record(stream)
+
+    def wait(self, stream=None):
+        r"""Makes all future work submitted to the given stream wait for this
+        event.
+
+        Use ``torch_npu.npu.current_stream()`` if no stream is specified.
+        """
+        if stream is None:
+            stream = torch_npu.npu.current_stream()
+        super(ExternalEvent, self).wait(stream)
+
+    def reset(self, stream=None):
+        r"""Reset an event.
+
+        Users need to make sure to wait for the tasks in the Stream
+        to complete before resetting the Event.
+        """
+        if stream is None:
+            stream = torch_npu.npu.current_stream()
+        super(ExternalEvent, self).reset(stream)
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.npu_event)
+
+    def __repr__(self):
+        if self.npu_event:
+            return '<torch_npu.npu.ExternalEvent {0:#x}>'.format(self._as_parameter_.value)
+        else:
+            return '<torch_npu.npu.ExternalEvent uninitialized>'
+
+
 class SyncLaunchStream(torch_npu._C._NPUStreamBase):
     r"""Wrapper around a SyncLaunch NPU stream.
 
diff --git a/torch_npu/npu/utils.py b/torch_npu/npu/utils.py
index 40c2f65fbf98f652c7dde76efc4844468653468d..75f5b7b4a995523351ce194da420dcd796311497 100644
--- a/torch_npu/npu/utils.py
+++ b/torch_npu/npu/utils.py
@@ -1,5 +1,5 @@
 import os
-from typing import Any
+from typing import Any, Optional, Union, List, cast
 from functools import lru_cache
 import warnings
 import contextlib
@@ -15,11 +15,11 @@ from torch_npu.npu._backends import get_soc_version
 
 
 __all__ = ["synchronize", "device_count", "can_device_access_peer", "set_device", "current_device", "get_device_name",
-           "get_device_properties", "mem_get_info", "get_device_capability", "utilization", "device", "device_of",
+           "get_device_properties", "mem_get_info", "get_device_capability", "utilization", "device", "device_of", "StreamContext",
            "stream", "set_stream", "current_stream", "default_stream", "set_sync_debug_mode", "get_sync_debug_mode",
            "init_dump", "set_dump", "finalize_dump", "is_support_inf_nan", "is_bf16_supported",
            "get_npu_overflow_flag", "npu_check_overflow", "clear_npu_overflow_flag", "current_blas_handle",
-           "check_uce_in_memory", "stress_detect", "get_cann_version"]
+           "check_uce_in_memory", "stress_detect", "get_cann_version", "get_uce_addr"]
 
 
 def get_cann_version(module="CANN"):
@@ -38,7 +38,7 @@ def _is_gte_cann_version(version, module="CANN"):
     compare current cann_version and version.
     Args:
         version: the features are supported or not from which cann version.
-        module: can be selected from [\"CANN\", \"RUNTIME\", \"COMPILER\", \"HCCL\", \"TOOLKIT\", \"OPP\", \"OPP_KERNEL\", \"DRIVER\"]
+        module: can be selected from [\"CANN\", \"RUNTIME\", \"COMPILER\", \"HCCL\", \"TOOLKIT\", \"OPP\", \"OPP_KERNEL\"]
 
     Returns: If current_version >= version, return True, else return False.
 
@@ -60,9 +60,82 @@ def synchronize(device=None):
         return torch_npu._C._npu_synchronize()
 
 
+def _parse_visible_devices() -> Union[List[int], List[str]]:
+    r"""Parse ASCEND_RT_VISIBLE_DEVICES environment variable."""
+    var = os.getenv("ASCEND_RT_VISIBLE_DEVICES")
+    if var is None:
+        return list(range(64))
+
+    rc: List[int] = []
+
+    if not var:
+        return rc
+
+    # Multiple Device IDs are separated by ',' and cannot contain any other characters. 
+    # If any other characters are included, only the Device IDs before them will be read
+    for idx, c in enumerate(var):
+        if not (c.isdigit() or c == ","):
+            break
+        if idx + 1 == len(var):
+            idx += 1
+
+    for elem in var[:idx].split(","):
+        if not elem:
+            return rc
+        x = int(elem)
+        rc.append(x)
+
+    return rc
+
+
+def _raw_device_count_ascend_hal() -> int:
+    r"""Return number of devices as reported by ascend_hal or negative value if ascend_hal discovery/initialization failed."""
+    from ctypes import byref, c_int, CDLL
+
+    ascend_hal_h = CDLL("libascend_hal.so")
+
+    dev_count = c_int(-1)
+    rc = ascend_hal_h.drvGetDevNum(byref(dev_count))
+
+
+    if rc != 0:
+        warnings.warn("Can't get ascend_hal device count")
+        return -1
+    del ascend_hal_h
+    return dev_count.value
+
+
+def _device_count_ascend_hal() -> int:
+    r"""Return number of devices as reported by ascend_hal taking ASCEND_RT_VISIBLE_DEVICES into account.
+
+    Negative value is returned if ascend_hal discovery or initialization has failed.
+    """
+    visible_devices = _parse_visible_devices()
+    if not visible_devices:
+        return 0
+    try:
+        raw_cnt = _raw_device_count_ascend_hal()
+        if raw_cnt <= 0:
+            return raw_cnt
+        # Trim the list up to a maximum available device
+        for idx, val in enumerate(visible_devices):
+            # `rts` need ascending order
+            if idx > 0 and val <= visible_devices[idx - 1]:
+                return 0
+            if cast(int, val) >= raw_cnt:
+                return idx
+    except OSError:
+        return -1
+    except AttributeError:
+        return -1
+    return len(visible_devices)
+
+
 @lru_cache(maxsize=1)
-def device_count():
-    return torch_npu._C._npu_getDeviceCount()
+def device_count() -> int:
+    r"""Return the number of NPUs available."""
+    ascend_hal_count = _device_count_ascend_hal()
+    return torch_npu._C._npu_getDeviceCount() if ascend_hal_count < 0 else ascend_hal_count
 
 
 def can_device_access_peer(device_id, peer_device_id):
@@ -147,14 +220,15 @@ class device(object):
     def __enter__(self):
         if self.idx == -1:
             return
-        self.prev_idx = torch_npu._C._npu_getDevice()
+        self.prev_idx = torch_npu._C._npu_getDeviceWithoutSet()
         if self.prev_idx != self.idx:
             torch_npu._C._npu_setDevice(self.idx)
         torch_npu.npu._lazy_init()
 
     def __exit__(self, *args):
-        if self.prev_idx != self.idx:
-            torch_npu._C._npu_setDevice(self.prev_idx)
+        if self.prev_idx == -1:
+            self.prev_idx = 0
+        self.idx = torch_npu._C._npu_maybeExchangeDevice(self.prev_idx)
         return False
 
 
@@ -205,39 +279,73 @@ class device_of(device):
         super(device_of, self).__init__(idx)
 
 
-@contextlib.contextmanager
-def stream(stream):
+class StreamContext:
     r"""Context-manager that selects a given stream.
 
     All NPU kernels queued within its context will be enqueued on a selected
     stream.
 
+    Args:
+        Stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    .. note:: Streams are per-device.
+    """
+    cur_stream: Optional["torch_npu.npu.Stream"]
+
+    def __init__(self, stream_ctx: Optional["torch_npu.npu.Stream"]):
+        self.stream = stream_ctx
+        self.idx = _get_device_index(None, True)
+        if not torch.jit.is_scripting():
+            if self.idx is None:
+                self.idx = -1
+
+        self.src_prev_stream = (
+            None if not torch.jit.is_scripting() else torch.npu.default_stream()
+        )
+        self.dst_prev_stream = (
+            None if not torch.jit.is_scripting() else torch.npu.default_stream()
+        )
+
+    def __enter__(self):
+        # Local cur_stream variable for type refinement
+        cur_stream = self.stream
+        # Return if stream is None or NPU device not available
+        if cur_stream is None or self.idx == -1:
+            return
+        self.src_prev_stream = torch.npu.current_stream()
+
+        # If the stream is not on the current device, then
+        # set the current stream on the device
+        if self.src_prev_stream.device != cur_stream.device:
+            with device(cur_stream.device):
+                self.dst_prev_stream = torch.npu.current_stream(cur_stream.device)
+        torch.npu.set_stream(cur_stream)
+
+    def __exit__(self, exec_type: Any, exec_value: Any, traceback: Any):
+        # Local cur_stream variable for type refinement
+        cur_stream = self.stream
+        # If stream is None or no NPU device available, return
+        if cur_stream is None or self.idx == -1:
+            return
+
+        # Reset the stream on the original device
+        # and destination device
+        if self.src_prev_stream.device != cur_stream.device:  # type: ignore[union-attr]
+            torch.npu.set_stream(self.dst_prev_stream)  # type: ignore[arg-type]
+        torch.npu.set_stream(self.src_prev_stream)  # type: ignore[arg-type]
+
+
+def stream(stream):
+    r"""Wrap around the Context-manager StreamContext that selects a given stream.
+
     Arguments:
         stream (Stream): selected stream. This manager is a no-op if it's
             ``None``.
 
-    .. note:: Streams are per-device. If the selected stream is not on the
-        current device, this function will also change the current device to
-        match the stream.
+    ..Note:: In eager mode stream is of type Stream class while in JIT it is
+      an object of the custom class ``torch.classes.npu.Stream``.
     """
-    if stream is None:
-        yield
-        return
-    src_prev_stream = current_stream()
-
-    if src_prev_stream.device != stream.device:
-        # The given stream is on a different device; have to restore the
-        # current_stream on that device on exit as well
-        with device(stream.device):
-            dst_prev_stream = current_stream()
-
-    torch.npu.set_stream(stream)
-    try:
-        yield
-    finally:
-        if src_prev_stream.device != stream.device:
-            torch.npu.set_stream(dst_prev_stream)
-        torch.npu.set_stream(src_prev_stream)
+    return StreamContext(stream)
 
 
 def set_stream(stream):
@@ -412,3 +520,41 @@ def current_blas_handle():
 def check_uce_in_memory(device_id):
     torch_npu.npu._lazy_init()
     return torch_npu._C._npu_check_uce_in_memory(device_id)
+
+
+def get_uce_addr():
+    torch_npu.npu._lazy_init()
+    return torch_npu._C._npu_get_uce_addr()
+
+
+def _erase_stream(tensor, stream):
+    r"""Remove the tags of the tensor that are used by this stream through the record_stream function.
+
+    The memory can be reused between multiple streams. By default, the record_stream is used to mark the memory pool
+    to prevent the reused memory from being returned to the memory pool in advance. Each time the memory pool
+    applies for memory, it queries the event on the device to determine whether the operator has been executed and
+    can be safely released. However, the combination of host and device has a side effect. When the host is dispatched
+    much faster than the device, the peak memory usage may be increased because the device is not completely executed
+    when the host is querying.
+
+    This api provides the erase_stream capability with memory pool. The memory can be returned in advance by actively
+    erasing and freeing the memory after the event wait. The subsequent operators must be executed after the event wait.
+    Therefore, the memory that is released back to the memory pool in advance will not be trampled by the subsequent operators.
+
+    Args:
+        tensor(Tensor): The tensor whose tag needs to be removed.
+        stream(Stream): The tensor is marked in the stream and the tag needs to be removed in the current operation.
+
+    Warning:
+        When the current api is in use, it must be used in conjunction with the event wait method.
+        Otherwise, memory trampling behavior may occur.
+    """
+
+    if not isinstance(tensor, torch.Tensor):
+        raise TypeError(f"tensor should be torch.Tensor, could not be {type(tensor)}" + pta_error(ErrCode.TYPE))
+    if not isinstance(stream, torch_npu.npu.Stream):
+        raise TypeError(f"stream should be torch_npu.npu.Stream, could not be {type(stream)}" + pta_error(ErrCode.TYPE))
+    torch_npu._C._npu_eraseStream(tensor=tensor,
+                                stream_id=stream.stream_id,
+                                device_index=stream.device_index,
+                                device_type=stream.device_type)
diff --git a/torch_npu/onnx/wrapper_onnx_ops.py b/torch_npu/onnx/wrapper_onnx_ops.py
index 4b2348d87139d44a65f1c2aef751a53e65013ece..a261d1785911b154eae6ee0a18e82b236de76921 100644
--- a/torch_npu/onnx/wrapper_onnx_ops.py
+++ b/torch_npu/onnx/wrapper_onnx_ops.py
@@ -1006,6 +1006,8 @@ class _NPUMoeGatingTopKSoftmaxOP(torch.autograd.Function):
                  x: torch.Tensor,
                  finished: Optional[Tensor],
                  k: int = 1):
+        if finished is None:
+            finished = g.op("npu::NPUMoeGatingTopKSoftmax", value_t=torch.tensor([]).to(torch.bool))
         return g.op("npu::NPUMoeGatingTopKSoftmax", x, finished, k_i=k, outputs=3)
     
 
diff --git a/torch_npu/profiler/__init__.py b/torch_npu/profiler/__init__.py
index d31bd85ebb2cb8bd5c84008cd1c9159aca9aaa51..58ca46e2739eae10a9b790e20e4c06eae7a20555 100644
--- a/torch_npu/profiler/__init__.py
+++ b/torch_npu/profiler/__init__.py
@@ -8,12 +8,12 @@ from .profiler_interface import supported_activities
 from .scheduler import Schedule as schedule
 from .scheduler import ProfilerAction
 from .experimental_config import _ExperimentalConfig, supported_profiler_level, supported_ai_core_metrics, \
-    supported_export_type, ProfilerLevel, AiCMetrics, ExportType
+    supported_export_type, ProfilerLevel, AiCMetrics, ExportType, HostSystem
 from ._non_intrusive_profile import _NonIntrusiveProfile
 
 __all__ = ["profile", "ProfilerActivity", "supported_activities", "tensorboard_trace_handler", "schedule",
            "ProfilerAction", "_ExperimentalConfig", "supported_profiler_level", "supported_ai_core_metrics",
-           "supported_export_type", "ProfilerLevel", "AiCMetrics", "ExportType"]
+           "supported_export_type", "ProfilerLevel", "AiCMetrics", "ExportType", "HostSystem"]
 
 
 _NonIntrusiveProfile.init()
diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
index 627f220393a4068fc86149e624f24adbebe71af4..5da94ae76334e5978f4da11c2313733369b52949 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
@@ -179,8 +179,8 @@ class ConfigContext:
                     self.activity_set.add(activity)
                 else:
                     DynamicProfilerUtils.out_log("Set activity failed, activity must be CPU OR NPU!",
-                                         DynamicProfilerUtils.LoggerLevelEnum.WARNING)
-    
+                                                 DynamicProfilerUtils.LoggerLevelEnum.WARNING)
+
     def _parse_analysis(self, json_data: dict):
         if not self._is_dyno:
             self._analyse = json_data.get("analyse", False)
@@ -188,7 +188,7 @@ class ConfigContext:
             self._analyse = json_data.get("PROFILE_ANALYSE", 'false')
             self._analyse = self.BOOL_MAP.get(self._analyse.lower(), False)
 
-    def _parse_dyno_exp_cfg(self, json_data: dict): 
+    def _parse_dyno_exp_cfg(self, json_data: dict):
         profiler_level = json_data.get('PROFILE_PROFILER_LEVEL', 'Level0')
         profiler_level = getattr(ProfilerLevel, profiler_level, profiler_level)
         aic_metrics = json_data.get('PROFILE_AIC_METRICS', 'AiCoreNone')
@@ -198,15 +198,22 @@ class ConfigContext:
         op_attr = json_data.get('PROFILE_OP_ATTR', 'false')
         op_attr = self.BOOL_MAP.get(op_attr.lower(), False)
         gc_detect_threshold = json_data.get('PROFILE_GC_DETECT_THRESHOLD', None)
-        if isinstance(gc_detect_threshold, str) and gc_detect_threshold != "None":
-            gc_detect_threshold = float(gc_detect_threshold)
+        if isinstance(gc_detect_threshold, str):
+            gc_detect_threshold = None if gc_detect_threshold == "None" else float(gc_detect_threshold)
         data_simplification = json_data.get('PROFILE_DATA_SIMPLIFICATION', 'true')
         data_simplification = self.BOOL_MAP.get(data_simplification.lower(), True)
         record_op_args = False
         export_type = json_data.get('PROFILE_EXPORT_TYPE', 'text').lower()
         msprof_tx = json_data.get('PROFILE_MSPROF_TX', 'false')
         msprof_tx = self.BOOL_MAP.get(msprof_tx.lower(), False)
-        
+        host_sys = DynamicProfilerUtils.parse_str_params_to_list(json_data.get('PROFILE_HOST_SYS', None))
+        mstx_domain_include = DynamicProfilerUtils.parse_str_params_to_list(json_data.get('PROFILE_MSTX_DOMAIN_INCLUDE', None))
+        mstx_domain_exclude = DynamicProfilerUtils.parse_str_params_to_list(json_data.get('PROFILE_MSTX_DOMAIN_EXCLUDE', None))
+        sys_io = json_data.get('PROFILE_SYS_IO', 'false')
+        sys_io = self.BOOL_MAP.get(sys_io.lower(), False)
+        sys_interconnection = json_data.get('PROFILE_SYS_INTERCONNECTION', 'false')
+        sys_interconnection = self.BOOL_MAP.get(sys_interconnection.lower(), False)
+
         self.experimental_config = _ExperimentalConfig(
             profiler_level=profiler_level,
             aic_metrics=aic_metrics,
@@ -216,9 +223,14 @@ class ConfigContext:
             data_simplification=data_simplification,
             record_op_args=record_op_args,
             export_type=export_type,
-            msprof_tx=msprof_tx
+            msprof_tx=msprof_tx,
+            host_sys=host_sys,
+            mstx_domain_include=mstx_domain_include,
+            mstx_domain_exclude=mstx_domain_exclude,
+            sys_io=sys_io,
+            sys_interconnection=sys_interconnection,
         )
-    
+
     def _parse_cfg_json_exp_cfg(self, json_data: dict):
         exp_config = json_data.get('experimental_config')
         if not exp_config:
@@ -235,6 +247,11 @@ class ConfigContext:
         record_op_args = exp_config.get('record_op_args', False)
         export_type = exp_config.get('export_type', 'text')
         msprof_tx = exp_config.get('msprof_tx', False)
+        mstx_domain_include = exp_config.get('mstx_domain_include', None)
+        mstx_domain_exclude = exp_config.get('mstx_domain_exclude', None)
+        host_sys = exp_config.get('host_sys', None)
+        sys_io = exp_config.get('sys_io', None)
+        sys_interconnection = exp_config.get('sys_interconnection', None)
 
         self.experimental_config = _ExperimentalConfig(
             profiler_level=profiler_level,
@@ -245,7 +262,12 @@ class ConfigContext:
             data_simplification=data_simplification,
             record_op_args=record_op_args,
             export_type=export_type,
-            msprof_tx=msprof_tx
+            msprof_tx=msprof_tx,
+            mstx_domain_include=mstx_domain_include,
+            mstx_domain_exclude=mstx_domain_exclude,
+            host_sys=host_sys,
+            sys_io=sys_io,
+            sys_interconnection=sys_interconnection
         )
 
     def _parse_exp_cfg(self, json_data: dict):
@@ -312,7 +334,7 @@ class ConfigContext:
         return self._active
 
     def warmup(self) -> int:
-        if not isinstance(self._warmup, int) or self._warmup <= 0:
+        if not isinstance(self._warmup, int) or self._warmup < 0:
             DynamicProfilerUtils.out_log("Invalid parameter warmup, reset it to 0.",
                                          DynamicProfilerUtils.LoggerLevelEnum.WARNING)
             return self.DEFAULT_WARMUP
diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
index a53c4568202a6b7e1d91b8f2e1be22682e3bffed..9ffbce78f79214940ec8f0a964f5abc04670cd7f 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
@@ -38,7 +38,12 @@ class DynamicProfilerShareMemory:
             "data_simplification": True,
             "record_op_args": False,
             "export_type": ["text"],
-            "msprof_tx": False
+            "msprof_tx": False,
+            "host_sys": [],
+            "mstx_domain_include": [],
+            "mstx_domain_exclude": [],
+            "sys_io": False,
+            "sys_interconnection": False
         }
     }
 
@@ -180,8 +185,9 @@ class DynamicProfilerShareMemory:
                 DynamicProfilerUtils.out_log("Rank {} unlink shm".format(
                     self._rank_id), DynamicProfilerUtils.LoggerLevelEnum.INFO)
             except Exception as ex:
-                DynamicProfilerUtils.out_log("Rank {} unlink shm failed, may be removed, {} hs occur".format(
-                    self._rank_id, str(ex)), DynamicProfilerUtils.LoggerLevelEnum.ERROR)
+                if self._rank_id != -1:
+                    DynamicProfilerUtils.out_log("Rank {} unlink shm failed, may be removed, {} hs occur".format(
+                        self._rank_id, str(ex)), DynamicProfilerUtils.LoggerLevelEnum.ERROR)
             self.shm = None
 
     def _clean_shm_py37(self):
@@ -196,8 +202,9 @@ class DynamicProfilerShareMemory:
                 DynamicProfilerUtils.out_log("Rank {} unlink shm".format(
                     self._rank_id), DynamicProfilerUtils.LoggerLevelEnum.INFO)
             except Exception as ex:
-                DynamicProfilerUtils.out_log("Rank {} unlink shm failed, may be removed, {} has occur ".format(
-                    self._rank_id, str(ex)), DynamicProfilerUtils.LoggerLevelEnum.ERROR)
+                if self._rank_id != -1:
+                    DynamicProfilerUtils.out_log("Rank {} unlink shm failed, may be removed, {} has occur ".format(
+                        self._rank_id, str(ex)), DynamicProfilerUtils.LoggerLevelEnum.ERROR)
             PathManager.remove_path_safety(os.path.dirname(self.shm_path))
             self.shm = None
 
diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_utils.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_utils.py
index 984a5419184a7bde3d5bfe1cf7c3f8b72c732867..e15b5ddb5f00a415204156aad5f980e2b17acb75 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_utils.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_utils.py
@@ -114,3 +114,9 @@ class DynamicProfilerUtils:
             res_dict = {}
 
         return res_dict
+
+    @staticmethod
+    def parse_str_params_to_list(params):
+        if params is None or params == 'None':
+            return []
+        return [item.strip() for item in params.split(',')]
diff --git a/torch_npu/profiler/analysis/_profiler_config.py b/torch_npu/profiler/analysis/_profiler_config.py
index 246e1ed31ffb68ecd48aa39db48feef012133a58..1e871f2fea0aba7203d63ee88fa89239467da89e 100644
--- a/torch_npu/profiler/analysis/_profiler_config.py
+++ b/torch_npu/profiler/analysis/_profiler_config.py
@@ -14,6 +14,10 @@ from .prof_bean._l2_cache_bean import L2CacheBean
 from .prof_bean._api_statistic_bean import ApiStatisticBean
 from .prof_bean._op_statistic_bean import OpStatisticBean
 from .prof_bean._npu_module_mem_bean import NpuModuleMemoryBean
+from .prof_bean._nic_bean import NicBean
+from .prof_bean._roce_bean import RoCEBean
+from .prof_bean._pcie_bean import PcieBean
+from .prof_bean._hccs_bean import HccsBean
 
 __all__ = []
 
@@ -48,6 +52,8 @@ class ProfilerConfig:
         self._is_cluster = False
         self._localtime_diff = 0
         self._syscnt_enable = False
+        self._sys_io = False
+        self._sys_interconnection = False
         self._freq = 100.0
         self._time_offset = 0
         self._start_cnt = 0
@@ -162,12 +168,19 @@ class ProfilerConfig:
         self._op_attr = experimental_config.get(Constant.OP_ATTR, self._op_attr)
         self._data_simplification = experimental_config.get(Constant.DATA_SIMPLIFICATION, self._data_simplification)
         self._export_type = self._get_export_type_from_profiler_info(experimental_config)
+        self._sys_io = experimental_config.get(Constant.SYS_IO, self._sys_io)
+        self._sys_interconnection = experimental_config.get(Constant.SYS_INTERCONNECTION, self._sys_interconnection)
 
     def load_rank_info(self, info_json: dict):
         self._rank_id = info_json.get(Constant.RANK_ID, -1)
 
     def get_parser_bean(self):
-        return self.LEVEL_PARSER_CONFIG.get(self._profiler_level) + self._get_l2_cache_bean()
+        return (
+            self.LEVEL_PARSER_CONFIG.get(self._profiler_level) +
+            self._get_l2_cache_bean() +
+            self._get_sys_io_bean() +
+            self._get_sys_interconnection_bean()
+        )
 
     def get_prune_config(self):
         return self.LEVEL_TRACE_PRUNE_CONFIG.get(self._profiler_level)
@@ -186,6 +199,12 @@ class ProfilerConfig:
 
     def _get_l2_cache_bean(self):
         return [(CANNDataEnum.L2_CACHE, L2CacheBean)] if self._l2_cache else []
+    
+    def _get_sys_io_bean(self):
+        return [(CANNDataEnum.NIC, NicBean), (CANNDataEnum.ROCE, RoCEBean)] if self._sys_io else []
+
+    def _get_sys_interconnection_bean(self):
+        return [(CANNDataEnum.PCIE, PcieBean), (CANNDataEnum.HCCS, HccsBean)] if self._sys_interconnection else []
 
     def _get_export_type_from_profiler_info(self, experimental_config: dict) -> list:
         export_type = experimental_config.get(Constant.EXPORT_TYPE, self._export_type)
diff --git a/torch_npu/profiler/analysis/_profiling_parser.py b/torch_npu/profiler/analysis/_profiling_parser.py
index 451b37fc82a583aa1c93d3589b622ba7708b2c25..4458d7559c445f63dc30b204acc74c2b925045bd 100644
--- a/torch_npu/profiler/analysis/_profiling_parser.py
+++ b/torch_npu/profiler/analysis/_profiling_parser.py
@@ -26,8 +26,7 @@ class ProfilingParser:
             self._output_path = os.path.join(profiler_path, Constant.OUTPUT_DIR)
             PathManager.remove_path_safety(self._output_path)
             PathManager.make_dir_safety(self._output_path)
-        ProfilerLogger.init(self._profiler_path, "ProfilingParser")
-        self.logger = ProfilerLogger.get_instance()
+        self.logger = None
 
     @staticmethod
     def simplify_data(profiler_path: str, simplify_flag: bool):
@@ -58,6 +57,9 @@ class ProfilingParser:
                     PathManager.remove_file_safety(file_path)
 
     def update_export_type(self):
+        export_type = self._kwargs.get('export_type', None)
+        if export_type is not None:
+            ProfilerConfig().export_type = export_type
         if Constant.Db not in ProfilerConfig().export_type:
             return
         if self._analysis_type == Constant.EXPORT_CHROME_TRACE or self._analysis_type == Constant.EXPORT_STACK:
@@ -81,6 +83,8 @@ class ProfilingParser:
                     PathManager.remove_file_safety(os.path.join(cann_path, filename))
 
     def analyse_profiling_data(self):
+        ProfilerLogger.init(self._profiler_path, "ProfilingParser")
+        self.logger = ProfilerLogger.get_instance()
         print_info_msg(f"Start parsing profiling data: {self._profiler_path}")
         ProfilerConfig().load_info(self._profiler_path)
         self.update_export_type()
@@ -99,6 +103,7 @@ class ProfilingParser:
         param_dict = {"profiler_path": self._profiler_path, "output_path": self._output_path}
         if self._kwargs:
             param_dict.update(self._kwargs)
+        param_dict[Constant.EXPORT_TYPE] = ProfilerConfig().export_type
 
         parser_config = ParserConfig.ONLY_FWK_CONFIG
         if ProfilerPathManager.get_cann_path(self._profiler_path):
diff --git a/torch_npu/profiler/analysis/prof_bean/_common_bean.py b/torch_npu/profiler/analysis/prof_bean/_common_bean.py
index 3ac6513a93fd4a5f654caed7f24c29196f07d793..de46b5d0a4aa5f0809dd631146c0a07767f0b32a 100644
--- a/torch_npu/profiler/analysis/prof_bean/_common_bean.py
+++ b/torch_npu/profiler/analysis/prof_bean/_common_bean.py
@@ -13,12 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FILTER_COL_LIST = ["Device_id"]
-
-
 class CommonBean:
     def __init__(self, data: dict):
-        for col in FILTER_COL_LIST:
-            if col in data:
-                data.pop(col)
         self._data = data
diff --git a/torch_npu/profiler/analysis/prof_bean/_event_bean.py b/torch_npu/profiler/analysis/prof_bean/_event_bean.py
index b03e7a58c4c91beb1fad19624e5fa8cf486e0630..d97cae4acb5c916d9287f18366d77936ec88569b 100644
--- a/torch_npu/profiler/analysis/prof_bean/_event_bean.py
+++ b/torch_npu/profiler/analysis/prof_bean/_event_bean.py
@@ -10,6 +10,7 @@ class EventBean:
 
     def __init__(self, data: dict):
         self._origin_data = data
+        self.device_id = -1
 
     @property
     def ts(self) -> int:
diff --git a/torch_npu/profiler/analysis/prof_bean/_hccs_bean.py b/torch_npu/profiler/analysis/prof_bean/_hccs_bean.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3a99eb9c0403237a0c0e7bc4d5100c2f0b9082
--- /dev/null
+++ b/torch_npu/profiler/analysis/prof_bean/_hccs_bean.py
@@ -0,0 +1,17 @@
+__all__ = []
+
+from ._common_bean import CommonBean
+
+
+class HccsBean(CommonBean):
+
+    def __init__(self, data: dict):
+        super().__init__(data)
+
+    @property
+    def row(self) -> list:
+        return list(self._data.values())
+
+    @property
+    def headers(self) -> list:
+        return list(self._data.keys())
diff --git a/torch_npu/profiler/analysis/prof_bean/_memory_use_bean.py b/torch_npu/profiler/analysis/prof_bean/_memory_use_bean.py
index 4e29f204e3c19f0ae6b829148d0bd20dc85e1709..0385af8d79ff38e5715a48582890ac8d8cc5cb0f 100644
--- a/torch_npu/profiler/analysis/prof_bean/_memory_use_bean.py
+++ b/torch_npu/profiler/analysis/prof_bean/_memory_use_bean.py
@@ -19,14 +19,15 @@ class MemoryEnum(Enum):
     STREAM_PTR = 6
     DEVICE_TYPE = 7
     DEVICE_INDEX = 8
-    DATA_TYPE = 9
-    ALLOCATOR_TYPE = 10
-    THREAD_ID = 11
-    PROCESS_ID = 12
+    COMPONENT_TYPE = 9
+    DATA_TYPE = 10
+    ALLOCATOR_TYPE = 11
+    THREAD_ID = 12
+    PROCESS_ID = 13
 
 
 class MemoryUseBean(CommonBean):
-    CONSTANT_STRUCT = "<7q2b2B2Q"
+    CONSTANT_STRUCT = "<7q2b3B2Q"
     NPU_ID = 20
     CPU_ID = 0
     INNER_ALLOCATOR = 0
@@ -88,6 +89,10 @@ class MemoryUseBean(CommonBean):
     def device_index(self) -> int:
         return int(self._constant_data[MemoryEnum.DEVICE_INDEX.value])
 
+    @property
+    def component_type(self) -> int:
+        return int(self._constant_data[MemoryEnum.COMPONENT_TYPE.value])
+
     @property
     def data_type(self) -> int:
         return int(self._constant_data[MemoryEnum.DATA_TYPE.value])
diff --git a/torch_npu/profiler/analysis/prof_bean/_nic_bean.py b/torch_npu/profiler/analysis/prof_bean/_nic_bean.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b9be706b0355ae8dbbfc030b54bf815933ceb69
--- /dev/null
+++ b/torch_npu/profiler/analysis/prof_bean/_nic_bean.py
@@ -0,0 +1,17 @@
+__all__ = []
+
+from ._common_bean import CommonBean
+
+
+class NicBean(CommonBean):
+
+    def __init__(self, data: dict):
+        super().__init__(data)
+
+    @property
+    def row(self) -> list:
+        return list(self._data.values())
+
+    @property
+    def headers(self) -> list:
+        return list(self._data.keys())
diff --git a/torch_npu/profiler/analysis/prof_bean/_npu_module_mem_bean.py b/torch_npu/profiler/analysis/prof_bean/_npu_module_mem_bean.py
index 872b23e26ffd0c181124717f01ee9e20cca29f81..3183cd737c4cec538aad2820ce50be1dcfe1e7ae 100644
--- a/torch_npu/profiler/analysis/prof_bean/_npu_module_mem_bean.py
+++ b/torch_npu/profiler/analysis/prof_bean/_npu_module_mem_bean.py
@@ -5,7 +5,7 @@ __all__ = []
 
 
 class NpuModuleMemoryBean(CommonBean):
-    SHOW_HEADERS = ["Component", "Timestamp(us)", "Total Reserved(MB)", "Device"]
+    SHOW_HEADERS = ["Device_id", "Component", "Timestamp(us)", "Total Reserved(MB)", "Device"]
 
     def __init__(self, data: dict):
         super().__init__(data)
diff --git a/torch_npu/profiler/analysis/prof_bean/_pcie_bean.py b/torch_npu/profiler/analysis/prof_bean/_pcie_bean.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a5a335302e144ca7721ad7d266d3bc485d1a6cd
--- /dev/null
+++ b/torch_npu/profiler/analysis/prof_bean/_pcie_bean.py
@@ -0,0 +1,17 @@
+__all__ = []
+
+from ._common_bean import CommonBean
+
+
+class PcieBean(CommonBean):
+
+    def __init__(self, data: dict):
+        super().__init__(data)
+
+    @property
+    def row(self) -> list:
+        return list(self._data.values())
+
+    @property
+    def headers(self) -> list:
+        return list(self._data.keys())
diff --git a/torch_npu/profiler/analysis/prof_bean/_roce_bean.py b/torch_npu/profiler/analysis/prof_bean/_roce_bean.py
new file mode 100644
index 0000000000000000000000000000000000000000..065202a359e1b967445b0ab8b799a8743605342a
--- /dev/null
+++ b/torch_npu/profiler/analysis/prof_bean/_roce_bean.py
@@ -0,0 +1,17 @@
+__all__ = []
+
+from ._common_bean import CommonBean
+
+
+class RoCEBean(CommonBean):
+
+    def __init__(self, data: dict):
+        super().__init__(data)
+
+    @property
+    def row(self) -> list:
+        return list(self._data.values())
+
+    @property
+    def headers(self) -> list:
+        return list(self._data.keys())
diff --git a/torch_npu/profiler/analysis/prof_common_func/_constant.py b/torch_npu/profiler/analysis/prof_common_func/_constant.py
index 1ce2c1fbbde2a3f31b0860520e36548e0bc39e67..56809c9b7f65be2479f7dd1e9d63e068940c1eab 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_constant.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_constant.py
@@ -81,6 +81,7 @@ class Constant(object):
     GE = "GE"
     APP = "APP"
     PTA_GE = "PTA+GE"
+    WORKSPACE = "WORKSPACE"
     B_TO_KB = 1024.0
     KB_TO_MB = 1024.0
     B_TO_MB = 1024.0 ** 2
@@ -93,6 +94,8 @@ class Constant(object):
     MEMORY_MALLOC = 0
     MEMORY_FREE = 1
     MEMORY_BLOCK_FREE = 2
+    CACHING_TYPE = 0
+    WORKSPACE_TYPE = 1
 
     # profiler config
     CONFIG = "config"
@@ -106,6 +109,8 @@ class Constant(object):
     MSPROF_TX = '_msprof_tx'
     OP_ATTR = "_op_attr"
     DATA_SIMPLIFICATION = '_data_simplification'
+    SYS_IO = '_sys_io'
+    SYS_INTERCONNECTION = '_sys_interconnection'
     EXPORT_TYPE = '_export_type'
     LEVEL0 = "Level0"
     LEVEL1 = "Level1"
@@ -122,6 +127,11 @@ class Constant(object):
     AicMetricsNone = "ACL_AICORE_NONE"
     Db = "db"
     Text = "text"
+    CPU = "cpu"
+    MEM = "mem"
+    DISK = "disk"
+    NETWORK = "network"
+    OSRT = "osrt"
 
     # profiler end info
     END_INFO = "end_info"
@@ -433,6 +443,7 @@ class TableColumnsManager():
             ("type", Constant.SQL_TEXT_TYPE)
         ],
         DbConstant.TABLE_STEP_TRACE_TIME : [
+            ("deviceId", Constant.SQL_INTEGER_TYPE),
             ("step", Constant.SQL_TEXT_TYPE),
             ("computing", Constant.SQL_NUMERIC_TYPE),
             ("communication_not_overlapped", Constant.SQL_NUMERIC_TYPE),
diff --git a/torch_npu/profiler/analysis/prof_common_func/_csv_headers.py b/torch_npu/profiler/analysis/prof_common_func/_csv_headers.py
index 09214e6cbe8dfaeff8e5d8beb676b5142b7ebda2..a98312d62ce39ce8a6784adb9322bf845dd32d07 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_csv_headers.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_csv_headers.py
@@ -1,7 +1,7 @@
 class CsvHeaders(object):
     # op_summary
     TASK_START_TIME = "Task Start Time(us)"
-    OP_SUMMARY_SHOW_HEADERS = ["Op Name", "OP Type", "Task Type", TASK_START_TIME, "Task Duration(us)",
+    OP_SUMMARY_SHOW_HEADERS = ["Device_id", "Op Name", "OP Type", "Task Type", TASK_START_TIME, "Task Duration(us)",
                                "Task Wait Time(us)", "Block Dim"]
-    OP_SUMMARY_KERNEL_BASE_HEADERS = ["Name", "Type", "Accelerator Core", "Start Time(us)", "Duration(us)",
+    OP_SUMMARY_KERNEL_BASE_HEADERS = ["Device_id", "Name", "Type", "Accelerator Core", "Start Time(us)", "Duration(us)",
                                       "Wait Time(us)", "Block Dim"]
diff --git a/torch_npu/profiler/analysis/prof_common_func/_log.py b/torch_npu/profiler/analysis/prof_common_func/_log.py
index 15ba7a80f9d10ed74e1e26a4a5be4ab9190b7ef0..eba5db1af7f74910d1afd3a1fcf47bfb2a928098 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_log.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_log.py
@@ -34,6 +34,7 @@ class ProfilerLogger:
     BACKUP_COUNT = 3
     # logger instance
     _instance = None
+    _pid = None
 
     @classmethod
     def get_instance(cls) -> logging.Logger:
@@ -54,7 +55,9 @@ class ProfilerLogger:
             RuntimeError: If logger initialization fails
         """
         if cls._instance is not None:
-            return
+            if cls._pid == os.getpid():
+                return
+            cls.destroy()
 
         # Create logs directory
         log_dir = os.path.join(output_dir, cls.DEFAULT_LOG_DIR)
@@ -89,6 +92,7 @@ class ProfilerLogger:
         logger.addHandler(file_handler)
 
         cls._instance = logger
+        cls._pid = os.getpid()
         logger.info("Profiler logger initialized at: %s", log_file)
 
     @classmethod
@@ -106,9 +110,21 @@ class ProfilerLogger:
 
     @classmethod
     def destroy(cls) -> None:
-        """Close and cleanup the logger."""
+        """
+        Close and cleanup the logger.
+        To avoid the deadlock problem caused by directly calling close on handler in multi-process scenarios, close the
+        file descriptor manually.
+        """
         if cls._instance:
             for handler in cls._instance.handlers[:]:
-                handler.close()
                 cls._instance.removeHandler(handler)
+                if cls._pid == os.getpid():
+                    handler.close()
+                else:
+                    try:
+                        if hasattr(handler.stream, 'fileno'):
+                            fileno = handler.stream.fileno()
+                            os.close(fileno)
+                    except (OSError, AttributeError, ValueError):
+                        logging.warning("Close profiler logger handler stream failed.")
             cls._instance = None
diff --git a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py
index b52763aa4ec7014fa602c5ab6c386aaf38f6fb5f..1753752ae07fe1fc7bec26fa02557d4c6940b2bf 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py
@@ -7,6 +7,7 @@ import multiprocessing
 import fcntl
 import pickle
 import signal
+import stat
 from enum import Enum
 from abc import ABC, abstractmethod
 from torch_npu.utils._error_code import ErrCode, prof_error
@@ -287,9 +288,19 @@ class ConcurrentTasksManager:
         if self.epoll is None:
             self.epoll = select.epoll()
         pr, pipe_write = os.pipe()
-        # 读管道设为非阻塞
-        flags = fcntl.fcntl(pr, fcntl.F_GETFL)
-        fcntl.fcntl(pr, fcntl.F_SETFL, flags | os.O_NONBLOCK)
+
+        try:
+            # 设置读管道为非阻塞并限制权限
+            flags = fcntl.fcntl(pr, fcntl.F_GETFL)
+            fcntl.fcntl(pr, fcntl.F_SETFL, flags | os.O_NONBLOCK)
+
+            # 设置管道文件描述符权限（只允许当前用户访问）
+            os.fchmod(pr, stat.S_IRUSR | stat.S_IWUSR)
+            os.fchmod(pipe_write, stat.S_IRUSR | stat.S_IWUSR)
+        except (OSError, AttributeError):
+            flags = fcntl.fcntl(pr, fcntl.F_GETFL)
+            fcntl.fcntl(pr, fcntl.F_SETFL, flags | os.O_NONBLOCK)
+
         task_info.pipe = (pr, pipe_write)
         self.epoll.register(pr, select.EPOLLIN | select.EPOLLET | select.EPOLLERR | select.EPOLLHUP)
         self.listening_infos[pr] = task_info
diff --git a/torch_npu/profiler/analysis/prof_config/_fwk_file_parser_config.py b/torch_npu/profiler/analysis/prof_config/_fwk_file_parser_config.py
index a4edff67c640dbb77f3634dbf11d9335d8cfc251..714a6204012189fdecf9f27d58200db35279635a 100644
--- a/torch_npu/profiler/analysis/prof_config/_fwk_file_parser_config.py
+++ b/torch_npu/profiler/analysis/prof_config/_fwk_file_parser_config.py
@@ -25,7 +25,7 @@ class FwkFileParserConfig:
     FILE_BEAN_MAP = {
         FileTag.TORCH_OP: {"bean": TorchOpBean, "is_tlv": True, "struct_size": 58},
         FileTag.OP_MARK: {"bean": OpMarkBean, "is_tlv": True, "struct_size": 40},
-        FileTag.MEMORY: {"bean": MemoryUseBean, "is_tlv": True, "struct_size": 76},
+        FileTag.MEMORY: {"bean": MemoryUseBean, "is_tlv": True, "struct_size": 77},
         FileTag.GC_RECORD: {"bean": GCRecordBean, "is_tlv": False, "struct_size": 24},
         FileTag.PYTHON_TRACER_FUNC: {"bean": PythonTracerFuncBean, "is_tlv": False, "struct_size": 33},
         FileTag.PYTHON_TRACER_HASH: {"bean": PythonTracerHashBean, "is_tlv": True, "struct_size": 8},
diff --git a/torch_npu/profiler/analysis/prof_config/_parser_deps_config.py b/torch_npu/profiler/analysis/prof_config/_parser_deps_config.py
index 51cd44ba02f6290092641bd52ee2b637d81d5999..e61ecc36900de08a5d2477166b54a66f6e48360e 100644
--- a/torch_npu/profiler/analysis/prof_config/_parser_deps_config.py
+++ b/torch_npu/profiler/analysis/prof_config/_parser_deps_config.py
@@ -55,7 +55,7 @@ class ParserDepsConfig:
                                   Constant.DEPS: [Constant.TREE_BUILD_PARSER]},
         Constant.DB_PARSER: {Constant.MODE: ConcurrentMode.PTHREAD,
                              Constant.DEPS: [Constant.CANN_EXPORT_PARSER, Constant.MEMORY_PREPARE,
-                                             Constant.TREE_BUILD_PARSER]},
+                                             Constant.TREE_BUILD_PARSER, Constant.CANN_ANALYZE_PARSER]},
         Constant.MEMORY_TIMELINE_PARSER: {}
     }
 
diff --git a/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py b/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py
index b70b3e049d8021728709599e9d1c6cde3a2b0622..8acbd1490a281678cf569ae68af77a3106c39e49 100644
--- a/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py
+++ b/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py
@@ -33,6 +33,10 @@ class CANNDataEnum(Enum):
     REPORT_DB = 12
     ANALYSIS_DB = 13
     API_STATISTIC = 14
+    NIC = 15
+    ROCE = 16
+    PCIE = 17
+    HCCS = 18
 
 
 class CANNFileParser:
@@ -52,9 +56,9 @@ class CANNFileParser:
                                         r"^memory_record_\d{1,20}.*\.csv",
                                         r"^memory_record_slice_\d{1,20}.*\.csv"],
         CANNDataEnum.GE_OPERATOR_MEMORY: [r"^ge_operator_memory_\d{1,20}.*\.csv",
-                                        r"^ge_operator_memory_slice_\d{1,20}.*\.csv",
-                                        r"^operator_memory_\d{1,20}.*\.csv",
-                                        r"^operator_memory_slice_\d{1,20}.*\.csv"],
+                                          r"^ge_operator_memory_slice_\d{1,20}.*\.csv",
+                                          r"^operator_memory_\d{1,20}.*\.csv",
+                                          r"^operator_memory_slice_\d{1,20}.*\.csv"],
         CANNDataEnum.L2_CACHE: [r"^l2_cache_\d{1,20}.*\.csv", r"^l2_cache_slice_\d{1,20}.*\.csv"],
         CANNDataEnum.AI_CPU: [r"^aicpu_\d{1,20}.*\.csv", r"^aicpu_slice_\d{1,20}.*\.csv"],
         CANNDataEnum.COMMUNICATION: [r"^communication\.json"],
@@ -63,7 +67,11 @@ class CANNFileParser:
         CANNDataEnum.OP_STATISTIC: [r"^op_statistic_\d{1,20}.*\.csv", r"^op_statistic_slice_\d{1,20}.*\.csv"],
         CANNDataEnum.NPU_MODULE_MEM: [r"^npu_module_mem_\d{1,20}.*\.csv", r"^npu_module_mem_slice_\d{1,20}.*\.csv"],
         CANNDataEnum.REPORT_DB: [r"^msprof_\d{1,20}\.db"],
-        CANNDataEnum.ANALYSIS_DB: [r"communication_analyzer\.db"]
+        CANNDataEnum.ANALYSIS_DB: [r"communication_analyzer\.db"],
+        CANNDataEnum.NIC: [r"^nic_\d{1,20}.*\.csv"],
+        CANNDataEnum.ROCE: [r"^roce_\d{1,20}.*\.csv"],
+        CANNDataEnum.PCIE: [r"^pcie_\d{1,20}.*\.csv"],
+        CANNDataEnum.HCCS: [r"^hccs_\d{1,20}.*\.csv"]
     }
 
     def __init__(self, profiler_path: str):
@@ -119,12 +127,13 @@ class CANNFileParser:
                 event_dict[unique_id] = data
 
         if not flow_dict:
-            logger.error("There is no HostToDevice flow events in msprof timeline.")
+            logger.warning("There is no HostToDevice flow events in msprof timeline.")
 
         if not event_dict:
             logger.error("There is no kernel events in msprof timeline.")
 
         acl_to_npu_dict = {}
+        warning_kernel_num = 0
         for flow in flow_dict.values():
             start_event = flow.get("start")
             end_event = flow.get("end")
@@ -135,11 +144,11 @@ class CANNFileParser:
                 unique_id = f"{pid}-{tid}-{ts}"
                 kernel_event = event_dict.get(unique_id)
                 if not kernel_event:
-                    logger.warning("The kernel event of unique_id(pid: %d, tid: %d, ts: %d) is not exist in msprof timeline.", 
-                                    pid, tid, ts)
+                    warning_kernel_num += 1
                     continue
                 acl_to_npu_dict.setdefault(convert_us2ns(start_event.get("ts", 0)), []).append(EventBean(kernel_event))
-                
+        if warning_kernel_num:
+            logger.warning(f"{warning_kernel_num} kernels do not exist in the msprof timeline.")
         return acl_to_npu_dict
 
     def get_timeline_all_data(self) -> list:
@@ -200,7 +209,8 @@ class CANNFileParser:
         PathManager.remove_path_safety(output_path)
 
     def _file_dispatch(self):
-        all_file_list = ProfilerPathManager.get_output_all_file_list_by_type(self._cann_path, self.MINDSTUDIO_PROFILER_OUTPUT)
+        all_file_list = ProfilerPathManager.get_output_all_file_list_by_type(self._cann_path,
+                                                                             self.MINDSTUDIO_PROFILER_OUTPUT)
         all_file_list += ProfilerPathManager.get_analyze_all_file(self._cann_path, self.ANALYZE)
         all_file_list += ProfilerPathManager.get_database_all_file(self._cann_path)
         for file_path in all_file_list:
diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py
index de8b46606733a35c255922bfdb927f6e3ddcf3f3..ba29da446eb5c43b8b93ce4d8bea4b9f245da487 100644
--- a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py
+++ b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py
@@ -1,4 +1,5 @@
 from ._fwk_file_parser import FwkFileParser
+from .._profiler_config import ProfilerConfig
 from ..prof_bean._torch_op_node import TorchOpNode
 from ..prof_common_func._constant import Constant, print_error_msg
 from ..prof_common_func._log import ProfilerLogger
@@ -47,7 +48,7 @@ class FwkCANNRelationParser:
 
     def get_kernel_dict(self) -> dict:
         acl_to_npu_dict = CANNFileParser(self._profiler_path).get_acl_to_npu_data()
-        if not acl_to_npu_dict:
+        if not acl_to_npu_dict and ProfilerConfig().get_level() != Constant.LEVEL_NONE:
             print_error_msg("Failed to get acl to npu flow events.")
             return acl_to_npu_dict
         dequeue_data_list = FwkFileParser(self._profiler_path).get_dequeue_data()
@@ -60,7 +61,7 @@ class FwkCANNRelationParser:
         # Get ProfilerStep#x node
         step_node_list = [node for node in root_node.child_node_list if node.is_profiler_step()]
         if not step_node_list:
-            self.logger.error("Get step range failed, the step node list is empty.")
+            self.logger.warning("Get step range failed, the step node list is empty.")
             return []
         
         # Gather flow events start time in each step node
@@ -73,7 +74,7 @@ class FwkCANNRelationParser:
             step_id = step_node.event.name.split("#")[-1]
             if not step_node.corr_id_total:
                 self.logger.error("There is no flow events in %s range.", step_node.event.name)
-                return []
+                continue
             corr_id_list = sorted(step_node.corr_id_total)
             min_index, max_index = 0, len(corr_id_list) - 1
             min_kernel_list, max_kernel_list = [], []
diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py
index 9916220ad181712cd7624be208a8190afecf640b..f78e3821d6e353e12c75ba04909506c1bc6db27e 100644
--- a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py
+++ b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py
@@ -49,6 +49,7 @@ class FwkFileParser:
             return enqueue_data_list
         op_mark_data.sort(key=lambda x: x.time_ns)
         tid_op_dict = defaultdict(lambda: defaultdict(list))
+        match_failed_num = 0
         for op_mark in op_mark_data:
             if not op_mark.is_enqueue:
                 continue
@@ -57,14 +58,15 @@ class FwkFileParser:
                 continue
             start_op_list = tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, [])
             if not start_op_list:
-                self.logger.warning("Enquque data match failed, the tid: %d, origin_name: %s is not exist.", 
-                                op_mark.tid, op_mark.origin_name)
+                match_failed_num += 1
                 continue
             start_op = start_op_list.pop()
             op_mark.ts = start_op.time_ns
             op_mark.dur = op_mark.time_ns - start_op.time_ns
             enqueue_data_list.append(op_mark)
             start_op_list.clear()
+        if match_failed_num:
+            self.logger.warning(f"{match_failed_num} enqueue data match failed.")
         return enqueue_data_list
 
     def get_dequeue_data(self) -> list:
@@ -75,6 +77,7 @@ class FwkFileParser:
             return dequeue_data_list
         op_mark_data.sort(key=lambda x: x.time_ns)
         tid_op_dict = defaultdict(lambda: defaultdict(list))
+        match_failed_num = 0
         for op_mark in op_mark_data:
             if not op_mark.is_dequeue:
                 continue
@@ -83,14 +86,15 @@ class FwkFileParser:
                 continue
             start_op_list = tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, [])
             if not start_op_list:
-                self.logger.warning("Dequque data match failed, the tid: %d, origin_name: %s is not exist.", 
-                                op_mark.tid, op_mark.origin_name)
+                match_failed_num += 1
                 continue
             start_op = start_op_list.pop()
             op_mark.ts = start_op.time_ns
             op_mark.dur = op_mark.time_ns - start_op.time_ns
             dequeue_data_list.append(op_mark)
             start_op_list.clear()
+        if match_failed_num:
+            self.logger.warning(f"{match_failed_num} enqueue data match failed.")
         return dequeue_data_list
 
     def get_task_queue_data(self) -> any:
@@ -101,6 +105,7 @@ class FwkFileParser:
         op_mark_data.sort(key=lambda x: x.time_ns)
         enqueue_tid_op_dict = defaultdict(lambda: defaultdict(list))
         dequeue_tid_op_dict = defaultdict(lambda: defaultdict(list))
+        enqueue_match_failed_num, dequeue_match_failed_num = 0, 0
         for op_mark in op_mark_data:
             if op_mark.is_enqueue_start:
                 enqueue_tid_op_dict[op_mark.tid][op_mark.origin_name].append(op_mark)
@@ -111,8 +116,7 @@ class FwkFileParser:
             if op_mark.is_enqueue_end:
                 start_op_list = enqueue_tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, [])
                 if not start_op_list:
-                    self.logger.warning("Enquque data match failed, the tid: %d, origin_name: %s is not exist.", 
-                                    op_mark.tid, op_mark.origin_name)
+                    enqueue_match_failed_num += 1
                     continue
                 start_op = start_op_list.pop()
                 op_mark.ts = start_op.time_ns
@@ -123,14 +127,17 @@ class FwkFileParser:
             if op_mark.is_dequeue_end:
                 start_op_list = dequeue_tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, [])
                 if not start_op_list:
-                    self.logger.warning("Dequque data match failed, the tid: %d, origin_name: %s is not exist.", 
-                                    op_mark.tid, op_mark.origin_name)
+                    dequeue_match_failed_num += 1
                     continue
                 start_op = start_op_list.pop()
                 op_mark.ts = start_op.time_ns
                 op_mark.dur = op_mark.time_ns - start_op.time_ns
                 dequeue_data_list.append(op_mark)
                 start_op_list.clear()
+        if enqueue_match_failed_num:
+            self.logger.warning(f"{enqueue_match_failed_num} enqueue data match failed.")
+        if dequeue_match_failed_num:
+            self.logger.warning(f"{dequeue_match_failed_num} dequeue data match failed.")
         return enqueue_data_list, dequeue_data_list
 
     def get_torch_op_tree_node(self, only_fwk: bool = False) -> list:
@@ -146,33 +153,41 @@ class FwkFileParser:
 
     def get_fwk_trace_data(self):
         torch_op_data = self.get_file_data_by_tag(FileTag.TORCH_OP)
-        if not torch_op_data:
-            self.logger.error("Get fwk trace data failed, the torch op data is empty.")
-            return []
         enqueue_data_list, dequeue_data_list = self.get_task_queue_data()
-        pid = torch_op_data[0].pid
+        if torch_op_data:
+            pid = torch_op_data[0].pid
+        elif enqueue_data_list or dequeue_data_list:
+            pid = enqueue_data_list[0].pid if enqueue_data_list else dequeue_data_list[0].pid
+        else:
+            self.logger.error("Get fwk trace data failed, framework data is empty.")
+            return []
         tid_dict = {}
         fwk_x_event_list = [None] * (
                 len(torch_op_data) + len(enqueue_data_list) * 2 + len(dequeue_data_list) * 2)
         index = 0
         fwd_dict = {}
+        correlation_id_name_dict = {}
         for torch_op in torch_op_data:
             self.filter_fwd_bwd_event(fwd_dict, torch_op)
             tid_dict[torch_op.tid] = False
             fwk_x_event_list[index] = TraceEventManager.create_x_event(torch_op, "cpu_op")
             index += 1
-        for enqueue_data in enqueue_data_list:
-            tid_dict[enqueue_data.tid] = False
-            fwk_x_event_list[index] = TraceEventManager.create_x_event(enqueue_data, "enqueue")
-            index += 1
-            fwk_x_event_list[index] = TraceEventManager.create_task_queue_flow(Constant.FLOW_START_PH, enqueue_data)
-            index += 1
         for dequeue_data in dequeue_data_list:
             tid_dict[dequeue_data.tid] = True
             fwk_x_event_list[index] = TraceEventManager.create_x_event(dequeue_data, "dequeue")
             index += 1
             fwk_x_event_list[index] = TraceEventManager.create_task_queue_flow(Constant.FLOW_END_PH, dequeue_data)
             index += 1
+            correlation_id_name_dict[dequeue_data.corr_id] = dequeue_data.origin_name
+        for enqueue_data in enqueue_data_list:
+            tid_dict[enqueue_data.tid] = False
+            fwk_x_event_list[index] = TraceEventManager.create_x_event(enqueue_data, "enqueue")
+            if enqueue_data.corr_id in correlation_id_name_dict:
+                # append correlation name with '@' prefix for consistent with Dequeue
+                fwk_x_event_list[index]['name'] += f"@{correlation_id_name_dict[enqueue_data.corr_id]}"
+            index += 1
+            fwk_x_event_list[index] = TraceEventManager.create_task_queue_flow(Constant.FLOW_START_PH, enqueue_data)
+            index += 1
         other_event_list = TraceEventManager.create_m_event(pid, tid_dict)
         other_event_list.extend(TraceEventManager.create_fwd_flow(fwd_dict))
         fwk_x_event_list.extend(other_event_list)
@@ -231,14 +246,20 @@ class FwkFileParser:
                 bwd_op_id = node['end']['idx']
                 torch_op_apis[fwb_op_id][3].append(start_connection_id)
                 torch_op_apis[bwd_op_id][3].append(start_connection_id)
-                
+
                 start_connection_id += 1
 
     def get_fwk_api(self) -> dict:
         torch_op_data = self.get_file_data_by_tag(FileTag.TORCH_OP)
-        if not torch_op_data:
+        enqueue_data_list, dequeue_data_list = self.get_task_queue_data()
+        if torch_op_data:
+            pid = torch_op_data[0].pid
+        elif enqueue_data_list or dequeue_data_list:
+            pid = enqueue_data_list[0].pid if enqueue_data_list else dequeue_data_list[0].pid
+        else:
+            self.logger.error("Get fwk api data failed, framework data is empty.")
             return {}
-        pid = torch_op_data[0].pid
+
         torch_op_apis = []
         fwd_bwd_dict = {}
         torch_op_idx = 0
@@ -248,7 +269,8 @@ class FwkFileParser:
         for torch_op in torch_op_data:
             api = [torch_op.ts, torch_op.end_ns, contact_2num(pid, torch_op.tid), [], torch_op.name,
                    torch_op.args.get(Constant.SEQUENCE_NUMBER, -1), torch_op.args.get(Constant.FORWARD_THREAD_ID),
-                   torch_op.args.get(Constant.INPUT_DTYPES), torch_op.args.get(Constant.INPUT_SHAPES), torch_op.call_stack]
+                   torch_op.args.get(Constant.INPUT_DTYPES), torch_op.args.get(Constant.INPUT_SHAPES),
+                   torch_op.call_stack]
             if torch_op.name == "mstx_mark_op":
                 mstx_mark_apis.append(api)
             else:
@@ -260,17 +282,24 @@ class FwkFileParser:
         connection_ids = []
         task_enqueues = []
         task_dequeues = []
-        enqueue_data_list, dequeue_data_list = self.get_task_queue_data()
-        for enqueue_data in enqueue_data_list:
-            task_enqueues.append(
-                [enqueue_data.ts, enqueue_data.ts + enqueue_data.dur, contact_2num(pid, enqueue_data.tid),
-                 enqueue_data.corr_id, enqueue_data.name])
-            connection_ids.append(enqueue_data.corr_id)
+        correlation_id_name_dict = {}
         for dequeue_data in dequeue_data_list:
             task_dequeues.append(
                 [dequeue_data.ts, dequeue_data.ts + dequeue_data.dur, contact_2num(pid, dequeue_data.tid),
                  dequeue_data.corr_id, dequeue_data.name])
-        
+            correlation_id_name_dict[dequeue_data.corr_id] = dequeue_data.origin_name
+            torch_tids.add(dequeue_data.tid)
+        for enqueue_data in enqueue_data_list:
+            name = enqueue_data.name
+            if enqueue_data.corr_id in correlation_id_name_dict:
+                # append correlation name with '@' prefix for consistent with Dequeue
+                name += f"@{correlation_id_name_dict[enqueue_data.corr_id]}"
+            task_enqueues.append(
+                [enqueue_data.ts, enqueue_data.ts + enqueue_data.dur, contact_2num(pid, enqueue_data.tid),
+                 enqueue_data.corr_id, name])
+            connection_ids.append(enqueue_data.corr_id)
+            torch_tids.add(enqueue_data.tid)
+
         start_connection_id = max(connection_ids) + 1 if connection_ids else 0
         self.update_fwd_bwd_connection_id(fwd_bwd_dict, torch_op_apis, start_connection_id)
 
diff --git a/torch_npu/profiler/analysis/prof_view/_base_parser.py b/torch_npu/profiler/analysis/prof_view/_base_parser.py
index 26dc595cd7ec27540b7f9e6693bfddc32a86a2d2..08f51fd04c5dfc884284fd75c66ef2c7f66b43a6 100644
--- a/torch_npu/profiler/analysis/prof_view/_base_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_base_parser.py
@@ -29,8 +29,9 @@ class BaseParser(ConcurrentTask, ABC):
         self._profiler_path = None
         self._output_path = None
         deps, mode = self._init_param(name)
+        self._export_type = param_dict.get(Constant.EXPORT_TYPE, [])
         super(BaseParser, self).__init__(name, deps, mode)
-        
+
     def _init_param(self, name: str) -> any:
         self._profiler_path = self._param_dict.get("profiler_path")
         self._output_path = self._param_dict.get("output_path")
diff --git a/torch_npu/profiler/analysis/prof_view/_integrate_parser.py b/torch_npu/profiler/analysis/prof_view/_integrate_parser.py
index b1344bc0a240d528d3006442dc58f5a22f8582fe..b6c545420c3bb961640c7ef25dc54e8050fad6ae 100644
--- a/torch_npu/profiler/analysis/prof_view/_integrate_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_integrate_parser.py
@@ -13,6 +13,10 @@ class IntegrateParser(BaseParser):
     copy and integrate files from cann
     """
     CSV_FILENAME_MAP = {
+        CANNDataEnum.NIC: "nic.csv",
+        CANNDataEnum.ROCE: "roce.csv",
+        CANNDataEnum.PCIE: "pcie.csv",
+        CANNDataEnum.HCCS: "hccs.csv",
         CANNDataEnum.AI_CPU: "data_preprocess.csv",
         CANNDataEnum.L2_CACHE: "l2_cache.csv",
         CANNDataEnum.API_STATISTIC: "api_statistic.csv",
diff --git a/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py b/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py
index b06d7d3d724916286fcbf65de2366b6f13ddbd6b..30ffd8be8ba46e0b8cc5ac1300c4eba389211eaa 100644
--- a/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py
@@ -77,7 +77,7 @@ class KernelViewParser(BaseParser):
                 return
             step_range = FwkCANNRelationParser(self._profiler_path).get_step_range(torch_op_node[0], kernel_dict)
             if not step_range:
-                self.logger.error("Kernel view get step range failed, the step range is empty.")
+                self.logger.warning("Kernel view get step range failed, the step range is empty.")
             for step_data in step_range:
                 step_id = step_data.get(Constant.STEP_ID)
                 step_start = convert_ns2us_str(step_data.get(Constant.START_TS, 0))
diff --git a/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py b/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py
index 80c5755819b0d8fd6cfc63dd25cd9ca463f0986d..4cb4ed35db5af171fbc98b0d92ab3eb884956f65 100644
--- a/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py
@@ -126,9 +126,9 @@ class MemoryPrepareParser(BaseParser):
                 valid_record_list = self._get_valid_record_entry(ptr_records)
                 pid_mem_buf.extend(valid_record_list)
             pid_mem_buf.sort(key=lambda x: x[0].time_ns)
-            if Constant.Text in ProfilerConfig().export_type:
+            if Constant.Text in self._export_type:
                 self.memory_data.setdefault(Constant.Text, self._complete_record_entry(pid_mem_buf, torch_ops))
-            if Constant.Db in ProfilerConfig().export_type:
+            if Constant.Db in self._export_type:
                 self.memory_data.setdefault(Constant.Db, self._complete_record_entry_for_db(pid_mem_buf, torch_ops))
 
     @staticmethod
@@ -210,13 +210,15 @@ class MemoryPrepareParser(BaseParser):
             else:
                 op_name = self._find_real_op_name_of_record(dequeue_record, torch_ops)
             if records_len == 1:
-                self._incomplete_num += 2
+                if hasattr(records[0], 'component_type') and records[0].component_type == Constant.CACHING_TYPE:
+                    self._incomplete_num += 2
                 combine_data = [op_name, records[0].alloc_size_for_db, records[0].time_ns, None, None, None, None,
                                 records[0].total_allocated_for_db, records[0].total_reserved_for_db, records[0].total_active_for_db,
                                 None, None, None,
                                 records[0].stream_ptr, records[0].device_index]
             elif records_len == 2:
-                self._incomplete_num += 1
+                if hasattr(records[0], 'component_type') and records[0].component_type == Constant.CACHING_TYPE:
+                    self._incomplete_num += 1
                 active_release_time = records[1].time_ns if records[1].data_type == Constant.MEMORY_BLOCK_FREE else None
                 release_time = records[1].time_ns if records[1].data_type == Constant.MEMORY_FREE else None
                 duration_time = records[1].time_ns - records[0].time_ns if records[1].data_type == Constant.MEMORY_FREE else None
@@ -251,13 +253,15 @@ class MemoryPrepareParser(BaseParser):
             else:
                 op_name = self._find_real_op_name_of_record(dequeue_record, torch_ops)
             if records_len == 1:
-                self._incomplete_num += 2
+                if hasattr(records[0], 'component_type') and records[0].component_type == Constant.CACHING_TYPE:
+                    self._incomplete_num += 2
                 combine_data = [op_name, records[0].alloc_size, convert_ns2us_str(records[0].time_ns, "\t"), None, None, None, None,
                                 records[0].total_allocated, records[0].total_reserved, records[0].total_active,
                                 None, None, None,
                                 records[0].stream_ptr, records[0].device_tag]
             elif records_len == 2:
-                self._incomplete_num += 1
+                if hasattr(records[0], 'component_type') and records[0].component_type == Constant.CACHING_TYPE:
+                    self._incomplete_num += 1
                 active_release_time = convert_ns2us_str(records[1].time_ns, "\t") if records[1].data_type == Constant.MEMORY_BLOCK_FREE else None
                 release_time = convert_ns2us_str(records[1].time_ns, "\t") if records[1].data_type == Constant.MEMORY_FREE else None
                 duration_time = convert_ns2us_str(records[1].time_ns - records[0].time_ns, "\t") if records[1].data_type == Constant.MEMORY_FREE else None
diff --git a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py
index fa834e543be39c1f861015e540b26a7fb81f1336..a82c3dc3c8f08ebe6875f0b7a5e59730c6cf4e6e 100644
--- a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py
@@ -53,6 +53,11 @@ class MemoryViewParser(BaseParser):
 
     @staticmethod
     def _combine_record(last_record, cur_record):
+        if hasattr(cur_record, 'component_type') and cur_record.component_type == Constant.WORKSPACE_TYPE:
+            cur_record_list = [Constant.WORKSPACE, convert_ns2us_str(cur_record.time_ns, tail="\t"),
+                               cur_record.total_allocated, cur_record.total_reserved, cur_record.total_active,
+                               cur_record.stream_ptr, cur_record.device_tag]
+            return [cur_record_list]
         cur_record_list = cur_record.row
         if last_record:
             pta_ge_record_list = [Constant.PTA_GE, convert_ns2us_str(cur_record.time_ns, tail="\t"),
@@ -104,7 +109,8 @@ class MemoryViewParser(BaseParser):
             if ge_record.time_ns >= pta_record.time_ns:
                 self.size_record_list.extend(self._combine_record(last_ge_record, pta_record))
                 pta_ptr += 1
-                last_pta_record = pta_record
+                if hasattr(pta_record, 'component_type') and pta_record.component_type != Constant.WORKSPACE_TYPE:
+                    last_pta_record = pta_record
             else:
                 self.size_record_list.extend(self._combine_record(last_pta_record, ge_record))
                 ge_ptr += 1
diff --git a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
index b9b7af71ada29276eb334f1623b031dcc581d51c..b5e0502ee410027dea1bc9d0f2b324c969bf26c3 100644
--- a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
@@ -1,3 +1,5 @@
+import copy
+from collections import defaultdict
 from enum import Enum
 from ._base_parser import BaseParser
 from ..prof_common_func._constant import Constant
@@ -11,6 +13,24 @@ from ..prof_parse._fwk_file_parser import FwkFileParser
 __all__ = []
 
 
+def default_time():
+    return {
+        'compute': 0,
+        'comunNotOverlp': 0,
+        'Overlp': 0,
+        'comun': 0,
+        'free': 0,
+        'stage': 0,
+        'bubble': 0,
+        'comunNotOverlpRec': 0,
+        'prepare': 0
+    }
+
+
+def step_time_dict():
+    return defaultdict(default_time)
+
+
 class StepInfoIndex(Enum):
     ID = 0
     START_TS = 1
@@ -25,8 +45,8 @@ class TraceStepTimeParser(BaseParser):
     STEP_TRACE = "step_trace_time.csv"
     timeflag = {'Communication': 'comun', 'Computing': 'compute', 'Free': 'free',
                 'Communication(Not Overlapped)': 'comunNotOverlp', 'hcom_receive': 'bubble'}
-    title = ['Step', 'Computing', 'Communication(Not Overlapped)', 'Overlapped', 'Communication', 'Free', 'Stage',
-             'Bubble', 'Communication(Not Overlapped and Exclude Receive)', 'Preparing']
+    title = ['Device_id', 'Step', 'Computing', 'Communication(Not Overlapped)', 'Overlapped', 'Communication',
+             'Free', 'Stage', 'Bubble', 'Communication(Not Overlapped and Exclude Receive)', 'Preparing']
 
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
@@ -43,18 +63,21 @@ class TraceStepTimeParser(BaseParser):
             return False
 
     @classmethod
-    def count_time(cls, add_type, start_time, duration, step_list, save_time):
+    def count_time(cls, add_type, data, step_list, save_time, pid_device_map):
+        start_time = data.get('ts', 0)
+        duration = data.get('dur', 0)
+        device_id = pid_device_map[data['pid']]
         cur_step = None
         if not cls.is_float_num(start_time) or not cls.is_float_num(duration):
             print('Ts or dur format error!')
             return
         start_time = float(start_time)
         duration = float(duration)
-        for step in step_list:
+        for step in step_list.get(device_id, []):
             if step[StepInfoIndex.START_TS.value] <= start_time < step[StepInfoIndex.END_TS.value]:
                 cur_step = step[StepInfoIndex.ID.value]
                 break
-        for step in step_list:
+        for step in step_list.get(device_id, []):
             if cur_step == step[StepInfoIndex.ID.value]:
                 if start_time < step[StepInfoIndex.E2E_START_TS.value] or \
                     step[StepInfoIndex.E2E_START_TS.value] == -1:
@@ -67,10 +90,7 @@ class TraceStepTimeParser(BaseParser):
                        step[StepInfoIndex.FIRST_TASK_TS.value] == -1:
                         step[StepInfoIndex.FIRST_TASK_TS.value] = start_time
                 break
-        for cur_save in save_time:
-            if cur_save.get('step') == cur_step:
-                cur_save[cls.timeflag.get(add_type)] += duration
-                break
+        save_time[device_id][cur_step][cls.timeflag.get(add_type)] += duration
 
     @classmethod
     def get_e2e_time(cls, step, step_list):
@@ -91,43 +111,57 @@ class TraceStepTimeParser(BaseParser):
 
     def create_step_file(self, output_path: str, json_str: list, file_name: str) -> None:
         step_list = []
-        save_time = []
+        save_time = defaultdict(step_time_dict)
         if not json_str:
             return
-        # get step time
+        # obtain the mapping between pid and device_id(rank_id)
+        pid_device_map = {}
+        for data in json_str:
+            if data.get('name') == 'process_labels' and data.get('args', {}).get('labels', '').startswith('NPU'):
+                label = data['args']['labels']
+                pid_device_map[data.get('pid')] = -1 if label == 'NPU' else int(label.split(' ')[1]) # "labels": "NPU 0"
+        # get initial step time
         for cur_step in self.step_range:
             step_list.append(
                 [cur_step.get(Constant.STEP_ID), convert_ns2us_float(cur_step.get(Constant.START_TS)),
                  convert_ns2us_float(cur_step.get(Constant.END_TS)), -1, -1,
                  convert_ns2us_float(cur_step.get(Constant.FWK_START_TS)), -1])
-            save_time.append(
-                {'step': cur_step.get(Constant.STEP_ID), 'compute': 0, 'comunNotOverlp': 0, 'Overlp': 0, 'comun': 0,
-                 'free': 0, 'stage': 0, 'bubble': 0, 'comunNotOverlpRec': 0, 'prepare': 0})
         if not self.step_range:
-            save_time.append(
-                {'step': None, 'compute': 0, 'comunNotOverlp': 0, 'Overlp': 0, 'comun': 0, 'free': 0, 'stage': 0,
-                 'bubble': 0, 'comunNotOverlpRec': 0, 'prepare': 0})
             step_list.append([None, -1, -1, -1, -1, -1, -1])
-
+        # every device should have its own step_list
+        step_dict = {}
+        for device in set(pid_device_map.values()):
+            step_dict[device] = copy.deepcopy(step_list)
         has_analysis_data_flag = False
+        bubble_data = []
+        # traverse json and calculate time
         for data in json_str:
             if data.get('name') in {'Communication', 'Computing', 'Free', 'Communication(Not Overlapped)'}:
-                self.count_time(data.get('name'), data.get('ts', 0), data.get('dur', 0), step_list, save_time)
+                self.count_time(data.get('name'), data, step_dict, save_time, pid_device_map)
                 has_analysis_data_flag = True
             elif str(data.get('name')).startswith('hcom_receive'):
-                self.count_time('hcom_receive', data.get('ts', 0), data.get('dur', 0), step_list, save_time)
+                bubble_data.append(data)
+                self.count_time('hcom_receive', data, step_dict, save_time, pid_device_map)
         if not has_analysis_data_flag:
             return
-        for calc_time in save_time:
-            calc_time['comunNotOverlpRec'] = calc_time['comunNotOverlp'] - calc_time['bubble']
-            calc_time['Overlp'] = calc_time['comun'] - calc_time['comunNotOverlp']
-            calc_time['stage'] = self.get_e2e_time(calc_time['step'], step_list) - calc_time['bubble']
-            calc_time['prepare'] = self.get_prepare_time(calc_time['step'], step_list)
         print_time = []
-        for step in save_time:
-            print_time.append(
-                [step['step'], step['compute'], step['comunNotOverlp'], step['Overlp'], step['comun'], step['free'],
-                 step['stage'], step['bubble'], step['comunNotOverlpRec'], step['prepare']])
+        for device, device_time in save_time.items():
+            for step, step_time in device_time.items():
+                if self.step_range and step is None:
+                    continue
+                step_time['comunNotOverlpRec'] = step_time['comunNotOverlp'] - step_time['bubble']
+                step_time['Overlp'] = step_time['comun'] - step_time['comunNotOverlp']
+                step_time['stage'] = self.get_e2e_time(step, step_dict.get(device, [])) - step_time['bubble']
+                step_time['prepare'] = self.get_prepare_time(step, step_dict.get(device, []))
+                print_time.append(
+                    [device, step, step_time['compute'], step_time['comunNotOverlp'], step_time['Overlp'],
+                     step_time['comun'], step_time['free'], step_time['stage'], step_time['bubble'],
+                     step_time['comunNotOverlpRec'], step_time['prepare']])
+        if print_time:
+            if self.step_range:
+                print_time.sort(key=lambda x: (x[0], int(x[1])))  # step is a string
+            else:
+                print_time.sort(key=lambda x: x[0])  # step is None
         FileManager.create_csv_file(output_path, print_time, file_name, self.title)
 
     def run(self, deps_data: dict):
diff --git a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py
index fb5cc724944498b0dfa131e6b0c21fdd1cd6d059..d5b577eaee1a53c1a3cf8b4f42293d712fce6083 100644
--- a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py
+++ b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py
@@ -47,13 +47,13 @@ class CANNAnalyzeParser(BaseParser):
                 print_error_msg(err_msg)
                 raise RuntimeError(err_msg)
 
-            if Constant.Db in ProfilerConfig().export_type:
+            if Constant.Db in self._export_type:
                 analyze_cmd_list = [self.msprof_path, "--analyze=on", "--type=db", f"--output={self._cann_path}"]
                 completed_analysis = subprocess.run(analyze_cmd_list, capture_output=True, shell=False)
                 if completed_analysis.returncode != self.COMMAND_SUCCESS:
                     print_warn_msg("Failed to analyze CANN DB Profiling data.")
 
-            if Constant.Text in ProfilerConfig().export_type:
+            if Constant.Text in self._export_type:
                 analyze_cmd_list = [self.msprof_path, "--analyze=on", f"--output={self._cann_path}"]
                 completed_analysis = subprocess.run(analyze_cmd_list, capture_output=True, shell=False)
                 if completed_analysis.returncode != self.COMMAND_SUCCESS:
diff --git a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py
index 5723d2ea6419bdf1ef046a319e1fc0e3762cd491..49d4e7eb8f6ac9b5d08f6de0177274ce148bd9b7 100644
--- a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py
+++ b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py
@@ -56,13 +56,13 @@ class CANNExportParser(BaseParser):
             self._check_prof_data_size()
             start_time = datetime.utcnow()
 
-            if Constant.Db in ProfilerConfig().export_type:
+            if Constant.Db in self._export_type:
                 analyze_cmd_list = [self.msprof_path, "--export=on", "--type=db", f"--output={self._cann_path}"]
                 completed_analysis = subprocess.run(analyze_cmd_list, capture_output=True, shell=False)
                 if completed_analysis.returncode != self.COMMAND_SUCCESS:
                     raise RuntimeError("Failed to export CANN DB Profiling data." + prof_error(ErrCode.INTERNAL))
 
-            if Constant.Text in ProfilerConfig().export_type:
+            if Constant.Text in self._export_type:
                 # 避免老CANN包无type参数报错
                 analyze_cmd_list = [self.msprof_path, "--export=on", f"--output={self._cann_path}"]
                 completed_analysis = subprocess.run(analyze_cmd_list, capture_output=True, shell=False)
@@ -102,7 +102,7 @@ class CANNTimelineParser(BaseParser):
         if not os.path.isdir(self._cann_path):
             return Constant.SUCCESS, None
         ProfilerConfig().load_info(self._profiler_path)
-        if Constant.Text in ProfilerConfig().export_type:
+        if Constant.Text in self._export_type:
             output_path = os.path.join(self._cann_path, "mindstudio_profiler_output")
             while True:
                 if os.path.exists(output_path):
@@ -114,10 +114,10 @@ class CANNTimelineParser(BaseParser):
                 except InterruptedError:
                     return Constant.FAIL, None
         else:
-            patten = r'^ascend_pytorch_profiler\.db$' if ProfilerConfig().rank_id == -1 else r'^ascend_pytorch_profiler_\d+\.db$'
+            patten = r'^msprof_\d+\.db$'
             while True:
-                for file in os.listdir(self._output_path):
-                    if re.match(patten, file) and os.path.isfile(os.path.join(self._output_path, file)):
+                for file in os.listdir(self._cann_path):
+                    if re.match(patten, file) and os.path.isfile(os.path.join(self._cann_path, file)):
                         return Constant.SUCCESS, None
                 try:
                     time.sleep(Constant.SLEEP_TIME)
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py
index 811ce9220d98347b9a516e6b4297396842d93050..11afb53b4f14fcea475c2789552084c752d6c2d6 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py
@@ -94,9 +94,9 @@ class CommunicationDbParser(CommunicationParser):
             }]
 
     def generate_view(self) -> None:
-        self.generate_communication_db(self._output_path)
+        self.generate_communication_db()
     
-    def generate_communication_db(self, output_path: str):
+    def generate_communication_db(self):
         db_files = CANNFileParser(self._profiler_path).get_file_list_by_type(CANNDataEnum.ANALYSIS_DB)
         if not db_files:
             return
@@ -104,7 +104,7 @@ class CommunicationDbParser(CommunicationParser):
         band_width_data, matrix_data, time_data = \
             self.set_step_and_type_info_for_db_data(band_width_data, matrix_data, time_data)
         matrix_data = self.reformat_matrix_db_data(matrix_data)
-        self.save_communication_db_data(band_width_data, matrix_data, time_data, output_path)
+        self.save_communication_db_data(band_width_data, matrix_data, time_data)
 
     def get_communication_db_data(self, db_path: str):
         # 在处理原analysis.db里的数据
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
index a2c36511a1dd87b60b161b329c8ac35168e3aca0..89cc322980ad7fab77e707897700e556c71bf3be 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
@@ -45,7 +45,7 @@ class DbParser(BaseParser):
         AnalysisDb().init(os.path.join(self._output_path, DbConstant.DB_ANALYSIS))
 
         parser_db_map = self.PYTORCH_DB_MAP
-        if ProfilerPathManager.get_cann_path(self._profiler_path) and ProfilerConfig().get_level() != "Level_none":
+        if ProfilerPathManager.get_cann_path(self._profiler_path) and ProfilerConfig().get_level() != Constant.LEVEL_NONE:
             parser_db_map = {**self.PYTORCH_DB_MAP, **self.ANALYSIS_DB_MAP}
         try:
             for name, parser in parser_db_map.items():
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py
index c9a7d5aff179a8ceee400ffe82ad6fe632f2a1f8..2ae2ac6474e707e27f1717298b9de3cac8d725b9 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py
@@ -158,16 +158,16 @@ class FwkApiDbParser(BaseParser):
         sql = "select startNs, endNs, globalTid, connectionId from {} " \
               "where name = {} and type = 10000 order by startNs" \
             .format(DbConstant.TABLE_CANN_API, node_launch_str_id)  # 10000 : node level
-        node_lauch_apis = TorchDb().fetch_all_data(sql)
-        if not node_lauch_apis:
+        node_launch_apis = TorchDb().fetch_all_data(sql)
+        if not node_launch_apis:
             raise RuntimeWarning("Failed to get node launch apis")
         torch_op_apis.sort(key=lambda x: x[TorchOpDataOri.START_NS.value])
         torch_op_len = len(torch_op_apis)
         if task_enqueues and task_dequeues:
             self.get_torch_op_connection_ids_with_task_queue(task_enqueues, task_dequeues, torch_op_apis, torch_op_len,
-                                                             node_lauch_apis)
+                                                             node_launch_apis)
         else:
-            self.get_torch_op_connection_ids_without_task_queue(torch_op_apis, torch_op_len, node_lauch_apis)
+            self.get_torch_op_connection_ids_without_task_queue(torch_op_apis, torch_op_len, node_launch_apis)
 
     def get_torch_op_connection_ids_with_task_queue(self, task_enqueues: list, task_dequeues: list, torch_op_apis: list, torch_op_len: int, node_lauch_apis: list):
         enqueue_corr_ids = {task_enqueue[TaskQueueDataOri.CORRELATION_ID.value] for task_enqueue in task_enqueues}
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
index 6afff95c707963c0ff19cdf2fe861ddecd1b0de8..34a5fc27f856530c83cb66ba93a63afe367aa746 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
@@ -65,6 +65,8 @@ class MemoryDbParser(BaseParser):
 
     @staticmethod
     def _combine_record(last_record, cur_record):
+        if cur_record[MemoryRecordTableRow.COMPONENT.value] == Str2IdManager().get_id_from_str(Constant.WORKSPACE):
+            return [cur_record]
         pta_ge_record_list = cur_record[:]
         pta_ge_record_list[MemoryRecordTableRow.COMPONENT.value] = Str2IdManager().get_id_from_str(Constant.PTA_GE)
         if last_record:
@@ -175,13 +177,20 @@ class MemoryDbParser(BaseParser):
         TorchDb().create_table_with_headers(DbConstant.TABLE_OPERATOR_MEMORY, TableColumnsManager.TableColumns.get(DbConstant.TABLE_OPERATOR_MEMORY))
         TorchDb().insert_data_into_table(DbConstant.TABLE_OPERATOR_MEMORY, self._pta_op_memory_data + self._ge_op_memory_data)
 
-    def get_pta_memort_record_list(self):
+    def get_pta_memory_record_list(self):
         if not self._pta_memory_bean_list:
             return
         for memory_bean in self._pta_memory_bean_list:
+            if memory_bean.component_type == Constant.WORKSPACE_TYPE:
+                self._pta_record_list.append([Str2IdManager().get_id_from_str(Constant.WORKSPACE), memory_bean.time_ns,
+                                              memory_bean.total_allocated_for_db, memory_bean.total_reserved_for_db,
+                                              memory_bean.total_active_for_db, memory_bean.stream_ptr,
+                                              memory_bean.device_index])
+                continue
             self._pta_record_list.append([Str2IdManager().get_id_from_str(Constant.PTA), memory_bean.time_ns,
                                           memory_bean.total_allocated_for_db, memory_bean.total_reserved_for_db,
-                                          memory_bean.total_active_for_db, memory_bean.stream_ptr, memory_bean.device_index])
+                                          memory_bean.total_active_for_db, memory_bean.stream_ptr,
+                                          memory_bean.device_index])
     
     def get_pta_ge_record_list(self):
         """
@@ -203,7 +212,9 @@ class MemoryDbParser(BaseParser):
             if ge_record[1] >= pta_record[1]:
                 self._record_list.extend(self._combine_record(last_ge_record, pta_record))
                 pta_ptr += 1
-                last_pta_record = pta_record
+                if pta_record[MemoryRecordTableRow.COMPONENT.value] != \
+                    Str2IdManager().get_id_from_str(Constant.WORKSPACE):
+                    last_pta_record = pta_record
             else:
                 self._record_list.extend(self._combine_record(last_pta_record, ge_record))
                 ge_ptr += 1
@@ -218,7 +229,7 @@ class MemoryDbParser(BaseParser):
             pta_ptr += 1
 
     def save_memory_record_data_to_db(self):
-        self.get_pta_memort_record_list()
+        self.get_pta_memory_record_list()
         self.get_pta_ge_record_list()
         if not self._record_list:
             return
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py
index 97a164b73d67d83be92127668772130efcec7dc7..db82064fdefbc32b0d034f8b34db9b8276e18208 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from collections import defaultdict
 from enum import Enum
 from .._base_parser import BaseParser
 from ...prof_common_func._constant import Constant, print_warn_msg
@@ -25,10 +26,11 @@ from ...prof_parse._fwk_file_parser import FwkFileParser
 __all__ = []
 
 
-class CommunicationOpIndex(Enum):
+class OpIndex(Enum):
     OP_NAME = 0
     START_NS = 1
     END_NS = 2
+    DEVICE_ID = 3
 
 
 class TraceStepTimeDbParser(BaseParser):
@@ -36,9 +38,8 @@ class TraceStepTimeDbParser(BaseParser):
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
         self.step_range = []
-        self.string_id_map = {}
-        self.compute_task_info = {}
-        self.communication_op_info = []
+        self.compute_task_info = defaultdict(list)
+        self.communication_op_info = defaultdict(list)
         ProfilerLogger.init(self._profiler_path, "TraceStepTimeDbParser")
         self.logger = ProfilerLogger.get_instance()
 
@@ -86,28 +87,32 @@ class TraceStepTimeDbParser(BaseParser):
                 {'step': None, 'compute': 0, 'comunNotOverlp': 0, 'Overlp': 0, 'comun': 0, 'free': 0,
                  'stage': 0, 'bubble': 0, 'comunNotOverlpRec': 0, 'prepare': 0})
         else:
-            # get step time
-            for cur_step in self.step_range:
-                save_info = {
-                    'step': cur_step.get(Constant.STEP_ID), 'compute': 0, 'comunNotOverlp': 0, 'Overlp': 0, 
-                    'comun': 0, 'free': 0, 'stage': 0, 'bubble': 0, 'comunNotOverlpRec': 0, 'prepare': 0
-                }
-                origin_compute_data = self._get_compute_data_in_step(cur_step)
-                origin_communication_data, bubble_data = self._get_communication_data_in_step(cur_step)
-                compute_data = RangeCaculator.merge_continuous_intervals(origin_compute_data)
-                save_info['compute'] = sum(data.end_ts - data.start_ts for data in compute_data)
-                communication_data = RangeCaculator.merge_continuous_intervals(origin_communication_data)
-                save_info['comun'] = sum(data.end_ts - data.start_ts for data in communication_data)
-                pure_communication_data, free_data = \
-                    RangeCaculator.compute_pipeline_overlap(communication_data, compute_data)
-                save_info['comunNotOverlp'] = \
-                    sum(data.end_ts - data.start_ts for data in pure_communication_data)
-                save_info['free'] = sum(data.end_ts - data.start_ts for data in free_data)
-                save_info['bubble'] = sum(data.end_ts - data.start_ts for data in bubble_data)
-                save_info['stage'] = self.get_e2e_time(compute_data + communication_data) - save_info['bubble']
-                first_task_start_ts = self._get_first_device_task_ts(compute_data, communication_data)
-                save_info['prepare'] = self.get_prepare_time(first_task_start_ts, cur_step)
-                save_time.append(save_info)
+            device_ids = list(set(self.compute_task_info.keys()) | set(self.communication_op_info.keys()))
+            device_ids.sort()
+            for device_id in device_ids:
+                # get step time
+                for cur_step in self.step_range:
+                    save_info = {
+                        'step': cur_step.get(Constant.STEP_ID), 'compute': 0, 'comunNotOverlp': 0, 'Overlp': 0,
+                        'comun': 0, 'free': 0, 'stage': 0, 'bubble': 0, 'comunNotOverlpRec': 0, 'prepare': 0,
+                        'deviceId': device_id
+                    }
+                    origin_compute_data = self._get_compute_data_in_step(cur_step, device_id)
+                    origin_communication_data, bubble_data = self._get_communication_data_in_step(cur_step, device_id)
+                    compute_data = RangeCaculator.merge_continuous_intervals(origin_compute_data)
+                    save_info['compute'] = sum(data.end_ts - data.start_ts for data in compute_data)
+                    communication_data = RangeCaculator.merge_continuous_intervals(origin_communication_data)
+                    save_info['comun'] = sum(data.end_ts - data.start_ts for data in communication_data)
+                    pure_communication_data, free_data = \
+                        RangeCaculator.compute_pipeline_overlap(communication_data, compute_data)
+                    save_info['comunNotOverlp'] = \
+                        sum(data.end_ts - data.start_ts for data in pure_communication_data)
+                    save_info['free'] = sum(data.end_ts - data.start_ts for data in free_data)
+                    save_info['bubble'] = sum(data.end_ts - data.start_ts for data in bubble_data)
+                    save_info['stage'] = self.get_e2e_time(compute_data + communication_data) - save_info['bubble']
+                    first_task_start_ts = self._get_first_device_task_ts(compute_data, communication_data)
+                    save_info['prepare'] = self.get_prepare_time(first_task_start_ts, cur_step)
+                    save_time.append(save_info)
 
         for calc_time in save_time:
             calc_time['comunNotOverlpRec'] = calc_time['comunNotOverlp'] - calc_time['bubble']
@@ -116,7 +121,8 @@ class TraceStepTimeDbParser(BaseParser):
         for step in save_time:
             step_time_data = [step['compute'], step['comunNotOverlp'], step['Overlp'], step['comun'], step['free'],
                               step['stage'], step['bubble'], step['comunNotOverlpRec'], step['prepare']]
-            reformat_time.append([step['step'], ] + [convert_ns2us_float(data) for data in step_time_data])
+            reformat_time.append([step['deviceId'], step['step']] + \
+                                 [convert_ns2us_float(data) for data in step_time_data])
         self.save_step_trace_db_data(reformat_time)
 
     def _init_step_range(self, deps_data: dict):
@@ -126,37 +132,75 @@ class TraceStepTimeDbParser(BaseParser):
         if not TorchDb().create_connect_db():
             print_warn_msg(f"Failed to connect to db file: {TorchDb().get_db_path()}")
             return
-        if TorchDb().judge_table_exist(DbConstant.TABLE_STRING_IDS):
-            sql = "select id, value from {}".format(DbConstant.TABLE_STRING_IDS)
-            string_id_data = TorchDb().fetch_all_data(sql)
-            self.string_id_map = {data[0]: data[1] for data in string_id_data}
+        if not TorchDb().judge_table_exist(DbConstant.TABLE_STRING_IDS):
+            self.logger.error(f"{DbConstant.TABLE_STRING_IDS} does not exist.")
+            return
         if TorchDb().judge_table_exist(DbConstant.TABLE_COMPUTE_TASK_INFO):
-            sql = "select name, globalTaskId from {}".format(DbConstant.TABLE_COMPUTE_TASK_INFO)
+            sql = """
+            SELECT 
+                STRING_IDS.value,
+                task.startNs,
+                task.endNs,
+                task.deviceId
+            FROM COMPUTE_TASK_INFO AS comp
+            JOIN TASK AS task
+                ON comp.globalTaskId = task.globalTaskId
+            JOIN STRING_IDS
+                ON comp.name = STRING_IDS.id
+            """
             compute_task_data = TorchDb().fetch_all_data(sql)
-            self.compute_task_info = {data[1]: data[0] for data in compute_task_data}
+            for item in compute_task_data:
+                self.compute_task_info[item[OpIndex.DEVICE_ID.value]].append(item)
         if TorchDb().judge_table_exist(DbConstant.TABLE_COMMUNICATION_OP):
-            sql = "select opName, startNs, endNs from {}".format(DbConstant.TABLE_COMMUNICATION_OP)
-            self.communication_op_info = TorchDb().fetch_all_data(sql)
-
-    def _get_compute_data_in_step(self, step_info):
+            sql = """
+            WITH comm_info AS (
+                SELECT (SELECT value FROM STRING_IDS WHERE id = c.opName) AS opName,
+                    startNs,
+                    endNs,
+                    connectionId
+                FROM COMMUNICATION_OP c
+            )
+            SELECT 
+                comm.opName,
+                comm.startNs,
+                comm.endNs,
+                t.deviceId
+            FROM comm_info comm
+            JOIN (
+                SELECT 
+                    connectionId,
+                    deviceId
+                FROM TASK
+                GROUP BY connectionId
+                HAVING COUNT(DISTINCT deviceId) = 1
+            ) t
+            ON comm.connectionId = t.connectionId
+            """
+            communication_op_data = TorchDb().fetch_all_data(sql)
+            for item in communication_op_data:
+                self.communication_op_info[item[OpIndex.DEVICE_ID.value]].append(item)
+
+    def _get_compute_data_in_step(self, step_info, device_id):
         compute_data = []
-        for task_id, task_info in step_info.get(Constant.TASK_INFO, {}).items():
-            if task_id in self.compute_task_info:
-                compute_data.append(
-                    RangeCaculator.generate_time_range(task_info.get("startNs"), task_info.get("endNs")))
+        for op_info in self.compute_task_info[device_id]:
+            op_start_time = op_info[OpIndex.START_NS.value]
+            if not (step_info.get(Constant.START_TS) <= op_start_time < step_info.get(Constant.END_TS)):
+                continue
+            time_range = RangeCaculator.generate_time_range(op_start_time, op_info[OpIndex.END_NS.value])
+            compute_data.append(time_range)
         return compute_data
 
-    def _get_communication_data_in_step(self, step_info):
+    def _get_communication_data_in_step(self, step_info, device_id):
         communication_data = []
         bubble_data = []
-        for op_info in self.communication_op_info:
-            op_start_time = op_info[CommunicationOpIndex.START_NS.value]
+        for op_info in self.communication_op_info[device_id]:
+            op_start_time = op_info[OpIndex.START_NS.value]
             if not (step_info.get(Constant.START_TS) <= op_start_time < step_info.get(Constant.END_TS)):
                 continue
             time_range = RangeCaculator.generate_time_range(
-                op_start_time, op_info[CommunicationOpIndex.END_NS.value], class_range=CommunicationTimeRange)
+                op_start_time, op_info[OpIndex.END_NS.value], class_range=CommunicationTimeRange)
             communication_data.append(time_range)
-            op_name = self.string_id_map.get(op_info[CommunicationOpIndex.OP_NAME.value], '')
+            op_name = op_info[OpIndex.OP_NAME.value]
             if op_name.startswith('hcom_receive'):
                 bubble_data.append(time_range)
         return communication_data, bubble_data
diff --git a/torch_npu/profiler/experimental_config.py b/torch_npu/profiler/experimental_config.py
index d9c3be7978c68243ee6f07556c63af1da3972226..78c9b5a353bfbdb876c84eae68bf16a2195c2f7e 100644
--- a/torch_npu/profiler/experimental_config.py
+++ b/torch_npu/profiler/experimental_config.py
@@ -12,7 +12,8 @@ __all__ = [
     "supported_export_type",
     "ProfilerLevel",
     "AiCMetrics",
-    "ExportType"
+    "ExportType",
+    "HostSystem"
 ]
 
 
@@ -54,6 +55,14 @@ class ExportType:
     Text = Constant.Text
 
 
+class HostSystem:
+    CPU = Constant.CPU
+    MEM = Constant.MEM
+    DISK = Constant.DISK
+    NETWORK = Constant.NETWORK
+    OSRT = Constant.OSRT
+
+
 class _ExperimentalConfig:
     def __init__(self,
                  profiler_level: int = Constant.LEVEL0,
@@ -64,7 +73,12 @@ class _ExperimentalConfig:
                  record_op_args: bool = False,
                  op_attr: bool = False,
                  gc_detect_threshold: float = None,
-                 export_type: Union[str, list] = None):
+                 export_type: Union[str, list] = None,
+                 host_sys: list = None,
+                 mstx_domain_include: list = None,
+                 mstx_domain_exclude: list = None,
+                 sys_io: bool = False,
+                 sys_interconnection: bool = False):
         self._profiler_level = profiler_level
         self._aic_metrics = aic_metrics
         if self._profiler_level != Constant.LEVEL_NONE:
@@ -75,9 +89,16 @@ class _ExperimentalConfig:
         self._data_simplification = data_simplification
         self.record_op_args = record_op_args
         self._export_type = self._conver_export_type_to_list(export_type)
+        self._host_sys = host_sys if host_sys else []
         self._op_attr = op_attr
         self._gc_detect_threshold = gc_detect_threshold
+        self._mstx_domain_include = mstx_domain_include if mstx_domain_include else []
+        self._mstx_domain_exclude = mstx_domain_exclude if mstx_domain_exclude else []
+        self._sys_io = sys_io
+        self._sys_interconnection = sys_interconnection
         self._check_params()
+        self._check_mstx_domain_params()
+        self._check_host_sys_params()
 
     def __call__(self) -> torch_npu._C._profiler._ExperimentalConfig:
         return torch_npu._C._profiler._ExperimentalConfig(trace_level=self._profiler_level,
@@ -85,7 +106,12 @@ class _ExperimentalConfig:
                                                           l2_cache=self._l2_cache,
                                                           record_op_args=self.record_op_args,
                                                           msprof_tx=self._msprof_tx,
-                                                          op_attr=self._op_attr)
+                                                          op_attr=self._op_attr,
+                                                          host_sys=self._host_sys,
+                                                          mstx_domain_include=self._mstx_domain_include,
+                                                          mstx_domain_exclude=self._mstx_domain_exclude,
+                                                          sys_io=self._sys_io,
+                                                          sys_interconnection=self._sys_interconnection)
 
     @property
     def export_type(self):
@@ -163,3 +189,53 @@ class _ExperimentalConfig:
                 self._gc_detect_threshold = None
             elif self._gc_detect_threshold == 0.0:
                 print_info_msg("Parameter gc_detect_threshold is set to 0, it will collect all gc events.")
+        if not isinstance(self._sys_io, bool):
+            print_warn_msg("Invalid parameter sys_io, which must be of boolean type, reset it to False.")
+            self._sys_io = False
+        if not isinstance(self._sys_interconnection, bool):
+            print_warn_msg("Invalid parameter sys_interconnection, which must be of boolean type, reset it to False.")
+            self._sys_interconnection = False
+
+    def _check_mstx_domain_params(self):
+        if not self._msprof_tx:
+            if self._mstx_domain_include or self._mstx_domain_exclude:
+                print_warn_msg("mstx_domain_include and mstx_domain_exclude are only valid when msprof_tx is True.")
+            self._mstx_domain_include = []
+            self._mstx_domain_exclude = []
+            return
+        if self._mstx_domain_include:
+            if not isinstance(self._mstx_domain_include, list):
+                print_warn_msg("Invalid parameter mstx_domain_include, which must be of list type, " \
+                               "reset it to default.")
+                self._mstx_domain_include = []
+            if any(not isinstance(domain, str) for domain in self._mstx_domain_include):
+                print_warn_msg("Invalid parameter mstx_domain_include, which contents must be of str type, " \
+                               "reset it to default.")
+                self._mstx_domain_include = []
+            else:
+                self._mstx_domain_include = list(set(self._mstx_domain_include))
+        if self._mstx_domain_exclude:
+            if not isinstance(self._mstx_domain_exclude, list):
+                print_warn_msg("Invalid parameter mstx_domain_exclude, which must be of list type, " \
+                               "reset it to default.")
+                self._mstx_domain_exclude = []
+            if any(not isinstance(domain, str) for domain in self._mstx_domain_exclude):
+                print_warn_msg("Invalid parameter _mstx_domain_exclude, which contents must be of str type, " \
+                               "reset it to default.")
+                self._mstx_domain_exclude = []
+            else:
+                self._mstx_domain_exclude = list(set(self._mstx_domain_exclude))
+        if self._mstx_domain_include and self._mstx_domain_exclude:
+            print_warn_msg("Parameter mstx_domain_include and mstx_domain_exclude can not be both set, " \
+                           "only mstx_domain_include will work.")
+            self._mstx_domain_exclude = []
+
+    def _check_host_sys_params(self):
+        if not isinstance(self._host_sys, list):
+            print_warn_msg("Invalid parameter host_sys, which must be of list type, reset it to empty.")
+            self._host_sys = []
+        if not all(host_sys in [HostSystem.CPU, HostSystem.MEM, HostSystem.DISK, HostSystem.NETWORK, HostSystem.OSRT]
+                   for host_sys in self._host_sys):
+            print_warn_msg("Invalid parameter host_sys, reset it to empty.")
+            self._host_sys = []
+        self._host_sys = list(set(str(item) for item in self._host_sys))
\ No newline at end of file
diff --git a/torch_npu/profiler/profiler.py b/torch_npu/profiler/profiler.py
index 6bc2d80a22ef4527993c62ef0b0a6b7410230fb7..bfcf6edeca7e35226e4ea2f3e90f523dd11f5a06 100644
--- a/torch_npu/profiler/profiler.py
+++ b/torch_npu/profiler/profiler.py
@@ -1,7 +1,7 @@
 import os.path
 import json
 from sys import getsizeof
-from typing import Optional, Iterable, Callable, Any
+from typing import Optional, Iterable, Callable, Any, Union
 
 import torch.autograd.profiler as prof
 import torch_npu.npu
@@ -131,7 +131,6 @@ class _KinetoProfile:
     def export_memory_timeline(self, output_path: str, device: Optional[str] = None) -> None:
         if device is None:
             device = "npu:0" if torch_npu.npu.is_available() else "cpu"
-        
         missing = []
         if not self.prof_if.record_shapes:
             missing.append("record_shapes=True")
@@ -279,11 +278,24 @@ class profile(_KinetoProfile):
 
 
 @no_exception_func()
-def analyse(profiler_path: str, max_process_number: int = Constant.DEFAULT_PROCESS_NUMBER):
+def analyse(profiler_path: str, max_process_number: int = Constant.DEFAULT_PROCESS_NUMBER,
+            export_type: Union[str, list] = None):
     if not isinstance(max_process_number, int) or max_process_number <= 0:
         max_process_number = Constant.DEFAULT_PROCESS_NUMBER
         print_warn_msg("Invalid max_process_number, reset it to default!")
     if max_process_number > os.cpu_count():
         max_process_number = os.cpu_count()
         print_warn_msg("max_process_number exceeds the number of cpu cores, reset it to the number of cpu cores!")
-    NpuProfiler.analyse(profiler_path, max_process_number=max_process_number)
+    if export_type is not None:
+        if isinstance(export_type, str):
+            export_type = [export_type]
+        elif isinstance(export_type, list):
+            export_type = list(set(export_type))
+        else:
+            print_warn_msg(f"Invalid parameter export_type: {export_type}, reset it to None.")
+            export_type = None
+        if export_type is not None:
+            if not export_type or not all(_type in [Constant.Text, Constant.Db] for _type in export_type):
+                print_warn_msg(f"Invalid parameter export_type: {export_type}, reset it to None.")
+                export_type = None
+    NpuProfiler.analyse(profiler_path, max_process_number=max_process_number, export_type=export_type)
diff --git a/torch_npu/utils/__init__.py b/torch_npu/utils/__init__.py
index 8c33a26e4b5fb4cac2fd4a9cb102a2656b6a61ff..200325e8219446b2f8aa0a2f504f99b5b65694f9 100644
--- a/torch_npu/utils/__init__.py
+++ b/torch_npu/utils/__init__.py
@@ -1,3 +1,6 @@
+__all__ = ["save_async", "npu_combine_tensors", "get_part_combined_tensor", "is_combined_tensor_valid", "FlopsCounter",
+           "set_thread_affinity", "reset_thread_affinity"]
+
 from torch_npu import _C
 from ._module import _apply_module_patch
 from .tensor_methods import _add_tensor_methods
@@ -15,8 +18,8 @@ from .utils import _print_error_log, _print_warn_log, _print_info_log, _should_p
 from .clip_grad_norm_ import _apply_clip_grad_norm_patch
 from ._step import add_perf_dump_patch
 from .flops_count import _FlopsCounter as FlopsCounter
-
-__all__ = ["save_async", "npu_combine_tensors", "get_part_combined_tensor", "is_combined_tensor_valid", "FlopsCounter"]
+from .affinity import _set_thread_affinity as set_thread_affinity
+from .affinity import _reset_thread_affinity as reset_thread_affinity
 
 
 # init flopcount
diff --git a/torch_npu/utils/_error_code.py b/torch_npu/utils/_error_code.py
index 1057df814ed96ecc2b89c05ae79cf85d040fd57a..068d0890a80bd42752077f4883eec81d7a17c381 100644
--- a/torch_npu/utils/_error_code.py
+++ b/torch_npu/utils/_error_code.py
@@ -57,8 +57,9 @@ def _format_error_msg(submodule, error_code):
             return rank
         except Exception:
             return -1
-
-    error_msg = "\n[ERROR] {time} (PID:{pid}, Device:{device}, RankID:{rank}) {error_code} {submodule_name} {error_code_msg}"
+    error_msg = ""
+    if not get_env_compact_error_output():
+        error_msg += "\n[ERROR] {time} (PID:{pid}, Device:{device}, RankID:{rank}) {error_code} {submodule_name} {error_code_msg}"
 
     return error_msg.format(
             time=time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()),
@@ -90,6 +91,10 @@ def prof_error(error: ErrCode) -> str:
     return _format_error_msg(_SubModuleID.PROF, error)
 
 
+def get_env_compact_error_output():
+    return int(os.getenv("TORCH_NPU_COMPACT_ERROR_OUTPUT", "0"))
+
+
 class _NPUExceptionHandler(object):
     def __init__(self):
         self.exception = None
@@ -123,7 +128,7 @@ class _NPUExceptionHandler(object):
             if self.force_stop_flag:
                 raise RuntimeError("FORCE STOP." + pta_error(ErrCode.ACL))
             if self._is_exception(self.npu_exception):
-                if self._is_exception(self.npu_timeout_exception):
+                if self._is_exception(self.npu_timeout_exception) or get_env_compact_error_output():
                     # if npu timeout, let other processes exit properly before elastic agent kills them.
                     time.sleep(self.npu_timeout_exit_offset)
             else:
diff --git a/torch_npu/utils/_module.py b/torch_npu/utils/_module.py
index 6f701fa4d85f66c6955c92b4196743780d9f1afc..313736c3a176de58c81ea2fad222475c78b99a5a 100644
--- a/torch_npu/utils/_module.py
+++ b/torch_npu/utils/_module.py
@@ -17,6 +17,7 @@ from torch.nn.modules.batchnorm import _NormBase, _LazyNormBase
 from torch.nn.modules.module import Module
 from torch.nn.parallel._functions import _streams
 from torch.utils.data.dataloader import _MultiProcessingDataLoaderIter
+from torch.utils.data._utils import worker, pin_memory
 from torch._utils import _get_device_index, _get_all_device_indices, _get_available_device_type, ExceptionWrapper
 from torch.nn.parallel.parallel_apply import get_a_var
 from torch.nn.parallel.scatter_gather import gather, scatter_kwargs
@@ -365,11 +366,11 @@ def _ddp_init_helper(
 def _mpdl_iter_init(self, *args, **kwargs):
     try:
         torch_npu.npu.synchronize()
-    except:
-        pass
-    torch_npu._C._npu_reset_threads_affinity()
+    except Exception as e:
+        print(e)
+    torch_npu._C._npu_set_thread_affinity(-1, -1)
     origin_mpdl_iter_init(self, *args, **kwargs)
-    torch_npu._C._npu_set_threads_affinity()
+    torch_npu._C._npu_reset_thread_affinity()
 
 
 def _parallel_apply(
@@ -519,6 +520,6 @@ def _apply_module_patch():
     torch.nn.Module.cast_weight = cast_weight
     torch.nn.modules.rnn.LSTM.forward = _lstm_forward
     torch.nn.modules.batchnorm.SyncBatchNorm.forward = _syncbn_forward
-    torch.utils.data.dataloader._MultiProcessingDataLoaderIter.__init__ = _mpdl_iter_init
     torch.nn.parallel.DataParallel.parallel_apply = npu_parallel_apply
     torch.nn.parallel.data_parallel = npu_data_parallel
+    torch.utils.data.dataloader._MultiProcessingDataLoaderIter.__init__ = _mpdl_iter_init
diff --git a/torch_npu/utils/_step.py b/torch_npu/utils/_step.py
index 5ec69c05ad2577de5adedd754980d80df6bc740c..b81d26bced88a07860926bc58abd27dd779566fe 100644
--- a/torch_npu/utils/_step.py
+++ b/torch_npu/utils/_step.py
@@ -2,6 +2,7 @@ import os
 import stat
 import logging
 from logging.handlers import RotatingFileHandler
+from functools import wraps
 import uuid
 import time
 import glob
@@ -11,12 +12,13 @@ from torch.nn import Module
 
 import torch_npu
 from torch_npu.utils._error_code import ErrCode, pta_error
-from torch_npu.asd.asd import _silent_fault_detector_v2, _silent_fault_detector_v3
+from torch_npu.asd.asd import _silent_check_decorator, silent_check, _matmul_silent_check_decorator, matmul_check
 
 
 original_call = Module.__call__
 DEFAULT_FALGS = os.O_WRONLY | os.O_CREAT | os.O_TRUNC
 DEFAULT_PERMISSION = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
+loggerSilent = logging.getLogger("torch_npu.silent_check")
 
 
 class PerfDumpState:
@@ -44,111 +46,6 @@ class PerfDumpState:
 
 perf_dump_state = PerfDumpState()
 perf_dump_enable = False
-IS_IN_BACKWARD = False
-loggerSilent = logging.getLogger("torch_npu.silent_check")
-
-
-def input_hook(idx, asd_flag):
-    def hook(grad):
-        global IS_IN_BACKWARD
-        loggerSilent.debug(f"input_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to False. idx is {idx}, flag is {asd_flag}")
-        IS_IN_BACKWARD = False
-        torch_npu._C._npu_set_call_state("forward")
-        if torch_npu._C._get_silent_check_version() == 3:
-            _silent_fault_detector_v3.silent_fault_check(idx, asd_flag, grad)
-        else:
-            _silent_fault_detector_v2.silent_fault_check(idx, asd_flag, grad)
-        return
-    return hook
-
-
-def output_hook(grad):
-    global IS_IN_BACKWARD
-    loggerSilent.debug(f"output_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to True.")
-    IS_IN_BACKWARD = True
-    torch_npu._C._npu_set_call_state("backward")
-    return grad
-
-
-def _is_inner_module(module):
-    return len(module._modules) == 0
-
-
-class SilentCheckState:
-    def __init__(self):
-        self.init_param()
-        self.init_marks = {}
-        self.weight_hook_handles = {}
-        self.last_weight_hook_handles = {}
-        self.dtype_support = True
-
-    def init_param(self):
-        self.first_forward = True
-        self.input_hook_flag = False
-        self.is_training = False
-        self.first_module_id = ""
-        self.first_weight = None
-        self.first_weight_id = None
-        self.last_weight = None
-        self.last_weight_id = None
-
-    def init_module_info(self, module_id, training):
-        self.first_module_id = module_id
-        self.first_forward = False
-        self.is_training = training
-        if self.is_training:
-            torch_npu._C._npu_set_module_train_state("train")
-        else:
-            torch_npu._C._npu_set_module_train_state("infer")
-
-    def check_tensor_dtype(self, tensor):
-        if not self.dtype_support:
-            return
-        if isinstance(tensor, torch.Tensor) and tensor.requires_grad and tensor.dtype == torch.float16:
-            self.dtype_support = False
-
-    def check_dtype(self, module, *args):
-        for x in args:
-            self.check_tensor_dtype(x)
-        for param_name, param in module._parameters.items():
-            self.check_tensor_dtype(param)
-
-    def search_first_weight(self, module):
-        # Search the first weight
-        if not self.init_marks.get(self.first_module_id, False) and self.first_weight is None:
-            for param_name, param in module._parameters.items():
-                if isinstance(param, torch.Tensor) and param.requires_grad:
-                    self.first_weight = param
-                    self.first_weight_id = id(param)
-                    break
-
-    def search_last_weight(self, module):
-        # Search the last weight (only in inner module)
-        if not self.init_marks.get(self.first_module_id, False) and _is_inner_module(module):
-            for param_name, param in module._parameters.items():
-                if isinstance(param, torch.Tensor) and param.requires_grad:
-                    self.last_weight = param
-                    self.last_weight_id = id(param)
-
-    def init_all_hook(self, asd_flag):
-        if self.is_training:
-            if self.last_weight is not None and self.first_weight is not None:
-                # Otherwise, there is only one weight in the outer module
-                if self.first_weight_id != self.last_weight_id:
-                    loggerSilent.debug(f"init_all_hook: module init, first_module_id is {self.first_module_id}.")
-                    if self.last_weight_hook_handles.get(self.first_module_id, None) is None:
-                        last_weight_handle = self.last_weight.register_hook(output_hook)
-                        self.last_weight_hook_handles[self.first_module_id] = last_weight_handle
-                    if self.weight_hook_handles.get(self.first_module_id, None) is None:
-                        first_weight_handle = self.first_weight.register_hook(input_hook(self.first_module_id, asd_flag))
-                        self.weight_hook_handles[self.first_module_id] = first_weight_handle
-                else:
-                    loggerSilent.debug(f"init_all_hook: module only have one weight, first_module_id is {self.first_module_id}.")
-            self.init_marks[self.first_module_id] = True
-
-
-silent_check = SilentCheckState()
-asd_enable = 0
 
 
 class CustomRotatingFileHandler(RotatingFileHandler):
@@ -222,90 +119,65 @@ def _setup_logger(name, path):
     logger.propagate = False
 
 
-def _custom_call(self, *args, **kwargs):    
-    global perf_dump_enable
-    global perf_dump_state
-
-    global asd_enable
-    global silent_check
-    global IS_IN_BACKWARD
-
-    if not torch.npu.is_initialized():
-        return original_call(self, *args, **kwargs)
-
-    if perf_dump_enable:
-        if not perf_dump_state.has_log:
-            perf_dump_path = _get_perf_dump_path()
-            pid = os.getpid()
-            device_id = torch_npu.npu.current_device()
-            delete_pref_pt_logs(perf_dump_path, device_id)
-            perf_dump_state.local_uuid = uuid.uuid4()
-            perf_dump_state.uuid = _get_uuid()
-            perf_dump_state.log_file_name = os.path.join(perf_dump_path, f"perf_pt_{pid}_{device_id}.log")
-            _setup_logger("perf_logger", perf_dump_state.log_file_name)
-            logger = logging.getLogger("perf_logger")
-            logger.info(f"[LOCALUUID]:{perf_dump_state.local_uuid}")
-            logger.info("[FRAMEWORK]:PyTorch")
-            logger.info(f"[UUID]:{perf_dump_state.uuid}")
-            os.chmod(perf_dump_state.log_file_name, DEFAULT_PERMISSION)
-            perf_dump_state.has_log = True
-
-        if perf_dump_state.is_outer_call:
-            if not perf_dump_state.is_child_module(self) and not _is_loss_module(self):
-                current_time = int(time.time() * 1000)
+def _perf_dump_decorator(func):
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        global perf_dump_enable
+        global perf_dump_state
+
+        if not torch.npu.is_initialized():
+            return func(self, *args, **kwargs)
+
+        if perf_dump_enable:
+            if not perf_dump_state.has_log:
+                perf_dump_path = _get_perf_dump_path()
+                pid = os.getpid()
+                device_id = torch_npu.npu.current_device()
+                delete_pref_pt_logs(perf_dump_path, device_id)
+                perf_dump_state.local_uuid = uuid.uuid4()
+                perf_dump_state.uuid = _get_uuid()
+                perf_dump_state.log_file_name = os.path.join(perf_dump_path, f"perf_pt_{pid}_{device_id}.log")
+                _setup_logger("perf_logger", perf_dump_state.log_file_name)
                 logger = logging.getLogger("perf_logger")
-                if perf_dump_state.last_time is not None:
-                    logger.info(f"[STEPTIME]:{perf_dump_state.last_time},{current_time}")
-                perf_dump_state.last_time = current_time
-                perf_dump_state.add_module_dict(self)
-            perf_dump_state.is_outer_call = False
-            self.visited = True
-
-    if asd_enable and not IS_IN_BACKWARD:
-        if silent_check.first_forward:
-            silent_check.init_module_info(id(self), self.training)
-            self.outer = True
-
-        if silent_check.is_training and not silent_check.init_marks.get(silent_check.first_module_id, False):
-            silent_check.check_dtype(self, *args)
-            if not silent_check.dtype_support:
-                for value in silent_check.weight_hook_handles.values():
-                    if value is not None:
-                        value.remove()
-                for value in silent_check.last_weight_hook_handles.values():
-                    if value is not None:
-                        value.remove()
-                asd_enable = 0
-                warnings.warn(f"Warning: Module has unsupported dtype tensor, silent check will be closed.")
-
-    tmp = original_call(self, *args, **kwargs)
-
-    if asd_enable and silent_check.is_training and not IS_IN_BACKWARD:
-        # Search the first weight
-        silent_check.search_first_weight(self)
-
-        # Search the last weight (only in inner module)
-        silent_check.search_last_weight(self)
-
-    if perf_dump_enable:
-        if hasattr(self, "visited") and self.visited:
-            perf_dump_state.is_outer_call = True
-            self.visited = False
-
-    if asd_enable and not IS_IN_BACKWARD:
-        if hasattr(self, "outer") and self.outer:
-            silent_check.init_all_hook(asd_enable)
-            silent_check.init_param()
-            self.outer = False
-
-    return tmp
-
-
-def _parse_perf_config():
-    perf_dump_config = os.getenv("PERF_DUMP_CONFIG")
+                logger.info(f"[LOCALUUID]:{perf_dump_state.local_uuid}")
+                logger.info("[FRAMEWORK]:PyTorch")
+                logger.info(f"[UUID]:{perf_dump_state.uuid}")
+                os.chmod(perf_dump_state.log_file_name, DEFAULT_PERMISSION)
+                perf_dump_state.has_log = True
+
+            if perf_dump_state.is_outer_call:
+                if not perf_dump_state.is_child_module(self) and not _is_loss_module(self):
+                    current_time = int(time.time() * 1000)
+                    logger = logging.getLogger("perf_logger")
+                    if perf_dump_state.last_time is not None:
+                        logger.info(f"[STEPTIME]:{perf_dump_state.last_time},{current_time}")
+                    perf_dump_state.last_time = current_time
+                    perf_dump_state.add_module_dict(self)
+                perf_dump_state.is_outer_call = False
+                self.visited = True
+
+        tmp = func(self, *args, **kwargs)
+
+        if perf_dump_enable:
+            if hasattr(self, "visited") and self.visited:
+                perf_dump_state.is_outer_call = True
+                self.visited = False
+
+        return tmp
+    return wrapper
+
+
+@_perf_dump_decorator
+@_silent_check_decorator
+@_matmul_silent_check_decorator
+def _custom_call(self, *args, **kwargs):
+    return original_call(self, *args, **kwargs)
+
+
+def _parse_config(config):
     config_dict = {}
-    if perf_dump_config:
-        config_items = perf_dump_config.split(',')
+    if config:
+        config_items = config.split(',')
         for item in config_items:
             key_value = item.split(':')
             if len(key_value) == 2:
@@ -314,29 +186,104 @@ def _parse_perf_config():
     return config_dict
 
 
+def _prase_asd_config(asd_config):
+    # checksum
+    with_checksum_str = asd_config.get("with_checksum", "false")
+    if with_checksum_str not in ["true", "false"]:
+        raise ValueError("NPU_ASD_CONFIG-with_checksum should be true or false. For details, 0 as `with checksum closed`, 1 as `with checksum opened`." + pta_error(ErrCode.VALUE))
+    with_checksum = with_checksum_str == "true"
+    matmul_check.set_with_checksum(with_checksum)
+
+    # cooldown
+    cooldown = asd_config.get("cooldown", "5")
+    if cooldown.isdigit() and cooldown != "0":
+        matmul_check.set_cooldown(int(cooldown))
+    else:
+        warnings.warn(f"Warning: NPU_ASD_CONFIG-cooldown is invalid, use the default value of 5.")
+
+    # strikes_num
+    strikes_num = asd_config.get("strikes_num", "3")
+    if strikes_num.isdigit() and strikes_num != "0":
+        matmul_check.set_strikes_num(int(strikes_num))
+    else:
+        warnings.warn(f"Warning: NPU_ASD_CONFIG-strikes_num is invalid, use the default value of 3.")
+
+    # strikes_window
+    strikes_window = asd_config.get("strikes_window", "480")
+    if strikes_window.isdigit() and strikes_window != "0":
+        matmul_check.set_strikes_window(int(strikes_window))
+    else:
+        warnings.warn(f"Warning: NPU_ASD_CONFIG-strikes_window is invalid, use the default value of 480.")
+
+    # checksum_cooldown
+    checksum_cooldown = asd_config.get("checksum_cooldown", "180")
+    if checksum_cooldown.isdigit() and checksum_cooldown != "0":
+        matmul_check.set_checksum_cooldown(int(checksum_cooldown))
+    else:
+        warnings.warn(f"Warning: NPU_ASD_CONFIG-checksum_cooldown is invalid, use the default value of 180.")
+
+    # upper_thresh1
+    upper_thresh1 = asd_config.get("upper_thresh1", "1000000")
+    if upper_thresh1.isdigit() and int(upper_thresh1) >= 3:
+        matmul_check.set_upper_thresh1(int(upper_thresh1))
+    else:
+        warnings.warn(f"Warning: NPU_ASD_CONFIG-upper_thresh1 is invalid, use the default value of 1000000.")
+
+    # upper_thresh2
+    upper_thresh2 = asd_config.get("upper_thresh2", "100")
+    if upper_thresh2.isdigit() and int(upper_thresh2) >= 3:
+        matmul_check.set_upper_thresh2(int(upper_thresh2))
+    else:
+        warnings.warn(f"Warning: NPU_ASD_CONFIG-upper_thresh2 is invalid, use the default value of 100.")
+
+    # grad_sample_interval
+    grad_sample_interval = asd_config.get("grad_sample_interval", "3")
+    if grad_sample_interval.isdigit() and grad_sample_interval != "0":
+        matmul_check.set_grad_sample_interval(int(grad_sample_interval))
+    else:
+        warnings.warn(f"Warning: NPU_ASD_CONFIG-grad_sample_interval is invalid, use the default value of 3.")
+
+
 def add_perf_dump_patch():
     global perf_dump_enable
-    global asd_enable
 
-    config_dict = _parse_perf_config()
+    perf_dump_config = os.getenv("PERF_DUMP_CONFIG")
+    config_dict = _parse_config(perf_dump_config)
     enable_value = config_dict.get("enable", "false")
     perf_dump_enable = enable_value.lower() == "true"
 
-    asd_value = os.getenv("NPU_ASD_ENABLE", "0")
-    if asd_value not in ["0", "1", "2", "3"]:
-        raise ValueError("NPU_ASD_ENABLE should be 0, 1, 2 or 3. For details, 0 as `ASD closed`, "
-                         "1 as `ASD opened, print error logs` "
-                         "2 as `ASD opened, print error logs and raise exception`, "
-                         "3 as `ASD opened, print debug logs and raise exception`" + pta_error(ErrCode.VALUE))
-    asd_enable = int(asd_value)
-    if asd_enable:
+    asd_enable = 0
+    asd_config = os.getenv("NPU_ASD_CONFIG", None)
+    if asd_config is not None:
+        asd_config_dict = _parse_config(asd_config)
+        asd_config_enable = asd_config_dict.get("enable", "false")
+        if asd_config_enable not in ["true", "false"]:
+            raise ValueError("NPU_ASD_CONFIG-enable should be true or false. For details, false as `ASD closed`, true as `ASD opened`." + pta_error(ErrCode.VALUE))
+        if asd_config_enable == "true":
+            warnings.warn(f'Silent data corruption check may take up 1.5GB device memory, please make sure there are enough free space in device')
+            _prase_asd_config(asd_config_dict)
+            asd_enable = 1
+            matmul_check.set_matmul_hook_enable(asd_enable)
+            loggerSilent.info(f"Silent check 3.0 version will be enabled. The checksum enable is {matmul_check.get_with_checksum()}, "
+                              f"cooldown is {matmul_check.get_cooldown()}, strikes_num is {matmul_check.get_strikes_num()}, strikes_window is {matmul_check.get_strikes_window()}, "
+                              f"checksum_cooldown is {matmul_check.get_checksum_cooldown()}, upper_thresh1 is {matmul_check.get_upper_thresh1()}, "
+                              f"upper_thresh2 is {matmul_check.get_upper_thresh2()}. grad_sample_interval is {matmul_check.get_grad_sample_interval()}.")
+    else:
+        asd_value = os.getenv("NPU_ASD_ENABLE", "0")
         if torch_npu._C._get_silent_check_version() == 1:
-            warnings.warn(f"Warning: CANN version lower than 8.0.RC3 and currently does not support silent check 2.0 version or later. It will switch to 1.0 version.")
-            asd_enable = 0
-        elif torch_npu._C._get_silent_check_version() == 2:
-            warnings.warn(f"Warning: CANN version lower than 8.0.0 and currently does not support silent check 3.0 version. It will switch to 2.0 version. The asd_detect is {asd_enable}")
+            if asd_value == "1":
+                warnings.warn(f"Warning: CANN version lower than 8.0.RC3 and currently does not support silent check 2.0 version or later. It will switch to 1.0 version.")
         else:
-            loggerSilent.debug(f"Silent check 3.0 version will be enabled. The asd_detect is {asd_enable}")
+            if asd_value not in ["0", "1", "2", "3"]:
+                raise ValueError("NPU_ASD_ENABLE should be 0, 1, 2 or 3. For details, 0 as `ASD closed`, "
+                                "1 as `ASD opened, print error logs`, "
+                                "2 as `ASD opened, print error logs and raise exception`, "
+                                "3 as `ASD opened, print debug logs and raise exception`" + pta_error(ErrCode.VALUE))
+            asd_enable = int(asd_value)
+            if asd_enable:
+                warnings.warn(f"Warning: Silent check 2.0 version will be enabled. The asd_detect is {asd_enable}. It is recommended to enable silent check v3 using the NPU_ASD_CONFIG.\n"
+                              "Silent data corruption check may take up 1.5GB device memory, please make sure there are enough free space in device. ")
+                silent_check.set_check_enable(asd_enable)
 
     if perf_dump_enable or asd_enable:
         Module.__call__ = _custom_call
diff --git a/torch_npu/utils/affinity.py b/torch_npu/utils/affinity.py
new file mode 100644
index 0000000000000000000000000000000000000000..37973f5bc79bc81af684a286603bb75e2c734332
--- /dev/null
+++ b/torch_npu/utils/affinity.py
@@ -0,0 +1,21 @@
+__all__ = []
+
+from typing import List
+
+import torch_npu
+from torch_npu.utils._error_code import ErrCode, pta_error
+
+
+def _set_thread_affinity(core_range: List[int] = None):
+    if core_range is None:
+        torch_npu._C._npu_set_thread_affinity(-1, -1)
+    elif (len(core_range) == 2):
+        if core_range[0] < 0 or core_range[1] < 0:
+            raise ValueError("Core range should be nonnegative." + pta_error(ErrCode.PARAM))
+        torch_npu._C._npu_set_thread_affinity(core_range[0], core_range[1])
+    else:
+        raise ValueError("The length of input list of set_thread_affinity should be 2." + pta_error(ErrCode.PARAM))
+
+
+def _reset_thread_affinity():
+    torch_npu._C._npu_reset_thread_affinity()
\ No newline at end of file
diff --git a/torch_npu/utils/serialization.py b/torch_npu/utils/serialization.py
index f967fdc3a8cb8c71d7cd2dbb08ed3bdc21526090..37f0789431da0fca92ef5f39ec916c7a83c2b10a 100644
--- a/torch_npu/utils/serialization.py
+++ b/torch_npu/utils/serialization.py
@@ -1,14 +1,16 @@
 import io
 import os
+import sys
 import pickle
+import tarfile
 import threading
-from typing import Any, Optional
+from typing import Dict, Any, Optional
 
 import torch
 from torch.serialization import _check_dill_version, _open_file_like, _is_zipfile, \
     _open_zipfile_reader, _is_torchscript_zip, _weights_only_unpickler, \
     _legacy_load, _load, FILE_LIKE, MAP_LOCATION, DEFAULT_PROTOCOL, \
-    normalize_storage_type, location_tag, _open_zipfile_writer
+    normalize_storage_type, location_tag, _open_zipfile_writer, _check_seekable, closing, _should_read_directly
 
 import torch_npu
 from torch_npu.utils._error_code import ErrCode, pta_error
@@ -191,19 +193,34 @@ def load(
                              overall_storage=overall_storage, **pickle_load_args)
         else:
             if mmap:
-                raise RuntimeError("mmap can only be used with files saved with `torch.save(_use_new_zipfile_serialization=True), ",
+                raise RuntimeError("mmap can only be used with files saved with "
+                                   "`torch.save(_use_new_zipfile_serialization=True), "
                                    "please torch.save your checkpoint with this option in order to use mmap." +
                                    pta_error(ErrCode.PARAM))
             if weights_only:
+                _check_seekable(opened_file)
+                f_should_read_directly = _should_read_directly(opened_file)
+                if f_should_read_directly and opened_file.tell() == 0:
+                    try:
+                        with closing(tarfile.open(fileobj=opened_file, mode="r:", format=tarfile.PAX_FORMAT)):
+                            raise pickle.UnpicklingError(
+                                UNSAFE_MESSAGE +
+                                "Cannot use ``weights_only=True`` with files saved in the legacy .tar format." +
+                                pta_error(ErrCode.NOT_SUPPORT)
+                            ) from None
+                    except tarfile.TarError:
+                        # ignore TarError and pass opened_file to torch._legacy_load
+                        opened_file.seek(0)
                 try:
                     return _legacy_load(opened_file, map_location, _weights_only_unpickler, **pickle_load_args)
                 except RuntimeError as e:
                     raise pickle.UnpicklingError(UNSAFE_MESSAGE + str(e) + pta_error(ErrCode.SYSCALL)) from None
 
             warn_massage = (
-                "Warning: since the loaded file is not a zipfile, only \"torch.device\" and \"str\" type parameters are currently supported for parameter types of map_location"
-                "If parameter types of map_location is \"Callable[[torch.Tensor, str], torch.Tensor]\" or \"Dict[str, str]\", which is only support for zipfile,"
-                "all tensors are currently loaded onto the CPU, which may introduce problems"
+                "Warning: since the loaded file is not a zipfile, only \"torch.device\" and \"str\" type parameters "
+                "are currently supported for parameter types of map_location. If parameter types of map_location is "
+                "\"Callable[[torch.Tensor, str], torch.Tensor]\" or \"Dict[str, str]\", which is only support for "
+                "zipfile, all tensors are currently loaded onto the CPU, which may introduce problems"
             )
             _warn_legacy_serialization(warn_massage, "load")
 
@@ -218,28 +235,90 @@ def load(
                 return _legacy_load(opened_file, "cpu", pickle_module, **pickle_load_args)
 
 
-def _get_npu_save_result(
-    obj: object,
-    f: FILE_LIKE,
-    pickle_module: Any = pickle,
-    pickle_protocol: int = DEFAULT_PROTOCOL,
-    _use_new_zipfile_serialization: bool = True,
-    _disable_byteorder_record: bool = False
-) -> None:
-    cpu_nbytes = torch.storage.UntypedStorage.nbytes
+def _npu_save(obj, zip_file, pickle_module, pickle_protocol, _disable_byteorder_record):
+    serialized_storages = {}
+    id_map: Dict[int, str] = {}
 
-    def npu_nbytes(self):
-        if self.device.type != 'cpu':
-            storage_tensor = torch_npu._C._tensor_construct_from_storage(self)
-            base_nbytes = storage_tensor.size().numel() * storage_tensor.element_size()
-            return base_nbytes
-        else:
-            return cpu_nbytes(self)
+    # Since loading storages that view the same data with different dtypes is
+    # not supported, we need to keep track of the dtype associated with each
+    # storage data_ptr and throw an error if the dtype is ever different.
+    storage_dtypes: Dict[int, torch.dtype] = {}
 
-    torch.storage.UntypedStorage.nbytes = npu_nbytes
-    result = torch.serialization.save(obj, f, pickle_module, pickle_protocol, True, _disable_byteorder_record)
-    torch.storage.UntypedStorage.nbytes = cpu_nbytes
-    return result
+    def persistent_id(obj):
+        if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj):
+
+            if isinstance(obj, torch.storage.TypedStorage):
+                storage = obj._untyped_storage
+                storage_dtype = obj.dtype
+                storage_type_str = obj._pickle_storage_type()
+                storage_type = getattr(torch, storage_type_str)
+                if storage.device.type != "cpu":
+                    storage_tensor = torch_npu._C._tensor_construct_from_storage(storage)
+                    storage_numel = storage_tensor.size().numel() * storage_tensor.element_size() // obj._element_size()
+                else:
+                    storage_numel = obj._size()
+
+            else:
+                storage = obj
+                storage_dtype = torch.uint8
+                storage_type = normalize_storage_type(type(obj))
+                if storage.device.type != "cpu":
+                    storage_tensor = torch_npu._C._tensor_construct_from_storage(storage)
+                    storage_numel = storage_tensor.size().numel() * storage_tensor.element_size()
+                else:
+                    storage_numel = storage.nbytes()
+
+            # If storage is allocated, ensure that any other saved storages
+            # pointing to the same data all have the same dtype. If storage is
+            # not allocated, don't perform this check
+            if storage.data_ptr() != 0:
+                if storage.data_ptr() in storage_dtypes:
+                    if storage_dtype != storage_dtypes[storage.data_ptr()]:
+                        raise RuntimeError(
+                            'Cannot save multiple tensors or storages that '
+                            'view the same data as different types')
+                else:
+                    storage_dtypes[storage.data_ptr()] = storage_dtype
+
+            storage_key = id_map.setdefault(storage._cdata, str(len(id_map)))
+            location = location_tag(storage)
+            serialized_storages[storage_key] = storage
+
+            return ('storage',
+                    storage_type,
+                    storage_key,
+                    location,
+                    storage_numel)
+
+        return None
+
+    # Write the pickle data for `obj`
+    data_buf = io.BytesIO()
+    pickler = pickle_module.Pickler(data_buf, protocol=pickle_protocol)
+    pickler.persistent_id = persistent_id
+    pickler.dump(obj)
+    data_value = data_buf.getvalue()
+    zip_file.write_record('data.pkl', data_value, len(data_value))
+
+    # Write byte order marker
+    if not _disable_byteorder_record:
+        if sys.byteorder not in ['little', 'big']:
+            raise ValueError('Unknown endianness type: ' + sys.byteorder)
+
+        zip_file.write_record('byteorder', sys.byteorder, len(sys.byteorder))
+
+    # Write each tensor to a file named tensor/the_tensor_key in the zip archive
+    for key in sorted(serialized_storages.keys()):
+        name = f'data/{key}'
+        storage = serialized_storages[key]
+        # given that we copy things around anyway, we might use storage.cpu()
+        # this means to that to get tensors serialized, you need to implement
+        # .cpu() on the underlying Storage
+        if storage.device.type != 'cpu':
+            storage = storage.cpu()
+        # Now that it is on the CPU we can directly copy it into the zip file
+        num_bytes = storage.nbytes()
+        zip_file.write_record(name, storage.data_ptr(), num_bytes)
 
 
 def save(
@@ -256,7 +335,7 @@ def save(
             "if it is necessary to use this, please convert the npu tensor to cpu tensor for saving"
         )
         _warn_legacy_serialization(warn_massage, "save")
-    return _get_npu_save_result(obj, f, pickle_module, pickle_protocol, True, _disable_byteorder_record)
+    return torch.serialization.save(obj, f, pickle_module, pickle_protocol, True, _disable_byteorder_record)
 
 
 def save_async(
@@ -313,7 +392,11 @@ def _save_data_thread(save_args,
             if storage.device.type != 'cpu':
                 storage = storage.cpu()
             # Now that it is on the CPU we can directly copy it into the zip file
-            num_bytes = storage.nbytes()
+            if storage.device.type != "cpu":
+                storage_tensor = torch_npu._C._tensor_construct_from_storage(storage)
+                num_bytes = storage_tensor.size().numel() * storage_tensor.element_size()
+            else:
+                num_bytes = storage.nbytes()
             storage_value.append((name, storage, num_bytes))
 
     with _open_zipfile_writer(f) as opened_zipfile:
@@ -340,13 +423,21 @@ def _save(obj, pickle_module, pickle_protocol):
                 storage_dtype = obj.dtype
                 storage_type_str = obj._pickle_storage_type()
                 storage_type = getattr(torch, storage_type_str)
-                storage_numel = obj._size()
+                if storage.device.type != "cpu":
+                    storage_tensor = torch_npu._C._tensor_construct_from_storage(storage)
+                    storage_numel = storage_tensor.size().numel() * storage_tensor.element_size() // obj._element_size()
+                else:
+                    storage_numel = obj._size()
 
             else:
                 storage = obj
                 storage_dtype = torch.uint8
                 storage_type = normalize_storage_type(type(obj))
-                storage_numel = storage.nbytes()
+                if storage.device.type != "cpu":
+                    storage_tensor = torch_npu._C._tensor_construct_from_storage(storage)
+                    storage_numel = storage_tensor.size().numel() * storage_tensor.element_size()
+                else:
+                    storage_numel = storage.nbytes()
 
             # If storage is allocated, ensure that any other saved storages
             # pointing to the same data all have the same dtype. If storage is
@@ -390,3 +481,4 @@ def _save(obj, pickle_module, pickle_protocol):
 def _add_serialization_methods():
     torch.save = save
     torch.load = load
+    torch.serialization._save = _npu_save