From dbf1af3074145993a47c81472c50f4dc8e2a6c62 Mon Sep 17 00:00:00 2001
From: mei-feiyao <meifeiyao@huawei.com>
Date: Tue, 23 Jan 2024 11:44:04 +0800
Subject: [PATCH 01/62] add bind_core.py

---
 profiler/affinity_cpu_bind/README.md    |  34 ++++
 profiler/affinity_cpu_bind/bind_core.py | 227 ++++++++++++++++++++++++
 2 files changed, 261 insertions(+)
 create mode 100644 profiler/affinity_cpu_bind/README.md
 create mode 100644 profiler/affinity_cpu_bind/bind_core.py

diff --git a/profiler/affinity_cpu_bind/README.md b/profiler/affinity_cpu_bind/README.md
new file mode 100644
index 0000000000..832bd555e4
--- /dev/null
+++ b/profiler/affinity_cpu_bind/README.md
@@ -0,0 +1,34 @@
+### **昇腾亲和性CPU绑核工具** 
+
+###  **介绍** 
+昇腾亲和性CPU绑核工具支持用户无需侵入式修改工程，直接运行工具即可实现按亲和性策略绑核，提升推理或训练性能。
+
+###  **使用方式** 
+1.命令行输入python3 bind_core.py -app/--application="inference/train cmd"（如果命令含多个参数，放在双引号中）
+该方式会在拉起任务后，监测任务进程，并实施绑核，直至任务进程结束。
+
+2.推理或训练任务已经拉起，命令行输入python3 bind_core.py。该方式会循环查找使用到NPU卡的任务进程，并实施绑核。
+
+3.绑核运行过程的日志默认不存盘；想保存运行日志的话，执行绑核命令时设置-l/--log参数，例如 : python3 bind_core.py -l/--log，这样就会将运行日志保存到当前路径的bind_core_xxx.txt
+
+### **使用须知**
+1.该脚本会在拉起后查找使用到NPU卡的进程，每次查找10s，循环5次。如果找不到进程，会超时退出。
+
+2.使用工具前应提前安装pstree工具，参考命令yum install -y psmisc或apt -y install psmisc。
+
+3.使用前手动执行npu-smi info -t topo，出现如下类似信息，说明环境支持绑核，否则请将环境驱动包升级到Ascend HDK 23.0.RC2以上版本。
+
+            NPU0   NPU1    NPU2    NPU3    NPU4    NPU5    NPU6    NPU7    CPU Affinity 
+    NPU0    X      HCCS    HCCS    HCCS    HCCS    HCCS    HCCS    HCCS    xx-xx
+    NPU1    HCCS   X       HCCS    HCCS    HCCS    HCCS    HCCS    HCCS    xx-xx
+    NPU2    HCCS   HCCS    X       HCCS    HCCS    HCCS    HCCS    HCCS    xx-xx
+    NPU3    HCCS   HCCS    HCCS    X       HCCS    HCCS    HCCS    HCCS    xx-xx
+    NPU4    HCCS   HCCS    HCCS    HCCS    X       HCCS    HCCS    HCCS    xx-xx
+    NPU5    HCCS   HCCS    HCCS    HCCS    HCCS    X       HCCS    HCCS    xx-xx
+    NPU6    HCCS   HCCS    HCCS    HCCS    HCCS    HCCS    X       HCCS    xx-xx
+    NPU7    HCCS   HCCS    HCCS    HCCS    HCCS    HCCS    HCCS    X       xx-xx
+
+
+
+
+
diff --git a/profiler/affinity_cpu_bind/bind_core.py b/profiler/affinity_cpu_bind/bind_core.py
new file mode 100644
index 0000000000..a3f6d67a8e
--- /dev/null
+++ b/profiler/affinity_cpu_bind/bind_core.py
@@ -0,0 +1,227 @@
+#! /usr/bin/python3
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import re
+import argparse
+from datetime import datetime
+from datetime import timezone
+import time
+
+NPU_IDS = []
+RUNNING_PIDS = {}
+NPU_CPU_AFFINITY_DICT = {}
+SAVE_LOG_TO_FILE = False
+
+# binding core log file
+nowtime = datetime.now(tz=timezone.utc)
+BIND_CORE_RESULT_FILE = 'bind_core_' + \
+                        str(nowtime.year) + '_' + \
+                        str(nowtime.month) + '_' + \
+                        str(nowtime.day) + '_' + \
+                        str(nowtime.hour) + '_' + \
+                        str(nowtime.minute) + '_' + \
+                        str(nowtime.second) + '.txt'
+
+
+# print log to logfile
+def print_log_to_file(msg):
+    global SAVE_LOG_TO_FILE
+    if not SAVE_LOG_TO_FILE:
+        return
+    with open(file=BIND_CORE_RESULT_FILE, mode="a", encoding="utf-8") as f:
+        f.write(msg + '\n')
+
+
+# launch training or inference process
+def launch_process(cmd):
+    global RUNNING_CMD_PID
+    print_log_to_file('[INFO] Start to execute cmd: {}'.format(cmd))
+    subprocess.Popen(cmd.split(), shell=False)
+
+
+# parse input cmd
+def args_parse():
+    global SAVE_LOG_TO_FILE
+    bind_wait_core_time = 0
+    parser = argparse.ArgumentParser(description='This is a sample program.')
+    parser.add_argument('-t', '--time', type=int, metavar='', nargs='+', help='Wait time before bind cores that you want to set. The unit is \'s\'')
+    parser.add_argument('-app', '--application', metavar='', nargs='+', help='Training or inference command that you want to run.')
+    parser.add_argument('-l', '--log', default=False, action='store_true', help='Switch to save running log to local file.')
+    args = parser.parse_args()
+    if args.application:
+        application_cmd = ' '.join(args.application)
+        launch_process(application_cmd)
+        time.sleep(10)
+    if args.time:
+        bind_wait_core_time = int(args.time[0])
+    if args.log:
+        SAVE_LOG_TO_FILE = True
+
+    # if time is set, wait for setting time before bind cores
+    if bind_wait_core_time != 0:
+        time.sleep(bind_wait_core_time)
+
+
+# get npu affinity
+def get_npu_affinity() -> bool:
+    global NPU_CPU_AFFINITY_DICT
+    global NPU_IDS
+
+    get_npu_topo_cmd = 'npu-smi info -t topo'
+    p = subprocess.run(get_npu_topo_cmd.split(), shell=False, capture_output=True)
+    res = p.stdout.decode('utf-8').strip().split()
+    if not res:
+        print('[ERROR] Failed to run get npu affinity info, please check if driver version support cmd npu-smi info -t topo')
+        return False
+
+    i = 0
+    for v in res:
+        if '-' in v:
+            NPU_CPU_AFFINITY_DICT[NPU_IDS[i]] = v
+            i += 1
+    for k in NPU_CPU_AFFINITY_DICT.keys():
+        print_log_to_file('[INFO] Affinity CPU list {} for NPU {}'.format(NPU_CPU_AFFINITY_DICT[k], k))
+    return True
+
+
+# get total npu id
+def get_total_npu_id() -> bool:
+    global NPU_IDS
+    get_npu_info_cmd = 'npu-smi info -l'
+    get_npu_info_process = subprocess.run(get_npu_info_cmd.split(), shell=False, capture_output=True)
+    get_npu_ids_cmd = 'grep ID'
+    get_npu_ids_process = subprocess.run(get_npu_ids_cmd.split(), shell=False, input=get_npu_info_process.stdout, capture_output=True)
+    res = get_npu_ids_process.stdout.decode('utf-8').strip().split()
+    for i in res:
+        if i.isdigit():
+            NPU_IDS.append(int(i))
+    if not NPU_IDS:
+        print('[ERROR] Failed to get total NPU id list, please make sure there is NPU on this device')
+        return False
+    print_log_to_file('[INFO] NPU total id list: {}'.format(NPU_IDS))
+    return True
+
+
+# get app pid on npu
+def get_pid_on_npu() -> bool:
+    global RUNNING_PIDS
+    global NPU_IDS
+    print_log_to_file('[INFO] Begin to find running process on all NPUs')
+    RUNNING_PIDS.clear()
+    # get process pid on NPUs, retry times : 5
+    for times in range(5):
+        for i in NPU_IDS:
+            get_npu_pids_cmd = 'npu-smi info -t proc-mem -i {} -c 0'.format(str(i))
+            p = subprocess.run(get_npu_pids_cmd.split(), shell=False, capture_output=True)
+            res = p.stdout.decode('utf-8').strip().split()
+
+            if 'Process' in res:
+                for v in res:
+                    if v.startswith('id:'):
+                        pid_on_npu = v.split(':')[1]
+                        if i not in RUNNING_PIDS:
+                            RUNNING_PIDS[i] = [int(pid_on_npu)]
+                        else:
+                            RUNNING_PIDS[i].append(int(pid_on_npu))
+
+        if RUNNING_PIDS:
+            break
+        print_log_to_file('[WARNING] Found no running process on all NPUs, retry times: {}, wait for 5 s'.format(times + 1))
+        # wait 5 s for each time
+        time.sleep(5)
+
+    # no running process on NPUs, stop
+    if not RUNNING_PIDS:
+        print_log_to_file('[INFO] Found no running process on all NPUs, stop bind cores')
+        print('[INFO] Now there is no running process on all NPUs, stop bind cores')
+        return False
+
+    # delete repeat pid
+    for i in NPU_IDS:
+        if i not in RUNNING_PIDS:
+            continue
+        pids_npu = RUNNING_PIDS[i]
+        for n, pid in RUNNING_PIDS.items():
+            if n != i and pid in pids_npu:
+                RUNNING_PIDS[n].remove(pid)
+
+    for k in RUNNING_PIDS.keys():
+        print_log_to_file('[INFO] Succeed to find running process {} on NPU {}'.format(RUNNING_PIDS[k], k))
+    return True
+
+
+# get device info
+def get_dev_info() -> bool:
+    if not get_total_npu_id():
+        return False
+    if not get_npu_affinity():
+        return False
+    return True
+
+
+# get process affinity
+def get_process_affinity(pid):
+    get_affinity_cpu_cmd = 'taskset -pc {} '.format(pid)
+    p = subprocess.run(get_affinity_cpu_cmd.split(), shell=False, capture_output=True)
+    res = p.stdout.decode('utf-8').strip().split()
+    return res[len(res) - 1]
+
+
+# run bind core
+def run_bind_core():
+    global NPU_IDS
+    global NPU_CPU_AFFINITY_DICT
+    for k, pid_list in RUNNING_PIDS.items():
+        cpu_list = NPU_CPU_AFFINITY_DICT[k].split('-')
+        start_cpu_id = cpu_list[0]
+        end_cpu_id = cpu_list[1]
+
+        for pid in pid_list:
+            get_child_pids_cmd = 'pstree {} -p -T'.format(pid)
+            p = subprocess.run(get_child_pids_cmd.split(), shell=False, capture_output=True)
+            res = p.stdout.decode('utf-8').strip().split()
+            for ele in res:
+                ele = re.sub(u"\\(|\\)", ",", ele)
+                ele_list = ele.split(',')
+                for sub_p in ele_list:
+                    if sub_p.isdigit():
+                        sub_p = int(sub_p)
+
+                        # if process has set to right affinity, continue
+                        current_affinity_cpu_list = get_process_affinity(sub_p)
+                        if not current_affinity_cpu_list:
+                            continue
+                        current_cpu_list = current_affinity_cpu_list.split('-')
+                        if current_cpu_list and current_cpu_list[0] == start_cpu_id and current_cpu_list[1] == end_cpu_id:
+                            continue
+                        print_log_to_file('[INFO] Begin to bind cores for process {} on NPU {}'.format(str(sub_p), k))
+                        set_affinity_cpu_cmd = 'taskset -pc {}-{} {}'.format(int(start_cpu_id), int(end_cpu_id), sub_p)
+                        p = subprocess.run(set_affinity_cpu_cmd.split(), shell=False, capture_output=True)
+                        print_log_to_file(p.stdout.decode('utf-8'))
+
+                        print_log_to_file('[INFO] Succeed to bind process {} on NPU {} with cpu cores list {}'.format(str(sub_p), k, NPU_CPU_AFFINITY_DICT[k]))
+
+
+if __name__ == '__main__':
+    print("[INFO] Begin to run bind-cores script...")
+    args_parse()
+    if not get_dev_info():
+        exit()
+
+    while True:
+        if not get_pid_on_npu():
+            exit()
+        run_bind_core()
-- 
Gitee


From 3463da38b42bc43d5f92fb4e356c02ede6129cc4 Mon Sep 17 00:00:00 2001
From: Mrtutu <zhangwei983@huawei.com>
Date: Wed, 28 Feb 2024 10:13:04 +0800
Subject: [PATCH 02/62] add kwargs

---
 .../advisor_backend/cluster_advice/kernel_cluster_advice.py     | 2 +-
 .../advisor/advisor_backend/cluster_advice/slow_link_advice.py  | 2 +-
 .../advisor/advisor_backend/cluster_advice/slow_rank_advice.py  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/profiler/advisor/advisor_backend/cluster_advice/kernel_cluster_advice.py b/profiler/advisor/advisor_backend/cluster_advice/kernel_cluster_advice.py
index e2ca914a79..6fa83c765f 100644
--- a/profiler/advisor/advisor_backend/cluster_advice/kernel_cluster_advice.py
+++ b/profiler/advisor/advisor_backend/cluster_advice/kernel_cluster_advice.py
@@ -12,7 +12,7 @@ class KernelClusterAdvice(ClusterAdviceBase):
     COLUMNS_TO_CAL = ["Duration(us)"]
     CAL_FUN = ['mean', 'var', 'max', 'min', 'count', 'sum']
 
-    def __init__(self, collection_path: str):
+    def __init__(self, collection_path: str, kwargs: dict = None):
         super().__init__(collection_path)
         self.all_kernel_data = pd.DataFrame()
 
diff --git a/profiler/advisor/advisor_backend/cluster_advice/slow_link_advice.py b/profiler/advisor/advisor_backend/cluster_advice/slow_link_advice.py
index e350e08f39..f8a625242f 100644
--- a/profiler/advisor/advisor_backend/cluster_advice/slow_link_advice.py
+++ b/profiler/advisor/advisor_backend/cluster_advice/slow_link_advice.py
@@ -33,7 +33,7 @@ class SlowLinkAdvice(ClusterAdviceBase):
     SDMA = "SDMA"
     RDMA = "RDMA"
 
-    def __init__(self, collection_path: str):
+    def __init__(self, collection_path: str, kwargs: dict = None):
         super().__init__(collection_path)
         self.rank_bw_dict = defaultdict(lambda: {
             self.RDMA_TIME_MS: 0,
diff --git a/profiler/advisor/advisor_backend/cluster_advice/slow_rank_advice.py b/profiler/advisor/advisor_backend/cluster_advice/slow_rank_advice.py
index 5165545832..4e789fb7fb 100644
--- a/profiler/advisor/advisor_backend/cluster_advice/slow_rank_advice.py
+++ b/profiler/advisor/advisor_backend/cluster_advice/slow_rank_advice.py
@@ -26,7 +26,7 @@ class SlowRankAdvice(ClusterAdviceBase):
     RATIO_THRESHOLD = 0.05
     BOTTLENECK_LIST = ['Computing', 'Communication', "Free"]
 
-    def __init__(self, collection_path: str):
+    def __init__(self, collection_path: str, kwargs: dict = None):
         super().__init__(collection_path)
 
     def load_step_time(self):
-- 
Gitee


From 86134c1b3b21d9dc8459f86ad452f77e38a8c26c Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Fri, 1 Mar 2024 12:42:59 +0000
Subject: [PATCH 03/62] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20pytorch=5Fadapter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sample/pytorch_adapter/.keep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 sample/pytorch_adapter/.keep

diff --git a/sample/pytorch_adapter/.keep b/sample/pytorch_adapter/.keep
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From 09b0ac36bb2999ad0eca7f67d0c41e84733bfa60 Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Fri, 1 Mar 2024 12:43:17 +0000
Subject: [PATCH 04/62] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20jit=5Fmode?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sample/pytorch_adapter/jit_mode/.keep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 sample/pytorch_adapter/jit_mode/.keep

diff --git a/sample/pytorch_adapter/jit_mode/.keep b/sample/pytorch_adapter/jit_mode/.keep
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From 61bb4f119f2d6d1e7debb29ace5b6347ab3c56e7 Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Fri, 1 Mar 2024 12:43:47 +0000
Subject: [PATCH 05/62] rename sample/pytorch_adapter/jit_mode/.keep to
 sample/pytorch_adapter/jit_mode/add_adapter.cpp.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 sample/pytorch_adapter/jit_mode/.keep         |   0
 .../pytorch_adapter/jit_mode/add_adapter.cpp  | 128 ++++++++++++++++++
 2 files changed, 128 insertions(+)
 delete mode 100644 sample/pytorch_adapter/jit_mode/.keep
 create mode 100644 sample/pytorch_adapter/jit_mode/add_adapter.cpp

diff --git a/sample/pytorch_adapter/jit_mode/.keep b/sample/pytorch_adapter/jit_mode/.keep
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/sample/pytorch_adapter/jit_mode/add_adapter.cpp b/sample/pytorch_adapter/jit_mode/add_adapter.cpp
new file mode 100644
index 0000000000..6c65e60ec5
--- /dev/null
+++ b/sample/pytorch_adapter/jit_mode/add_adapter.cpp
@@ -0,0 +1,128 @@
+#include <torch/extension.h>
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+#include "torch_npu/csrc/framework/OpCommand.h"
+
+using torch::autograd::AutogradContext;
+using torch::autograd::Function;
+using tensor_list = std::vector<at::Tensor>;
+using namespace at;
+
+extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z);
+
+// 为NPU设备注册前向实现
+at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other)
+{
+    // 创建输出内存
+    at::Tensor result = at::Tensor(self);
+    // 将pytorch中的结构翻译成为CANN认识的数据类型和结构
+    // 1. (重要)通过对tensor的shape分析，选择合适的tiling（该算子为了简化，固定了tiling，只有特定shape下计算才正确）
+    // 2. 对数据类型和格式转换  -- 此处无需数据格式处理，直接使用
+    auto stream = c10_npu::getCurrentNPUStream().stream(false);
+    auto x = self.storage().data();
+    auto y = other.storage().data();
+    auto z = result.storage().data();
+
+    uint32_t blockDim = 8;
+    auto callback = [stream, blockDim, x, y, z]() -> int {
+        add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z);
+        return 0;  // 此处可以通过某种方式获取算子执行结果，还未实现
+    };
+    // 下发算子
+    at_npu::native::OpCommand cmd;
+    cmd.Name("my_add").SetCustomHandler(callback).Run();
+    return result;
+}
+
+// 为NPU设备注册反向实现
+std::tuple<at::Tensor, at::Tensor> my_add_backward_impl_npu(const at::Tensor &self)
+{
+    at::Tensor result = at::Tensor(self);  // 创建输出内存
+
+    return {result, result};
+}
+
+// 为Meta设备注册前向实现
+at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other)
+{
+    return empty_like(self);
+}
+
+// 为Meta设备注册反向实现
+std::tuple<at::Tensor, at::Tensor> my_add_backward_impl_meta(const at::Tensor &self)
+{
+    auto result = empty_like(self);
+    return std::make_tuple(result, result);
+}
+
+// 寻找注册在该op上的不同设备的实现
+at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other)
+{
+    static auto op =
+        torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed<decltype(my_add_impl)>();
+    return op.call(self, other);
+}
+// 寻找注册在该op上的不同设备的实现
+std::tuple<at::Tensor, at::Tensor> my_add_backward_impl(const at::Tensor &self)
+{
+    static auto op = torch::Dispatcher::singleton()
+                         .findSchemaOrThrow("myaten::my_add_backward", "")
+                         .typed<decltype(my_add_backward_impl)>();
+    return op.call(self);
+}
+
+// 在myaten命名空间里注册my_add和my_add_backward两个schema
+TORCH_LIBRARY(myaten, m)
+{
+    m.def("my_add(Tensor self, Tensor other) -> Tensor");
+    m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)");
+}
+
+// 通过继承torch::autograd::Function类实现前反向绑定
+class MyAddFunction : public torch::autograd::Function<MyAddFunction> {
+public:
+    static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other)
+    {
+        at::AutoDispatchBelowADInplaceOrView guard;
+        return my_add_impl(self, other);
+    }
+
+    static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs)
+    {
+        auto grad_output = grad_outputs[0];
+        auto result = my_add_backward_impl(grad_output);
+        return {std::get<0>(result), std::get<1>(result)};
+    }
+};
+
+at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other)
+{
+    return MyAddFunction::apply(self, other);
+}
+
+// 给op绑定NPU的自动求导实现
+// 如果是pytorch 2.1以下的版本，AutogradPrivateUse1需要改成AutogradXLA
+TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m)
+{
+    m.impl("my_add", &my_add_impl_autograd);
+}
+
+// 为NPU设备注册前反向实现
+// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1，在2.1以下版本用的是XLA，如果是2.1以下版本PrivateUse1需要改成XLA
+TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m)
+{
+    m.impl("my_add", &my_add_impl_npu);
+    m.impl("my_add_backward", &my_add_backward_impl_npu);
+}
+
+// 为Meta设备注册前反向实现
+TORCH_LIBRARY_IMPL(myaten, Meta, m)
+{
+    m.impl("my_add", &my_add_impl_meta);
+    m.impl("my_add_backward", &my_add_backward_impl_meta);
+}
+
+// 通过pybind将c++接口和python接口绑定
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("my_add", &my_add_impl_autograd, "x + y");
+}
-- 
Gitee


From 4519eecd436c6ae853d97296f32e6890feaf937c Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Fri, 1 Mar 2024 12:44:06 +0000
Subject: [PATCH 06/62] add sample/pytorch_adapter/jit_mode/add_kernel.cpp.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 .../pytorch_adapter/jit_mode/add_kernel.cpp   | 106 ++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 sample/pytorch_adapter/jit_mode/add_kernel.cpp

diff --git a/sample/pytorch_adapter/jit_mode/add_kernel.cpp b/sample/pytorch_adapter/jit_mode/add_kernel.cpp
new file mode 100644
index 0000000000..9164764efc
--- /dev/null
+++ b/sample/pytorch_adapter/jit_mode/add_kernel.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ *
+ * Function : z = x + y
+ * This sample is a very basic sample that implements vector add on Ascend plaform.
+ * In this sample:
+ * Length of x / y / z is 8*2048.
+ * Num of vector core used in sample is 8.
+ * Length for each core to compute is 2048.
+ * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop.
+ *
+ */
+#include "kernel_operator.h"
+using namespace AscendC;
+constexpr int32_t TOTAL_LENGTH = 8 * 2048;                             // total length of data
+constexpr int32_t USE_CORE_NUM = 8;                                    // num of core used
+constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;          // length computed of each core
+constexpr int32_t TILE_NUM = 8;                                        // split data into 8 tiles for each core
+constexpr int32_t BUFFER_NUM = 2;                                      // tensor num for each queue
+constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM;  // seperate to 2 parts, due to double buffer
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd()
+    {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    {
+        // get start index for current core, core parallel
+        xGm.SetGlobalBuffer((__gm__ int16_t *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        yGm.SetGlobalBuffer((__gm__ int16_t *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        zGm.SetGlobalBuffer((__gm__ int16_t *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        // pipe alloc memory to queue, the unit is Bytes
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t));
+    }
+    __aicore__ inline void Process()
+    {
+        // loop count need to be doubled, due to double buffer
+        constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM;
+        // tiling strategy, pipeline parallel
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        // alloc tensor from queue memory
+        LocalTensor<int16_t> xLocal = inQueueX.AllocTensor<int16_t>();
+        LocalTensor<int16_t> yLocal = inQueueY.AllocTensor<int16_t>();
+        // copy progress_th tile from global tensor to local tensor
+        DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
+        DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
+        // enque input tensors to VECIN queue
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        // deque input tensors from VECIN queue
+        LocalTensor<int16_t> xLocal = inQueueX.DeQue<int16_t>();
+        LocalTensor<int16_t> yLocal = inQueueY.DeQue<int16_t>();
+        LocalTensor<int16_t> zLocal = outQueueZ.AllocTensor<int16_t>();
+        // call Add instr for computation
+        Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+        // enque the output tensor to VECOUT queue
+        outQueueZ.EnQue<int16_t>(zLocal);
+        // free input tensors for reuse
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        // deque output tensor from VECOUT queue
+        LocalTensor<int16_t> zLocal = outQueueZ.DeQue<int16_t>();
+        // copy progress_th tile from local tensor to global tensor
+        DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
+        // free output tensor for reuse
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    TPipe pipe;
+    // create queues for input, in this case depth is equal to buffer num
+    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
+    // create queue for output, in this case depth is equal to buffer num
+    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+    GlobalTensor<int16_t> xGm, yGm, zGm;
+};
+// implementation of kernel function
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+{
+    KernelAdd op;
+    op.Init(x, y, z);
+    op.Process();
+}
+
+// 包裹核函数，使得普通编译器能认识这个符号
+extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
+{
+    add_custom<<<blockDim, nullptr, stream>>>(x, y, z);
+}
\ No newline at end of file
-- 
Gitee


From b9c27456d92ba89f6ca78c5d103685274cd402b3 Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Fri, 1 Mar 2024 12:44:24 +0000
Subject: [PATCH 07/62] add sample/pytorch_adapter/jit_mode/main.py.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 sample/pytorch_adapter/jit_mode/main.py | 50 +++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 sample/pytorch_adapter/jit_mode/main.py

diff --git a/sample/pytorch_adapter/jit_mode/main.py b/sample/pytorch_adapter/jit_mode/main.py
new file mode 100644
index 0000000000..47cf34f487
--- /dev/null
+++ b/sample/pytorch_adapter/jit_mode/main.py
@@ -0,0 +1,50 @@
+import os
+import torch
+import torch.utils.cpp_extension
+import torch_npu
+
+PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__))
+CUR_PATH = os.getcwd()
+
+
+def compile_kernels():
+    os.system("make")  # 由于pytorch中没有device编译的扩展，所以此处人工加make
+
+
+def compile_host():
+    extra_ldflags = []
+    extra_ldflags.append(f"-L{PYTORCH_NPU_INSTALL_PATH}/lib")
+    extra_ldflags.append("-ltorch_npu")
+    extra_ldflags.append(f"-L{CUR_PATH}/")
+    extra_ldflags.append("-lcustom_kernels")
+    extra_include_paths = []
+    extra_include_paths.append("./")
+    extra_include_paths.append(os.path.join(
+        PYTORCH_NPU_INSTALL_PATH, "include"))
+    extra_include_paths.append(os.path.join(os.path.join(os.path.join(os.path.join(
+        PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"))
+
+    module = torch.utils.cpp_extension.load(
+        name="jit_extension",
+        sources=[
+            "add_adapter.cpp"
+        ],
+        extra_include_paths=extra_include_paths,
+        extra_ldflags=extra_ldflags,
+        verbose=True)
+    return module
+
+
+def test_add(module):
+    # 由于kernel现在是静态tiling，所以此处尺寸需要匹配
+    # 因为add是elementwise的，现有算子支持8*2048(详见kernel实现)，所以，小于这个应该都可以
+    x = torch.arange(0, 100).short()
+    y = torch.arange(0, 100).short()
+    z = module.my_add(x.npu(), y.npu())
+    print(z)
+
+
+if __name__ == '__main__':
+    compile_kernels()
+    module = compile_host()
+    test_add(module)
-- 
Gitee


From b18932f4e5fec3c5a9fa9df42df5aa91f4e244f4 Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Fri, 1 Mar 2024 12:44:46 +0000
Subject: [PATCH 08/62] add sample/pytorch_adapter/jit_mode/Makefile.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 sample/pytorch_adapter/jit_mode/Makefile | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 sample/pytorch_adapter/jit_mode/Makefile

diff --git a/sample/pytorch_adapter/jit_mode/Makefile b/sample/pytorch_adapter/jit_mode/Makefile
new file mode 100644
index 0000000000..ad17f3a514
--- /dev/null
+++ b/sample/pytorch_adapter/jit_mode/Makefile
@@ -0,0 +1,20 @@
+# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化
+ASCEND_HOME_PATH			?= /usr/local/Ascend/ascend-toolkit/latest
+
+COMPILER					:= $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec  # 参考device_config.cmake中CMAKE_C_COMPILER配置
+COMPILER_FLAG				:= -xcce -O2 -std=c++17
+DYNAMIC_LIB_FLAG			:= -fPIC -shared
+DAV_FLAG                    := --cce-aicore-arch=dav-c220-vec
+ASCENDC_INC_FLAG 			:= -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include  # 参考device_intf.cmake的配置简化
+
+all: build
+
+build: libcustom_kernels.so
+
+# 后续如果要扩展，把多个kernel的cpp都加到后面
+libcustom_kernels.so: add_kernel.cpp
+	$(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^
+
+.PHONY: clean
+clean:
+	rm *.so
\ No newline at end of file
-- 
Gitee


From 063268d82bf00342689eae952d165059177b97cf Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Sat, 2 Mar 2024 05:26:43 +0000
Subject: [PATCH 09/62] rename sample/pytorch_adapter/.keep to
 sample/pytorch_adapter/README.md.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 sample/pytorch_adapter/{.keep => README.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename sample/pytorch_adapter/{.keep => README.md} (100%)

diff --git a/sample/pytorch_adapter/.keep b/sample/pytorch_adapter/README.md
similarity index 100%
rename from sample/pytorch_adapter/.keep
rename to sample/pytorch_adapter/README.md
-- 
Gitee


From 0dd90976bfabc2e2561fc366bfe042706a052c06 Mon Sep 17 00:00:00 2001
From: lidonglei3 <lidonglei3@huawei.com>
Date: Mon, 4 Mar 2024 19:10:23 +0800
Subject: [PATCH 10/62] =?UTF-8?q?[profiler\cluster=5Fanalyse]=E9=9B=86?=
 =?UTF-8?q?=E7=BE=A4=E5=88=86=E6=9E=90=E5=B7=A5=E5=85=B7=E6=94=AF=E6=8C=81?=
 =?UTF-8?q?db=E6=96=87=E4=BB=B6=E8=A7=A3=E6=9E=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 profiler/cluster_analyse/README.md | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/profiler/cluster_analyse/README.md b/profiler/cluster_analyse/README.md
index 7cdb2d2c1e..2833d293ea 100644
--- a/profiler/cluster_analyse/README.md
+++ b/profiler/cluster_analyse/README.md
@@ -21,6 +21,12 @@ experimental_config = torch_npu.profiler._ExperimentalConfig(
 - ./ASCEND_PROFILER_OUTPUT/communication.json,
 - ./ASCEND_PROFILER_OUTPUT/communication_matrix.json
 
+或者具备：
+
+- ascend_pytorch.db
+
+以上csv、json文件与db文件只能存在一类，否则集群分析工具解析异常。
+
 确认这几个文件生成后，继续下面的集群分析。
 
 ## 数据汇聚与集群解析
@@ -37,11 +43,11 @@ python3 cluster_analysis.py -d {cluster profiling data path} -m {mode}
 | --collection_path或-d  | 性能数据汇集目录，运行分析脚本之后会在该目录下自动创建cluster_analysis_output文件夹，保存分析数据。 | 是 |
 | --mode或-m | 数据解析模式。取值为：communication_matrix（解析通信矩阵数据）、communication_time（解析通信耗时数据）、all（同时解析通信矩阵和通信耗时数据），默认值为all。 | 否 |
 
-## 交付件
+### 交付件
 
 集群分析工具的交付件通过Ascend Insight工具展示，详见《MindStudio 可视化调优工具指南（Ascend Insight）》。
 
-### cluster_step_trace_time.csv
+#### cluster_step_trace_time.csv
 
 数据解析模式为communication_matrix、communication_time或all时均生成。
 
@@ -79,7 +85,7 @@ K列：Communication（Not Overlapped and Exclude Receive）指剔除recieve算
 
 以上时间理论上都应该处于持平状态，即最大值小于最小值5%，否则就可能出现慢卡。
 
-### cluster_communication_matrix.json
+#### cluster_communication_matrix.json
 
 数据解析模式为communication_matrix或all时生成。
 
@@ -99,8 +105,21 @@ K列：Communication（Not Overlapped and Exclude Receive）指剔除recieve算
 - “HCCS”或“PCIE”是节点内片间拷贝，速度在18GB左右或以上比较正常。
 - “RDMA”是节点间拷贝，910A速度在12GB左右或以上。
 
-### cluster_communication.json
+#### cluster_communication.json
 
 数据解析模式为communication_time或all时生成。
 
 主要为通信耗时数据。
+
+#### cluster_analysis.db
+
+解析ascend_pytorch.db生成的交付件，当前解析通信类数据，主要包含下面数据：
+
+- ClusterCommAnalyzerTime：集群通信时间信息。
+- ClusterCommAnalyzerBandwidth：集群通信带宽信息。
+- ClusterCommAnalyzerMatrix：集群通信矩阵数据。
+- CommunicationGroup：通信组信息。
+- ClusterStepTraceTime：集群迭代轨迹数据。
+
+
+
-- 
Gitee


From 22d13cc133d6e3d31755e52a1c0d8aa7ffe89bbe Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Mon, 4 Mar 2024 13:50:51 +0000
Subject: [PATCH 11/62] update sample/pytorch_adapter/jit_mode/add_kernel.cpp.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 .../pytorch_adapter/jit_mode/add_kernel.cpp   | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/sample/pytorch_adapter/jit_mode/add_kernel.cpp b/sample/pytorch_adapter/jit_mode/add_kernel.cpp
index 9164764efc..7e82e88820 100644
--- a/sample/pytorch_adapter/jit_mode/add_kernel.cpp
+++ b/sample/pytorch_adapter/jit_mode/add_kernel.cpp
@@ -26,13 +26,13 @@ public:
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
     {
         // get start index for current core, core parallel
-        xGm.SetGlobalBuffer((__gm__ int16_t *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
-        yGm.SetGlobalBuffer((__gm__ int16_t *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
-        zGm.SetGlobalBuffer((__gm__ int16_t *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
         // pipe alloc memory to queue, the unit is Bytes
-        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t));
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t));
-        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t));
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half));
     }
     __aicore__ inline void Process()
     {
@@ -50,8 +50,8 @@ private:
     __aicore__ inline void CopyIn(int32_t progress)
     {
         // alloc tensor from queue memory
-        LocalTensor<int16_t> xLocal = inQueueX.AllocTensor<int16_t>();
-        LocalTensor<int16_t> yLocal = inQueueY.AllocTensor<int16_t>();
+        LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
+        LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
         // copy progress_th tile from global tensor to local tensor
         DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
         DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
@@ -62,13 +62,13 @@ private:
     __aicore__ inline void Compute(int32_t progress)
     {
         // deque input tensors from VECIN queue
-        LocalTensor<int16_t> xLocal = inQueueX.DeQue<int16_t>();
-        LocalTensor<int16_t> yLocal = inQueueY.DeQue<int16_t>();
-        LocalTensor<int16_t> zLocal = outQueueZ.AllocTensor<int16_t>();
+        LocalTensor<half> xLocal = inQueueX.DeQue<half>();
+        LocalTensor<half> yLocal = inQueueY.DeQue<half>();
+        LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
         // call Add instr for computation
         Add(zLocal, xLocal, yLocal, TILE_LENGTH);
         // enque the output tensor to VECOUT queue
-        outQueueZ.EnQue<int16_t>(zLocal);
+        outQueueZ.EnQue<half>(zLocal);
         // free input tensors for reuse
         inQueueX.FreeTensor(xLocal);
         inQueueY.FreeTensor(yLocal);
@@ -76,7 +76,7 @@ private:
     __aicore__ inline void CopyOut(int32_t progress)
     {
         // deque output tensor from VECOUT queue
-        LocalTensor<int16_t> zLocal = outQueueZ.DeQue<int16_t>();
+        LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
         // copy progress_th tile from local tensor to global tensor
         DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
         // free output tensor for reuse
@@ -89,7 +89,7 @@ private:
     TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
     // create queue for output, in this case depth is equal to buffer num
     TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
-    GlobalTensor<int16_t> xGm, yGm, zGm;
+    GlobalTensor<half> xGm, yGm, zGm;
 };
 // implementation of kernel function
 extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
-- 
Gitee


From fb472514d735b98c82257c0cb8606d6f09d33fb8 Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Mon, 4 Mar 2024 13:51:16 +0000
Subject: [PATCH 12/62] update sample/pytorch_adapter/jit_mode/main.py.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 sample/pytorch_adapter/jit_mode/main.py | 36 ++++++++++++++++++-------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/sample/pytorch_adapter/jit_mode/main.py b/sample/pytorch_adapter/jit_mode/main.py
index 47cf34f487..388e0799fb 100644
--- a/sample/pytorch_adapter/jit_mode/main.py
+++ b/sample/pytorch_adapter/jit_mode/main.py
@@ -2,6 +2,7 @@ import os
 import torch
 import torch.utils.cpp_extension
 import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
 
 PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__))
 CUR_PATH = os.getcwd()
@@ -35,16 +36,33 @@ def compile_host():
     return module
 
 
-def test_add(module):
-    # 由于kernel现在是静态tiling，所以此处尺寸需要匹配
-    # 因为add是elementwise的，现有算子支持8*2048(详见kernel实现)，所以，小于这个应该都可以
-    x = torch.arange(0, 100).short()
-    y = torch.arange(0, 100).short()
-    z = module.my_add(x.npu(), y.npu())
-    print(z)
+class TestCustomAdd(TestCase):
+    def test_add(self):
+        module = compile_host()
+        # 由于kernel现在是静态tiling，所以此处尺寸需要匹配
+        # 因为add是elementwise的，现有算子支持8*2048(详见kernel实现)，所以，小于这个应该都可以
+        length = [8, 2048]
+        x = torch.rand(length, device='cpu', dtype=torch.float16)
+        y = torch.rand(length, device='cpu', dtype=torch.float16)
+
+        x_npu = x.npu()
+        y_npu = y.npu()
+        x_npu.requires_grad = True
+        y_npu.requires_grad = True
+        output = module.my_add(x_npu, y_npu)
+        # 反向能力验证
+        output.backward(output)
+
+        x.requires_grad = True
+        y.requires_grad = True
+        cpuout = torch.add(x, y)
+        cpuout.backward(cpuout)
+
+        self.assertRtolEqual(output, cpuout)
+        self.assertRtolEqual(x_npu.grad, x.grad)
+        self.assertRtolEqual(y_npu.grad, y.grad)
 
 
 if __name__ == '__main__':
     compile_kernels()
-    module = compile_host()
-    test_add(module)
+    run_tests()
-- 
Gitee


From 0d97dba34e2e6d67d1a4515729a77dfc8d3a09f8 Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Mon, 4 Mar 2024 13:51:39 +0000
Subject: [PATCH 13/62] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20with=5Fsetuptools?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sample/pytorch_adapter/with_setuptools/.keep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 sample/pytorch_adapter/with_setuptools/.keep

diff --git a/sample/pytorch_adapter/with_setuptools/.keep b/sample/pytorch_adapter/with_setuptools/.keep
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From 3f0756c4a47dbb40d640261f1ff3104ea7abe408 Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Mon, 4 Mar 2024 13:52:12 +0000
Subject: [PATCH 14/62] rename setup.py.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 sample/pytorch_adapter/with_setuptools/.keep  |  0
 .../pytorch_adapter/with_setuptools/setup.py  | 51 +++++++++++++++++++
 2 files changed, 51 insertions(+)
 delete mode 100644 sample/pytorch_adapter/with_setuptools/.keep
 create mode 100644 sample/pytorch_adapter/with_setuptools/setup.py

diff --git a/sample/pytorch_adapter/with_setuptools/.keep b/sample/pytorch_adapter/with_setuptools/.keep
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/sample/pytorch_adapter/with_setuptools/setup.py b/sample/pytorch_adapter/with_setuptools/setup.py
new file mode 100644
index 0000000000..8f5ad6d9f8
--- /dev/null
+++ b/sample/pytorch_adapter/with_setuptools/setup.py
@@ -0,0 +1,51 @@
+import os
+import torch
+import torch_npu
+from setuptools import setup, find_packages
+from torch.utils.cpp_extension import BuildExtension
+from torch_npu.utils.cpp_extension import NpuExtension
+
+PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__))
+CUR_PATH = os.getcwd()
+
+
+def compile_kernels():
+    os.system("make")  # 由于pytorch中没有device编译的扩展，所以此处人工加make
+    return "libcustom_kernels.so"  # 这个make出来的库名字
+
+
+def compile_adapter():
+    exts = []
+    ext1 = NpuExtension(
+        name="ascend_custom_kernels_lib",  # import的库的名字
+        # 如果还有其他cpp文件参与编译，需要在这里添加
+        sources=["./add_adapter.cpp"],
+        extra_compile_args=[
+            '-I' + os.path.join(os.path.join(os.path.join(os.path.join(
+                PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"),
+        ],
+        library_dirs=[f"{CUR_PATH}"],  # 编译时需要依赖的库文件的路径，相当于g++编译时的-L选项
+        libraries=["custom_kernels"],  # 编译时依赖的库文件，相当于-l选项
+    )
+    exts.append(ext1)
+    return exts
+
+
+if __name__ == "__main__":
+    # 编译出含有算子的库，并以so的方式提供
+    kernel_so = compile_kernels()
+
+    # 编译出pytorch适配层的库，支持被框架集成
+    exts = compile_adapter()
+
+    # 将整体打包成wheel包
+    setup(
+        name="ascend_custom_kernels",  # package的名字
+        version='1.0',
+        keywords='ascend_custom_kernels',
+        ext_modules=exts,
+        packages=find_packages(),
+        cmdclass={"build_ext": BuildExtension},
+        data_files=[(".", [kernel_so])],
+        include_package_data=True,
+    )
-- 
Gitee


From 436b6f337c6880484d025d0655be44107507b5d1 Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Mon, 4 Mar 2024 13:52:30 +0000
Subject: [PATCH 15/62] add sample/pytorch_adapter/with_setuptools/test.py.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 .../pytorch_adapter/with_setuptools/test.py   | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 sample/pytorch_adapter/with_setuptools/test.py

diff --git a/sample/pytorch_adapter/with_setuptools/test.py b/sample/pytorch_adapter/with_setuptools/test.py
new file mode 100644
index 0000000000..896eef2c0f
--- /dev/null
+++ b/sample/pytorch_adapter/with_setuptools/test.py
@@ -0,0 +1,34 @@
+import torch
+import torch_npu
+import ascend_custom_kernels_lib
+from torch_npu.testing.testcase import TestCase, run_tests
+
+
+class TestCustomAdd(TestCase):
+    def test_add(self):
+        # 由于kernel现在是静态tiling，所以此处尺寸需要匹配
+        # 因为add是elementwise的，现有算子支持8*2048(详见kernel实现)，所以，小于这个应该都可以
+        length = [8, 2048]
+        x = torch.rand(length, device='cpu', dtype=torch.float16)
+        y = torch.rand(length, device='cpu', dtype=torch.float16)
+
+        x_npu = x.npu()
+        y_npu = y.npu()
+        x_npu.requires_grad = True
+        y_npu.requires_grad = True
+        output = ascend_custom_kernels_lib.my_add(x_npu, y_npu)
+        # 反向能力验证
+        output.backward(output)
+
+        x.requires_grad = True
+        y.requires_grad = True
+        cpuout = torch.add(x, y)
+        cpuout.backward(cpuout)
+
+        self.assertRtolEqual(output, cpuout)
+        self.assertRtolEqual(x_npu.grad, x.grad)
+        self.assertRtolEqual(y_npu.grad, y.grad)
+
+
+if __name__ == "__main__":
+    run_tests()
-- 
Gitee


From 94f8046c834e18c909f7fb2925270d234985bc3c Mon Sep 17 00:00:00 2001
From: binlien <lianwaiyu@qq.com>
Date: Mon, 4 Mar 2024 23:29:03 +0800
Subject: [PATCH 16/62] =?UTF-8?q?=E6=95=B4=E6=94=B9=E7=9B=AE=E5=BD=95?=
 =?UTF-8?q?=E7=BB=93=E6=9E=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../{jit_mode => jit_compile}/Makefile        |  38 ++--
 .../{jit_mode => jit_compile}/add_adapter.cpp |   0
 .../{jit_mode => jit_compile}/add_kernel.cpp  | 210 +++++++++---------
 .../{jit_mode => jit_compile}/main.py         | 136 ++++++------
 sample/third_party/lib/libruntime.so.aarch64  | Bin 8584 -> 0 bytes
 sample/third_party/lib/libruntime.so.x86      | Bin 16096 -> 0 bytes
 .../lib/libruntime_camodel.so.aarch64         | Bin 8584 -> 0 bytes
 .../third_party/lib/libruntime_camodel.so.x86 | Bin 16096 -> 0 bytes
 8 files changed, 192 insertions(+), 192 deletions(-)
 rename sample/pytorch_adapter/{jit_mode => jit_compile}/Makefile (98%)
 rename sample/pytorch_adapter/{jit_mode => jit_compile}/add_adapter.cpp (100%)
 rename sample/pytorch_adapter/{jit_mode => jit_compile}/add_kernel.cpp (97%)
 rename sample/pytorch_adapter/{jit_mode => jit_compile}/main.py (96%)
 delete mode 100644 sample/third_party/lib/libruntime.so.aarch64
 delete mode 100644 sample/third_party/lib/libruntime.so.x86
 delete mode 100644 sample/third_party/lib/libruntime_camodel.so.aarch64
 delete mode 100644 sample/third_party/lib/libruntime_camodel.so.x86

diff --git a/sample/pytorch_adapter/jit_mode/Makefile b/sample/pytorch_adapter/jit_compile/Makefile
similarity index 98%
rename from sample/pytorch_adapter/jit_mode/Makefile
rename to sample/pytorch_adapter/jit_compile/Makefile
index ad17f3a514..ec9115f377 100644
--- a/sample/pytorch_adapter/jit_mode/Makefile
+++ b/sample/pytorch_adapter/jit_compile/Makefile
@@ -1,20 +1,20 @@
-# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化
-ASCEND_HOME_PATH			?= /usr/local/Ascend/ascend-toolkit/latest
-
-COMPILER					:= $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec  # 参考device_config.cmake中CMAKE_C_COMPILER配置
-COMPILER_FLAG				:= -xcce -O2 -std=c++17
-DYNAMIC_LIB_FLAG			:= -fPIC -shared
-DAV_FLAG                    := --cce-aicore-arch=dav-c220-vec
-ASCENDC_INC_FLAG 			:= -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include  # 参考device_intf.cmake的配置简化
-
-all: build
-
-build: libcustom_kernels.so
-
-# 后续如果要扩展，把多个kernel的cpp都加到后面
-libcustom_kernels.so: add_kernel.cpp
-	$(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^
-
-.PHONY: clean
-clean:
+# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化
+ASCEND_HOME_PATH			?= /usr/local/Ascend/ascend-toolkit/latest
+
+COMPILER					:= $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec  # 参考device_config.cmake中CMAKE_C_COMPILER配置
+COMPILER_FLAG				:= -xcce -O2 -std=c++17
+DYNAMIC_LIB_FLAG			:= -fPIC -shared
+DAV_FLAG                    := --cce-aicore-arch=dav-c220-vec
+ASCENDC_INC_FLAG 			:= -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include  # 参考device_intf.cmake的配置简化
+
+all: build
+
+build: libcustom_kernels.so
+
+# 后续如果要扩展，把多个kernel的cpp都加到后面
+libcustom_kernels.so: add_kernel.cpp
+	$(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^
+
+.PHONY: clean
+clean:
 	rm *.so
\ No newline at end of file
diff --git a/sample/pytorch_adapter/jit_mode/add_adapter.cpp b/sample/pytorch_adapter/jit_compile/add_adapter.cpp
similarity index 100%
rename from sample/pytorch_adapter/jit_mode/add_adapter.cpp
rename to sample/pytorch_adapter/jit_compile/add_adapter.cpp
diff --git a/sample/pytorch_adapter/jit_mode/add_kernel.cpp b/sample/pytorch_adapter/jit_compile/add_kernel.cpp
similarity index 97%
rename from sample/pytorch_adapter/jit_mode/add_kernel.cpp
rename to sample/pytorch_adapter/jit_compile/add_kernel.cpp
index 7e82e88820..9aa62e0936 100644
--- a/sample/pytorch_adapter/jit_mode/add_kernel.cpp
+++ b/sample/pytorch_adapter/jit_compile/add_kernel.cpp
@@ -1,106 +1,106 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
- *
- * Function : z = x + y
- * This sample is a very basic sample that implements vector add on Ascend plaform.
- * In this sample:
- * Length of x / y / z is 8*2048.
- * Num of vector core used in sample is 8.
- * Length for each core to compute is 2048.
- * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop.
- *
- */
-#include "kernel_operator.h"
-using namespace AscendC;
-constexpr int32_t TOTAL_LENGTH = 8 * 2048;                             // total length of data
-constexpr int32_t USE_CORE_NUM = 8;                                    // num of core used
-constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;          // length computed of each core
-constexpr int32_t TILE_NUM = 8;                                        // split data into 8 tiles for each core
-constexpr int32_t BUFFER_NUM = 2;                                      // tensor num for each queue
-constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM;  // seperate to 2 parts, due to double buffer
-
-class KernelAdd {
-public:
-    __aicore__ inline KernelAdd()
-    {}
-    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
-    {
-        // get start index for current core, core parallel
-        xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
-        yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
-        zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
-        // pipe alloc memory to queue, the unit is Bytes
-        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half));
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half));
-        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half));
-    }
-    __aicore__ inline void Process()
-    {
-        // loop count need to be doubled, due to double buffer
-        constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM;
-        // tiling strategy, pipeline parallel
-        for (int32_t i = 0; i < loopCount; i++) {
-            CopyIn(i);
-            Compute(i);
-            CopyOut(i);
-        }
-    }
-
-private:
-    __aicore__ inline void CopyIn(int32_t progress)
-    {
-        // alloc tensor from queue memory
-        LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
-        LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
-        // copy progress_th tile from global tensor to local tensor
-        DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
-        DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
-        // enque input tensors to VECIN queue
-        inQueueX.EnQue(xLocal);
-        inQueueY.EnQue(yLocal);
-    }
-    __aicore__ inline void Compute(int32_t progress)
-    {
-        // deque input tensors from VECIN queue
-        LocalTensor<half> xLocal = inQueueX.DeQue<half>();
-        LocalTensor<half> yLocal = inQueueY.DeQue<half>();
-        LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
-        // call Add instr for computation
-        Add(zLocal, xLocal, yLocal, TILE_LENGTH);
-        // enque the output tensor to VECOUT queue
-        outQueueZ.EnQue<half>(zLocal);
-        // free input tensors for reuse
-        inQueueX.FreeTensor(xLocal);
-        inQueueY.FreeTensor(yLocal);
-    }
-    __aicore__ inline void CopyOut(int32_t progress)
-    {
-        // deque output tensor from VECOUT queue
-        LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
-        // copy progress_th tile from local tensor to global tensor
-        DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
-        // free output tensor for reuse
-        outQueueZ.FreeTensor(zLocal);
-    }
-
-private:
-    TPipe pipe;
-    // create queues for input, in this case depth is equal to buffer num
-    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
-    // create queue for output, in this case depth is equal to buffer num
-    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
-    GlobalTensor<half> xGm, yGm, zGm;
-};
-// implementation of kernel function
-extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
-{
-    KernelAdd op;
-    op.Init(x, y, z);
-    op.Process();
-}
-
-// 包裹核函数，使得普通编译器能认识这个符号
-extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
-{
-    add_custom<<<blockDim, nullptr, stream>>>(x, y, z);
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ *
+ * Function : z = x + y
+ * This sample is a very basic sample that implements vector add on Ascend plaform.
+ * In this sample:
+ * Length of x / y / z is 8*2048.
+ * Num of vector core used in sample is 8.
+ * Length for each core to compute is 2048.
+ * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop.
+ *
+ */
+#include "kernel_operator.h"
+using namespace AscendC;
+constexpr int32_t TOTAL_LENGTH = 8 * 2048;                             // total length of data
+constexpr int32_t USE_CORE_NUM = 8;                                    // num of core used
+constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;          // length computed of each core
+constexpr int32_t TILE_NUM = 8;                                        // split data into 8 tiles for each core
+constexpr int32_t BUFFER_NUM = 2;                                      // tensor num for each queue
+constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM;  // seperate to 2 parts, due to double buffer
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd()
+    {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    {
+        // get start index for current core, core parallel
+        xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        // pipe alloc memory to queue, the unit is Bytes
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+    }
+    __aicore__ inline void Process()
+    {
+        // loop count need to be doubled, due to double buffer
+        constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM;
+        // tiling strategy, pipeline parallel
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        // alloc tensor from queue memory
+        LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
+        LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
+        // copy progress_th tile from global tensor to local tensor
+        DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
+        DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
+        // enque input tensors to VECIN queue
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        // deque input tensors from VECIN queue
+        LocalTensor<half> xLocal = inQueueX.DeQue<half>();
+        LocalTensor<half> yLocal = inQueueY.DeQue<half>();
+        LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
+        // call Add instr for computation
+        Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+        // enque the output tensor to VECOUT queue
+        outQueueZ.EnQue<half>(zLocal);
+        // free input tensors for reuse
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        // deque output tensor from VECOUT queue
+        LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
+        // copy progress_th tile from local tensor to global tensor
+        DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
+        // free output tensor for reuse
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    TPipe pipe;
+    // create queues for input, in this case depth is equal to buffer num
+    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
+    // create queue for output, in this case depth is equal to buffer num
+    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+    GlobalTensor<half> xGm, yGm, zGm;
+};
+// implementation of kernel function
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+{
+    KernelAdd op;
+    op.Init(x, y, z);
+    op.Process();
+}
+
+// 包裹核函数，使得普通编译器能认识这个符号
+extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
+{
+    add_custom<<<blockDim, nullptr, stream>>>(x, y, z);
 }
\ No newline at end of file
diff --git a/sample/pytorch_adapter/jit_mode/main.py b/sample/pytorch_adapter/jit_compile/main.py
similarity index 96%
rename from sample/pytorch_adapter/jit_mode/main.py
rename to sample/pytorch_adapter/jit_compile/main.py
index 388e0799fb..11f92600d9 100644
--- a/sample/pytorch_adapter/jit_mode/main.py
+++ b/sample/pytorch_adapter/jit_compile/main.py
@@ -1,68 +1,68 @@
-import os
-import torch
-import torch.utils.cpp_extension
-import torch_npu
-from torch_npu.testing.testcase import TestCase, run_tests
-
-PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__))
-CUR_PATH = os.getcwd()
-
-
-def compile_kernels():
-    os.system("make")  # 由于pytorch中没有device编译的扩展，所以此处人工加make
-
-
-def compile_host():
-    extra_ldflags = []
-    extra_ldflags.append(f"-L{PYTORCH_NPU_INSTALL_PATH}/lib")
-    extra_ldflags.append("-ltorch_npu")
-    extra_ldflags.append(f"-L{CUR_PATH}/")
-    extra_ldflags.append("-lcustom_kernels")
-    extra_include_paths = []
-    extra_include_paths.append("./")
-    extra_include_paths.append(os.path.join(
-        PYTORCH_NPU_INSTALL_PATH, "include"))
-    extra_include_paths.append(os.path.join(os.path.join(os.path.join(os.path.join(
-        PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"))
-
-    module = torch.utils.cpp_extension.load(
-        name="jit_extension",
-        sources=[
-            "add_adapter.cpp"
-        ],
-        extra_include_paths=extra_include_paths,
-        extra_ldflags=extra_ldflags,
-        verbose=True)
-    return module
-
-
-class TestCustomAdd(TestCase):
-    def test_add(self):
-        module = compile_host()
-        # 由于kernel现在是静态tiling，所以此处尺寸需要匹配
-        # 因为add是elementwise的，现有算子支持8*2048(详见kernel实现)，所以，小于这个应该都可以
-        length = [8, 2048]
-        x = torch.rand(length, device='cpu', dtype=torch.float16)
-        y = torch.rand(length, device='cpu', dtype=torch.float16)
-
-        x_npu = x.npu()
-        y_npu = y.npu()
-        x_npu.requires_grad = True
-        y_npu.requires_grad = True
-        output = module.my_add(x_npu, y_npu)
-        # 反向能力验证
-        output.backward(output)
-
-        x.requires_grad = True
-        y.requires_grad = True
-        cpuout = torch.add(x, y)
-        cpuout.backward(cpuout)
-
-        self.assertRtolEqual(output, cpuout)
-        self.assertRtolEqual(x_npu.grad, x.grad)
-        self.assertRtolEqual(y_npu.grad, y.grad)
-
-
-if __name__ == '__main__':
-    compile_kernels()
-    run_tests()
+import os
+import torch
+import torch.utils.cpp_extension
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+
+PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__))
+CUR_PATH = os.getcwd()
+
+
+def compile_kernels():
+    os.system("make")  # 由于pytorch中没有device编译的扩展，所以此处人工加make
+
+
+def compile_host():
+    extra_ldflags = []
+    extra_ldflags.append(f"-L{PYTORCH_NPU_INSTALL_PATH}/lib")
+    extra_ldflags.append("-ltorch_npu")
+    extra_ldflags.append(f"-L{CUR_PATH}/")
+    extra_ldflags.append("-lcustom_kernels")
+    extra_include_paths = []
+    extra_include_paths.append("./")
+    extra_include_paths.append(os.path.join(
+        PYTORCH_NPU_INSTALL_PATH, "include"))
+    extra_include_paths.append(os.path.join(os.path.join(os.path.join(os.path.join(
+        PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"))
+
+    module = torch.utils.cpp_extension.load(
+        name="jit_extension",
+        sources=[
+            "add_adapter.cpp"
+        ],
+        extra_include_paths=extra_include_paths,
+        extra_ldflags=extra_ldflags,
+        verbose=True)
+    return module
+
+
+class TestCustomAdd(TestCase):
+    def test_add(self):
+        module = compile_host()
+        # 由于kernel现在是静态tiling，所以此处尺寸需要匹配
+        # 因为add是elementwise的，现有算子支持8*2048(详见kernel实现)，所以，小于这个应该都可以
+        length = [8, 2048]
+        x = torch.rand(length, device='cpu', dtype=torch.float16)
+        y = torch.rand(length, device='cpu', dtype=torch.float16)
+
+        x_npu = x.npu()
+        y_npu = y.npu()
+        x_npu.requires_grad = True
+        y_npu.requires_grad = True
+        output = module.my_add(x_npu, y_npu)
+        # 反向能力验证
+        output.backward(output)
+
+        x.requires_grad = True
+        y.requires_grad = True
+        cpuout = torch.add(x, y)
+        cpuout.backward(cpuout)
+
+        self.assertRtolEqual(output, cpuout)
+        self.assertRtolEqual(x_npu.grad, x.grad)
+        self.assertRtolEqual(y_npu.grad, y.grad)
+
+
+if __name__ == '__main__':
+    compile_kernels()
+    run_tests()
diff --git a/sample/third_party/lib/libruntime.so.aarch64 b/sample/third_party/lib/libruntime.so.aarch64
deleted file mode 100644
index 2c686dc3e0ab56768ec8c45cfac9f1fbb107888f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8584
zcmeHMYitzP6+XM`V8F&+6CQ1#VL_-$lz7&LG<Nz>cKyI6GA7-aidK<^@$T5WVju48
zEU`n>CIP80Ma2b$RBD2M<Oh8S@q?uCA24~emXaz|w3HtWA|!256Q!nksH+CH=iE7G
zJv+N5X;oFK#D(#>_q*pc_ciC9{dVub7QfF2QUdTxpjK#+gG9Jv_7<)ni9jdR<KH!K
zi{ia$zN@O%xWs{kf|79ll8G0VIsGpzb9@lG)9>KBn2H=VN$w2GonbklbNnho7dvX8
z4k<me-0!6unNKM3Qn~+6c7p29)~jX}eb9kHsK|X4<#-60<N74X-7&Ve(75#TcnIa!
zQSY__bp-I~8{7fcp8uY?>7On4J-P4HzLU=#v%eGH-9jd)pI(LJ$iK+rL5S2{oFt6K
zssW#lSEl!@II^L=|2IGW{PYu7ntoM!?Y_UB{ae#VM=qQ?b^EGg$a38hfF&?fV^IWP
z6$FFgv7>@op&Hs^h;68WuSG0W1Mge_e+=>E_*QrF{LvUuIgAhC5yn5GP=IeT9%P*C
zQK9)+UgNz^D!c(6@#tTr@J2YmI5t)JIsPTPI~kYtW-$H%FB-86x;~o8W%aymShlW%
zp30_d&_|GfzIAAuo-i$QG?lkaYiL_{I+rzvjN!ED^wsXs<KqS?Fw&`qO|a|-OuH}^
zwMGjWGizhJJC_|vjTS7k+eoLe-DB?QLT+nfM>fW$+^7rNQpm>bRL;#dU?7%+bZR(n
zC*pV9q2+Vh-PFvKYoqbFo~Kqj;c*A=8`#>_t#8uWJDhQ&zs}P4;o5cobNUH+5390|
z4o^ObBk6~KINf;8_*5;x8+4AxdB%euWqj6yOMAB2(c?osw3nouap7_XP~UkM?%om?
zT{xR`RPp}sd-N}M;cop`_n!Lc)c{Q5dOLJ=@9Cp60sWl#M05`O4*(o!xz<+<Ub=TO
z<X?AgT`2H-d9d$zi$78f`a$)n_9AR=aN3WeEWF)Ui~yYYuvA)2`bie?;}C&K++WAn
zBfS^kohg7JY#&(vZeKBq{2^F(ZXy2l0I#n9xclJxkE1`p=iq7Ne;j4I=K6}mSdXAA
z*%A4xdrzwOe7^yT2hrxPuE;$RAACXice$^K^n|L1IwoH)l@6cBCj^hic6Gr$?>9t>
zSNFcC#^wUPL#Z2nd$AshZ`Q-~r2tF=@*y@!wo3c3Zyij(iFPg_kLugUeJoeOFS-KE
zD|%i+J*&~i&8Vjtif^I5x0~74OAG6h`E<{vZ%yD!I{{!RK8LQAN@GZ0M!FwqU8z)h
z3h4mS8Kn0k{T<RY(sz-57ipaj;3r7eB0YihE~Gd<r2(Wt<e}nwI0kj&xBAvJu2UGD
zhilQsovPR2qk1P|UuJA0(g;2Y#QF(~Hie$9dnOQF_POXm|KNo;&Z06bWFv{se&mz2
zOXu(-_}GZ4vo6{c>}>)d{186l^LR+`Uyuyq#Dlhi-OT?o^OMhn_b~oP#>r2_+xi&)
z0Gs5CTMx0^C^tsn#t7UPf&YgQkod(Lui%lc%CDJ9TY>^<U-b09$L&U@62J2`s~pFE
zmir~1TElI*XUR2D;wp8vBaifdc&(HpkvVomVxlPyrX&{X^gD{_n@1(U{B%E|qRx7x
z5`$glc#6fKO>sHj6RJuWzMAm^Y*_Hi{9zJ5k1`)$U(5FJv!VQ-Wq$mOs<w>l9Jin4
z{vx-<PgeV1BXaNkPZeI|aS+<q-TlSTZFqycJ=CFX*4jgx+S)tYHn(m56b1qqgUh|z
z%^tuwT;{tH4}i>bC0+-z9xCyhK*qHaUj#BAm3TeKyjJ3iLFT^_U+CVt6l8oV`5Vf4
zl1jY#-beibSO!zF;9LR$Xe#FwTrl!C&u<)8M*t%#*?%q=`CCBtT_wI8<o;5L-;8+_
zS-JB)0gS?Cct6df<o`;LeO8HA-|MMA0J7WWxyf1`Tq(^XPA#oS^?1_e^5cD3@=+>g
zL(Ji?o-YTl9)F587q+_uUXz1J`62on;VI92{pGrP8lY1SFxK;Tq~yOBrx=v*q=&y=
zO-J>7Ek&I8<-BM05JKGN%Bv`G>~!>0uZOVWug1T|`sKSK`W-#h>up%+sm2}tW>_Hy
zDJydD>UHuM>Zvs^4uAE2c|zHp$N8A$_0Ry4KdJP;8OA;SEHsbe|6)1#S<#fT+YDZu
zx@-A=4EdqPzCNw=Tq$|?*B=zVvMN9Jmcpyo1D&Tn+(+L1>#YCk^Qwsaly~yxsr>lC
z43Vpz(t`MMsNP4b5#Qk9rRJsu^b9IJHS%2x@b6`QZ(i<)jLZB=TycbPZ+`DN#=UvG
zR}@}-Za_FUW{0h0IAQLwbGdZhv|BsERxW3U3wbL%oXUnZEsXi8Fs7@*hGE5%ci+`o
z$nMVO9?G_+Q`y3J>u9zR?jSbjzc9^r#w|Og<zVAD#4)v|KV%xa^^xrV%5)Y@qX&7r
zFfyXWVLoxF+ZjDhslz;MOklz>k<&-hxnU!%C+wV+*Np-?oXd=*P1{Up9UYt7Ycr{=
zq;6Q2F`;8B(VBn}%gC5|qL9f<popub(wG&jRv~MrGNu+E8w33t1MQn*Lwav4wmqf~
zZXfJL=`FG7wqCt=u!pjsIz}jZZ$E?mJu%Sx2DW!a2lVY*wmi@~qz^^A26~ARv!Z38
zL_Vh{jckJQqdi|AjBeZ7jhReNQF;@tPKvc6SsIDB(vsK9zFKxwDpax8Ovt)2tgc+D
zH!E8^3tO41b&|uN<tH+>F^tr<9Ga9`HfNg}u2XHekV+?7QwdP5q>)d8mYB$*q(g1X
z>DgmidCcfmG<58<%(OuQTpLT<psCcfW}D+!SEHv{xrAXGpqWXY{A2=UrR7LCtH6<v
zI!TcURG{23GO0MakVD1zM^n~@^Lfy4S*omcjT7nLgXkVZpL#qVRFODV=tjnc$ahED
z8dg5SSYPBN-WA%zjB*d~e2(xJHRrb%k(anwXcEtFDrA>*h`ii`B8-dt6dMvM_a|gi
zh1E$D`msiL8IhMbTj;IKDgFze&>h%McOz*_JT5fE^5j3+7XKNB1Xid|<RxwwdXyQ(
zzR1h_XIZ|B`6Rv<D)&K>Ux|-bejG9KOTsa^{|c436IoOtU)?2-4Y$197lcOK%vhJa
ztI%(G<R#t}I_8-$@n6*L_sGldPC~_w@O%CLKFiDa%RNP?m*3m|kw@P9d+CE>n5kEO
z$|GN%Ab!!tPk8^Ip**H{)!sl-z8}f&sNV6Cw$SHM25(zsdAUc;Z~{d5NH-N}3w;qW
za#7Z&+_z>wL7sF`c^w~Gdm=A!o&3%%=ZK7(*p+!bkFsw25?{{HOJTXqA?3d!FZ2!U
zam&j+@A9XV|1-*yF7aP}=MA!hVn-bCOeeg<_S=MksmM!SNxDVeO=ay0579QZf1Wos
s<*&YikN6|!2Zj6MJ66kOF<)U{^(Ys+!q|e9TJmQ(VMg~ock#;q3uj)=)&Kwi

diff --git a/sample/third_party/lib/libruntime.so.x86 b/sample/third_party/lib/libruntime.so.x86
deleted file mode 100644
index 6da21687dc7655cc6745003cfcbb6c3c0a8ceb34..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16096
zcmeHOZ){sv6~B&?HC@u!>qeKRm13rBT3hy-G)kLNb#)p$eXWwT^WqOoL=VS)NsRm_
zJU@5IL`7Mll7ULC_JJY|RY;TQhJ?lkG>Q;wm92opfC=$OVxVZWQb1@+1)BnzV$Qkm
zoR{ZoN3B3Ye0aCA&pp3$e&^nI-k)CQ-SdU`=txr_Ah-p^lLED9*g>LXhzwE(L{tol
z&G5ZLbV=XlbyHpMr!5Y|QczqRLz+luoaMJ@N-P;iSdJHQ;tECPh{w{)yALR>n!{^-
zvQ~zh9KDky^H4=zgm@9+vAj;>v21n1$T{LX2PvK^d&rNaJSM<S-L^B>Z;m()svLD-
z!IF7T0T1VSuZ`25pz$`!%PJioOH$?WJp_XV;d5*}DZV-M*yeX%JYPKc!7mTKdaZhK
z;CqKQ;at#;=NWl8zv%N}5p7<&gc$0Z;d9{4Yo{N&@<^Zl);AuzcHvIP&Farj{PoxG
zcYJu|Pp`keYuk6hWy6MXBMq!OH^A{SZseZ;{1N!5?TY|x6>TE^fMW@f!`BI)ApAkX
zKS%gp!nYBQ`Q9qv>fpo^{$uh#uf#{*R?#jl6He~pJLEq@csChRgm)4CF(w$sOukq!
zN_NV!4MP~&Le>_>GzeiFpBOVTre)4#OSWlEj1A|C1#==bl`|b%V~dfVPho&mF8iD*
zEc<EGF3$~FGv&Nlut6U#7N)Z^Wy>5+<#M3M%+p7~+geB#5;T+-^<X3ALfXz2y>6o^
zz-C1*J5{nX={<Y2Qc>HFYToV5q|-(TwLZkt4t{L(_|aiwpSDjM;7o`nxC(3k;dL1h
z?|}!OmU34XF3BdbN-Xy|L3)_-8|LQ+;lH8)oeIwS(h<iIulWFRP%L|Je)J%|;=%DA
z;<oC+`QB%nFZ~So8SpdUXTZ;Zp8-Dueg^yu+${t8xxcmQUv2r#ULo`^U$<M9cK|~_
z_eSfY!>kUx3#RI$x8OU}9fjU2(3`$kee{P?>K6{b4V|QZq2*^F^@FQ=<+i;YLVg8R
z2)Sjlx-_kYy1yg?PA*QbE1_-y{7dL~VF3M;`mQzL>o?YddSykwv3x`i+|=J*vpXT+
z5Cv?<fE&d-dl=nBd9Qx%@Wa?k;(Xd+nB7mnYCB%7R+n}7y4kWFtbxhJQ1@A!*xwf+
zWXy4&{L3_WAW^TJS*?`+QAw`+0G%cm5v-i~NUxMv^h$EsgQBaVqE`b5iF##hDOQQU
zqnJsBq2zg4KMgp#-~0^t8SpdUXTZ;Zp8-Dueg^yu_!;ms;Ag<kz<-s2z>eS{9U|bn
z>zymrY6j#dKz<G6wma48OCa}vd;{bV$Xg&!foz5sS!s|xAfE>r1Bvg1u?3z>h`@X}
zu%mrb>qWqO0K<2(ufAWc%2#C_;gOEc6QQ<eThEFkU59q}_sG|;$dAG2GW$Uv8*)Dl
zpU=X)<h#wGj_~=W;Vt)_f&s|>d5{L!<vUllKecXu5$s8@hd>IppIEp52;dml<-1(A
zkFVR`0s8^4cYFEItlK{VeE$aVk>WQ$1AYel4EP!FGvH^y&w!r+KLdUS{0#g*WPtZI
zs{0t>r<-g)A<6o06@5w3aZe?-c1O4yEVq!(`&ze?&igF!^)oj7O^2<^5rX$y?j)V<
zAFNf22z`b&PvX8}Y)iChlJ`VkCHwu3S>SJ1Y;BGZoh048mQe?go0y>+p};)uRr1IE
z%h;H|R(0m%Y;K1)=cIGq;OC*7@2zHs_fOK@aUJ-7_Rag))d7zU4}UW9IPC1+6*-{w
zYkiS@y?ukd{k{FFV*@=1J6w6cyVo3qeYJd^YVn}p^HYmAyXUbMZxNioS{%+cUjQCM
zP;7F~p9hA$>2NlA<FCgMgk9;8hWP!Ww;|r<)=PN8K>Ygs+h`98xC<g)Pp2Og9iq7*
z9ulV-;$gw}jW=3QJmB6hwfI)S`;lw$2SuanAP9fW;eE@s{_q!=s26eiL9tDAHpCyU
zbH@9jS=_0r27F#L!Ce_u^4vlASq1+%;r!g<cn6>p@XUwrzY&So=O<5b_&LaN7NmcD
zo-YI5X#Our|Ml}rw>iwEvHxq*KO|=HK$O-f{+rUL{yO<R-~my8Uhp}LxO!gxCGq;}
zVG~R=AnLD&FyL4Rq1GvE2Rs6?)b_aaufP7a4g60?|2Co4HB4^c|19A04~ELXFHjt{
zF5}yTtMw7D5w6yEyhXTL$MHvrH+XKOExTlwVRKVjtd~?6cHT&1QAJ4@My6=Y<cd?N
zoRP7MR>??}=S8}hpUauHnbGzi>>p_CB#SkSlx3wB45+ZM7R0ob%9}=}oX;-+#iPmU
zjv801T(GlwQ%ldy3FDd3zW&675l<veCXDfu<8fe*B!<S~MtnSm#UTbf@Qt`mpdL#I
z<JjoQqeG*{$&rz#;}gcj(9zL2Izq9COUjgr#%!vP!LpIqr^biIjt@hD2UTvU^(S|&
zJhAM$G7wp)au3BQ(WtlV!&7<UsWDNjRT@{P)Yh;#MKeMxE#&Rg6iC~0<Sh3JMcdS7
z3T17ooXusP$YzApW>ckEp=A~dz;vW-IhNC=Rf0;F8pQycW#&>y5SlF1(dKftHdC~P
zW}EZyEwiUt#Z1ah3C*0PlRujQDr*kaxe6Qx_mPDoKma+LR6d)A85C_CMrc!|lF;Ca
zlyyAHef6KJxSqo`6fM8pbpfvrdX&KYykuB|&WEr@Vm_}QSjNba*H64YWBwp?aE-@&
zUT3fzg!>sAj*DI9^Lj1{7{+G)5)H(X*OA~N8--(4=+MJ8AoF?M!jjjwJb(6MnFKzr
zDOq>dH?+=)&`9e12^JaXppE$v=8?Qgjy%44`~|S%G4VRuT`$r9!#sZqBgf)C%0Ixz
z%jb0!OXa_FHN3DDKCi!6s(hY5%QFf;N(7ck<%Fs6&MAEU-NKUB|KKJY=GxmTpx_Xk
ze-R-oN!4G~FM+$4-^@IcqhR+om45|{UOulQS?)xmr*YmGdKo&n_HwTeu5ZH|6PoJJ
zI?L~a558|>KL3sj-y{D9aB+&v=k;snUHCwe?HBNk*DmvUoy+yS*Xj5<Z#-|l4(5UD
z9iP|3OM7K_5pjh3&3u->1B;h0qU74^NPf-`>gJRAOJKl0&!5+Eoy2E9=F{P@DsDp$
zF>-ft=BxKF^S#o&c3n2qHB-PaJxB0&{BHR0eE7KV{@}TL=t3WGv_JDeWK>l6CDWL{
P2N(Fz-gAWet@8g37k2|A

diff --git a/sample/third_party/lib/libruntime_camodel.so.aarch64 b/sample/third_party/lib/libruntime_camodel.so.aarch64
deleted file mode 100644
index 2c686dc3e0ab56768ec8c45cfac9f1fbb107888f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8584
zcmeHMYitzP6+XM`V8F&+6CQ1#VL_-$lz7&LG<Nz>cKyI6GA7-aidK<^@$T5WVju48
zEU`n>CIP80Ma2b$RBD2M<Oh8S@q?uCA24~emXaz|w3HtWA|!256Q!nksH+CH=iE7G
zJv+N5X;oFK#D(#>_q*pc_ciC9{dVub7QfF2QUdTxpjK#+gG9Jv_7<)ni9jdR<KH!K
zi{ia$zN@O%xWs{kf|79ll8G0VIsGpzb9@lG)9>KBn2H=VN$w2GonbklbNnho7dvX8
z4k<me-0!6unNKM3Qn~+6c7p29)~jX}eb9kHsK|X4<#-60<N74X-7&Ve(75#TcnIa!
zQSY__bp-I~8{7fcp8uY?>7On4J-P4HzLU=#v%eGH-9jd)pI(LJ$iK+rL5S2{oFt6K
zssW#lSEl!@II^L=|2IGW{PYu7ntoM!?Y_UB{ae#VM=qQ?b^EGg$a38hfF&?fV^IWP
z6$FFgv7>@op&Hs^h;68WuSG0W1Mge_e+=>E_*QrF{LvUuIgAhC5yn5GP=IeT9%P*C
zQK9)+UgNz^D!c(6@#tTr@J2YmI5t)JIsPTPI~kYtW-$H%FB-86x;~o8W%aymShlW%
zp30_d&_|GfzIAAuo-i$QG?lkaYiL_{I+rzvjN!ED^wsXs<KqS?Fw&`qO|a|-OuH}^
zwMGjWGizhJJC_|vjTS7k+eoLe-DB?QLT+nfM>fW$+^7rNQpm>bRL;#dU?7%+bZR(n
zC*pV9q2+Vh-PFvKYoqbFo~Kqj;c*A=8`#>_t#8uWJDhQ&zs}P4;o5cobNUH+5390|
z4o^ObBk6~KINf;8_*5;x8+4AxdB%euWqj6yOMAB2(c?osw3nouap7_XP~UkM?%om?
zT{xR`RPp}sd-N}M;cop`_n!Lc)c{Q5dOLJ=@9Cp60sWl#M05`O4*(o!xz<+<Ub=TO
z<X?AgT`2H-d9d$zi$78f`a$)n_9AR=aN3WeEWF)Ui~yYYuvA)2`bie?;}C&K++WAn
zBfS^kohg7JY#&(vZeKBq{2^F(ZXy2l0I#n9xclJxkE1`p=iq7Ne;j4I=K6}mSdXAA
z*%A4xdrzwOe7^yT2hrxPuE;$RAACXice$^K^n|L1IwoH)l@6cBCj^hic6Gr$?>9t>
zSNFcC#^wUPL#Z2nd$AshZ`Q-~r2tF=@*y@!wo3c3Zyij(iFPg_kLugUeJoeOFS-KE
zD|%i+J*&~i&8Vjtif^I5x0~74OAG6h`E<{vZ%yD!I{{!RK8LQAN@GZ0M!FwqU8z)h
z3h4mS8Kn0k{T<RY(sz-57ipaj;3r7eB0YihE~Gd<r2(Wt<e}nwI0kj&xBAvJu2UGD
zhilQsovPR2qk1P|UuJA0(g;2Y#QF(~Hie$9dnOQF_POXm|KNo;&Z06bWFv{se&mz2
zOXu(-_}GZ4vo6{c>}>)d{186l^LR+`Uyuyq#Dlhi-OT?o^OMhn_b~oP#>r2_+xi&)
z0Gs5CTMx0^C^tsn#t7UPf&YgQkod(Lui%lc%CDJ9TY>^<U-b09$L&U@62J2`s~pFE
zmir~1TElI*XUR2D;wp8vBaifdc&(HpkvVomVxlPyrX&{X^gD{_n@1(U{B%E|qRx7x
z5`$glc#6fKO>sHj6RJuWzMAm^Y*_Hi{9zJ5k1`)$U(5FJv!VQ-Wq$mOs<w>l9Jin4
z{vx-<PgeV1BXaNkPZeI|aS+<q-TlSTZFqycJ=CFX*4jgx+S)tYHn(m56b1qqgUh|z
z%^tuwT;{tH4}i>bC0+-z9xCyhK*qHaUj#BAm3TeKyjJ3iLFT^_U+CVt6l8oV`5Vf4
zl1jY#-beibSO!zF;9LR$Xe#FwTrl!C&u<)8M*t%#*?%q=`CCBtT_wI8<o;5L-;8+_
zS-JB)0gS?Cct6df<o`;LeO8HA-|MMA0J7WWxyf1`Tq(^XPA#oS^?1_e^5cD3@=+>g
zL(Ji?o-YTl9)F587q+_uUXz1J`62on;VI92{pGrP8lY1SFxK;Tq~yOBrx=v*q=&y=
zO-J>7Ek&I8<-BM05JKGN%Bv`G>~!>0uZOVWug1T|`sKSK`W-#h>up%+sm2}tW>_Hy
zDJydD>UHuM>Zvs^4uAE2c|zHp$N8A$_0Ry4KdJP;8OA;SEHsbe|6)1#S<#fT+YDZu
zx@-A=4EdqPzCNw=Tq$|?*B=zVvMN9Jmcpyo1D&Tn+(+L1>#YCk^Qwsaly~yxsr>lC
z43Vpz(t`MMsNP4b5#Qk9rRJsu^b9IJHS%2x@b6`QZ(i<)jLZB=TycbPZ+`DN#=UvG
zR}@}-Za_FUW{0h0IAQLwbGdZhv|BsERxW3U3wbL%oXUnZEsXi8Fs7@*hGE5%ci+`o
z$nMVO9?G_+Q`y3J>u9zR?jSbjzc9^r#w|Og<zVAD#4)v|KV%xa^^xrV%5)Y@qX&7r
zFfyXWVLoxF+ZjDhslz;MOklz>k<&-hxnU!%C+wV+*Np-?oXd=*P1{Up9UYt7Ycr{=
zq;6Q2F`;8B(VBn}%gC5|qL9f<popub(wG&jRv~MrGNu+E8w33t1MQn*Lwav4wmqf~
zZXfJL=`FG7wqCt=u!pjsIz}jZZ$E?mJu%Sx2DW!a2lVY*wmi@~qz^^A26~ARv!Z38
zL_Vh{jckJQqdi|AjBeZ7jhReNQF;@tPKvc6SsIDB(vsK9zFKxwDpax8Ovt)2tgc+D
zH!E8^3tO41b&|uN<tH+>F^tr<9Ga9`HfNg}u2XHekV+?7QwdP5q>)d8mYB$*q(g1X
z>DgmidCcfmG<58<%(OuQTpLT<psCcfW}D+!SEHv{xrAXGpqWXY{A2=UrR7LCtH6<v
zI!TcURG{23GO0MakVD1zM^n~@^Lfy4S*omcjT7nLgXkVZpL#qVRFODV=tjnc$ahED
z8dg5SSYPBN-WA%zjB*d~e2(xJHRrb%k(anwXcEtFDrA>*h`ii`B8-dt6dMvM_a|gi
zh1E$D`msiL8IhMbTj;IKDgFze&>h%McOz*_JT5fE^5j3+7XKNB1Xid|<RxwwdXyQ(
zzR1h_XIZ|B`6Rv<D)&K>Ux|-bejG9KOTsa^{|c436IoOtU)?2-4Y$197lcOK%vhJa
ztI%(G<R#t}I_8-$@n6*L_sGldPC~_w@O%CLKFiDa%RNP?m*3m|kw@P9d+CE>n5kEO
z$|GN%Ab!!tPk8^Ip**H{)!sl-z8}f&sNV6Cw$SHM25(zsdAUc;Z~{d5NH-N}3w;qW
za#7Z&+_z>wL7sF`c^w~Gdm=A!o&3%%=ZK7(*p+!bkFsw25?{{HOJTXqA?3d!FZ2!U
zam&j+@A9XV|1-*yF7aP}=MA!hVn-bCOeeg<_S=MksmM!SNxDVeO=ay0579QZf1Wos
s<*&YikN6|!2Zj6MJ66kOF<)U{^(Ys+!q|e9TJmQ(VMg~ock#;q3uj)=)&Kwi

diff --git a/sample/third_party/lib/libruntime_camodel.so.x86 b/sample/third_party/lib/libruntime_camodel.so.x86
deleted file mode 100644
index 6da21687dc7655cc6745003cfcbb6c3c0a8ceb34..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16096
zcmeHOZ){sv6~B&?HC@u!>qeKRm13rBT3hy-G)kLNb#)p$eXWwT^WqOoL=VS)NsRm_
zJU@5IL`7Mll7ULC_JJY|RY;TQhJ?lkG>Q;wm92opfC=$OVxVZWQb1@+1)BnzV$Qkm
zoR{ZoN3B3Ye0aCA&pp3$e&^nI-k)CQ-SdU`=txr_Ah-p^lLED9*g>LXhzwE(L{tol
z&G5ZLbV=XlbyHpMr!5Y|QczqRLz+luoaMJ@N-P;iSdJHQ;tECPh{w{)yALR>n!{^-
zvQ~zh9KDky^H4=zgm@9+vAj;>v21n1$T{LX2PvK^d&rNaJSM<S-L^B>Z;m()svLD-
z!IF7T0T1VSuZ`25pz$`!%PJioOH$?WJp_XV;d5*}DZV-M*yeX%JYPKc!7mTKdaZhK
z;CqKQ;at#;=NWl8zv%N}5p7<&gc$0Z;d9{4Yo{N&@<^Zl);AuzcHvIP&Farj{PoxG
zcYJu|Pp`keYuk6hWy6MXBMq!OH^A{SZseZ;{1N!5?TY|x6>TE^fMW@f!`BI)ApAkX
zKS%gp!nYBQ`Q9qv>fpo^{$uh#uf#{*R?#jl6He~pJLEq@csChRgm)4CF(w$sOukq!
zN_NV!4MP~&Le>_>GzeiFpBOVTre)4#OSWlEj1A|C1#==bl`|b%V~dfVPho&mF8iD*
zEc<EGF3$~FGv&Nlut6U#7N)Z^Wy>5+<#M3M%+p7~+geB#5;T+-^<X3ALfXz2y>6o^
zz-C1*J5{nX={<Y2Qc>HFYToV5q|-(TwLZkt4t{L(_|aiwpSDjM;7o`nxC(3k;dL1h
z?|}!OmU34XF3BdbN-Xy|L3)_-8|LQ+;lH8)oeIwS(h<iIulWFRP%L|Je)J%|;=%DA
z;<oC+`QB%nFZ~So8SpdUXTZ;Zp8-Dueg^yu+${t8xxcmQUv2r#ULo`^U$<M9cK|~_
z_eSfY!>kUx3#RI$x8OU}9fjU2(3`$kee{P?>K6{b4V|QZq2*^F^@FQ=<+i;YLVg8R
z2)Sjlx-_kYy1yg?PA*QbE1_-y{7dL~VF3M;`mQzL>o?YddSykwv3x`i+|=J*vpXT+
z5Cv?<fE&d-dl=nBd9Qx%@Wa?k;(Xd+nB7mnYCB%7R+n}7y4kWFtbxhJQ1@A!*xwf+
zWXy4&{L3_WAW^TJS*?`+QAw`+0G%cm5v-i~NUxMv^h$EsgQBaVqE`b5iF##hDOQQU
zqnJsBq2zg4KMgp#-~0^t8SpdUXTZ;Zp8-Dueg^yu_!;ms;Ag<kz<-s2z>eS{9U|bn
z>zymrY6j#dKz<G6wma48OCa}vd;{bV$Xg&!foz5sS!s|xAfE>r1Bvg1u?3z>h`@X}
zu%mrb>qWqO0K<2(ufAWc%2#C_;gOEc6QQ<eThEFkU59q}_sG|;$dAG2GW$Uv8*)Dl
zpU=X)<h#wGj_~=W;Vt)_f&s|>d5{L!<vUllKecXu5$s8@hd>IppIEp52;dml<-1(A
zkFVR`0s8^4cYFEItlK{VeE$aVk>WQ$1AYel4EP!FGvH^y&w!r+KLdUS{0#g*WPtZI
zs{0t>r<-g)A<6o06@5w3aZe?-c1O4yEVq!(`&ze?&igF!^)oj7O^2<^5rX$y?j)V<
zAFNf22z`b&PvX8}Y)iChlJ`VkCHwu3S>SJ1Y;BGZoh048mQe?go0y>+p};)uRr1IE
z%h;H|R(0m%Y;K1)=cIGq;OC*7@2zHs_fOK@aUJ-7_Rag))d7zU4}UW9IPC1+6*-{w
zYkiS@y?ukd{k{FFV*@=1J6w6cyVo3qeYJd^YVn}p^HYmAyXUbMZxNioS{%+cUjQCM
zP;7F~p9hA$>2NlA<FCgMgk9;8hWP!Ww;|r<)=PN8K>Ygs+h`98xC<g)Pp2Og9iq7*
z9ulV-;$gw}jW=3QJmB6hwfI)S`;lw$2SuanAP9fW;eE@s{_q!=s26eiL9tDAHpCyU
zbH@9jS=_0r27F#L!Ce_u^4vlASq1+%;r!g<cn6>p@XUwrzY&So=O<5b_&LaN7NmcD
zo-YI5X#Our|Ml}rw>iwEvHxq*KO|=HK$O-f{+rUL{yO<R-~my8Uhp}LxO!gxCGq;}
zVG~R=AnLD&FyL4Rq1GvE2Rs6?)b_aaufP7a4g60?|2Co4HB4^c|19A04~ELXFHjt{
zF5}yTtMw7D5w6yEyhXTL$MHvrH+XKOExTlwVRKVjtd~?6cHT&1QAJ4@My6=Y<cd?N
zoRP7MR>??}=S8}hpUauHnbGzi>>p_CB#SkSlx3wB45+ZM7R0ob%9}=}oX;-+#iPmU
zjv801T(GlwQ%ldy3FDd3zW&675l<veCXDfu<8fe*B!<S~MtnSm#UTbf@Qt`mpdL#I
z<JjoQqeG*{$&rz#;}gcj(9zL2Izq9COUjgr#%!vP!LpIqr^biIjt@hD2UTvU^(S|&
zJhAM$G7wp)au3BQ(WtlV!&7<UsWDNjRT@{P)Yh;#MKeMxE#&Rg6iC~0<Sh3JMcdS7
z3T17ooXusP$YzApW>ckEp=A~dz;vW-IhNC=Rf0;F8pQycW#&>y5SlF1(dKftHdC~P
zW}EZyEwiUt#Z1ah3C*0PlRujQDr*kaxe6Qx_mPDoKma+LR6d)A85C_CMrc!|lF;Ca
zlyyAHef6KJxSqo`6fM8pbpfvrdX&KYykuB|&WEr@Vm_}QSjNba*H64YWBwp?aE-@&
zUT3fzg!>sAj*DI9^Lj1{7{+G)5)H(X*OA~N8--(4=+MJ8AoF?M!jjjwJb(6MnFKzr
zDOq>dH?+=)&`9e12^JaXppE$v=8?Qgjy%44`~|S%G4VRuT`$r9!#sZqBgf)C%0Ixz
z%jb0!OXa_FHN3DDKCi!6s(hY5%QFf;N(7ck<%Fs6&MAEU-NKUB|KKJY=GxmTpx_Xk
ze-R-oN!4G~FM+$4-^@IcqhR+om45|{UOulQS?)xmr*YmGdKo&n_HwTeu5ZH|6PoJJ
zI?L~a558|>KL3sj-y{D9aB+&v=k;snUHCwe?HBNk*DmvUoy+yS*Xj5<Z#-|l4(5UD
z9iP|3OM7K_5pjh3&3u->1B;h0qU74^NPf-`>gJRAOJKl0&!5+Eoy2E9=F{P@DsDp$
zF>-ft=BxKF^S#o&c3n2qHB-PaJxB0&{BHR0eE7KV{@}TL=t3WGv_JDeWK>l6CDWL{
P2N(Fz-gAWet@8g37k2|A

-- 
Gitee


From 0dec16361db1981a25c33120fcd0b4c9808aa308 Mon Sep 17 00:00:00 2001
From: binlien <lianwaiyu@qq.com>
Date: Mon, 4 Mar 2024 23:31:23 +0800
Subject: [PATCH 17/62] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=BA=90=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pytorch_adapter/with_setuptools/Makefile  |  20 +++
 .../with_setuptools/add_adapter.cpp           | 128 ++++++++++++++++++
 .../with_setuptools/add_kernel.cpp            | 106 +++++++++++++++
 3 files changed, 254 insertions(+)
 create mode 100644 sample/pytorch_adapter/with_setuptools/Makefile
 create mode 100644 sample/pytorch_adapter/with_setuptools/add_adapter.cpp
 create mode 100644 sample/pytorch_adapter/with_setuptools/add_kernel.cpp

diff --git a/sample/pytorch_adapter/with_setuptools/Makefile b/sample/pytorch_adapter/with_setuptools/Makefile
new file mode 100644
index 0000000000..ec9115f377
--- /dev/null
+++ b/sample/pytorch_adapter/with_setuptools/Makefile
@@ -0,0 +1,20 @@
+# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化
+ASCEND_HOME_PATH			?= /usr/local/Ascend/ascend-toolkit/latest
+
+COMPILER					:= $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec  # 参考device_config.cmake中CMAKE_C_COMPILER配置
+COMPILER_FLAG				:= -xcce -O2 -std=c++17
+DYNAMIC_LIB_FLAG			:= -fPIC -shared
+DAV_FLAG                    := --cce-aicore-arch=dav-c220-vec
+ASCENDC_INC_FLAG 			:= -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include  # 参考device_intf.cmake的配置简化
+
+all: build
+
+build: libcustom_kernels.so
+
+# 后续如果要扩展，把多个kernel的cpp都加到后面
+libcustom_kernels.so: add_kernel.cpp
+	$(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^
+
+.PHONY: clean
+clean:
+	rm *.so
\ No newline at end of file
diff --git a/sample/pytorch_adapter/with_setuptools/add_adapter.cpp b/sample/pytorch_adapter/with_setuptools/add_adapter.cpp
new file mode 100644
index 0000000000..6c65e60ec5
--- /dev/null
+++ b/sample/pytorch_adapter/with_setuptools/add_adapter.cpp
@@ -0,0 +1,128 @@
+#include <torch/extension.h>
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+#include "torch_npu/csrc/framework/OpCommand.h"
+
+using torch::autograd::AutogradContext;
+using torch::autograd::Function;
+using tensor_list = std::vector<at::Tensor>;
+using namespace at;
+
+extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z);
+
+// 为NPU设备注册前向实现
+at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other)
+{
+    // 创建输出内存
+    at::Tensor result = at::Tensor(self);
+    // 将pytorch中的结构翻译成为CANN认识的数据类型和结构
+    // 1. (重要)通过对tensor的shape分析，选择合适的tiling（该算子为了简化，固定了tiling，只有特定shape下计算才正确）
+    // 2. 对数据类型和格式转换  -- 此处无需数据格式处理，直接使用
+    auto stream = c10_npu::getCurrentNPUStream().stream(false);
+    auto x = self.storage().data();
+    auto y = other.storage().data();
+    auto z = result.storage().data();
+
+    uint32_t blockDim = 8;
+    auto callback = [stream, blockDim, x, y, z]() -> int {
+        add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z);
+        return 0;  // 此处可以通过某种方式获取算子执行结果，还未实现
+    };
+    // 下发算子
+    at_npu::native::OpCommand cmd;
+    cmd.Name("my_add").SetCustomHandler(callback).Run();
+    return result;
+}
+
+// 为NPU设备注册反向实现
+std::tuple<at::Tensor, at::Tensor> my_add_backward_impl_npu(const at::Tensor &self)
+{
+    at::Tensor result = at::Tensor(self);  // 创建输出内存
+
+    return {result, result};
+}
+
+// 为Meta设备注册前向实现
+at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other)
+{
+    return empty_like(self);
+}
+
+// 为Meta设备注册反向实现
+std::tuple<at::Tensor, at::Tensor> my_add_backward_impl_meta(const at::Tensor &self)
+{
+    auto result = empty_like(self);
+    return std::make_tuple(result, result);
+}
+
+// 寻找注册在该op上的不同设备的实现
+at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other)
+{
+    static auto op =
+        torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed<decltype(my_add_impl)>();
+    return op.call(self, other);
+}
+// 寻找注册在该op上的不同设备的实现
+std::tuple<at::Tensor, at::Tensor> my_add_backward_impl(const at::Tensor &self)
+{
+    static auto op = torch::Dispatcher::singleton()
+                         .findSchemaOrThrow("myaten::my_add_backward", "")
+                         .typed<decltype(my_add_backward_impl)>();
+    return op.call(self);
+}
+
+// 在myaten命名空间里注册my_add和my_add_backward两个schema
+TORCH_LIBRARY(myaten, m)
+{
+    m.def("my_add(Tensor self, Tensor other) -> Tensor");
+    m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)");
+}
+
+// 通过继承torch::autograd::Function类实现前反向绑定
+class MyAddFunction : public torch::autograd::Function<MyAddFunction> {
+public:
+    static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other)
+    {
+        at::AutoDispatchBelowADInplaceOrView guard;
+        return my_add_impl(self, other);
+    }
+
+    static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs)
+    {
+        auto grad_output = grad_outputs[0];
+        auto result = my_add_backward_impl(grad_output);
+        return {std::get<0>(result), std::get<1>(result)};
+    }
+};
+
+at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other)
+{
+    return MyAddFunction::apply(self, other);
+}
+
+// 给op绑定NPU的自动求导实现
+// 如果是pytorch 2.1以下的版本，AutogradPrivateUse1需要改成AutogradXLA
+TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m)
+{
+    m.impl("my_add", &my_add_impl_autograd);
+}
+
+// 为NPU设备注册前反向实现
+// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1，在2.1以下版本用的是XLA，如果是2.1以下版本PrivateUse1需要改成XLA
+TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m)
+{
+    m.impl("my_add", &my_add_impl_npu);
+    m.impl("my_add_backward", &my_add_backward_impl_npu);
+}
+
+// 为Meta设备注册前反向实现
+TORCH_LIBRARY_IMPL(myaten, Meta, m)
+{
+    m.impl("my_add", &my_add_impl_meta);
+    m.impl("my_add_backward", &my_add_backward_impl_meta);
+}
+
+// 通过pybind将c++接口和python接口绑定
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("my_add", &my_add_impl_autograd, "x + y");
+}
diff --git a/sample/pytorch_adapter/with_setuptools/add_kernel.cpp b/sample/pytorch_adapter/with_setuptools/add_kernel.cpp
new file mode 100644
index 0000000000..9aa62e0936
--- /dev/null
+++ b/sample/pytorch_adapter/with_setuptools/add_kernel.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ *
+ * Function : z = x + y
+ * This sample is a very basic sample that implements vector add on Ascend plaform.
+ * In this sample:
+ * Length of x / y / z is 8*2048.
+ * Num of vector core used in sample is 8.
+ * Length for each core to compute is 2048.
+ * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop.
+ *
+ */
+#include "kernel_operator.h"
+using namespace AscendC;
+constexpr int32_t TOTAL_LENGTH = 8 * 2048;                             // total length of data
+constexpr int32_t USE_CORE_NUM = 8;                                    // num of core used
+constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;          // length computed of each core
+constexpr int32_t TILE_NUM = 8;                                        // split data into 8 tiles for each core
+constexpr int32_t BUFFER_NUM = 2;                                      // tensor num for each queue
+constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM;  // seperate to 2 parts, due to double buffer
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd()
+    {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    {
+        // get start index for current core, core parallel
+        xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
+        // pipe alloc memory to queue, the unit is Bytes
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+    }
+    __aicore__ inline void Process()
+    {
+        // loop count need to be doubled, due to double buffer
+        constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM;
+        // tiling strategy, pipeline parallel
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        // alloc tensor from queue memory
+        LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
+        LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
+        // copy progress_th tile from global tensor to local tensor
+        DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
+        DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
+        // enque input tensors to VECIN queue
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        // deque input tensors from VECIN queue
+        LocalTensor<half> xLocal = inQueueX.DeQue<half>();
+        LocalTensor<half> yLocal = inQueueY.DeQue<half>();
+        LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
+        // call Add instr for computation
+        Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+        // enque the output tensor to VECOUT queue
+        outQueueZ.EnQue<half>(zLocal);
+        // free input tensors for reuse
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        // deque output tensor from VECOUT queue
+        LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
+        // copy progress_th tile from local tensor to global tensor
+        DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
+        // free output tensor for reuse
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    TPipe pipe;
+    // create queues for input, in this case depth is equal to buffer num
+    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
+    // create queue for output, in this case depth is equal to buffer num
+    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+    GlobalTensor<half> xGm, yGm, zGm;
+};
+// implementation of kernel function
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+{
+    KernelAdd op;
+    op.Init(x, y, z);
+    op.Process();
+}
+
+// 包裹核函数，使得普通编译器能认识这个符号
+extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
+{
+    add_custom<<<blockDim, nullptr, stream>>>(x, y, z);
+}
\ No newline at end of file
-- 
Gitee


From 57fa7205e6a9cc359f6b95c74e24bab9ba5600da Mon Sep 17 00:00:00 2001
From: binlien <lianwaiyu@qq.com>
Date: Tue, 5 Mar 2024 00:10:11 +0800
Subject: [PATCH 18/62] =?UTF-8?q?=E5=8A=A0=E5=BC=BAREADME=E8=AF=B4?=
 =?UTF-8?q?=E6=98=8E=EF=BC=8C=E6=8F=90=E4=BE=9B=E6=80=BB=E4=BD=93=E7=BB=93?=
 =?UTF-8?q?=E6=9E=84=E7=9A=84=E8=B0=83=E6=95=B4=E5=92=8C=E9=92=88=E5=AF=B9?=
 =?UTF-8?q?pytorch=20adapter=E7=9A=84=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sample/README.md                 | 76 ++++++++++++++++++++------------
 sample/pytorch_adapter/README.md | 33 ++++++++++++++
 2 files changed, 82 insertions(+), 27 deletions(-)

diff --git a/sample/README.md b/sample/README.md
index 167b1a01cb..1f02b0609e 100644
--- a/sample/README.md
+++ b/sample/README.md
@@ -5,12 +5,61 @@
 
 如果考虑商用集成，推荐使用CANN软件包中的AscendC样例工程，比如：ascendc_kernel_cmake目录。本项目中的工程就是基于其进行简化仅用于快速验证。
 
+说明：该sample目录中，每个最小目录就是一个完整的样例工程。这些样例工程本身可能以为依赖的不同存在差异。
+
 ## 依赖说明
 安装CANN包，并使能环境变量，并确保```ASCEND_HOME_PATH```生效，可以在CANN包安装目录下使能：
 ```
 source set_env.sh
 ```
 
+## 目录介绍
+整体目录结构如下：
+```
+- sample
+  |- build              # 编译并运行所有样例内容（建议按需使用，此处命令可以参考
+  |- normal_sample      # 纯C/C++的AscendC单算子极简工程，可配合msdebug和msprof工具
+    |- cube_only        # 仅含aic的AscendC单算子极简工程
+    |- mix              # mix算子的AscendC单算子极简工程
+    |- vec_only         # 仅含aiv的AscendC单算子极简工程
+  |- pytorch_adapter    # 适配pytorch的AscendC单算子极简工程，可配合msdebug和msprof工具
+    |- jit_compile      # jit模式，运行时编译使用
+    |- with_setuptools  # 编译成wheel包安装使用
+  |- sanitizer_sample   # 异常样例，用于配合mssanitizer工具
+    |- racecheck        # 含竞争问题的样例
+    |- xx               # 其他异常样例 
+```
+
+如果你关注算子的框架适配，详见[此处](./pytorch_adapter/README.md)
+
+
+## 算子调试 msdebug
+若使用msdebug进行上板调试，还需要额外调整，具体如下：
+1. 编译阶段：在```sample\normal_sample\vec_only```相对路径下的```Makefile```文件中修改如下内容：
+    + 调试信息增强，并扩大栈空间：
+    ```
+    COMPILER_FLAG		:= -xcce -O2 -std=c++17
+    修改为：
+    COMPILER_FLAG		:= -xcce -O0 -std=c++17 -g -mllvm -cce-aicore-function-stack-size=0x8000 -mllvm -cce-aicore-stack-size=0x8000 -mllvm -cce-aicore-jump-expand=true
+    ```
+
+2. 运行阶段：
+```
+msdebug ./*.fatbin
+```
+
+## 内存检测 sanitizer
+1. 编译阶段：在编译过程中添加```--cce-enable-sanitizer -g```参数, 在链接过程中添加```--cce-enable-sanitizer```参数。（现样例中已在Makefile中添加），执行如下命令：
+```
+make
+```
+
+2. 运行阶段：
+```
+mssanitizer ./*.fatbin  # 默认进行memcheck检查
+```
+
+
 ## 算子调优
 算子调优工具可以支持上板和仿真算子的调优，下面将以vec_only中的算子为例，进行工具使用的实战命令讲解
 
@@ -84,30 +133,3 @@ source set_env.sh
         └── trace.json                          # 算子所有核的流水图
     ```
 4. 更多指标信息请参考算子开发工具使用手册。
-
-## 算子调试msdebug
-若使用msdebug进行上板调试，还需要额外调整，具体如下：
-1. 编译阶段：在```sample\normal_sample\vec_only```相对路径下的```Makefile```文件中修改如下内容：
-    + 调试信息增强，并扩大栈空间：
-    ```
-    COMPILER_FLAG		:= -xcce -O2 -std=c++17
-    修改为：
-    COMPILER_FLAG		:= -xcce -O0 -std=c++17 -g -mllvm -cce-aicore-function-stack-size=0x8000 -mllvm -cce-aicore-stack-size=0x8000 -mllvm -cce-aicore-jump-expand=true
-
-## 内存检测 sanitizer
-### sanitizer_sample目录介绍
-
-此目录下为sanitizer对应的样例库，包含竞争检测和内存检测相关的样例。
-
-#### Racecheck目录介绍
-
-Racecheck为竞争检测相关的样例。
-
-raw_error_kernel.cpp文件为UB上先读后写竞争和GM上先写后读竞争问题的样例。
-
-
-运行阶段：
-
-```
-/usr/local/Ascend/ascend-toolkit/latest/tools/mssanitizer/bin/mssanitizer --tool=racecheck ./raw_error.fatbin
-```
\ No newline at end of file
diff --git a/sample/pytorch_adapter/README.md b/sample/pytorch_adapter/README.md
index e69de29bb2..588401bf34 100644
--- a/sample/pytorch_adapter/README.md
+++ b/sample/pytorch_adapter/README.md
@@ -0,0 +1,33 @@
+# 算子框架适配说明
+
+## 简介
+昇腾提供丰富的算子接入框架的方式，此处将介绍最简单的一种，每个目录中都是一个独立的可使用的工程
+
+## 依赖
+与业内pytorch的算子介入方式相同，算子接入框架需要保障设备上有正确的pytorch版本（我们还依赖torch_npu版本）
+
+pytorch版本可由pip安装，torch_npu版本详见[此处](https://gitee.com/ascend/pytorch/releases)，请选择与pytorch适配的torch_npu版本。
+
+## 工程使用
+
+### jit_compile工程
+执行如下命令，就会在运行过程中，现场生成python模块并使用：
+```
+python main.py
+```
+
+### setuptools工程
+针对with_setuptools工程，可以编译出可安装的wheel包，便于多机部署使用。
+
+
+1. 执行如下命令可以编译出软件包(setuptools可以支持多种方式，比如：build,install等，此处不一一展示)：
+```
+pytorch setup.py bdist_wheel    # 编译出wheel包，在dist目录下
+```
+
+2. 到```dist```目录下用pip命令安装对应软件包。
+
+3. 执行测试脚本
+```
+python test.py
+```
-- 
Gitee


From 3d4775c95fa69c10228a189776c8cd76afffca74 Mon Sep 17 00:00:00 2001
From: binlien <lianwaiyu@qq.com>
Date: Tue, 5 Mar 2024 00:12:04 +0800
Subject: [PATCH 19/62] .

---
 sample/README.md                 | 2 +-
 sample/pytorch_adapter/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sample/README.md b/sample/README.md
index 1f02b0609e..6bd55a2f83 100644
--- a/sample/README.md
+++ b/sample/README.md
@@ -30,7 +30,7 @@ source set_env.sh
     |- xx               # 其他异常样例 
 ```
 
-如果你关注算子的框架适配，详见[此处](./pytorch_adapter/README.md)
+如果你关注自定义算子的pytorch框架适配，详见[此处](./pytorch_adapter/README.md)
 
 
 ## 算子调试 msdebug
diff --git a/sample/pytorch_adapter/README.md b/sample/pytorch_adapter/README.md
index 588401bf34..4233303b69 100644
--- a/sample/pytorch_adapter/README.md
+++ b/sample/pytorch_adapter/README.md
@@ -1,4 +1,4 @@
-# 算子框架适配说明
+# 自定义算子的pytorch框架适配说明
 
 ## 简介
 昇腾提供丰富的算子接入框架的方式，此处将介绍最简单的一种，每个目录中都是一个独立的可使用的工程
-- 
Gitee


From 3a90ae447c09ad625476cf2b0ea47bb41b99de02 Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Tue, 5 Mar 2024 01:31:32 +0000
Subject: [PATCH 20/62] update sample/pytorch_adapter/jit_compile/main.py.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 sample/pytorch_adapter/jit_compile/main.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sample/pytorch_adapter/jit_compile/main.py b/sample/pytorch_adapter/jit_compile/main.py
index 11f92600d9..a0e17fb072 100644
--- a/sample/pytorch_adapter/jit_compile/main.py
+++ b/sample/pytorch_adapter/jit_compile/main.py
@@ -1,7 +1,8 @@
 import os
 import torch
-import torch.utils.cpp_extension
 import torch_npu
+import torch.utils.cpp_extension
+import subprocess
 from torch_npu.testing.testcase import TestCase, run_tests
 
 PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__))
@@ -9,7 +10,7 @@ CUR_PATH = os.getcwd()
 
 
 def compile_kernels():
-    os.system("make")  # 由于pytorch中没有device编译的扩展，所以此处人工加make
+    subprocess.run("make", shell=True)  # 由于pytorch中没有昇腾device编译的扩展，所以此处人工加make
 
 
 def compile_host():
-- 
Gitee


From 4891efb139b37e27805c8973b57d7a919d795fb8 Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Tue, 5 Mar 2024 01:32:08 +0000
Subject: [PATCH 21/62] update sample/pytorch_adapter/with_setuptools/setup.py.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 sample/pytorch_adapter/with_setuptools/setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sample/pytorch_adapter/with_setuptools/setup.py b/sample/pytorch_adapter/with_setuptools/setup.py
index 8f5ad6d9f8..2c75808fa6 100644
--- a/sample/pytorch_adapter/with_setuptools/setup.py
+++ b/sample/pytorch_adapter/with_setuptools/setup.py
@@ -1,6 +1,7 @@
 import os
 import torch
 import torch_npu
+import subprocess
 from setuptools import setup, find_packages
 from torch.utils.cpp_extension import BuildExtension
 from torch_npu.utils.cpp_extension import NpuExtension
@@ -10,7 +11,7 @@ CUR_PATH = os.getcwd()
 
 
 def compile_kernels():
-    os.system("make")  # 由于pytorch中没有device编译的扩展，所以此处人工加make
+    subprocess.run("make", shell=True)  # 由于pytorch中没有昇腾device编译的扩展，所以此处人工加make
     return "libcustom_kernels.so"  # 这个make出来的库名字
 
 
-- 
Gitee


From 5436a8f5d687e54d79c6571fb1ee9ed4efbeb907 Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Tue, 5 Mar 2024 01:50:47 +0000
Subject: [PATCH 22/62] update sample/pytorch_adapter/README.md.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 sample/pytorch_adapter/README.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/sample/pytorch_adapter/README.md b/sample/pytorch_adapter/README.md
index 4233303b69..76a1c55e91 100644
--- a/sample/pytorch_adapter/README.md
+++ b/sample/pytorch_adapter/README.md
@@ -8,6 +8,23 @@
 
 pytorch版本可由pip安装，torch_npu版本详见[此处](https://gitee.com/ascend/pytorch/releases)，请选择与pytorch适配的torch_npu版本。
 
+## 工程介绍
+整体工程目录如下：
+```
+- pytorch_adapter
+  |- jit_compile         # 实时编译的接入方式
+    |- add_adapter.cpp   # 使用算子动态库接口完成算子在pytorch框架的适配
+    |- add_kernel.cpp    # 昇腾算子实现，并提供host侧的动态库接口
+    |- main.py           # python的入口，实现整体集成
+    |- Makefile          # 用以生成昇腾算子的host侧动态库的编译脚本
+  |- with_setuptools     # wheel包的接入方式
+    |- add_adapter.cpp
+    |- add_kernel.cpp
+    |- Makefile
+    |- setup.py          # setuptools的入口，支持编译并打包生成wheel包
+    |- test.py           # 测试wheel包功能的入口
+```
+
 ## 工程使用
 
 ### jit_compile工程
-- 
Gitee


From c0ea384ed34fafb384a5e76f2f456614c711ce1d Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Tue, 5 Mar 2024 01:56:00 +0000
Subject: [PATCH 23/62] update sample/pytorch_adapter/README.md.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 sample/pytorch_adapter/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sample/pytorch_adapter/README.md b/sample/pytorch_adapter/README.md
index 76a1c55e91..a2b1ba6357 100644
--- a/sample/pytorch_adapter/README.md
+++ b/sample/pytorch_adapter/README.md
@@ -48,3 +48,6 @@ pytorch setup.py bdist_wheel    # 编译出wheel包，在dist目录下
 ```
 python test.py
 ```
+
+## 其他
+1. 此处样例使用的是静态tiling，如果使用动态tiling，则可以在adapter.cpp中对Tensor的shape进行分析，选择合适tiling。（这部分是流程中必须的，只是可能在不同位置，比如aclnn中，这部分在接口实现；此处，我们本身也可以对add_custom_do进行封装，将tiling内置。）
\ No newline at end of file
-- 
Gitee


From f1cc62777cb2e631a565b88cf53592c4ce220072 Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Tue, 5 Mar 2024 03:57:26 +0000
Subject: [PATCH 24/62] update sample/pytorch_adapter/with_setuptools/setup.py.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 sample/pytorch_adapter/with_setuptools/setup.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/sample/pytorch_adapter/with_setuptools/setup.py b/sample/pytorch_adapter/with_setuptools/setup.py
index 2c75808fa6..92ab1d3c78 100644
--- a/sample/pytorch_adapter/with_setuptools/setup.py
+++ b/sample/pytorch_adapter/with_setuptools/setup.py
@@ -1,26 +1,26 @@
 import os
+import subprocess
 import torch
 import torch_npu
-import subprocess
 from setuptools import setup, find_packages
 from torch.utils.cpp_extension import BuildExtension
 from torch_npu.utils.cpp_extension import NpuExtension
 
 PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__))
-CUR_PATH = os.getcwd()
+CUR_PATH = os.path.abspath(os.path.dirname(__file__))
 
 
 def compile_kernels():
-    subprocess.run("make", shell=True)  # 由于pytorch中没有昇腾device编译的扩展，所以此处人工加make
+    # 由于pytorch中没有昇腾device编译的扩展，所以此处人工加make
+    subprocess.run("make")
     return "libcustom_kernels.so"  # 这个make出来的库名字
 
 
 def compile_adapter():
-    exts = []
-    ext1 = NpuExtension(
+    ext = NpuExtension(
         name="ascend_custom_kernels_lib",  # import的库的名字
         # 如果还有其他cpp文件参与编译，需要在这里添加
-        sources=["./add_adapter.cpp"],
+        sources=[f"{CUR_PATH}/add_adapter.cpp"],
         extra_compile_args=[
             '-I' + os.path.join(os.path.join(os.path.join(os.path.join(
                 PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"),
@@ -28,8 +28,7 @@ def compile_adapter():
         library_dirs=[f"{CUR_PATH}"],  # 编译时需要依赖的库文件的路径，相当于g++编译时的-L选项
         libraries=["custom_kernels"],  # 编译时依赖的库文件，相当于-l选项
     )
-    exts.append(ext1)
-    return exts
+    return [ext]
 
 
 if __name__ == "__main__":
-- 
Gitee


From 2d0acfe120c2429b90f4c9233c4f3d41778bf739 Mon Sep 17 00:00:00 2001
From: lian <lianwaiyu@qq.com>
Date: Tue, 5 Mar 2024 03:57:44 +0000
Subject: [PATCH 25/62] update sample/pytorch_adapter/jit_compile/main.py.

Signed-off-by: lian <lianwaiyu@qq.com>
---
 sample/pytorch_adapter/jit_compile/main.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sample/pytorch_adapter/jit_compile/main.py b/sample/pytorch_adapter/jit_compile/main.py
index a0e17fb072..847a51f1c4 100644
--- a/sample/pytorch_adapter/jit_compile/main.py
+++ b/sample/pytorch_adapter/jit_compile/main.py
@@ -1,16 +1,17 @@
 import os
+import subprocess
 import torch
 import torch_npu
 import torch.utils.cpp_extension
-import subprocess
 from torch_npu.testing.testcase import TestCase, run_tests
 
 PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__))
-CUR_PATH = os.getcwd()
+CUR_PATH = os.path.abspath(os.path.dirname(__file__))
 
 
 def compile_kernels():
-    subprocess.run("make", shell=True)  # 由于pytorch中没有昇腾device编译的扩展，所以此处人工加make
+    # 由于pytorch中没有昇腾device编译的扩展，所以此处人工加make
+    subprocess.run("make")
 
 
 def compile_host():
-- 
Gitee


From 98e6560a9086f52349115313a5d710654df38b8f Mon Sep 17 00:00:00 2001
From: user_10012209 <734267852@qq.com>
Date: Tue, 5 Mar 2024 16:14:15 +0800
Subject: [PATCH 26/62] =?UTF-8?q?[filer\compare=5Ftools]=E6=80=A7=E8=83=BD?=
 =?UTF-8?q?=E6=AF=94=E5=AF=B9=E5=B7=A5=E5=85=B7=E6=AF=94=E5=AF=B9=E7=BB=93?=
 =?UTF-8?q?=E6=9E=9C=E8=A1=A5=E5=85=85=E5=90=8C=E6=AD=A5=E6=97=B6=E9=97=B4?=
 =?UTF-8?q?=E5=AD=97=E6=AE=B5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 profiler/compare_tools/README.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/profiler/compare_tools/README.md b/profiler/compare_tools/README.md
index 17d26d07e2..3793679b39 100644
--- a/profiler/compare_tools/README.md
+++ b/profiler/compare_tools/README.md
@@ -131,19 +131,19 @@ python performance_compare.py [基准性能数据文件] [比对性能数据文
 
 总体性能比对结果以打屏的形式呈现。
 
-| 字段                            | 说明                                                         |
-| ------------------------------- | ------------------------------------------------------------ |
-| Cube Time(Num)                  | Cube算子总耗时，Num表示计算的次数。                          |
-| Vector Time(Num)                | Vector算子总耗时，Num表示计算的次数。                        |
-| Other Time                      | AI CPU、DSA等其他非cube vector算子耗时。                     |
-| Flash Attention Time(Forward)   | Flash Attention算子前向耗时。                                |
-| Flash Attention Time(Backward)  | Flash Attention算子反向耗时。                                |
-| Computing Time                  | 计算流耗时，计算流所有event耗时总和。如果有多条并发计算，计算流耗时对重叠部分只会计算一次。 |
-| Mem Usage                       | 内存使用。gpu上的内存使用可以使用nvidia-smi查看，npu上的内存使用可以使用npu-smi查看，Profiling信息采集时打开profile_memory=True开关，mem usage显示的是memory_record里面的最大resevered值，一般来说是进程级内存。 |
-| Uncovered Communication Time    | 通信未掩盖耗时。                                             |
-| SDMA Time(Num)                  | 拷贝类任务耗时，Num表示计算的次数。                          |
-| Free Time                       | 调度耗时 = E2E耗时 - 算子耗时 - 通信不可掩盖耗时。Free的定义为Device侧既不在通信又不在计算的时间，因此包含拷贝时间（SDMA Time）。 |
-| E2E Time(Not minimal profiling) | E2E总耗时，计算流端到端耗时。当存在Not minimal profiling时，表示该时间存在性能膨胀，会影响通信和调度耗时。 |
+| 字段                                    | 说明                                                         |
+| --------------------------------------- | ------------------------------------------------------------ |
+| Cube Time(Num)                          | Cube算子总耗时，Num表示计算的次数。                          |
+| Vector Time(Num)                        | Vector算子总耗时，Num表示计算的次数。                        |
+| Other Time                              | AI CPU、DSA等其他非cube vector算子耗时。                     |
+| Flash Attention Time(Forward)           | Flash Attention算子前向耗时。                                |
+| Flash Attention Time(Backward)          | Flash Attention算子反向耗时。                                |
+| Computing Time                          | 计算流耗时，计算流所有event耗时总和。如果有多条并发计算，计算流耗时对重叠部分只会计算一次。 |
+| Mem Usage                               | 内存使用。gpu上的内存使用可以使用nvidia-smi查看，npu上的内存使用可以使用npu-smi查看，Profiling信息采集时打开profile_memory=True开关，mem usage显示的是memory_record里面的最大resevered值，一般来说是进程级内存。 |
+| Uncovered Communication Time(Wait Time) | 通信未掩盖耗时，包含Wait Time为同步时间。                    |
+| SDMA Time(Num)                          | 拷贝类任务耗时，Num表示计算的次数。                          |
+| Free Time                               | 调度耗时 = E2E耗时 - 算子耗时 - 通信不可掩盖耗时。Free的定义为Device侧既不在通信又不在计算的时间，因此包含拷贝时间（SDMA Time）。 |
+| E2E Time(Not minimal profiling)         | E2E总耗时，计算流端到端耗时。当存在Not minimal profiling时，表示该时间存在性能膨胀，会影响通信和调度耗时。 |
 
 可以采取最简性能数据采集的方式来减少E2E耗时的性能膨胀，示例代码如下：
 
-- 
Gitee


From 84a318c14fedd47edf111253ffc064e890613fcc Mon Sep 17 00:00:00 2001
From: zhouxianqi <13165993773@163.com>
Date: Thu, 22 Feb 2024 11:08:00 +0800
Subject: [PATCH 27/62] nn_module_compare

---
 .../comparator/module_statistic_comparator.py |  45 +++++
 .../compare_bean/module_statistic_bean.py     |  98 +++++++++++
 .../origin_data_bean/trace_event_bean.py      |  15 ++
 .../compare_backend/data_prepare/__init__.py  |   0
 .../data_prepare/module_data_prepare.py       |  97 +++++++++++
 .../data_prepare/operator_data_prepare.py     |  19 +++
 .../generator/detail_performance_generator.py | 124 ++++++--------
 .../profiling_parser/base_profiling_parser.py |  24 +++
 .../profiling_parser/gpu_profiling_parser.py  |   3 +
 .../profiling_parser/npu_profiling_parser.py  |   5 +-
 .../compare_backend/utils/common_func.py      |  55 ++++++
 .../compare_backend/utils/constant.py         |   2 +
 .../compare_backend/utils/excel_config.py     |  39 ++++-
 .../compare_backend/utils/module_node.py      | 160 ++++++++++++++++++
 .../compare_backend/utils/name_function.py    |   9 +
 .../compare_backend/utils/torch_op_node.py    |  11 +-
 .../compare_backend/utils/tree_builder.py     |  33 +++-
 .../view/work_sheet_creator.py                |  16 +-
 18 files changed, 669 insertions(+), 86 deletions(-)
 create mode 100644 profiler/compare_tools/compare_backend/comparator/module_statistic_comparator.py
 create mode 100644 profiler/compare_tools/compare_backend/compare_bean/module_statistic_bean.py
 create mode 100644 profiler/compare_tools/compare_backend/data_prepare/__init__.py
 create mode 100644 profiler/compare_tools/compare_backend/data_prepare/module_data_prepare.py
 create mode 100644 profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py
 create mode 100644 profiler/compare_tools/compare_backend/utils/module_node.py

diff --git a/profiler/compare_tools/compare_backend/comparator/module_statistic_comparator.py b/profiler/compare_tools/compare_backend/comparator/module_statistic_comparator.py
new file mode 100644
index 0000000000..e09108f3cb
--- /dev/null
+++ b/profiler/compare_tools/compare_backend/comparator/module_statistic_comparator.py
@@ -0,0 +1,45 @@
+from collections import OrderedDict
+
+from compare_backend.comparator.base_comparator import BaseComparator
+from compare_backend.utils.common_func import update_order_id
+
+
+class ModuleStatisticComparator(BaseComparator):
+    def __init__(self, origin_data: list, bean: any):
+        super().__init__(origin_data, bean)
+
+    def _compare(self):
+        if not self._origin_data:
+            return
+        base_module_dict, comparison_module_dict = self._group_by_module_name()
+        for module_name, base_data in base_module_dict.items():
+            comparison_data = comparison_module_dict.pop(module_name, [])
+            self._rows.extend(self._bean(module_name, base_data, comparison_data).rows)
+        for module_name, comparison_data in comparison_module_dict.items():
+            self._rows.extend(self._bean(module_name, [], comparison_data).rows)
+        update_order_id(self._rows)
+
+    def _group_by_module_name(self):
+        base_module_dict, comparison_module_dict = OrderedDict(), OrderedDict()
+        base_all_data = [data for data in self._origin_data if data[0]]  # index 0 for base module
+        base_all_data.sort(key=lambda x: x[0].start_time)
+        base_none_data = [data for data in self._origin_data if not data[0]]  # index 0 for base module
+        base_none_data.sort(key=lambda x: x[1].start_time)
+        index = 0
+        for base_module, comparison_module in base_all_data:
+            base_module_dict.setdefault(base_module.module_name, []).append(base_module)
+            if not comparison_module:
+                continue
+            while index < len(base_none_data):
+                module = base_none_data[index][1]  # index 1 for comparison module
+                if module.start_time < comparison_module.start_time:
+                    comparison_module_dict.setdefault(module.module_name, []).append(module)
+                    index += 1
+                else:
+                    break
+            comparison_module_dict.setdefault(comparison_module.module_name, []).append(comparison_module)
+        while index < len(base_none_data):
+            module = base_none_data[index][1]  # index 1 for comparison module
+            comparison_module_dict.setdefault(module.module_name, []).append(module)
+            index += 1
+        return base_module_dict, comparison_module_dict
diff --git a/profiler/compare_tools/compare_backend/compare_bean/module_statistic_bean.py b/profiler/compare_tools/compare_backend/compare_bean/module_statistic_bean.py
new file mode 100644
index 0000000000..97fc98bdd3
--- /dev/null
+++ b/profiler/compare_tools/compare_backend/compare_bean/module_statistic_bean.py
@@ -0,0 +1,98 @@
+import re
+
+from compare_backend.utils.common_func import calculate_diff_ratio
+from compare_backend.utils.constant import Constant
+from compare_backend.utils.excel_config import ExcelConfig
+
+
+class ModuleStatisticBean:
+    TABLE_NAME = Constant.MODULE_TOP_TABLE
+    HEADERS = ExcelConfig.HEADERS.get(TABLE_NAME)
+    OVERHEAD = ExcelConfig.OVERHEAD.get(TABLE_NAME)
+
+    def __init__(self, name: str, base_data: list, comparison_data: list):
+        self._module_name = name.replace("nn.Module:", "")
+        pattern = re.compile('_[0-9]+$')
+        self._module_class = pattern.sub('', name.split("/")[-1])
+        self._module_level = name.count("/")
+        self._base_info = ModuleStatisticInfo(base_data)
+        self._comparison_info = ModuleStatisticInfo(comparison_data)
+
+    @property
+    def rows(self):
+        rows = [self.get_total_row()]
+        rows.extend(self.get_detail_rows())
+        return rows
+
+    @staticmethod
+    def _get_kernel_detail_rows(base_kernel_dict, com_kernel_dict):
+        base_kernel_detals = ""
+        com_kernel_details = ""
+        for kernel_name, base_dur_list in base_kernel_dict.items():
+            base_dur = "%.3f" % sum(base_dur_list)
+            base_kernel_detals += f"{kernel_name}, [number: {len(base_dur_list)}], [duration_ms: {base_dur}]\n"
+        for kernel_name, com_dur_list in com_kernel_dict.items():
+            com_dur = "%.3f" % sum(com_dur_list)
+            com_kernel_details += f"{kernel_name}, [number: {len(com_dur_list)}], [duration_ms: {com_dur}]\n"
+        return [base_kernel_detals, com_kernel_details]
+
+    def get_total_row(self):
+        total_diff, total_ratio = calculate_diff_ratio(self._base_info.device_total_dur_ms,
+                                                       self._comparison_info.device_total_dur_ms)
+        self_diff, _ = calculate_diff_ratio(self._base_info.device_self_dur_ms,
+                                            self._comparison_info.device_self_dur_ms)
+        row = [None, self._module_class, self._module_level, self._module_name, "[ TOTAL ]", None,
+               self._base_info.device_self_dur_ms, self._base_info.number, self._base_info.device_total_dur_ms,
+               None, self._comparison_info.device_self_dur_ms, self._comparison_info.number,
+               self._comparison_info.device_total_dur_ms, total_diff, self_diff,
+               total_ratio, self._base_info.call_stack, self._comparison_info.call_stack]
+        return row
+
+    def get_detail_rows(self):
+        rows = []
+        for op_name, base_dur_dict in self._base_info.api_dict.items():
+            base_dur_list = base_dur_dict.get("total", [])
+            com_dur_dict = self._comparison_info.api_dict.pop(op_name, {})
+            com_dur_list = com_dur_dict.get("total", [])
+            base_kernel_detals, com_kernel_details = self._get_kernel_detail_rows(base_dur_dict.get("detail", {}),
+                                                                                  com_dur_dict.get("detail", {}))
+            self_diff, self_ratio = calculate_diff_ratio(sum(base_dur_list), sum(com_dur_list))
+            row = [None, self._module_class, self._module_level, self._module_name, op_name, base_kernel_detals,
+                   sum(base_dur_list), len(base_dur_list), None, com_kernel_details, sum(com_dur_list),
+                   len(com_dur_list), None, None, self_diff, self_ratio, None, None]
+            rows.append(row)
+
+        for op_name, com_dur_dict in self._comparison_info.api_dict.items():
+            com_dur_list = com_dur_dict.get("total", [])
+            base_kernel_detals, com_kernel_details = self._get_kernel_detail_rows({}, com_dur_dict.get("detail", {}))
+            self_diff, self_ratio = calculate_diff_ratio(0, sum(com_dur_list))
+            row = [None, self._module_class, self._module_level, self._module_name, op_name, base_kernel_detals, 0, 0,
+                   None, com_kernel_details, sum(com_dur_list), len(com_dur_list), None, None, self_diff,
+                   self_ratio, None, None]
+            rows.append(row)
+        return rows
+
+
+class ModuleStatisticInfo:
+    def __init__(self, data_list: list):
+        self._data_list = data_list
+        self.device_self_dur_ms = 0
+        self.device_total_dur_ms = 0
+        self.call_stack = ""
+        self.number = len(data_list)
+        self.api_dict = {}
+        self._get_info()
+
+    def _get_info(self):
+        if self._data_list:
+            self.call_stack = self._data_list[0].call_stack
+        for module in self._data_list:
+            self.device_self_dur_ms += module.device_self_dur / Constant.US_TO_MS
+            self.device_total_dur_ms += module.device_total_dur / Constant.US_TO_MS
+            for torch_op in module.toy_layer_api_list:
+                self.api_dict.setdefault(torch_op.name, {}).setdefault("total", []).append(
+                    torch_op.device_dur / Constant.US_TO_MS)
+                for kernel in torch_op.kernel_list:
+                    self.api_dict.setdefault(torch_op.name, {}).setdefault("detail", {}).setdefault(kernel.kernel_name,
+                                                                                                    []).append(
+                        kernel.device_dur / Constant.US_TO_MS)
diff --git a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py
index 6ce91ba53c..b3491cfb13 100644
--- a/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py
+++ b/profiler/compare_tools/compare_backend/compare_bean/origin_data_bean/trace_event_bean.py
@@ -187,6 +187,21 @@ class TraceEventBean:
                 return True
         return False
 
+    def is_python_function(self):
+        return self.lower_cat == "python_function"
+
+    def is_optimizer(self):
+        return self.lower_name.startswith("optimizer")
+
+    def is_fwdbwd(self):
+        return self.lower_cat == "fwdbwd"
+
+    def is_step_profiler(self):
+        return self.name.find("ProfilerStep#") != -1
+
+    def reset_name(self, name):
+        self._name = name
+
     def init(self):
         if isinstance(self._event, dict):
             self._pid = self._event.get("pid", 0)
diff --git a/profiler/compare_tools/compare_backend/data_prepare/__init__.py b/profiler/compare_tools/compare_backend/data_prepare/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/profiler/compare_tools/compare_backend/data_prepare/module_data_prepare.py b/profiler/compare_tools/compare_backend/data_prepare/module_data_prepare.py
new file mode 100644
index 0000000000..6d45b98dd7
--- /dev/null
+++ b/profiler/compare_tools/compare_backend/data_prepare/module_data_prepare.py
@@ -0,0 +1,97 @@
+from queue import Queue
+
+from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean
+from compare_backend.profiling_parser.base_profiling_parser import ProfilingResult
+from compare_backend.utils.constant import Constant
+from compare_backend.utils.module_node import ModuleNode
+from compare_backend.utils.tree_builder import TreeBuilder
+
+
+class ModuleDataPrepare:
+    def __init__(self, profiling_data: ProfilingResult):
+        self.profiling_data = profiling_data
+        self._nn_module_list = []
+        self._call_function = []
+        for event in profiling_data.python_function_data:
+            if event.lower_name.startswith("nn.module:"):
+                self._nn_module_list.append(event)
+            else:
+                self._call_function.append(event)
+        self._bwd_dict = {}
+        self._bwd_pid = self._get_bwd_pid()
+
+    @staticmethod
+    def update_module_node_info(fwd_root_node, bwd_root_node, func_root_node):
+        queue = Queue()
+        queue.put(fwd_root_node)
+        queue.put(bwd_root_node)
+        while not queue.empty():
+            module_node = queue.get()
+            module_node.update_torch_op_kernel_list()
+            call_function = func_root_node.find_module_call(module_node.start_time)
+            if call_function:
+                module_node.reset_call_stack(call_function.call_stack)
+            for sub_module_node in module_node.child_nodes:
+                queue.put(sub_module_node)
+
+    def build_module_tree(self):
+        if not self._nn_module_list:
+            return [None, None]
+        self._dispatch_torch_op()
+        event_list = [TraceEventBean({"ts": ts}) for ts in self.profiling_data.kernel_dict.keys()]
+        self._nn_module_list.extend(event_list)
+        root_node = TreeBuilder.build_module_tree(self._nn_module_list, self.profiling_data.kernel_dict)
+        func_root_node = TreeBuilder.build_module_tree(self._call_function, {})
+        bwd_module_list = self.get_bwd_module(root_node)
+        if bwd_module_list:
+            bwd_module_list.extend(event_list)
+        bwd_root_node = TreeBuilder.build_module_tree(bwd_module_list, self.profiling_data.kernel_dict)
+        self.match_torch_op(root_node, bwd_root_node)
+        self.update_module_node_info(root_node, bwd_root_node, func_root_node)
+        return [root_node, bwd_root_node]
+
+    def get_bwd_module(self, root_node: ModuleNode):
+        bwd_module_list = []
+        for flow in self.profiling_data.fwdbwd_dict.values():
+            start_point = flow.get("start")
+            end_point = flow.get("end")
+            if not start_point or not end_point:
+                continue
+            end_event = self._bwd_dict.get(end_point.start_time)
+            if not end_event:
+                continue
+            call_module = root_node.find_module_call(start_point.start_time)
+            if call_module:
+                end_event.reset_name(f"[ BACKWARD ]{call_module.module_name}")
+                bwd_module_list.append(end_event)
+        return bwd_module_list
+
+    def match_torch_op(self, fwd_root_node, bwd_root_node):
+        torch_op_list = sorted(self.profiling_data.torch_op_data, key=lambda x: x.start_time)
+        for torch_op in torch_op_list:
+            if torch_op.is_optimizer():
+                continue
+            if torch_op.is_step_profiler():
+                continue
+            matched_module = fwd_root_node.find_module_call(torch_op.start_time)
+            if matched_module:
+                matched_module.find_torch_op_call(torch_op)
+                continue
+            matched_module = bwd_root_node.find_module_call(torch_op.start_time)
+            if matched_module:
+                matched_module.find_torch_op_call(torch_op)
+
+    def _dispatch_torch_op(self):
+        for torch_op in self.profiling_data.torch_op_data:
+            if torch_op.is_optimizer():
+                self._nn_module_list.append(torch_op)
+                continue
+            if torch_op.pid == self._bwd_pid:
+                self._bwd_dict[torch_op.start_time] = torch_op
+
+    def _get_bwd_pid(self):
+        for flow in self.profiling_data.fwdbwd_dict.values():
+            end_point = flow.get("end")
+            if end_point:
+                return end_point.pid
+        return Constant.INVALID_VALUE
diff --git a/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py b/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py
new file mode 100644
index 0000000000..fdce23c6ab
--- /dev/null
+++ b/profiler/compare_tools/compare_backend/data_prepare/operator_data_prepare.py
@@ -0,0 +1,19 @@
+from compare_backend.profiling_parser.base_profiling_parser import ProfilingResult
+from compare_backend.utils.tree_builder import TreeBuilder
+
+
+class OperatorDataPrepare:
+    def __init__(self, profiling_data: ProfilingResult):
+        self.profiling_data = profiling_data
+
+    def get_top_layer_ops(self) -> any:
+        root_node = TreeBuilder.build_tree(self.profiling_data.torch_op_data, self.profiling_data.kernel_dict,
+                                           self.profiling_data.memory_list)
+        level1_child_nodes = root_node.child_nodes
+        result_data = []
+        for level1_node in level1_child_nodes:
+            if level1_node.is_step_profiler():
+                result_data.extend(level1_node.child_nodes)
+            else:
+                result_data.append(level1_node)
+        return result_data
diff --git a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py
index 72ce3ba868..4ae6ebcec2 100644
--- a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py
+++ b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py
@@ -1,23 +1,26 @@
 import os
 from collections import deque
 from datetime import datetime
-
-import numpy as np
+from queue import Queue
 
 from compare_backend.comparator.communication_comparator import CommunicationComparator
+from compare_backend.comparator.module_statistic_comparator import ModuleStatisticComparator
 from compare_backend.comparator.operator_comparator import OperatorComparator
 from compare_backend.comparator.operator_statistic_comparator import OperatorStatisticComparator
 from compare_backend.compare_bean.communication_bean import CommunicationBean
 from compare_backend.compare_bean.memory_compare_bean import MemoryCompareBean
 from compare_backend.compare_bean.memory_statistic_bean import MemoryStatisticBean
+from compare_backend.compare_bean.module_statistic_bean import ModuleStatisticBean
 from compare_backend.compare_bean.operator_compare_bean import OperatorCompareBean
 from compare_backend.compare_bean.operator_statistic_bean import OperatorStatisticBean
+from compare_backend.data_prepare.module_data_prepare import ModuleDataPrepare
+from compare_backend.data_prepare.operator_data_prepare import OperatorDataPrepare
 from compare_backend.generator.base_generator import BaseGenerator
-from compare_backend.profiling_parser.base_profiling_parser import ProfilingResult
+from compare_backend.utils.common_func import longest_common_subsequence_matching
 from compare_backend.utils.constant import Constant
+from compare_backend.utils.module_node import ModuleNode
 from compare_backend.utils.name_function import NameFunction
 from compare_backend.utils.torch_op_node import TorchOpNode
-from compare_backend.utils.tree_builder import TreeBuilder
 from compare_backend.view.excel_view import ExcelView
 
 
@@ -46,6 +49,8 @@ class DetailPerformanceGenerator(BaseGenerator):
         comparator_list = []
         if self._args.enable_operator_compare or self._args.enable_memory_compare:
             op_compare_result = self.match_torch_op()
+        if self._args.enable_operator_compare:
+            module_compare_result = self.match_nn_module()
 
         if self._args.enable_communication_compare:
             communication_data = {
@@ -56,87 +61,24 @@ class DetailPerformanceGenerator(BaseGenerator):
         if self._args.enable_operator_compare:
             comparator_list.append(OperatorComparator(op_compare_result, OperatorCompareBean))
             comparator_list.append(OperatorStatisticComparator(op_compare_result, OperatorStatisticBean))
-
+            if module_compare_result:
+                comparator_list.append(ModuleStatisticComparator(module_compare_result, ModuleStatisticBean))
         if self._args.enable_memory_compare:
             comparator_list.append(OperatorComparator(op_compare_result, MemoryCompareBean))
             comparator_list.append(OperatorStatisticComparator(op_compare_result, MemoryStatisticBean))
         return comparator_list
 
     def match_torch_op(self) -> list:
-        base_ops = self._get_top_layer_ops(self._profiling_data_dict.get(Constant.BASE_DATA))
-        comparison_ops = self._get_top_layer_ops(self._profiling_data_dict.get(Constant.COMPARISON_DATA))
+        base_ops = OperatorDataPrepare(self._profiling_data_dict.get(Constant.BASE_DATA)).get_top_layer_ops()
+        comparison_ops = OperatorDataPrepare(
+            self._profiling_data_dict.get(Constant.COMPARISON_DATA)).get_top_layer_ops()
         if not base_ops and not comparison_ops:
             return []
         name_func = NameFunction(self._args).get_name_func()
-        compare_result_data = self._matching_op(base_ops, comparison_ops, name_func)
+        op_compare_result = longest_common_subsequence_matching(base_ops, comparison_ops, name_func)
         if self._args.max_kernel_num is not None:
-            compare_result_data = self._drill_down(compare_result_data, name_func)
-        return compare_result_data
-
-    @classmethod
-    def _matching_op(cls, base_ops: list, comparison_ops: list, name_func: any) -> list:
-        if not comparison_ops:
-            result_data = [None] * len(base_ops)
-            for index, value in enumerate(base_ops):
-                result_data[index] = [value, None]
-            return result_data
-
-        result_data = []
-        comparison_len, base_len = len(comparison_ops), len(base_ops)
-        dp = [[0] * (base_len + 1) for _ in range(comparison_len + 1)]
-        for comparison_index in range(1, comparison_len + 1):
-            for base_index in range(1, base_len + 1):
-                if name_func(base_ops[base_index - 1]) == name_func(
-                        comparison_ops[comparison_index - 1]):
-                    dp[comparison_index][base_index] = dp[comparison_index - 1][base_index - 1] + 1
-                else:
-                    dp[comparison_index][base_index] = max(dp[comparison_index][base_index - 1],
-                                                           dp[comparison_index - 1][base_index])
-        matched_op = []
-        comparison_index, base_index = comparison_len, base_len
-        while comparison_index > 0 and base_index > 0:
-            if name_func(base_ops[base_index - 1]) == name_func(
-                    comparison_ops[comparison_index - 1]):
-                matched_op.append([comparison_index - 1, base_index - 1])
-                comparison_index -= 1
-                base_index -= 1
-                continue
-            if dp[comparison_index][base_index - 1] > dp[comparison_index - 1][base_index]:
-                base_index -= 1
-            else:
-                comparison_index -= 1
-        if not matched_op:
-            matched_base_index_list = []
-        else:
-            matched_op.reverse()
-            matched_op = np.array(matched_op)
-            matched_base_index_list = list(matched_op[:, 1])
-        curr_comparison_index = 0
-        for base_index, base_api_node in enumerate(base_ops):
-            if base_index not in matched_base_index_list:
-                result_data.append([base_api_node, None])
-                continue
-            matched_comparison_index = matched_op[matched_base_index_list.index(base_index), 0]
-            for comparison_index in range(curr_comparison_index, matched_comparison_index):
-                result_data.append([None, comparison_ops[comparison_index]])
-            result_data.append([base_api_node, comparison_ops[matched_comparison_index]])
-            curr_comparison_index = matched_comparison_index + 1
-        if curr_comparison_index < len(comparison_ops):
-            for comparison_index in range(curr_comparison_index, len(comparison_ops)):
-                result_data.append([None, comparison_ops[comparison_index]])
-        return result_data
-
-    def _get_top_layer_ops(self, profiling_data: ProfilingResult) -> any:
-        root_node = TreeBuilder.build_tree(profiling_data.torch_op_data, profiling_data.kernel_dict,
-                                           profiling_data.memory_list)
-        level1_child_nodes = root_node.child_nodes
-        result_data = []
-        for level1_node in level1_child_nodes:
-            if level1_node.is_step_profiler():
-                result_data.extend(level1_node.child_nodes)
-            else:
-                result_data.append(level1_node)
-        return result_data
+            op_compare_result = self._drill_down(op_compare_result, name_func)
+        return op_compare_result
 
     def _drill_down(self, compare_result_data: list, name_func: any) -> list:
         drill_down_result = []
@@ -152,9 +94,39 @@ class DetailPerformanceGenerator(BaseGenerator):
             if max(base_op.kernel_num, comparison_op.kernel_num) <= self._args.max_kernel_num:
                 drill_down_result.append(match_data)
                 continue
-            match_list = self._matching_op(base_op.child_nodes, comparison_op.child_nodes, name_func)
+            match_list = longest_common_subsequence_matching(base_op.child_nodes, comparison_op.child_nodes, name_func)
             match_list.reverse()
             for data in match_list:
                 op_deque.append(data)
 
         return drill_down_result
+
+    def match_nn_module(self) -> list:
+        module_compare_result = []
+        base_root_node = ModuleDataPrepare(self._profiling_data_dict.get(Constant.BASE_DATA)).build_module_tree()
+        comparison_root_node = ModuleDataPrepare(
+            self._profiling_data_dict.get(Constant.COMPARISON_DATA)).build_module_tree()
+        for index, base_node in enumerate(base_root_node):
+            comparison_node = comparison_root_node[index] if index < len(comparison_root_node) else None
+            module_compare_result.extend(self._matching_all_modules(base_node, comparison_node))
+        return module_compare_result
+
+    def _matching_all_modules(self, base_node: ModuleNode, comparison_node: ModuleNode):
+        all_matched_modules = []
+        matched_queue = Queue()
+        matched_queue.put([base_node, comparison_node])
+        while not matched_queue.empty():
+            matched_base_node, matched_comparison_node = matched_queue.get()
+            matched_node_list = self._matching_common_subsequence(matched_base_node, matched_comparison_node)
+            all_matched_modules.extend(matched_node_list)
+            for matched_node in matched_node_list:
+                matched_queue.put(matched_node)
+        return all_matched_modules
+
+    def _matching_common_subsequence(self, base_node: ModuleNode, comparison_node: ModuleNode):
+        base_modules = base_node.child_nodes if base_node else []
+        comparison_modules = comparison_node.child_nodes if comparison_node else []
+        if not base_modules and not comparison_modules:
+            return []
+        name_func = NameFunction(self._args).get_module_name
+        return longest_common_subsequence_matching(base_modules, comparison_modules, name_func)
diff --git a/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py
index 4c0b51272b..3adcaa99d6 100644
--- a/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py
+++ b/profiler/compare_tools/compare_backend/profiling_parser/base_profiling_parser.py
@@ -18,11 +18,19 @@ class ProfilingResult:
         self.memory_list = []
         self.communication_dict = {}
         self.overall_metrics = ProfilingInfo(profiling_type)
+        self.python_function_data = []
+        self.fwdbwd_dict = {}
 
     def update_torch_op_data(self, event: TraceEventBean):
         event.is_torch_op = True
         self.torch_op_data.append(event)
 
+    def update_python_function_data(self, event: TraceEventBean):
+        self.python_function_data.append(event)
+
+    def update_fwdbwd_data(self, flow_type: str, event: TraceEventBean):
+        self.fwdbwd_dict.setdefault(event.id, {})[flow_type] = event
+
     def update_kernel_dict(self, start_time: Decimal, kernel_event: TraceEventBean):
         self.kernel_dict.setdefault(start_time, []).append(KernelEvent(kernel_event, self._profiling_type))
 
@@ -53,6 +61,7 @@ class BaseProfilingParser(ABC):
         self._result_data = ProfilingResult(self._profiling_type)
         self._memory_events = []
         self._flow_dict = {}
+        self._fwdbwd_dict = {}
         self._all_kernels = {}
         self._comm_task_list = []
         self._comm_list = []
@@ -134,6 +143,21 @@ class BaseProfilingParser(ABC):
             return True
         return False
 
+    def _picking_python_function_event(self, event: TraceEventBean):
+        if event.is_python_function():
+            self._result_data.update_python_function_data(event)
+            return True
+        return False
+
+    def _picking_fwdbwd_flow_event(self, event: TraceEventBean):
+        if event.is_fwdbwd():
+            if event.is_flow_start():
+                self._result_data.update_fwdbwd_data("start", event)
+            elif event.is_flow_end():
+                self._result_data.update_fwdbwd_data("end", event)
+            return True
+        return False
+
     def _update_kernel_dict(self):
         if self._profiling_type == Constant.NPU:
             for comm in self._comm_list:
diff --git a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py
index 2ad2e1a557..77785d0261 100644
--- a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py
+++ b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py
@@ -136,6 +136,9 @@ class GPUProfilingParser(BaseProfilingParser):
             func_set.add(self._picking_torch_op_event)
         if self._enable_communication_compare:
             func_set.add(self._picking_kernel_event)
+        if self._enable_operator_compare:
+            func_set.add(self._picking_python_function_event)
+            func_set .add(self._picking_fwdbwd_flow_event)
         if self._enable_operator_compare or self._args.max_kernel_num:
             func_set.add(self._picking_kernel_event)
             func_set.add(self._picking_flow_event)
diff --git a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
index f872e52a53..1e2f115934 100644
--- a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
+++ b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
@@ -41,6 +41,9 @@ class NPUProfilingParser(BaseProfilingParser):
         if self._enable_operator_compare or self._args.max_kernel_num:
             func_list.add(self._picking_kernel_event)
             func_list.add(self._picking_flow_event)
+        if self._enable_operator_compare:
+            func_list.add(self._picking_python_function_event)
+            func_list.add(self._picking_fwdbwd_flow_event)
         if self._enable_memory_compare:
             func_list.add(self._picking_task_queue_data)
         if self._enable_communication_compare:
@@ -235,7 +238,7 @@ class NPUProfilingParser(BaseProfilingParser):
                 sdma_dict.setdefault(stream_id, []).append(event.dur)
             elif event.is_compute_event():
                 ai_core_stream.add(stream_id)
-        compute_stream = event_wait_stream & ai_core_stream
+        compute_stream = event_wait_stream & ai_core_stream if event_wait_stream else ai_core_stream
         for stream in compute_stream:
             dur_list = sdma_dict.get(stream, [])
             self._result_data.overall_metrics.update_sdma_info(sum(dur_list), len(dur_list))
diff --git a/profiler/compare_tools/compare_backend/utils/common_func.py b/profiler/compare_tools/compare_backend/utils/common_func.py
index 26584626cd..f8f0af97b1 100644
--- a/profiler/compare_tools/compare_backend/utils/common_func.py
+++ b/profiler/compare_tools/compare_backend/utils/common_func.py
@@ -1,5 +1,7 @@
 from decimal import Decimal
 
+import numpy as np
+
 
 def calculate_diff_ratio(base_value: float, comparison_value: float):
     if not base_value and not comparison_value:
@@ -31,3 +33,56 @@ def convert_to_decimal(data: any) -> Decimal:
         print('[ERROR] Invalid profiling data which failed to convert data to decimal.')
         return 0.0
     return decimal_value
+
+
+def longest_common_subsequence_matching(base_ops: list, comparison_ops: list, name_func: any) -> list:
+    if not comparison_ops:
+        result_data = [None] * len(base_ops)
+        for index, value in enumerate(base_ops):
+            result_data[index] = [value, None]
+        return result_data
+
+    result_data = []
+    comparison_len, base_len = len(comparison_ops), len(base_ops)
+    dp = [[0] * (base_len + 1) for _ in range(comparison_len + 1)]
+    for comparison_index in range(1, comparison_len + 1):
+        for base_index in range(1, base_len + 1):
+            if name_func(base_ops[base_index - 1]) == name_func(
+                    comparison_ops[comparison_index - 1]):
+                dp[comparison_index][base_index] = dp[comparison_index - 1][base_index - 1] + 1
+            else:
+                dp[comparison_index][base_index] = max(dp[comparison_index][base_index - 1],
+                                                       dp[comparison_index - 1][base_index])
+    matched_op = []
+    comparison_index, base_index = comparison_len, base_len
+    while comparison_index > 0 and base_index > 0:
+        if name_func(base_ops[base_index - 1]) == name_func(
+                comparison_ops[comparison_index - 1]):
+            matched_op.append([comparison_index - 1, base_index - 1])
+            comparison_index -= 1
+            base_index -= 1
+            continue
+        if dp[comparison_index][base_index - 1] > dp[comparison_index - 1][base_index]:
+            base_index -= 1
+        else:
+            comparison_index -= 1
+    if not matched_op:
+        matched_base_index_list = []
+    else:
+        matched_op.reverse()
+        matched_op = np.array(matched_op)
+        matched_base_index_list = list(matched_op[:, 1])
+    curr_comparison_index = 0
+    for base_index, base_api_node in enumerate(base_ops):
+        if base_index not in matched_base_index_list:
+            result_data.append([base_api_node, None])
+            continue
+        matched_comparison_index = matched_op[matched_base_index_list.index(base_index), 0]
+        for comparison_index in range(curr_comparison_index, matched_comparison_index):
+            result_data.append([None, comparison_ops[comparison_index]])
+        result_data.append([base_api_node, comparison_ops[matched_comparison_index]])
+        curr_comparison_index = matched_comparison_index + 1
+    if curr_comparison_index < len(comparison_ops):
+        for comparison_index in range(curr_comparison_index, len(comparison_ops)):
+            result_data.append([None, comparison_ops[comparison_index]])
+    return result_data
diff --git a/profiler/compare_tools/compare_backend/utils/constant.py b/profiler/compare_tools/compare_backend/utils/constant.py
index d44f9fea93..d021a730c7 100644
--- a/profiler/compare_tools/compare_backend/utils/constant.py
+++ b/profiler/compare_tools/compare_backend/utils/constant.py
@@ -53,6 +53,8 @@ class Constant(object):
     MEMORY_TOP_TABLE = "MemoryCompareStatistic"
     COMMUNICATION_TABLE = "CommunicationCompare"
     PERFORMANCE_TABLE = "Model Profiling Time Distribution"
+    MODULE_TABLE = "ModuleCompare"
+    MODULE_TOP_TABLE = "ModuleCompareStatistic"
 
     # memory
     SIZE = "Size(KB)"
diff --git a/profiler/compare_tools/compare_backend/utils/excel_config.py b/profiler/compare_tools/compare_backend/utils/excel_config.py
index 50b2e6329e..965c229ede 100644
--- a/profiler/compare_tools/compare_backend/utils/excel_config.py
+++ b/profiler/compare_tools/compare_backend/utils/excel_config.py
@@ -14,6 +14,10 @@ class CellFormatType:
                 'bold': True}  # 字符串，无背景色，字体加粗
     BLUE_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.BLUE_COLOR, 'align': 'left',
                  'valign': 'vcenter', 'bold': True, 'border': True}  # 蓝色背景，加粗
+    GREEN_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.GREEN_COLOR, 'align': 'left',
+                'valign': 'vcenter', 'bold': True, 'border': True}  # 绿色背景，加粗
+    YELLOW_BOLD = {"font_name": "Arial", 'font_size': 11, 'fg_color': Constant.YELLOW_COLOR, 'align': 'left',
+                   'valign': 'vcenter', 'bold': True, 'border': True}  # 黄色背景，加粗
 
 
 class ExcelConfig(object):
@@ -46,6 +50,19 @@ class ExcelConfig(object):
     AVG_DURATION = "Avg Duration(us)"
     MAX_DURATION = "Max Duration(us)"
     MIN_DURATION = "Min Duration(us)"
+    MODULE_CLASS = "Module Class"
+    MODULE_NAME = "Module Name"
+    HOST_SELF_TIME = "Host Self Time(ms)"
+    HOST_TOTAL_TIME = "Host Total Time(ms)"
+    DEVICE_SELF_TIME = "Device Self Time(ms)"
+    DEVICE_TOTAL_TIME = "Device Total Time(ms)"
+    DIFF_SELF_TIME = "Device Self Time Diff"
+    DIFF_TOTAL_RATIO = "Total Diff Ratio"
+    DIFF_TOTAL_TIME = "Device Total Time Diff"
+    NUMBER = "Number"
+    MODULE_LEVEL = "Module Level"
+    BASE_CALL_STACK = "Base Call Stack"
+    COMPARISON_CALL_STACK = "Comparison Call Stack"
 
     HEADERS = {
         Constant.OPERATOR_TABLE: [
@@ -118,9 +135,29 @@ class ExcelConfig(object):
             {"name": MIN_DURATION, "type": CellFormatType.DEFAULT_FLOAT, "width": 17},
             {"name": DIFF_DUR, "type": CellFormatType.DEFAULT_FLOAT, "width": 20},
             {"name": DIFF_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 20}
+        ],
+        Constant.MODULE_TOP_TABLE: [
+            {"name": ORDER, "type": CellFormatType.DEFAULT, "width": 10},
+            {"name": MODULE_CLASS, "type": CellFormatType.DEFAULT, "width": 20},
+            {"name": MODULE_LEVEL, "type": CellFormatType.DEFAULT, "width": 15},
+            {"name": MODULE_NAME, "type": CellFormatType.DEFAULT, "width": 35},
+            {"name": OPERATOR_NAME, "type": CellFormatType.DEFAULT, "width": 25},
+            {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20},
+            {"name": DEVICE_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20},
+            {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10},
+            {"name": DEVICE_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20},
+            {"name": KERNEL_DETAILS, "type": CellFormatType.DEFAULT, "width": 20},
+            {"name": DEVICE_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20},
+            {"name": NUMBER, "type": CellFormatType.DEFAULT, "width": 10},
+            {"name": DEVICE_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20},
+            {"name": DIFF_TOTAL_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20},
+            {"name": DIFF_SELF_TIME, "type": CellFormatType.DEFAULT_FLOAT, "width": 20},
+            {"name": DIFF_TOTAL_RATIO, "type": CellFormatType.DEFAULT_RATIO, "width": 15},
+            {"name": BASE_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30},
+            {"name": COMPARISON_CALL_STACK, "type": CellFormatType.DEFAULT, "width": 30}
         ]
     }
 
     OVERHEAD = {Constant.OPERATOR_TABLE: ["B1:F1", "G1:K1"], Constant.MEMORY_TABLE: ["B1:F1", "G1:K1"],
                 Constant.COMMUNICATION_TABLE: ["B1:H1", "I1:O1"], Constant.OPERATOR_TOP_TABLE: ["C1:D1", "E1:F1"],
-                Constant.MEMORY_TOP_TABLE: ["C1:E1", "F1:H1"]}
+                Constant.MEMORY_TOP_TABLE: ["C1:E1", "F1:H1"], Constant.MODULE_TOP_TABLE: ["F1:I1", "J1:M1"]}
diff --git a/profiler/compare_tools/compare_backend/utils/module_node.py b/profiler/compare_tools/compare_backend/utils/module_node.py
new file mode 100644
index 0000000000..0c9331b2cd
--- /dev/null
+++ b/profiler/compare_tools/compare_backend/utils/module_node.py
@@ -0,0 +1,160 @@
+from math import ceil
+
+from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean
+from compare_backend.utils.torch_op_node import TorchOpNode
+
+
+class ModuleNode:
+    ts = "ts"
+    kernels = "kernels"
+
+    def __init__(self, event: TraceEventBean, parent_node=None):
+        self._event = event
+        self._parent_node = parent_node
+        self._child_nodes = []
+        self._module_name = f"{parent_node.module_name}/{event.name}" if parent_node else event.name
+        self._kernel_self_list = []
+        self._kernel_total_list = []
+        self._call_stack = f"{parent_node.call_stack};\n{event.name}" if parent_node and parent_node.call_stack \
+            else event.name
+        self._root_torch_op_node = TorchOpNode()
+        self._cur_torch_op_node = self._root_torch_op_node
+
+    @property
+    def module_name(self):
+        return self._module_name
+
+    @property
+    def name(self):
+        return self._event.name
+
+    @property
+    def parent_node(self):
+        return self._parent_node
+
+    @property
+    def child_nodes(self):
+        return self._child_nodes
+
+    @property
+    def dur(self):
+        return self._event.dur
+
+    @property
+    def start_time(self):
+        return self._event.start_time
+
+    @property
+    def end_time(self):
+        return self._event.end_time
+
+    @property
+    def host_self_dur(self):
+        return self.dur - sum([node.dur for node in self.child_nodes])
+
+    @property
+    def device_self_dur(self):
+        dur = 0
+        for kernel_dict in self._kernel_self_list:
+            kernel_list = kernel_dict.get(self.kernels, [])
+            dur += sum([kernel.device_dur for kernel in kernel_list])
+        return dur
+
+    @property
+    def device_total_dur(self):
+        dur = 0
+        for kernel_dict in self._kernel_total_list:
+            kernel_list = kernel_dict.get(self.kernels, [])
+            dur += sum([kernel.device_dur for kernel in kernel_list])
+        return dur
+
+    @property
+    def kernel_details(self):
+        kernel_details = ""
+        for kernel_dict in self._kernel_self_list:
+            kernel_list = kernel_dict.get(self.kernels, [])
+            for kernel in kernel_list:
+                kernel_details += kernel.kernel_details
+        return kernel_details
+
+    @property
+    def toy_layer_api_list(self):
+        return self._root_torch_op_node.child_nodes
+
+    @property
+    def call_stack(self):
+        return self._call_stack
+
+    @staticmethod
+    def _binary_search(ts_time, parent_node):
+        if not parent_node.child_nodes:
+            return None
+        right = len(parent_node.child_nodes) - 1
+        left = 0
+        while right > left:
+            mid = left + ceil((right - left) / 2)
+            if ts_time >= parent_node.child_nodes[mid].start_time:
+                left = mid
+            else:
+                right = mid - 1
+        if parent_node.child_nodes[left].start_time < ts_time < parent_node.child_nodes[left].end_time:
+            return parent_node.child_nodes[left]
+        return None
+
+    def reset_call_stack(self, call_stack):
+        self._call_stack = call_stack
+
+    def update_child_nodes(self, node):
+        self._child_nodes.append(node)
+
+    def update_kernel_list(self, ts, kernel_list: list):
+        self._update_kernel_self_list(ts, kernel_list)
+        node = self
+        while node.parent_node:
+            node._update_kernel_total_list(ts, kernel_list)
+            node = node.parent_node
+
+    def _update_kernel_self_list(self, ts, kernel_list: list):
+        self._kernel_self_list.append({self.ts: ts, self.kernels: kernel_list})
+
+    def _update_kernel_total_list(self, ts, kernel_list: list):
+        self._kernel_total_list.append({self.ts: ts, self.kernels: kernel_list})
+
+    def find_module_call(self, ts_time):
+        call_module = self._binary_search(ts_time, self)
+        while call_module:
+            module = self._binary_search(ts_time, call_module)
+            if not module:
+                return call_module
+            call_module = module
+        return call_module
+
+    def find_torch_op_call(self, event):
+        while self._cur_torch_op_node:
+            if self._cur_torch_op_node != self._root_torch_op_node and \
+                    event.start_time > self._cur_torch_op_node.end_time:
+                self._cur_torch_op_node = self._cur_torch_op_node.parent
+                continue
+            tree_node = TorchOpNode(event, self._cur_torch_op_node)
+            self._cur_torch_op_node.add_child_node(tree_node)
+            self._cur_torch_op_node = tree_node
+            break
+
+    def update_torch_op_kernel_list(self):
+        top_node_list = self._root_torch_op_node.child_nodes
+        if not top_node_list:
+            return
+        top_node_list.sort(key=lambda x: x.start_time)
+        cur_index = 0
+        self._kernel_self_list.sort(key=lambda x: x.get(self.ts, 0))
+        for kernel_dict in self._kernel_self_list:
+            ts = kernel_dict.get(self.ts, 0)
+            kernel_list = kernel_dict.get(self.kernels, [])
+            while cur_index < len(top_node_list):
+                if ts > top_node_list[cur_index].end_time:
+                    cur_index += 1
+                    continue
+                if ts < top_node_list[cur_index].start_time:
+                    break
+                top_node_list[cur_index].update_kernel_list(kernel_list)
+                break
diff --git a/profiler/compare_tools/compare_backend/utils/name_function.py b/profiler/compare_tools/compare_backend/utils/name_function.py
index d83f9e4291..cd79e8a03f 100644
--- a/profiler/compare_tools/compare_backend/utils/name_function.py
+++ b/profiler/compare_tools/compare_backend/utils/name_function.py
@@ -1,3 +1,4 @@
+from compare_backend.utils.module_node import ModuleNode
 from compare_backend.utils.torch_op_node import TorchOpNode
 
 
@@ -41,3 +42,11 @@ class NameFunction:
             input_shape = ';\r\n'.join(data)
             return f'{self.args.op_name_map.get(op_node.name, op_node.name)}{input_shape}'
         return f'{self.args.op_name_map.get(op_node.name, op_node.name)}{op_node.input_shape}'
+
+    def get_module_name(self, module: ModuleNode) -> str:
+        if not self.args.op_name_map:
+            return module.module_name
+        module = module.module_name
+        for old_name, new_name in self.args.op_name_map.items():
+            module.replace(old_name, new_name)
+        return module
diff --git a/profiler/compare_tools/compare_backend/utils/torch_op_node.py b/profiler/compare_tools/compare_backend/utils/torch_op_node.py
index 45b9299ba0..690c46cd51 100644
--- a/profiler/compare_tools/compare_backend/utils/torch_op_node.py
+++ b/profiler/compare_tools/compare_backend/utils/torch_op_node.py
@@ -60,6 +60,10 @@ class TorchOpNode:
     def memory_allocated(self):
         return self._memory_allocated_list
 
+    @property
+    def device_dur(self):
+        return sum([kernel.device_dur for kernel in self._kernel_list])
+
     def add_child_node(self, child_node):
         self._child_nodes.append(child_node)
 
@@ -73,11 +77,16 @@ class TorchOpNode:
             cur_node._kernel_num += kernel_num
             cur_node = cur_node._parent_node
 
+    def update_kernel_list(self, kernel_list: list):
+        if not kernel_list:
+            return
+        self._kernel_list.extend(kernel_list)
+
     def set_memory_allocated(self, memory_allocated: MemoryEvent):
         self._memory_allocated_list.append(memory_allocated)
 
     def is_step_profiler(self) -> bool:
-        return self.name.find("ProfilerStep#") != -1
+        return self._event.is_step_profiler()
 
     def get_op_info(self) -> list:
         return [self.name, self.input_shape, self.input_type, self.call_stack]
diff --git a/profiler/compare_tools/compare_backend/utils/tree_builder.py b/profiler/compare_tools/compare_backend/utils/tree_builder.py
index f621453d1a..34c1fe1a1f 100644
--- a/profiler/compare_tools/compare_backend/utils/tree_builder.py
+++ b/profiler/compare_tools/compare_backend/utils/tree_builder.py
@@ -1,5 +1,7 @@
 from queue import Queue
 
+from compare_backend.compare_bean.origin_data_bean.trace_event_bean import TraceEventBean
+from compare_backend.utils.module_node import ModuleNode
 from compare_backend.utils.torch_op_node import TorchOpNode
 
 
@@ -7,10 +9,12 @@ class TreeBuilder:
     @classmethod
     def build_tree(cls, event_list: list, kernel_dict: dict, memory_list: list) -> TorchOpNode:
         root_node = TorchOpNode()
-        event_list.extend(memory_list)
-        event_list.sort(key=lambda x: x.start_time)
+        all_event_list = []
+        all_event_list.extend(event_list)
+        all_event_list.extend(memory_list)
+        all_event_list.sort(key=lambda x: x.start_time)
         last_node = root_node
-        for event in event_list:
+        for event in all_event_list:
             while last_node:
                 if last_node != root_node and event.start_time > last_node.end_time:
                     last_node = last_node.parent
@@ -53,3 +57,26 @@ class TreeBuilder:
             for child_node in tree_node.child_nodes:
                 node_queue.put(child_node)
         return result_list
+
+    @classmethod
+    def build_module_tree(cls, event_list: list, kernel_dict: dict):
+        root_node = ModuleNode(TraceEventBean({}))
+        event_list.sort(key=lambda x: x.start_time)
+        last_node = root_node
+        for event in event_list:
+            while last_node:
+                if last_node != root_node and event.start_time > last_node.end_time:
+                    last_node = last_node.parent_node
+                    continue
+                if event.is_x_mode():
+                    tree_node = ModuleNode(event, last_node)
+                    last_node.update_child_nodes(tree_node)
+                    last_node = tree_node
+                    break
+                if last_node == root_node:
+                    break
+                kernel_list = kernel_dict.get(event.start_time, [])
+                if kernel_list:
+                    last_node.update_kernel_list(event.start_time, kernel_list)
+                break
+        return root_node
diff --git a/profiler/compare_tools/compare_backend/view/work_sheet_creator.py b/profiler/compare_tools/compare_backend/view/work_sheet_creator.py
index c5e56c2f8b..7a33168da3 100644
--- a/profiler/compare_tools/compare_backend/view/work_sheet_creator.py
+++ b/profiler/compare_tools/compare_backend/view/work_sheet_creator.py
@@ -23,20 +23,28 @@ class WorkSheetCreator:
         self._write_data()
 
     def _write_headers(self):
-        header_format = self._work_book.add_format(CellFormatType.BLUE_BOLD)
+        base_header_format = self._work_book.add_format(CellFormatType.GREEN_BOLD)
+        com_header_format = self._work_book.add_format(CellFormatType.YELLOW_BOLD)
+        com_index_range = [-1, -1]
         overhead = self._data.get("overhead", [])
         if overhead:
             base_path = f"Base Profiling: {self._args.base_profiling_path}"
-            self._work_sheet.merge_range(overhead[0], base_path, header_format)
+            self._work_sheet.merge_range(overhead[0], base_path, base_header_format)
+            com_index_range = [self._col_ids.index(overhead[1].split(":")[0][0]),
+                               self._col_ids.index(overhead[1].split(":")[1][0])]
             comparison_path = f"Comparison Profiling: {self._args.comparison_profiling_path}"
-            self._work_sheet.merge_range(overhead[1], comparison_path, header_format)
+            self._work_sheet.merge_range(overhead[1], comparison_path, com_header_format)
             self._row_id += 2
         for index, header in enumerate(self._data.get("headers")):
+            if index in range(com_index_range[0], com_index_range[1] + 1):
+                header_format = com_header_format
+            else:
+                header_format = base_header_format
             col_id = self._col_ids[index]
             self._work_sheet.set_column(f"{col_id}:{col_id}", header.get("width"))
             self._work_sheet.write(f"{col_id}{self._row_id}", header.get("name"), header_format)
             self._field_format[index] = self._work_book.add_format(header.get("type"))
-            if header.get("name") == ExcelConfig.DIFF_RATIO:
+            if header.get("name") in (ExcelConfig.DIFF_RATIO, ExcelConfig.DIFF_TOTAL_RATIO):
                 self._diff_ratio_index = index
         self._row_id += 1
 
-- 
Gitee


From 5598bcc7f1077f136c89d122c9e9f9fb8382d86e Mon Sep 17 00:00:00 2001
From: wangqihui01 <wangqihui2@huawei.com>
Date: Tue, 5 Mar 2024 17:55:45 +0800
Subject: [PATCH 28/62] =?UTF-8?q?=E5=BC=82=E5=B8=B8=E6=80=A7=E8=83=BD?=
 =?UTF-8?q?=E7=AE=97=E5=AD=90=E8=AF=86=E5=88=AB=EF=BC=8C=E4=BB=A5=E5=8F=8A?=
 =?UTF-8?q?call=20stack=E8=8E=B7=E5=8F=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../advice_factory/compute_advice_factory.py  |   2 +
 .../common_func_advisor/constant.py           | 115 +++++++
 .../common_func_advisor/trace_view_json.py    |  31 +-
 .../compute_advice/npu_fused/csv_analyzer.py  |  18 +-
 .../compute_advice/npu_fused/op_perf.py       | 120 ++++++-
 .../compute_advice/npu_slow_advice.py         |  82 +++++
 profiler/advisor/compute_perf_analysis.ipynb  | 295 ++++++++++++++++--
 .../compute_advice/test_npu_slow_advice.py    | 210 +++++++++++++
 8 files changed, 835 insertions(+), 38 deletions(-)
 create mode 100644 profiler/advisor/advisor_backend/compute_advice/npu_slow_advice.py
 create mode 100644 profiler/test/ut/advisor/advisor_backend/compute_advice/test_npu_slow_advice.py

diff --git a/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py b/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py
index 2b6e5270f2..336bef7dd8 100644
--- a/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py
+++ b/profiler/advisor/advisor_backend/advice_factory/compute_advice_factory.py
@@ -15,11 +15,13 @@
 from common_func_advisor.constant import Constant
 from advice_factory.advice_factory import AdviceFactory
 from compute_advice.npu_fused_advice import NpuFusedAdvice
+from compute_advice.npu_slow_advice import NpuSlowAdvice
 
 
 class ComputeAdviceFactory(AdviceFactory):
     ADVICE_LIB = {
         Constant.NPU_FUSED: NpuFusedAdvice,
+        Constant.NPU_SLOW: NpuSlowAdvice,
     }
 
     def __init__(self, collection_path: str):
diff --git a/profiler/advisor/advisor_backend/common_func_advisor/constant.py b/profiler/advisor/advisor_backend/common_func_advisor/constant.py
index 34879db9f2..46a7fb24c2 100644
--- a/profiler/advisor/advisor_backend/common_func_advisor/constant.py
+++ b/profiler/advisor/advisor_backend/common_func_advisor/constant.py
@@ -15,11 +15,104 @@
 from enum import Enum
 
 
+class CsvTitle:
+    MODEL_NAME = "Model Name"
+    MODEL_ID = "Model ID"
+    TASK_ID = "Task ID"
+    STREAM_ID = "Stream ID"
+    INFER_ID = "Infer ID"
+    TASK_START_TIME = "Task Start Time(us)"
+    TASK_WAIT_TIME = "Task Wait Time(us)"
+    BLOCK_DIM = "Block Dim"
+    MIX_BLOCK_DIM = "Mix Block Dim"
+    HF32_ELIGIBLE = "HF32 Eligible"
+    INPUT_SHAPES = "Input Shapes"
+    INPUT_DATA_TYPES = "Input Data Types"
+    INPUT_FORMATS = "Input Formats"
+    OUTPUT_SHAPES = "Output Shapes"
+    OUTPUT_DATA_TYPES = "Output Data Types"
+    OUTPUT_FORMATS = "Output Formats"
+    CONTEXT_ID = "Context ID"
+    AICORE_TIME = "aicore_time(us)"
+    AIC_TOTAL_CYCLES = "aic_total_cycles"
+    AIC_MAC_TIME = "aic_mac_time(us)"
+    AIC_MAC_RATIO = "aic_mac_ratio"
+    AIC_SCALAR_TIME = "aic_scalar_time(us)"
+    AIC_SCALAR_RATIO = "aic_scalar_ratio"
+    AIC_MTE1_TIME = "aic_mte1_time(us)"
+    AIC_MTE1_RATIO = "aic_mte1_ratio"
+    AIC_MTE2_TIME = "aic_mte2_time(us)"
+    AIC_MTE2_RATIO = "aic_mte2_ratio"
+    AIC_FIXPIPE_TIME = "aic_fixpipe_time(us)"
+    AIC_FIXPIPE_RATIO = "aic_fixpipe_ratio"
+    AIC_ICACHE_MISS_RATE = "aic_icache_miss_rate"
+    AIV_TIME = "aiv_time(us)"
+    AIV_TOTAL_CYCLES = "aiv_total_cycles"
+    AIV_VEC_TIME = "aiv_vec_time(us)"
+    AIV_VEC_RATIO = "aiv_vec_ratio"
+    AIV_SCALAR_TIME = "aiv_scalar_time(us)"
+    AIV_SCALAR_RATIO = "aiv_scalar_ratio"
+    AIV_MTE2_TIME = "aiv_mte2_time(us)"
+    AIV_MTE2_RATIO = "aiv_mte2_ratio"
+    AIV_MTE3_TIME = "aiv_mte3_time(us)"
+    AIV_MTE3_RATIO = "aiv_mte3_ratio"
+    AIV_ICACHE_MISS_RATE = "aiv_icache_miss_rate"
+    CUBE_UTILIZATION = "cube_utilization( %)"
+    TASK_DURATION_SUM = "Task Duration Sum(us)"
+    TASK_DURATION_MEAN = "Task Duration Mean(us)"
+    TASK_DURATION_STD = "Task Duration Std(us)"
+    TASK_DURATION_RATIO = "Task Duration Ratio(100%)"
+    SIZE = "size(MB)"
+    THROUGHPUT = "throughput(GB/s)"
+    COLOR = "color"
+    GAP = "Gap(us)"
+    DURATION_SUM = "Duration Sum(us)"
+    COUNT = "Count"
+    MAX_DURATION = "Max Duration(us)"
+    MIN_DURATION = "Min Duration(us)"
+    AVG_DURATION = "Avg Duration(us)"
+    DURATION_RATIO = "Duration Ratio"
+    INDEX = "Index"
+
+
+# 定义CSV_TITILE_V1类，继承自CSV_TITILE类, 适配旧版csv
+class CsvTitleV1(CsvTitle):
+    OP_NAME = "Op Name"
+    OP_TYPE = "OP Type"
+    TASK_TYPE = "Task Type"
+    TASK_DURATION = "Task Duration(us)"
+
+
+# 定义CSV_TITILE_V1类，继承自CSV_TITILE类, 适配新版csv
+class CsvTitleV2(CsvTitle):
+    OP_NAME = "Name"
+    OP_TYPE = "Type"
+    TASK_TYPE = "Accelerator Core"
+    TASK_DURATION = "Duration(us)"
+
+
 class Constant:
+    DTYPE_SIZE_MAP = {"int8": 1, "uint8": 1,
+                      "int16": 2, "uint16": 2,
+                      "int32": 4, "uint32": 4,
+                      "int64": 8, "uint64": 8,
+                      "float16": 2,
+                      "bfloat16": 2,
+                      "bf16": 2,
+                      "dt_bf16": 2,
+                      "float32": 4,
+                      "float": 4,
+                      "float64": 8,
+                      "complex64": 8,
+                      "complex128": 16,
+                      "bool": 1}
+    TP_THRESHOLD = 1150
     MAX_INPUT_MODE_LEN = 30
     MAX_INPUT_ADVICE_LEN = 30
     SMALL_OP_DUR_RATIO = 0.2
     SMALL_OP_NUM_RATIO = 0.2
+    BYTE_UNIT_TRANS = 1024
+    UNIT_TRANS = 1000
 
     # mode list
     COMPUTE = "compute"
@@ -35,6 +128,7 @@ class Constant:
 
     # compute
     NPU_FUSED = "npu_fused"
+    NPU_SLOW = "npu_slow"
 
     # timeline
     OPTIM = "optimizer"
@@ -108,3 +202,24 @@ class Constant:
                     ("Cast", "Mul", "MaskedFill", "SoftmaxV2", "Cast"): "torch_npu.npu_scaled_masked_softmax",
                     ("Mul", "Slice", "Neg", "Slice", "ConcatD", "Mul"): "torch_npu.npu_rotary_mul",
                     ("Cast", "Square", "ReduceMeanD", "Add", "Rsqrt", "Mul", "Cast", "Mul"): "torch_npu.npu_rms_norm"}
+    TITLE = CsvTitleV2
+
+    @classmethod
+    def update_title(cls):
+        cls.TITLE = CsvTitleV1
+
+
+class CoreType:
+    AIV = "AI_VECTOR_CORE"
+    AIC = "AI_CORE"
+    AICPU = "AI_CPU"
+    MIX_AIV = "MIX_AIV"
+    MIX_AIC = "MIX_AIC"
+    HCCL = "HCCL"
+
+
+class PerfColor(Enum):
+    WHITE = 0
+    GREEN = 1
+    YELLOW = 2
+    RED = 3
diff --git a/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py b/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py
index 08ef028765..8171f06ee2 100644
--- a/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py
+++ b/profiler/advisor/advisor_backend/common_func_advisor/trace_view_json.py
@@ -12,13 +12,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
 from abc import abstractmethod
 from dataclasses import dataclass
 from dataclasses import field
 from typing import Dict
 from typing import List
 
+import pandas as pd
+
 from common_func.file_manager import FileManager
 
 
@@ -89,9 +91,34 @@ class TraceViewJson:
         self.cann_dur_events: Dict[str, DurationEvent] = dict()
         self.ascend_hardware_dur_events: Dict[str, DurationEvent] = dict()
         self.torch_2_npu_flow_events: Dict[str, FlowEvent] = dict()
-
         traces = FileManager.read_json_file(path)
         self._load_obj(traces)
+    
+    def get_call_stack(self, data: pd.DataFrame, index_id: int, ts_col: str) -> str:
+        if ts_col not in data.columns.tolist():
+            print("[ERROR] No {} col found in data columns.".format(ts_col))
+            return ""
+        row = data.loc[index_id]
+        timestamp = row[ts_col]
+        flow_event = self.get_torch_2_npu_flow_event(timestamp)
+        if not flow_event.valid():
+            print("[ERROR] Get flow event failed for pattern {}.".format(row['pattern']))
+            return ""
+        flow_event_s_key = flow_event.s_point_ts
+        python_dur_events = self.get_python_dur_events_contain_ts(flow_event_s_key)
+        if not python_dur_events:
+            print("[ERROR] No python dur event found for pattern {}.".format(row['pattern']))
+            return ""
+        # 保持新老版本callstack兼容性
+        if python_dur_events[0].args.get("Call stack"):
+            # 旧版本
+            call_stack_list = python_dur_events[0].args.get("Call stack").split(";")
+        else:
+            python_dur_events.sort(key=lambda e: e.ts)
+            # 新版本
+            call_stack_list = [event.name for event in python_dur_events if event.cat == "python_function"]
+        call_stack = "\n".join(call_stack_list)
+        return call_stack
 
     def get_torch_2_npu_flow_event(self, end_time) -> FlowEvent:
         if not self.torch_2_npu_flow_events or not self.torch_2_npu_flow_events.get(end_time):
diff --git a/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py b/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py
index 5411610a7f..c85c14d618 100644
--- a/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py
+++ b/profiler/advisor/advisor_backend/compute_advice/npu_fused/csv_analyzer.py
@@ -28,18 +28,10 @@ class CSVAnalyzer:
 
     def process(self):
         df = pd.read_csv(self._path, dtype={"Start Time(us)": str})
-
-
-        pool = multiprocessing.Pool(multiprocessing.cpu_count())
-        # 数据预解析
-        result = pool.map(self.update_op_row, df.iterrows())
-        pool.close()
-
-        preparse_df = pd.DataFrame(result)
         # 分析是否存在可融合的算子
-        op_type_list = preparse_df["Type"].tolist()
-        duration_list = preparse_df["Duration(us)"].tolist()
-        start_times = preparse_df["Start Time(us)"].tolist()
+        op_type_list = df["Type"].tolist()
+        duration_list = df["Duration(us)"].tolist()
+        start_times = df["Start Time(us)"].tolist()
         # 去除末尾的\t分隔符
         start_times = [start_time[:-1] for start_time in start_times]
         result_list = []
@@ -50,10 +42,6 @@ class CSVAnalyzer:
                               "index", "first_timestamp"]
         return data_frame
 
-    @staticmethod
-    def update_op_row(row):
-        return OpPerfFactory.build(row[1]).update()
-
     @staticmethod
     def find_all_sub_lists(op_type_list, duration_list, start_times, expect_sub_list):
         # 创建一个空字典，用来存储子列表和它们的出现次数和起始位置
diff --git a/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py b/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py
index 2442807fd1..a194ce5e40 100644
--- a/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py
+++ b/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py
@@ -12,19 +12,29 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import functools
 from typing import Dict
+
 from common_func_advisor.constant import Constant
+from common_func_advisor.constant import CoreType
+from common_func_advisor.constant import PerfColor
 
 
 class OpPerfFactory:
     @classmethod
     def build(cls, op_row: Dict):
-        return OpPerf(op_row)
+        if op_row.get(Constant.TITLE.TASK_TYPE) in (CoreType.AIV, CoreType.MIX_AIV):
+            return VecOpPerf(op_row)
+        elif op_row.get(Constant.TITLE.TASK_TYPE) in (CoreType.AIC, CoreType.MIX_AIC):
+            return CubeOpPerf(op_row)
+        else:
+            return OpPerf(op_row)
 
 
 class OpPerf:
     def __init__(self, op_row: Dict):
+        if "OP Type" in op_row.keys():
+            Constant.update_title()
         self.row = op_row
         self.model_name = op_row.get("Model Name")
         self.model_id = op_row.get("Model ID")
@@ -75,6 +85,112 @@ class OpPerf:
         self.aiv_mte3_ratio = op_row.get("aiv_mte3_ratio")
         self.aiv_icache_miss_rate = op_row.get("aiv_icache_miss_rate")
         self.cube_utilization = op_row.get("cube_utilization( %)")
+    
+    @staticmethod
+    def get_dtype_size(dtype_str: str):
+        return Constant.DTYPE_SIZE_MAP.get(dtype_str.lower(), 0)
+    
+    @staticmethod
+    def get_element_count(shape: list):
+        return functools.reduce(lambda x, y: int(x) * int(y), shape)
+    
+    @staticmethod
+    def shape_to_tuple(shape_str: str) -> tuple:
+        if not isinstance(shape_str, str):
+            return []
+        shape_str = shape_str.strip('"')
+        split_shape = shape_str.strip(';')
+        if not split_shape:
+            return []
+        pairs = split_shape.split(';')
+        shape_result = []
+        for pair in pairs:
+            pair = pair.strip(";")
+            elements = pair.split(',')
+            elements = tuple(int(element) if "" != element else 0 for element in elements)
+            shape_result.append(elements)
+        return tuple(shape_result)
+    
+    @staticmethod
+    def dtype_to_tuple(dtypes_str: str) -> tuple:
+        if not isinstance(dtypes_str, str):
+            return []
+        dtypes_str = dtypes_str.strip('"')
+        split_dtypes = dtypes_str.strip(';')
+        if not split_dtypes:
+            return []
+        pairs = split_dtypes.split(';')
+        return tuple(pairs)
+    
+    def get_mac_ratio(self):
+        return self.aic_mac_ratio
+    
+    def get_size(self, shapes_str, dtypes_str):
+        shapes = self.shape_to_tuple(shapes_str)
+        dtypes = self.dtype_to_tuple(dtypes_str)
+        if len(shapes) > len(dtypes):
+            print(f"[ERROR] The size of shape is greater than that of dtypes.")
+            return 0
+        if len(shapes) < len(dtypes):
+            shapes = list(shapes)
+            shapes.extend([(1,)] * (len(dtypes) - len(shapes)))
+        all_size = 0
+        for index, shape in enumerate(shapes):
+            element_count = self.get_element_count(shape)
+            dtype_size = self.get_dtype_size(dtypes[index])
+            all_size += element_count * dtype_size
+        return all_size
+    
+    def get_calc_size(self):
+        # input and output bytes (MB)
+        if not self.input_shapes or not self.output_shapes:
+            print("[ERROR] There is no tensor data, do not assess vector op performance.")
+            return 0
+        intput_size = self.get_size(self.input_shapes, self.input_data_types)
+        output_size = self.get_size(self.output_shapes, self.output_data_types)
+        return (intput_size + output_size) / (Constant.BYTE_UNIT_TRANS * Constant.BYTE_UNIT_TRANS)
+    
+    def get_throughput(self):
+        # throughput(GB/s)
+        if not self.task_duration or abs(self.task_duration) < 1e-6:
+            print("[ERROR] There is no task_duration, do not assess vector op performance.")
+            return 0
+        return self.row[Constant.TITLE.SIZE] / Constant.BYTE_UNIT_TRANS / self.task_duration * Constant.UNIT_TRANS * Constant.UNIT_TRANS
+    
+    def get_perf_color(self):
+        return PerfColor.WHITE
 
     def update(self):
+        self.row[Constant.TITLE.SIZE] = self.get_calc_size()
+        self.row[Constant.TITLE.THROUGHPUT] = self.get_throughput()
+        self.row[Constant.TITLE.COLOR] = self.get_perf_color().name
         return self.row
+
+
+class VecOpPerf(OpPerf):
+    def get_perf_color(self) -> PerfColor:
+        throughput = self.row[Constant.TITLE.THROUGHPUT]
+        op_duration = self.task_duration
+        tp_threshold = Constant.TP_THRESHOLD
+        if throughput == 0:
+            return PerfColor.WHITE
+        if throughput < tp_threshold / 2 and op_duration > 20:
+            return PerfColor.RED
+        elif tp_threshold / 2 <= throughput < tp_threshold:
+            return PerfColor.YELLOW
+        else:
+            return PerfColor.GREEN
+
+
+class CubeOpPerf(OpPerf):
+    def get_perf_color(self) -> PerfColor:
+        aic_mac_ratio = self.get_mac_ratio()
+        if not aic_mac_ratio:
+            print("[WARNING] There is no aic_mac_ratio, do not assess cube op performance.")
+            return PerfColor.WHITE
+        elif aic_mac_ratio < 0.6:
+            return PerfColor.RED
+        elif 0.6 <= aic_mac_ratio < 0.8:
+            return PerfColor.YELLOW
+        else:
+            return PerfColor.GREEN
diff --git a/profiler/advisor/advisor_backend/compute_advice/npu_slow_advice.py b/profiler/advisor/advisor_backend/compute_advice/npu_slow_advice.py
new file mode 100644
index 0000000000..caff1c792c
--- /dev/null
+++ b/profiler/advisor/advisor_backend/compute_advice/npu_slow_advice.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC
+import multiprocessing
+
+import pandas as pd
+
+from compute_advice.compute_advice_base import ComputeAdviceBase
+from compute_advice.npu_fused.op_perf import OpPerfFactory
+from common_func_advisor.constant import Constant
+from common_func_advisor.constant import PerfColor
+from advisor_backend.common_func_advisor.trace_view_json import TraceViewJson
+
+
+class NpuSlowAdvice(ComputeAdviceBase, ABC):
+    OP_PERF_SHEET = "op_perf"
+
+    def __init__(self, collection_path: str):
+        super().__init__(collection_path)
+        self.kernel_details_path = ""
+        self.data = pd.DataFrame()
+    
+    @staticmethod
+    def save_to_excel(data: pd.DataFrame, file_path: str) -> None:
+        writer = pd.ExcelWriter(file_path, engine="xlsxwriter", mode="w")
+        data.index.name = Constant.TITLE.INDEX
+        data.to_excel(writer, index=True, sheet_name=NpuSlowAdvice.OP_PERF_SHEET)
+        NpuSlowAdvice.color_sheet(data, writer.book, writer.sheets[NpuSlowAdvice.OP_PERF_SHEET])
+        writer.sheets[NpuSlowAdvice.OP_PERF_SHEET].freeze_panes = "A2"
+        writer.close()
+
+    @staticmethod
+    def color_sheet(data: pd.DataFrame, workbook, worksheet):
+        color_rgb = {
+            PerfColor.GREEN.name: workbook.add_format({'bg_color': '#C6EFCE'}),
+            PerfColor.YELLOW.name: workbook.add_format({'bg_color': '#FFEB9C'}),
+            PerfColor.RED.name: workbook.add_format({'bg_color': '#FFC7CE'}),
+        }
+        for row in data.iterrows():
+            color = row[1][Constant.TITLE.COLOR]
+            fill_format = color_rgb.get(color)
+            if not fill_format:
+                continue
+            worksheet.set_row(row[0] + 1, None, fill_format)
+    
+    @staticmethod
+    def update_op_row(row: tuple):
+        return OpPerfFactory.build(row[1]).update()
+    
+    def get_call_stack(self, data: pd.DataFrame, index_id: int, ts_col: str) -> str:
+        if not self.has_callstack():
+            print("There is no call stack info, please set 'with_stack=True'")
+            return ""
+        trace_json = TraceViewJson(self.trace_view_path)
+        return trace_json.get_call_stack(data, index_id, ts_col)
+    
+    def run(self):
+        if not self.path_check():
+            return self.data
+        self.process()
+        return self.data
+    
+    def process(self):
+        self.data = pd.read_csv(self.kernel_details_path, dtype={"Start Time(us)": str})
+        # 去除末尾的\t分隔符
+        self.data["Start Time(us)"] = self.data["Start Time(us)"].apply(lambda x: x[:-1])
+        pool = multiprocessing.Pool(multiprocessing.cpu_count())
+        result = pool.map(self.update_op_row, self.data.iterrows())
+        pool.close()
+        self.data = pd.DataFrame(result)
diff --git a/profiler/advisor/compute_perf_analysis.ipynb b/profiler/advisor/compute_perf_analysis.ipynb
index 27c9caf37b..86a1fb5b7e 100644
--- a/profiler/advisor/compute_perf_analysis.ipynb
+++ b/profiler/advisor/compute_perf_analysis.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 13,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-02-21T09:19:13.937531900Z",
@@ -11,6 +11,7 @@
    },
    "outputs": [],
    "source": [
+    "import os\n",
     "import pandas as pd\n",
     "\n",
     "from advisor_backend.interface import Interface\n",
@@ -24,15 +25,18 @@
     "# 算子调优分析\n",
     "## 1. 算子分析的数据准备\n",
     "当前算子分析工具支持分析Ascend Pyorch Profiler方式生成的ascend_pt目录\n",
-    "## 2. 算子分析解决的问题\n",
+    "## 2. 融合算子分析\n",
     "当前支持分析模型中存在可融合的小算子，并给出优化建议。\n",
     "\n",
-    "\"更多融合算子信息，请查阅 https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/700alpha003/processormodel/hardwaredesc_0001.html"
+    "\"更多融合算子信息，请查阅 https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/700alpha003/processormodel/hardwaredesc_0001.html\n",
+    "\n",
+    "## 3. 异常性能算子分析\n",
+    "支持分析模型中性能异常的计算算子"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 14,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-02-22T08:41:17.455567500Z",
@@ -44,18 +48,75 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[INFO] Start to analyse the target file: C:\\data\\ascend_pt\\ASCEND_PROFILER_OUTPUT\\kernel_details.csv\n",
-      "            pattern_name                     pattern  len  count  duration sum(us)          op durations(us) index\n",
-      "18  torch_npu.npu_swiglu  (Slice, Slice, Swish, Mul)    4      1             12.56  [3.14, 3.14, 3.14, 3.14]   [0]\n",
+      "[INFO] Start to analyse the target file: D:\\work\\ascend_pt\\ASCEND_PROFILER_OUTPUT\\kernel_details.csv\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>pattern_name</th>\n",
+       "      <th>pattern</th>\n",
+       "      <th>len</th>\n",
+       "      <th>count</th>\n",
+       "      <th>duration sum(us)</th>\n",
+       "      <th>op durations(us)</th>\n",
+       "      <th>index</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>torch_npu.npu_swiglu</td>\n",
+       "      <td>(Slice, Slice, Swish, Mul)</td>\n",
+       "      <td>4</td>\n",
+       "      <td>1</td>\n",
+       "      <td>27.53</td>\n",
+       "      <td>[21.2, 0.05, 3.14, 3.14]</td>\n",
+       "      <td>[0]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            pattern_name                     pattern  len  count  duration sum(us)          op durations(us) index\n",
+       "18  torch_npu.npu_swiglu  (Slice, Slice, Swish, Mul)    4      1             27.53  [21.2, 0.05, 3.14, 3.14]   [0]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "\n",
       "\n",
-      "The computing time of fusable op is 12.56 ms.\n",
+      "The computing time of fusable op is 27.53 ms.\n",
       "\n",
       "\n",
       "Advice 0:\n",
       "Replace [Slice, Slice, Swish, Mul] with torch_npu.npu_swiglu. This pattern first happened in: \n",
-      "torch/nn/modules/module.py(1513): _call_impl\n",
-      "profiler_main.py(116):forward\n"
+      "/root/torch/module.py\n",
+      "/root/test/slice.py(116)\n"
      ]
     }
    ],
@@ -66,7 +127,7 @@
     "data = interface.get_data('compute', 'npu_fused')\n",
     "pd.set_option('display.max_columns', None)\n",
     "pd.set_option('display.width', 900)\n",
-    "print(data['data'].iloc[:, :-2])\n",
+    "display(data['data'].iloc[:, :-2])\n",
     "print('\\n')\n",
     "print(data['bottleneck'])\n",
     "print('\\n')\n",
@@ -75,21 +136,217 @@
   },
   {
    "cell_type": "code",
-   "outputs": [],
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[INFO] Start to analyse the target file: D:\\work\\ascend_pt\\ASCEND_PROFILER_OUTPUT\\kernel_details.csv\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Step Id</th>\n",
+       "      <th>Model ID</th>\n",
+       "      <th>Task ID</th>\n",
+       "      <th>Stream ID</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Type</th>\n",
+       "      <th>Accelerator Core</th>\n",
+       "      <th>Start Time(us)</th>\n",
+       "      <th>Duration(us)</th>\n",
+       "      <th>Wait Time(us)</th>\n",
+       "      <th>Block Dim</th>\n",
+       "      <th>Mix Block Dim</th>\n",
+       "      <th>Input Shapes</th>\n",
+       "      <th>Input Data Types</th>\n",
+       "      <th>Input Formats</th>\n",
+       "      <th>Output Shapes</th>\n",
+       "      <th>Output Data Types</th>\n",
+       "      <th>Output Formats</th>\n",
+       "      <th>Context ID</th>\n",
+       "      <th>aicore_time(us)</th>\n",
+       "      <th>aic_total_cycles</th>\n",
+       "      <th>aic_mac_ratio</th>\n",
+       "      <th>aic_mac_int8_ratio</th>\n",
+       "      <th>aic_cube_fops</th>\n",
+       "      <th>aic_vector_fops</th>\n",
+       "      <th>aiv_time(us)</th>\n",
+       "      <th>aiv_total_cycles</th>\n",
+       "      <th>aiv_vec_fp32_ratio</th>\n",
+       "      <th>aiv_vec_fp16_ratio</th>\n",
+       "      <th>aiv_vec_int32_ratio</th>\n",
+       "      <th>aiv_vec_misc_ratio</th>\n",
+       "      <th>aiv_cube_fops</th>\n",
+       "      <th>aiv_vector_fops</th>\n",
+       "      <th>size(MB)</th>\n",
+       "      <th>throughput(GB/s)</th>\n",
+       "      <th>color</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>4294967295</td>\n",
+       "      <td>1265</td>\n",
+       "      <td>16</td>\n",
+       "      <td>Slice1</td>\n",
+       "      <td>Slice</td>\n",
+       "      <td>AI_VECTOR_CORE</td>\n",
+       "      <td>1699529623106750</td>\n",
+       "      <td>21.20</td>\n",
+       "      <td>261.56</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4,1025</td>\n",
+       "      <td>INT64</td>\n",
+       "      <td>FORMAT_ND</td>\n",
+       "      <td>4,1025</td>\n",
+       "      <td>INT32</td>\n",
+       "      <td>FORMAT_ND</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.77</td>\n",
+       "      <td>29508.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0062</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>5856.0</td>\n",
+       "      <td>0.046921</td>\n",
+       "      <td>2.161371</td>\n",
+       "      <td>RED</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>4294967295</td>\n",
+       "      <td>1265</td>\n",
+       "      <td>16</td>\n",
+       "      <td>Add1</td>\n",
+       "      <td>Add</td>\n",
+       "      <td>AI_CORE</td>\n",
+       "      <td>1699529623106754</td>\n",
+       "      <td>3.14</td>\n",
+       "      <td>261.56</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4,1025</td>\n",
+       "      <td>INT64</td>\n",
+       "      <td>FORMAT_ND</td>\n",
+       "      <td>4,1025</td>\n",
+       "      <td>INT32</td>\n",
+       "      <td>FORMAT_ND</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2.3</td>\n",
+       "      <td>28888.0</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>0.1</td>\n",
+       "      <td>0.1</td>\n",
+       "      <td>0.7</td>\n",
+       "      <td>1.77</td>\n",
+       "      <td>29508.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0062</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>5856.0</td>\n",
+       "      <td>0.046921</td>\n",
+       "      <td>14.592698</td>\n",
+       "      <td>RED</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Step Id    Model ID  Task ID  Stream ID    Name   Type Accelerator Core    Start Time(us)  Duration(us)  Wait Time(us)  Block Dim  Mix Block Dim Input Shapes Input Data Types Input Formats Output Shapes Output Data Types Output Formats  Context ID  aicore_time(us)  aic_total_cycles  aic_mac_ratio  aic_mac_int8_ratio  aic_cube_fops  aic_vector_fops  aiv_time(us)  aiv_total_cycles  aiv_vec_fp32_ratio  aiv_vec_fp16_ratio  aiv_vec_int32_ratio  aiv_vec_misc_ratio  aiv_cube_fops  aiv_vector_fops  size(MB)  throughput(GB/s) color\n",
+       "0        1  4294967295     1265         16  Slice1  Slice   AI_VECTOR_CORE  1699529623106750         21.20         261.56          9              0       4,1025            INT64     FORMAT_ND        4,1025             INT32      FORMAT_ND         NaN              0.0               0.0            0.0                 0.0            0.0              0.0          1.77           29508.0                 0.0                 0.0               0.0062                 0.0            0.0           5856.0  0.046921          2.161371   RED\n",
+       "4        1  4294967295     1265         16    Add1    Add          AI_CORE  1699529623106754          3.14         261.56          9              0       4,1025            INT64     FORMAT_ND        4,1025             INT32      FORMAT_ND         NaN              2.3           28888.0            0.2                 0.1            0.1              0.7          1.77           29508.0                 0.0                 0.0               0.0062                 0.0            0.0           5856.0  0.046921         14.592698   RED"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
+    "# 异常性能算子识别\n",
+    "from advisor_backend.compute_advice.npu_slow_advice import NpuSlowAdvice\n",
     "\n",
-    "\n"
+    "npu_slow_advice = NpuSlowAdvice(compute_path)\n",
+    "data = interface.get_data('compute', 'npu_slow')\n",
+    "slow_op_data = data[data[\"color\"] == \"RED\"]\n",
+    "display(slow_op_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NpuSlowAdvice.save_to_excel(data, file_path=os.path.join(compute_path, \"slow_op.xlsx\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "call stack: \n",
+      "/root/torch/module.py\n",
+      "/root/test/slice.py(116)\n"
+     ]
+    }
    ],
-   "metadata": {
-    "collapsed": false
-   }
+   "source": [
+    "# 异常性能算子call stack\n",
+    "call_stack = npu_slow_advice.get_call_stack(data, index_id=0, ts_col=\"Start Time(us)\")\n",
+    "print(\"call stack: \")\n",
+    "print(call_stack)"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "name": "python3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "display_name": "Python 3 (ipykernel)"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -101,7 +358,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.18"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,
diff --git a/profiler/test/ut/advisor/advisor_backend/compute_advice/test_npu_slow_advice.py b/profiler/test/ut/advisor/advisor_backend/compute_advice/test_npu_slow_advice.py
new file mode 100644
index 0000000000..41da4b3c27
--- /dev/null
+++ b/profiler/test/ut/advisor/advisor_backend/compute_advice/test_npu_slow_advice.py
@@ -0,0 +1,210 @@
+import json
+import os
+import shutil
+import stat
+import csv
+import unittest
+
+from advisor_backend.interface import Interface
+from advisor_backend.compute_advice.npu_slow_advice import NpuSlowAdvice
+from advisor_backend.common_func_advisor.trace_view_json import TraceViewJson
+
+
+class TestNpuSlowAdvice(unittest.TestCase):
+    ASCEND_PT_DIR = "./ascend_pt"
+    OUTPUT_DIR = "./ascend_pt/ASCEND_PROFILER_OUTPUT"
+    interface = None
+    err_interface = None
+
+    def tearDown(self):
+        if os.path.exists(TestNpuSlowAdvice.ASCEND_PT_DIR):
+            shutil.rmtree(TestNpuSlowAdvice.ASCEND_PT_DIR)
+
+    def setUp(self):
+        if os.path.exists(TestNpuSlowAdvice.ASCEND_PT_DIR):
+            shutil.rmtree(TestNpuSlowAdvice.ASCEND_PT_DIR)
+        if not os.path.exists(TestNpuSlowAdvice.ASCEND_PT_DIR):
+            os.makedirs(TestNpuSlowAdvice.ASCEND_PT_DIR)
+        if not os.path.exists(TestNpuSlowAdvice.OUTPUT_DIR):
+            os.makedirs(TestNpuSlowAdvice.OUTPUT_DIR)
+
+    @classmethod
+    def get_basic_trace_view(cls):
+        # Python pid
+        py_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 1, "args": {"name": "Python"}}
+        # ascend pid
+        ascend_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 4, "args": {"name": "Ascend Hardware"}}
+        # ascend pid
+        cann_pid_data = {"ph": "M", "name": "process_name", "tid": 0, "pid": 5, "args": {"name": "CANN"}}
+        # ascend hardware ops
+        ah_event1 = {"ph": "X", "name": "Slice1", "ts": "1699529623106750", "dur": 100, "tid": 3, "pid": 4, "args": {}}
+        ah_event2 = {"ph": "X", "name": "Slice2", "ts": "1699529623106751", "dur": 80, "tid": 3, "pid": 4, "args": {}}
+        # flow event
+        flow_event_s = {"ph": "s", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "200", "args": {}}
+        flow_event_e = {"ph": "f", "name": "link1", "id": 1, "tid": 3, "pid": 1, "ts": "1699529623106750", "args": {}}
+        return [py_pid_data, ascend_pid_data, cann_pid_data, ah_event1, ah_event2, flow_event_s, flow_event_e]
+
+    @classmethod
+    def create_profiler_info_json(cls):
+        info = {
+            "config": {
+                "common_config": {
+                    "with_stack": True,
+                    "activities": ["ProfilerActivity.CPU", "ProfilerActivity.NPU"]
+                }
+            }
+        }
+        with os.fdopen(os.open(f"{TestNpuSlowAdvice.ASCEND_PT_DIR}/profiler_info_0.json",
+                               os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp:
+            fp.write(json.dumps(info))
+
+    @classmethod
+    def create_old_version_trace_view(cls):
+        basic_info = cls.get_basic_trace_view()
+
+        # python ops
+        py_event1 = {"ph": "X", "cat": "python_function", "name": "aten::slice", "ts": "200", "dur": 100, "tid": 2,
+                     "pid": 1,
+                     "args": {"Call stack": "/root/test/slice.py(116);\r\n/root/torch/module.py"}}
+        py_event2 = {"ph": "X", "cat": "python_function", "name": "slice", "ts": "199", "dur": 200, "tid": 2, "pid": 1,
+                     "args": {"Call stack": "/root/test/slice.py(116);\r\n/root/torch/module.py"}}
+        raw_data = [
+            *basic_info, py_event1, py_event2
+        ]
+
+        with os.fdopen(os.open(f"{TestNpuSlowAdvice.OUTPUT_DIR}/trace_view.json",
+                               os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp:
+            fp.write(json.dumps(raw_data))
+
+    @classmethod
+    def create_new_version_trace_view(cls):
+        basic_info = cls.get_basic_trace_view()
+        # python ops
+        py_event1 = {"ph": "X", "name": "aten::slice", "ts": "200", "dur": 100, "tid": 2, "pid": 1, "args": {}}
+        py_event2 = {"ph": "X", "name": "slice", "ts": "199", "dur": 105, "tid": 2, "pid": 1, "args": {}}
+        py_event3 = {"ph": "X", "cat": "python_function", "name": "/root/test/slice.py(116)", "ts": "198", "dur": 120,
+                     "tid": 2, "pid": 1,
+                     "args": {}}
+        py_event4 = {"ph": "X", "cat": "python_function", "name": "/root/torch/module.py", "ts": "197", "dur": 150,
+                     "tid": 2, "pid": 1, "args": {}}
+
+        raw_data = [
+            *basic_info, py_event1, py_event2, py_event3, py_event4
+        ]
+
+        with os.fdopen(os.open(f"{TestNpuSlowAdvice.OUTPUT_DIR}/trace_view.json",
+                               os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp:
+            fp.write(json.dumps(raw_data))
+
+    @classmethod
+    def create_kernel_details(cls):
+        # create csv files
+        csv_header = ['Step Id', 'Model ID', 'Task ID', 'Stream ID', 'Name', 'Type', 'Accelerator Core',
+                      'Start Time(us)',
+                      'Duration(us)', 'Wait Time(us)', 'Block Dim', 'Mix Block Dim', 'Input Shapes', 'Input Data Types',
+                      'Input Formats', 'Output Shapes', 'Output Data Types', 'Output Formats', 'Context ID',
+                      'aicore_time(us)',
+                      'aic_total_cycles', 'aic_mac_ratio', 'aic_mac_int8_ratio', 'aic_cube_fops',
+                      'aic_vector_fops',
+                      'aiv_time(us)', 'aiv_total_cycles', 'aiv_vec_fp32_ratio', 'aiv_vec_fp16_ratio',
+                      'aiv_vec_int32_ratio',
+                      'aiv_vec_misc_ratio', 'aiv_cube_fops', 'aiv_vector_fops']
+        # red: size=0.0492 MB, throughput=2.32 GB/s, task_duration=21.2us
+        csv_row1 = [1, 4294967295, 1265, 16, 'Slice1', 'Slice', 'AI_VECTOR_CORE', "1699529623106750\t", 21.2, 261.56, 9,
+                    0,
+                    '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A',
+                    0, 0, 0, 0, 0, 0,
+                    1.77, 29508, 0, 0, 0.0062, 0, 0, 5856]
+        # YELLOW: size=0.0492 MB, throughput=984 GB/s, task_duration=0.05us
+        csv_row2 = [1, 4294967295, 1265, 16, 'Slice2', 'Slice', 'MIX_AIV', "1699529623106751\t", 0.05, 261.56, 9,
+                    0,
+                    '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A',
+                    2.3, 28888, 0.4, 0.1, 0.1, 0.7,
+                    1.77, 29508, 0, 0, 0.0062, 0, 0, 5856]
+        # WHITE: AI_CPU
+        csv_row3 = [1, 4294967295, 1265, 16, 'Swish1', 'Swish', 'AI_CPU', "1699529623106752\t", 3.14, 261.56, 9,
+                    0,
+                    '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A',
+                    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
+                    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A']
+        # GREEN: size=0.0492 MB, throughput=15.67 GB/s, task_duration = 3.14us
+        csv_row4 = [1, 4294967295, 1265, 16, 'Mul1', 'Mul', 'AI_VECTOR_CORE', "1699529623106753\t", 3.14, 261.56, 9, 0,
+                    '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A',
+                    0, 0, 0, 0, 0, 0,
+                    1.77, 29508, 0, 0, 0.0062, 0, 0, 5856]
+        # RED: aic_mac_ratio=0.2
+        csv_row5 = [1, 4294967295, 1265, 16, 'Add1', 'Add', 'AI_CORE', "1699529623106754\t", 3.14, 261.56, 9, 0,
+                    '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A',
+                    2.3, 28888, 0.2, 0.1, 0.1, 0.7,
+                    1.77, 29508, 0, 0, 0.0062, 0, 0, 5856]
+        # GREEN: aic_mac_ratio=0.85
+        csv_row6 = [1, 4294967295, 1265, 16, 'Add1', 'Add', 'MIX_AIC', "1699529623106754\t", 3.14, 261.56, 9, 0,
+                    '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A',
+                    2.3, 38888, 0.85, 0.1, 0.1, 0.7,
+                    1.77, 29508, 0, 0, 0.0062, 0, 0, 5856]
+        # YELLOW: aic_mac_ratio=0.64
+        csv_row7 = [1, 4294967295, 1265, 16, 'Add1', 'Add', 'MIX_AIC', "1699529623106754\t", 3.14, 261.56, 9, 0,
+                    '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A',
+                    2.3, 48888, 0.64, 0.1, 0.1, 0.7,
+                    1.77, 29508, 0, 0, 0.0062, 0, 0, 5856]
+        with os.fdopen(os.open(f"{TestNpuSlowAdvice.OUTPUT_DIR}/kernel_details.csv",
+                               os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp:
+            csv_writer = csv.writer(fp)
+            csv_writer.writerow(csv_header)
+            csv_writer.writerow(csv_row1)
+            csv_writer.writerow(csv_row2)
+            csv_writer.writerow(csv_row3)
+            csv_writer.writerow(csv_row4)
+            csv_writer.writerow(csv_row5)
+            csv_writer.writerow(csv_row6)
+            csv_writer.writerow(csv_row7)
+
+    def test_run_should_return_empty_when_ascend_pt_path_not_exist(self):
+        interface = Interface("")
+        data = interface.get_data('compute', 'npu_slow')
+        self.assertEqual(0, len(data))
+
+    def test_run_should_return_empty_when_there_is_no_kernel_details(self):
+        interface = Interface(self.ASCEND_PT_DIR)
+        data = interface.get_data('compute', 'npu_slow')
+        self.assertEqual(0, len(data))
+
+    def test_run_should_return_7_data_without_call_stack_when_json_not_exist(self):
+        self.create_kernel_details()
+        interface = Interface(self.ASCEND_PT_DIR)
+        data = interface.get_data('compute', 'npu_slow')
+        call_stack = NpuSlowAdvice(self.ASCEND_PT_DIR).get_call_stack(data, index_id=0, ts_col="Start Time(us)")
+        self.assertEqual(7, len(data))
+        self.assertEqual("", call_stack)
+
+    def test_run_should_return_7_data_with_call_stack_when_new_trace_view_exists(self):
+        self.create_profiler_info_json()
+        self.create_kernel_details()
+        self.create_new_version_trace_view()
+        interface = Interface(self.ASCEND_PT_DIR)
+        data = interface.get_data('compute', 'npu_slow')
+        slow_op_data = data[data["color"] == "RED"]
+        NpuSlowAdvice.save_to_excel(data, file_path=os.path.join(self.ASCEND_PT_DIR, "slow_op.xlsx"))
+        call_stack = NpuSlowAdvice(self.ASCEND_PT_DIR).get_call_stack(data, index_id=0, ts_col="Start Time(us)")
+        self.assertEqual(7, len(data))
+        self.assertEqual(2, len(slow_op_data))
+        print(call_stack)
+        call_stack_res = "/root/torch/module.py\n" \
+                         "/root/test/slice.py(116)"
+        self.assertEqual(call_stack_res, call_stack)
+
+    def test_run_should_return_7_data_with_call_stack_when_old_trace_view_exists(self):
+        self.create_profiler_info_json()
+        self.create_kernel_details()
+        self.create_old_version_trace_view()
+        interface = Interface(self.ASCEND_PT_DIR)
+        data = interface.get_data('compute', 'npu_slow')
+        slow_op_data = data[data["color"] == "RED"]
+        NpuSlowAdvice.save_to_excel(data, file_path=os.path.join(self.ASCEND_PT_DIR, "slow_op.xlsx"))
+        call_stack = NpuSlowAdvice(self.ASCEND_PT_DIR).get_call_stack(data, index_id=0, ts_col="Start Time(us)")
+        self.assertEqual(7, len(data))
+        self.assertEqual(2, len(slow_op_data))
+        print(call_stack)
+        call_stack_res = "/root/test/slice.py(116)\n\r\n" \
+                         "/root/torch/module.py"
+        self.assertEqual(call_stack_res, call_stack)
-- 
Gitee


From 81e7e0b2bfe5f08493dc0ef37480a91977d000cb Mon Sep 17 00:00:00 2001
From: c30056266 <chengfangzhou1@huawei.com>
Date: Thu, 7 Mar 2024 12:10:31 +0800
Subject: [PATCH 29/62] wait_time_added

---
 .../overall_performance_comparator.py         |  9 ++-
 .../compare_bean/profiling_info.py            |  5 ++
 .../profiling_parser/npu_profiling_parser.py  | 60 +++++++++++++++++++
 3 files changed, 71 insertions(+), 3 deletions(-)

diff --git a/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py b/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py
index bfc631c66c..7ad66c0fa7 100644
--- a/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py
+++ b/profiler/compare_tools/compare_backend/comparator/overall_performance_comparator.py
@@ -37,9 +37,12 @@ class OverallPerformanceComparator(BaseComparator):
             self._headers.append('Mem Usage')
             base_col.append(f'{base_profiling_info.memory_used:.2f}G')
             comp_col.append(f'{comp_profiling_info.memory_used:.2f}G')
-        self._headers.extend(['Uncovered Communication Time'])
-        base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s'])
-        comp_col.extend([f'{comp_profiling_info.communication_not_overlapped: .3f}s'])
+        self._headers.extend(['Uncovered Communication Time(Wait Time)'])
+        if base_profiling_info.wait_time:
+            base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s({base_profiling_info.wait_time:.3f}s'])
+        else:
+            base_col.extend([f'{base_profiling_info.communication_not_overlapped: .3f}s( / )'])
+        comp_col.extend([f'{comp_profiling_info.communication_not_overlapped: .3f}s({comp_profiling_info.wait_time:.3f}s)'])
         if base_profiling_info.sdma_time or comp_profiling_info.sdma_time:
             self._headers.append('SDMA Time(Num)')
             base_col.append(f'{base_profiling_info.sdma_time:.3f}s({base_profiling_info.sdma_num})')
diff --git a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py
index 9184c790b7..b100e7ba98 100644
--- a/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py
+++ b/profiler/compare_tools/compare_backend/compare_bean/profiling_info.py
@@ -18,6 +18,7 @@ class ProfilingInfo:
         self.fa_num_bwd = 0
         self.compute_time = 0.0
         self.communication_not_overlapped = 0.0
+        self.wait_time = 0.0
         self.memory_used = 0.0
         self.e2e_time = 0.0
         self.sdma_time = 0.0
@@ -33,6 +34,7 @@ class ProfilingInfo:
         self.vec_time = self.vec_time / 10 ** 6
         self.compute_time = self.compute_time / 10 ** 6
         self.communication_not_overlapped = self.communication_not_overlapped / 10 ** 6
+        self.wait_time = self.wait_time / 10 ** 6
         self.e2e_time = self.e2e_time / 10 ** 6
         self.sdma_time = self.sdma_time / 10 ** 6
         self.scheduling_time = self.scheduling_time / 10 ** 6
@@ -84,6 +86,9 @@ class ProfilingInfo:
     def update_comm_not_overlap(self, time: float):
         self.communication_not_overlapped += time
 
+    def update_comm_not_overlap_wait_time(self, time: float):
+        self.wait_time = time
+
     def set_memory_used(self, memory: float):
         self.memory_used = memory
 
diff --git a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
index 1e2f115934..5759b0719a 100644
--- a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
+++ b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
@@ -51,6 +51,7 @@ class NPUProfilingParser(BaseProfilingParser):
         if self._enable_profiling_compare:
             func_list.add(self._picking_overlap_analysis_data)
             func_list.add(self._picking_kernel_event)
+            func_list.add(self._picking_hccl_event)
         return list(func_list)
 
     def _update_memory_list(self):
@@ -101,10 +102,69 @@ class NPUProfilingParser(BaseProfilingParser):
         self.__parse_kernel_csv()
         self.__add_sdma_time()
         self.__add_overlap_analysis_time()
+        self._picking_notify_wait_event_and_not_overlap_event()
+        self.__add_overlap_wait_time()
         self._result_data.overall_metrics.calculate_other_time()
         self._result_data.overall_metrics.calculate_schedule_time()
         self._result_data.overall_metrics.trans_time_to_s()
 
+    def _picking_notify_wait_event_and_not_overlap_event(self):
+        self.notify_event_cache = []
+        self._not_overlaped_commu_event = []
+        for event in self._commu_task_list:
+            if event.name == 'Notify_Wait' and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_CHECK' \
+                    and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_ACK':
+                self.notify_event_cache.append(event)
+        for event in self._overlap_analysis:
+            if event.is_comm_not_overlap():
+                self._not_overlaped_commu_event.append(event)
+        self._not_overlaped_commu_event.sort(key=lambda x: x.start_time)
+
+    def __add_overlap_wait_time(self):
+        notify_wait_event_dict = dict()
+        for notify_event in self.notify_event_cache:
+            if notify_event.tid in notify_wait_event_dict:
+                notify_wait_event_dict[notify_event.tid].append(notify_event)
+            else:
+                notify_wait_event_dict[notify_event.tid] = [notify_event]
+        total_time = 0
+        for commu_event in self._not_overlaped_commu_event:
+            wait_time_list = []
+            commu_event_start_time = float(commu_event.start_time)
+            commu_event_end_time = float(commu_event.start_time) + commu_event.dur
+
+            for plane_id, events in notify_wait_event_dict.items():
+                wait_time = 0
+                idx = 0
+                for notify_event in events:
+                    notify_event_start_time = float(notify_event.start_time)
+                    notify_event_end_time = float(notify_event.start_time) + notify_event.dur
+                    if notify_event_start_time < commu_event_start_time and notify_event_end_time > \
+                            commu_event_end_time:
+                        wait_time = commu_event_end_time - commu_event_start_time
+                        break
+                    elif notify_event_start_time < commu_event_start_time <= notify_event_end_time <= \
+                            commu_event_end_time:
+                        wait_time += notify_event_end_time - commu_event_start_time
+                        idx += 1
+                    elif commu_event_start_time <= notify_event_start_time <= commu_event_end_time < \
+                            notify_event_end_time:
+                        wait_time += commu_event_end_time - notify_event_start_time
+                        break
+                    elif notify_event_start_time >= commu_event_start_time and notify_event_end_time <= \
+                            commu_event_end_time:
+                        wait_time += notify_event_end_time - notify_event_start_time
+                        idx += 1
+                    elif notify_event_end_time < commu_event_start_time:
+                        idx += 1
+                    else:
+                        break
+
+                wait_time_list.append(wait_time)
+                notify_wait_event_dict[plane_id] = notify_wait_event_dict[plane_id][idx:]
+            total_time += max(wait_time_list)
+        self._result_data.overall_metrics.update_comm_not_overlap_wait_time(total_time)
+
     def _picking_hccl_event(self, event: TraceEventBean):
         if event.pid != self._hccl_pid or not event.is_x_mode():
             return False
-- 
Gitee


From 8f8a40f3606b55d1e311fc98bff8a55cca1433b0 Mon Sep 17 00:00:00 2001
From: l30044004 <louyujing@huawei.com>
Date: Wed, 6 Mar 2024 16:31:09 +0800
Subject: [PATCH 30/62] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=A4=9A=E6=AC=A1?=
 =?UTF-8?q?=E8=B0=83=E7=94=A8start=E5=92=8Cstop=20dump=E6=8A=A5=E9=94=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../python/ptdbg_ascend/debugger/precision_debugger.py    | 8 +++++---
 .../ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py     | 5 ++---
 .../ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py    | 1 +
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py
index f71a9d4c54..2d1613c47b 100644
--- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/debugger/precision_debugger.py
@@ -1,8 +1,9 @@
 import os
+from concurrent.futures import ThreadPoolExecutor
 import torch
 from ..common.utils import Const, check_switch_valid, generate_compare_script, check_is_npu, print_error_log, \
     CompareException, print_warn_log
-from ..dump.dump import DumpUtil, acc_cmp_dump, write_to_disk, get_pkl_file_path, reset_module_count, GLOBAL_THREAD_POOL
+from ..dump.dump import DumpUtil, acc_cmp_dump, write_to_disk, get_pkl_file_path, reset_module_count
 from ..dump.utils import set_dump_path, set_dump_switch_print_info, generate_dump_path_str, \
         set_dump_switch_config, set_backward_input
 from ..overflow_check.utils import OverFlowUtil
@@ -108,6 +109,7 @@ class PrecisionDebugger:
                     register_hook_core(instance.hook_func, instance.model)
                     instance.first_start = False
                 DumpUtil.dump_switch = "ON"
+                DumpUtil.dump_thread_pool = ThreadPoolExecutor()
                 OverFlowUtil.overflow_check_switch = "ON"
                 dump_path_str = generate_dump_path_str()
                 set_dump_switch_print_info("ON", DumpUtil.dump_switch_mode, dump_path_str)
@@ -130,8 +132,8 @@ class PrecisionDebugger:
             dump_path_str = generate_dump_path_str()
             set_dump_switch_print_info("OFF", DumpUtil.dump_switch_mode, dump_path_str)
             write_to_disk()
-            if DumpUtil.is_single_rank:
-                GLOBAL_THREAD_POOL.shutdown(wait=True)
+            if DumpUtil.is_single_rank and DumpUtil.dump_thread_pool:
+                DumpUtil.dump_thread_pool.shutdown(wait=True)
             if check_is_npu() and DumpUtil.dump_switch_mode in [Const.ALL, Const.API_STACK, Const.LIST, Const.RANGE, Const.API_LIST]:
                 generate_compare_script(DumpUtil.dump_data_dir, get_pkl_file_path(), DumpUtil.dump_switch_mode)
 
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
index a6b769ff2a..ea61be8636 100644
--- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
@@ -47,7 +47,6 @@ pkl_name = ""
 rank = os.getpid() + 100000
 multi_output_apis = ["_sort_", "npu_flash_attention"]
 module_count = {}
-GLOBAL_THREAD_POOL = ThreadPoolExecutor()
 
 
 class APIList(list):
@@ -186,12 +185,12 @@ def dump_data(prefix, data_info):
 
 
 def thread_dump_data(prefix, data_info):
-    GLOBAL_THREAD_POOL.submit(dump_data, prefix, data_info)
+    DumpUtil.dump_thread_pool.submit(dump_data, prefix, data_info)
 
 
 def dump_data_by_rank_count(dump_step, prefix, data_info):
     print_info_log(f"ptdbg is analyzing rank{rank} api: {prefix}" + " " * 10, end='\r')
-    if DumpUtil.is_single_rank:
+    if DumpUtil.is_single_rank and DumpUtil.dump_thread_pool:
         thread_dump_data(prefix, data_info)
     else:
         dump_data(prefix, data_info)
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py
index bb0ae82c41..bea18501ae 100644
--- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/utils.py
@@ -87,6 +87,7 @@ class DumpUtil(object):
     need_replicate = False
     summary_mode = "all"
     is_single_rank = None
+    dump_thread_pool = None
 
 
     @staticmethod
-- 
Gitee


From 86f13983b26fa860a2907bb5102d96040e7cc4b9 Mon Sep 17 00:00:00 2001
From: caishangqiu <caishangqiu@huawei.com>
Date: Thu, 7 Mar 2024 14:46:31 +0800
Subject: [PATCH 31/62] =?UTF-8?q?[att=5Fprofiler]=E6=80=A7=E8=83=BD?=
 =?UTF-8?q?=E6=AF=94=E5=AF=B9=E5=B7=A5=E5=85=B7=E6=94=AF=E6=8C=81nn.module?=
 =?UTF-8?q?=E7=BA=A7=E5=88=AB=E7=9A=84=E6=AF=94=E5=AF=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 profiler/cluster_analyse/README.md |  4 ++--
 profiler/compare_tools/README.md   | 35 ++++++++++++++++++++----------
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/profiler/cluster_analyse/README.md b/profiler/cluster_analyse/README.md
index 2833d293ea..f7646f67c4 100644
--- a/profiler/cluster_analyse/README.md
+++ b/profiler/cluster_analyse/README.md
@@ -23,7 +23,7 @@ experimental_config = torch_npu.profiler._ExperimentalConfig(
 
 或者具备：
 
-- ascend_pytorch.db
+- analysis.db
 
 以上csv、json文件与db文件只能存在一类，否则集群分析工具解析异常。
 
@@ -113,7 +113,7 @@ K列：Communication（Not Overlapped and Exclude Receive）指剔除recieve算
 
 #### cluster_analysis.db
 
-解析ascend_pytorch.db生成的交付件，当前解析通信类数据，主要包含下面数据：
+解析analysis.db生成的交付件，当前解析通信类数据，主要包含下面数据：
 
 - ClusterCommAnalyzerTime：集群通信时间信息。
 - ClusterCommAnalyzerBandwidth：集群通信带宽信息。
diff --git a/profiler/compare_tools/README.md b/profiler/compare_tools/README.md
index 3793679b39..b36ac58097 100644
--- a/profiler/compare_tools/README.md
+++ b/profiler/compare_tools/README.md
@@ -162,17 +162,34 @@ activities配置仅采集NPU数据，不配置experimental_config参数以及其
 
 ### 算子性能
 
-算子性能比对结果在performance_comparison_result_*.xlsl中OperatorCompare和OperatorCompare（TOP）的sheet页呈现。
+算子性能比对结果在performance_comparison_result_*.xlsl中OperatorCompare和OperatorCompareStatistic的sheet页呈现。
 
 - OperatorCompare(TOP)：算子为粒度的统计呈现，按照算子在device上的总耗时与基准算子的差距值（Diff Duration(ms)列）进行逆序。
 - OperatorCompare：算子比对的明细展示，可以查看每一个算子对应的kernel详情。
 - Diff Ratio：比较算子在device上执行总耗时 / 基准算子在device上执行总耗时，红色代表劣化。
+- Device Duration(us)：该算子下发到device上执行的所有kernel耗时的总和。
 
-#### Device Duration(us)
+### nn.Module性能
 
-```
-该算子下发到device上执行的所有kernel耗时的总和
-```
+nn.Module是所有神经网络模块的基类，使用PyTorch构建神经网络需要继承nn.Module类来实现，性能比对工具支持nn.Module模块级的比对，帮助优化模型结构。
+
+当用户采集时开启with_stack开关，会上报python function事件，当比对的双方数据都存在python function的事件时，可进行模块级别的比对。
+
+nn.Module性能比对结果在performance_comparison_result_*.xlsl中ModuleCompareStatistic的sheet页呈现。
+
+- Module Class：Module名，如nn.Module: Linear。
+- Module Level：Module的层级。
+- Module Name：Module唯一标识名，如/ DynamicNet_0/ Linear_0。
+- Operator Name：框架侧算子名，如aten::add。字段为[ TOTAL ]代表该module的总体情况。
+- Kernel Detail：算子详细信息。
+- Device Self Time(ms)：该模块调用的算子（排除子模块）在device侧执行的总耗时，单位ms。
+- Number：该Module或算子被调用的次数。
+- Device Total Time(ms)：该模块调用的算子（包含子模块）在device侧执行的总耗时，单位ms。
+- Device Total Time Diff：GPU与NPU的Device Total Time(ms)差值。
+- Device Self Time Diff：GPU与NPU的Device Self Time(ms)差值。
+- Self Time Ratio：GPU与NPU的Device Self Time(ms)比值。
+- Base Call Stack：基准文件模块的调用栈。
+- Comparison Call Stack：比较文件模块的调用栈。
 
 ### 通信性能
 
@@ -184,7 +201,7 @@ activities配置仅采集NPU数据，不配置experimental_config参数以及其
 
 ### 算子内存
 
-算子内存比对结果在performance_comparison_result_*.xlsl中MemoryCompare和MemoryCompare（TOP）的sheet页呈现。
+算子内存比对结果在performance_comparison_result_*.xlsl中MemoryCompare和MemoryCompareStatistic的sheet页呈现。
 
 - MemoryCompare(TOP)：算子为粒度的统计呈现，按照算子占用的总内存与基准算子的差距值(Diff Memory(MB))进行逆序。
 
@@ -192,8 +209,4 @@ activities配置仅采集NPU数据，不配置experimental_config参数以及其
 
 - Diff Ratio: 比较算子占用的总内存 / 基准算子占用的总内存，红色代表劣化。
 
-#### Size(KB)
-
-```
-该算子占用的device内存大小，单位KB
-```
\ No newline at end of file
+- Size(KB)：该算子占用的device内存大小，单位KB。
-- 
Gitee


From 0012b5d6947dc4792306a73eca607246f9f04c12 Mon Sep 17 00:00:00 2001
From: zhouxianqi <13165993773@163.com>
Date: Thu, 7 Mar 2024 11:25:16 +0800
Subject: [PATCH 32/62] bug_fix_for_module_compare

---
 .../compare_backend/generator/detail_performance_generator.py | 2 ++
 .../compare_backend/profiling_parser/gpu_profiling_parser.py  | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py
index 4ae6ebcec2..677b0d18c2 100644
--- a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py
+++ b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py
@@ -108,6 +108,8 @@ class DetailPerformanceGenerator(BaseGenerator):
             self._profiling_data_dict.get(Constant.COMPARISON_DATA)).build_module_tree()
         for index, base_node in enumerate(base_root_node):
             comparison_node = comparison_root_node[index] if index < len(comparison_root_node) else None
+            if not base_node or not comparison_node:
+                continue
             module_compare_result.extend(self._matching_all_modules(base_node, comparison_node))
         return module_compare_result
 
diff --git a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py
index 77785d0261..923854bdf7 100644
--- a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py
+++ b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py
@@ -9,7 +9,7 @@ from compare_backend.utils.constant import Constant
 
 class GPUProfilingParser(BaseProfilingParser):
     CUBE_MARK = 'gemm'
-    FA_MARK_LIST = [['fmha', 'kernel'], ['flash', 'kernel']]
+    FA_MARK_LIST = [['fmha', 'kernel'], ['flash', 'kernel'], ['attention', 'kernel']]
     SDMA_MARK_LIST = ['htod', 'dtod', 'dtoh', 'memset (device)']
     FLOW_CAT = ("async_gpu", "async_cpu_to_gpu", "ac2g", "async")
     TORCH_OP_CAT = ("cpu_op", "user_annotation", "cuda_runtime", "operator")
@@ -138,7 +138,7 @@ class GPUProfilingParser(BaseProfilingParser):
             func_set.add(self._picking_kernel_event)
         if self._enable_operator_compare:
             func_set.add(self._picking_python_function_event)
-            func_set .add(self._picking_fwdbwd_flow_event)
+            func_set.add(self._picking_fwdbwd_flow_event)
         if self._enable_operator_compare or self._args.max_kernel_num:
             func_set.add(self._picking_kernel_event)
             func_set.add(self._picking_flow_event)
-- 
Gitee


From 848b648b9a799a508d306ea177de65882abe8e4f Mon Sep 17 00:00:00 2001
From: w00800385 <wuzicheng2@huawei.com>
Date: Wed, 6 Mar 2024 12:23:35 +0800
Subject: [PATCH 33/62] =?UTF-8?q?att=E6=94=AF=E6=8C=81DB=E6=A0=BC=E5=BC=8F?=
 =?UTF-8?q?=E6=95=B0=E6=8D=AE=E8=BE=93=E5=85=A5=E4=B8=8E=E5=A4=84=E7=90=86?=
 =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E6=AD=A3codec=20heck=E5=91=8A=E8=AD=A6,?=
 =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../analysis/analysis_facade.py               |   7 +-
 .../analysis/base_analysis_json.py            |  64 ++++++
 .../analysis/communication/__init__.py        |   0
 .../communication/comm_analysis_generator.py  |  17 ++
 .../communication_analysis_db.py              |  27 +++
 .../communication_analysis_json.py}           | 165 +-------------
 .../analysis/communication_matrix/__init__.py |   0
 .../comm_matrix_analysis_db.py                |  20 ++
 .../comm_matrix_analysis_json.py              |  90 ++++++++
 .../comm_matrix_generator.py                  |  17 ++
 .../analysis/step_trace_time_analysis.py      |  45 +++-
 profiler/cluster_analyse/cluster_analysis.py  |  31 ++-
 .../cluster_analyse/common_func/constant.py   |  20 +-
 .../cluster_analyse/common_func/db_manager.py | 206 +++++++++++++++++
 .../common_func/empty_class.py                |  20 ++
 .../common_func/file_manager.py               |  10 +
 .../common_func/table_constant.py             |  27 +++
 .../common_func/tables_config.py              |  63 ++++++
 .../base_communication_group.py               | 138 ++++++++++++
 .../communication_db_group.py                 |  25 +++
 .../communication_group_generator.py          | 209 +-----------------
 .../communication_json_group.py               | 132 +++++++++++
 22 files changed, 954 insertions(+), 379 deletions(-)
 create mode 100644 profiler/cluster_analyse/analysis/base_analysis_json.py
 create mode 100644 profiler/cluster_analyse/analysis/communication/__init__.py
 create mode 100644 profiler/cluster_analyse/analysis/communication/comm_analysis_generator.py
 create mode 100644 profiler/cluster_analyse/analysis/communication/communication_analysis_db.py
 rename profiler/cluster_analyse/analysis/{communication_analysis.py => communication/communication_analysis_json.py} (37%)
 create mode 100644 profiler/cluster_analyse/analysis/communication_matrix/__init__.py
 create mode 100644 profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py
 create mode 100644 profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_json.py
 create mode 100644 profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_generator.py
 create mode 100644 profiler/cluster_analyse/common_func/db_manager.py
 create mode 100644 profiler/cluster_analyse/common_func/empty_class.py
 create mode 100644 profiler/cluster_analyse/common_func/table_constant.py
 create mode 100644 profiler/cluster_analyse/common_func/tables_config.py
 create mode 100644 profiler/cluster_analyse/communication_group/base_communication_group.py
 create mode 100644 profiler/cluster_analyse/communication_group/communication_db_group.py
 create mode 100644 profiler/cluster_analyse/communication_group/communication_json_group.py

diff --git a/profiler/cluster_analyse/analysis/analysis_facade.py b/profiler/cluster_analyse/analysis/analysis_facade.py
index b383a704df..0b870bbaaf 100644
--- a/profiler/cluster_analyse/analysis/analysis_facade.py
+++ b/profiler/cluster_analyse/analysis/analysis_facade.py
@@ -14,14 +14,13 @@
 # limitations under the License.
 
 from multiprocessing import Process
-from common_func.constant import Constant
-from analysis.communication_analysis import CommunicationAnalysis
+from analysis.communication.comm_analysis_generator import CommunicationAnalysisGenerator
+from analysis.communication_matrix.comm_matrix_generator import CommMatrixAnalysisGenerator
 from analysis.step_trace_time_analysis import StepTraceTimeAnalysis
-from analysis.communication_analysis import CommMatrixAnalysis
 
 
 class AnalysisFacade:
-    analysis_module = {CommunicationAnalysis, StepTraceTimeAnalysis, CommMatrixAnalysis}
+    analysis_module = {CommunicationAnalysisGenerator, StepTraceTimeAnalysis, CommMatrixAnalysisGenerator}
 
     def __init__(self, params: dict):
         self.params = params
diff --git a/profiler/cluster_analyse/analysis/base_analysis_json.py b/profiler/cluster_analyse/analysis/base_analysis_json.py
new file mode 100644
index 0000000000..3df54b0ae2
--- /dev/null
+++ b/profiler/cluster_analyse/analysis/base_analysis_json.py
@@ -0,0 +1,64 @@
+from abc import abstractmethod
+from common_func.constant import Constant
+from common_func.file_manager import FileManager
+
+
+class BaseAnalysisJson:
+
+    def __init__(self, param: dict):
+        self.collection_path = param.get(Constant.COLLECTION_PATH)
+        self.data_map = param.get(Constant.DATA_MAP)
+        self.communication_ops = []
+        self.collective_group_dict = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.COLLECTIVE_GROUP)
+        self.comm_ops_struct = {}
+
+    @staticmethod
+    def compute_ratio(dividend: float, divisor: float):
+        if abs(divisor) < Constant.EPS:
+            return 0
+        else:
+            return round(dividend / divisor, 4)
+
+    @staticmethod
+    def check_add_op(op_name: str):
+        """
+        兼容2个版本，判断是否需要将此算子信息相加
+        """
+        stat_list = ["middle", "top", "bottom", "total"]
+        total = "total"
+        for stat_name in stat_list:
+            if stat_name in op_name:
+                if stat_name != total:
+                    return False
+            return True
+
+    @abstractmethod
+    def run(self):
+        pass
+
+    def dump_data(self):
+        if not self.comm_ops_struct:
+            print("[WARNING] There is no final comm ops data generated")
+            return
+        output_comm_data = {}
+        for key in self.comm_ops_struct:
+            output_comm_data[str(key)] = self.comm_ops_struct.get(key)
+        FileManager.create_json_file(self.collection_path, output_comm_data, self.SAVED_JSON)
+
+    def split_op_by_group(self):
+        for single_op in self.communication_ops:
+            if single_op.get(Constant.COMM_OP_TYPE) == Constant.P2P:
+                rank_tup = Constant.P2P
+            else:
+                rank_tup = tuple(self.collective_group_dict.get(single_op.get(Constant.GROUP_NAME), []))
+            rank_id = single_op.get(Constant.RANK_ID, 'N/A')
+            step_id = single_op.get(Constant.STEP_ID, 'N/A')
+            op_name = single_op.get(Constant.COMM_OP_NAME, 'N/A')
+            op_info = single_op.get(Constant.COMM_OP_INFO)
+            self.comm_ops_struct.setdefault(rank_tup, {}).setdefault(step_id, {}).\
+                setdefault(op_name, {}).setdefault(rank_id, op_info)
+
+    def combine_ops_total_info(self):
+        for rank_tup, group_dict in self.comm_ops_struct.items():
+            for step_id, communication_ops in group_dict.items():
+                self.compute_total_info(communication_ops)
diff --git a/profiler/cluster_analyse/analysis/communication/__init__.py b/profiler/cluster_analyse/analysis/communication/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/profiler/cluster_analyse/analysis/communication/comm_analysis_generator.py b/profiler/cluster_analyse/analysis/communication/comm_analysis_generator.py
new file mode 100644
index 0000000000..4b737b5da4
--- /dev/null
+++ b/profiler/cluster_analyse/analysis/communication/comm_analysis_generator.py
@@ -0,0 +1,17 @@
+from analysis.communication.communication_analysis_db import CommunicationAnalysisDB
+from analysis.communication.communication_analysis_json import CommunicationAnalysisJson
+from common_func.constant import Constant
+
+
+class CommunicationAnalysisGenerator:
+
+    GROUP_MAP = {
+        Constant.DB: CommunicationAnalysisDB,
+        Constant.TEXT: CommunicationAnalysisJson
+    }
+
+    def __init__(self, params: dict):
+        self.generator = self.GROUP_MAP.get(params.get(Constant.DATA_TYPE))(params)
+
+    def run(self):
+        self.generator.run()
diff --git a/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py b/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py
new file mode 100644
index 0000000000..0f7085ad9c
--- /dev/null
+++ b/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py
@@ -0,0 +1,27 @@
+import os
+
+from analysis.base_analysis_json import BaseAnalysisJson
+from common_func.db_manager import DBManager
+from common_func.constant import Constant
+from common_func.table_constant import TableConstant
+
+
+class CommunicationAnalysisDB:
+    COMMUNICATION_BANDWIDTH_TABLE = "ClusterCommAnalyzerBandwidth"
+    COMMUNICATION_TIME_TABLE = "ClusterCommAnalyzerTime"
+    TIME_EXTENSION = "time"
+    RANK_BAND_TYPE = "{}-{}"
+
+    def __init__(self, params: any):
+        self.collection_path = params.get(Constant.COLLECTION_PATH)
+        self.communication_time_info = params.get(Constant.COMM_DATA_DICT, {}).get(Constant.COMMUNICATION_TIME_INFO)
+        self.communication_bandwidth_info = params.get(Constant.COMM_DATA_DICT, {}).get(
+            Constant.COMMUNICATION_BANDWIDTH_INFO)
+        self.collective_group_dict = params.get(Constant.COMM_DATA_DICT, {}).get(Constant.COLLECTIVE_GROUP)
+        self.comm_time_struct = {}
+        self.comm_bandwidth_struct = {}
+        self.res_comm_time = []
+        self.res_comm_bandwidth = []
+
+    def run(self):
+        pass
\ No newline at end of file
diff --git a/profiler/cluster_analyse/analysis/communication_analysis.py b/profiler/cluster_analyse/analysis/communication/communication_analysis_json.py
similarity index 37%
rename from profiler/cluster_analyse/analysis/communication_analysis.py
rename to profiler/cluster_analyse/analysis/communication/communication_analysis_json.py
index 88ac073a9c..7fa680fe56 100644
--- a/profiler/cluster_analyse/analysis/communication_analysis.py
+++ b/profiler/cluster_analyse/analysis/communication/communication_analysis_json.py
@@ -1,74 +1,10 @@
-# Copyright (c) 2023, Huawei Technologies Co., Ltd.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 from collections import defaultdict
-from abc import abstractmethod
 
+from analysis.base_analysis_json import BaseAnalysisJson
 from common_func.constant import Constant
-from common_func.file_manager import FileManager
-
 
-class BaseCommAnalysis:
 
-    def __init__(self, param: dict):
-        self.collection_path = param.get(Constant.COLLECTION_PATH)
-        self.data_map = param.get(Constant.DATA_MAP)
-        self.communication_ops = []
-        self.collective_group_dict = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.COLLECTIVE_GROUP)
-        self.comm_ops_struct = {}
-
-    @staticmethod
-    def compute_ratio(dividend: float, divisor: float):
-        if abs(divisor) < Constant.EPS:
-            return 0
-        else:
-            return round(dividend / divisor, 4)
-
-    @abstractmethod
-    def run(self):
-        pass
-
-    def dump_data(self):
-        if not self.comm_ops_struct:
-            print("[WARNING] There is no final comm ops data generated")
-            return
-        output_comm_data = {}
-        for key in self.comm_ops_struct:
-            output_comm_data[str(key)] = self.comm_ops_struct.get(key)
-        FileManager.create_json_file(self.collection_path, output_comm_data, self.SAVED_JSON)
-
-    def split_op_by_group(self):
-        for single_op in self.communication_ops:
-            if single_op.get(Constant.COMM_OP_TYPE) == Constant.P2P:
-                rank_tup = Constant.P2P
-            else:
-                rank_tup = tuple(self.collective_group_dict.get(single_op.get(Constant.GROUP_NAME), []))
-            rank_id = single_op.get(Constant.RANK_ID, 'N/A')
-            step_id = single_op.get(Constant.STEP_ID, 'N/A')
-            op_name = single_op.get(Constant.COMM_OP_NAME, 'N/A')
-            op_info = single_op.get(Constant.COMM_OP_INFO)
-            self.comm_ops_struct.setdefault(rank_tup, {}).setdefault(step_id, {}).\
-                setdefault(op_name, {}).setdefault(rank_id, op_info)
-
-    def combine_ops_total_info(self):
-        for rank_tup, group_dict in self.comm_ops_struct.items():
-            for step_id, communication_ops in group_dict.items():
-                self.compute_total_info(communication_ops)
-
-
-class CommunicationAnalysis(BaseCommAnalysis):
+class CommunicationAnalysisJson(BaseAnalysisJson):
     SAVED_JSON = "cluster_communication.json"
 
     def __init__(self, param: dict):
@@ -144,100 +80,3 @@ class CommunicationAnalysis(BaseCommAnalysis):
             bandwidth_dict[Constant.BANDWIDTH_GB_S] = \
                 self.compute_ratio(bandwidth_dict.get(Constant.TRANSIT_SIZE_MB, 0),
                                    bandwidth_dict.get(Constant.TRANSIT_TIME_MS, 0))
-
-
-class CommMatrixAnalysis(BaseCommAnalysis):
-    SAVED_JSON = "cluster_communication_matrix.json"
-    STAT_LIST = ['middle', 'top', 'bottom', 'total']
-    TOTAL = 'total'
-
-    def __init__(self, param: dict):
-        super().__init__(param)
-        self.communication_ops = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.MATRIX_OPS)
-
-    @staticmethod
-    def combine_link(link_info_dict: dict, single_link_dict: dict):
-        link_info_dict[Constant.TRANSPORT_TYPE] = single_link_dict.get(Constant.TRANSPORT_TYPE)
-        link_info_dict[Constant.OP_NAME] = single_link_dict.get(Constant.OP_NAME, '')
-        link_info_dict[Constant.TRANSIT_TIME_MS] += single_link_dict.get(Constant.TRANSIT_TIME_MS, 0)
-        link_info_dict[Constant.TRANSIT_SIZE_MB] += single_link_dict.get(Constant.TRANSIT_SIZE_MB, 0)
-
-    def run(self):
-        if not self.communication_ops:
-            return
-        self.split_op_by_group()
-        self.combine_ops_total_info()
-        self.dump_data()
-
-    def compute_total_info(self, step_dict: dict):
-        self.merge_same_links(step_dict)
-        self.combine_link_info(step_dict)
-
-    def merge_same_links(self, step_dict: dict):
-        def process_link_key():
-            for link_key in rank_dict:
-                if '-' not in link_key:
-                    print(f"[WARNING] {op_name} has an invalid link key {link_key}!")
-                    break
-                src_rank = link_key.split('-')[0]
-                dst_rank = link_key.split('-')[1]
-                if src_rank == dst_rank:
-                    if src_rank not in project_local_global_rank_map:
-                        project_local_global_rank_map[src_rank] = rank_id
-                    elif project_local_global_rank_map.get(src_rank) != rank_id:
-                        print(f"[WARNING] In the same communication group, local ranks projecting to global ranks repeat!")
-                self.combine_link(link_info[link_key], rank_dict[link_key])
-
-        def convert_local_to_global_rank():
-            tmp_link = {}
-            for link_key, link_dict in link_info.items():
-                src_rank = link_key.split('-')[0]
-                dst_rank = link_key.split('-')[1]
-                src_rank = project_local_global_rank_map[src_rank] \
-                    if src_rank in project_local_global_rank_map else src_rank
-                dst_rank = project_local_global_rank_map[dst_rank] \
-                    if dst_rank in project_local_global_rank_map else dst_rank
-                link_dict[Constant.BANDWIDTH_GB_S] = \
-                    self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0),
-                                       link_dict.get(Constant.TRANSIT_TIME_MS, 0))
-                tmp_link[f"{src_rank}-{dst_rank}"] = link_dict
-            return tmp_link
-
-        project_local_global_rank_map = dict()
-        for op_name, op_dict in step_dict.items():
-            link_info = defaultdict(lambda: {
-                Constant.TRANSPORT_TYPE: '',
-                Constant.TRANSIT_TIME_MS: 0,
-                Constant.TRANSIT_SIZE_MB: 0,
-                Constant.OP_NAME: ''
-            })
-            for rank_id, rank_dict in op_dict.items():
-                process_link_key()
-            step_dict[op_name] = convert_local_to_global_rank()
-
-    def combine_link_info(self, step_dict: dict):
-        total_op_info = defaultdict(lambda: {
-            Constant.TRANSPORT_TYPE: '',
-            Constant.TRANSIT_TIME_MS: 0,
-            Constant.TRANSIT_SIZE_MB: 0,
-            Constant.OP_NAME: ''
-        })
-        for op_name, op_dict in step_dict.items():
-            if self.check_add_op(op_name):
-                for link_key, link_dict in op_dict.items():
-                    self.combine_link(total_op_info[link_key], link_dict)
-        for link_key, link_dict in total_op_info.items():
-            link_dict[Constant.BANDWIDTH_GB_S] = \
-                self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0),
-                                   link_dict.get(Constant.TRANSIT_TIME_MS, 0))
-        step_dict[Constant.TOTAL_OP_INFO] = total_op_info
-
-    def check_add_op(self: any, op_name: str):
-        """
-        兼容2个版本，判断是否需要将此算子信息相加
-        """
-        for stat_name in self.STAT_LIST:
-            if stat_name in op_name:
-                if stat_name != self.TOTAL:
-                    return False
-            return True
diff --git a/profiler/cluster_analyse/analysis/communication_matrix/__init__.py b/profiler/cluster_analyse/analysis/communication_matrix/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py
new file mode 100644
index 0000000000..ea02c990bf
--- /dev/null
+++ b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py
@@ -0,0 +1,20 @@
+import os
+
+from analysis.base_analysis_json import BaseAnalysisJson
+from common_func.db_manager import DBManager
+from common_func.constant import Constant
+from common_func.table_constant import TableConstant
+
+
+class CommMatrixAnalysisDB:
+    COMMUNICATION_MATRIX_TABLE = "ClusterCommAnalyzerMatrix"
+
+    def __init__(self, params: any):
+        self.collection_path = params.get(Constant.COLLECTION_PATH)
+        self.matrix_info = params.get(Constant.COMM_DATA_DICT, {}).get(Constant.MATRIX_OPS)
+        self.collective_group_dict = params.get(Constant.COMM_DATA_DICT, {}).get(Constant.COLLECTIVE_GROUP)
+        self.comm_matrix_struct = {}
+        self.res_comm_matrix = []
+
+    def run(self):
+        pass
\ No newline at end of file
diff --git a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_json.py b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_json.py
new file mode 100644
index 0000000000..7baca7e928
--- /dev/null
+++ b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_json.py
@@ -0,0 +1,90 @@
+from collections import defaultdict
+
+from analysis.base_analysis_json import BaseAnalysisJson
+from common_func.constant import Constant
+
+
+class CommMatrixAnalysisJson(BaseAnalysisJson):
+    SAVED_JSON = "cluster_communication_matrix.json"
+
+    def __init__(self, param: dict):
+        super().__init__(param)
+        self.communication_ops = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.MATRIX_OPS)
+
+    @staticmethod
+    def combine_link(link_info_dict: dict, single_link_dict: dict):
+        link_info_dict[Constant.TRANSPORT_TYPE] = single_link_dict.get(Constant.TRANSPORT_TYPE)
+        link_info_dict[Constant.OP_NAME] = single_link_dict.get(Constant.OP_NAME, '')
+        link_info_dict[Constant.TRANSIT_TIME_MS] += single_link_dict.get(Constant.TRANSIT_TIME_MS, 0)
+        link_info_dict[Constant.TRANSIT_SIZE_MB] += single_link_dict.get(Constant.TRANSIT_SIZE_MB, 0)
+
+    def run(self):
+        if not self.communication_ops:
+            return
+        self.split_op_by_group()
+        self.combine_ops_total_info()
+        self.dump_data()
+
+    def compute_total_info(self, step_dict: dict):
+        self.merge_same_links(step_dict)
+        self.combine_link_info(step_dict)
+
+    def merge_same_links(self, step_dict: dict):
+        def process_link_key():
+            for link_key in rank_dict:
+                if '-' not in link_key:
+                    print(f"[WARNING] {op_name} has an invalid link key {link_key}!")
+                    break
+                src_rank = link_key.split('-')[0]
+                dst_rank = link_key.split('-')[1]
+                if src_rank == dst_rank:
+                    if src_rank not in project_local_global_rank_map:
+                        project_local_global_rank_map[src_rank] = rank_id
+                    elif project_local_global_rank_map.get(src_rank) != rank_id:
+                        print(f"[WARNING] In the same communication group, local ranks projecting to global ranks "
+                              f"repeat!")
+                self.combine_link(link_info[link_key], rank_dict[link_key])
+
+        def convert_local_to_global_rank():
+            tmp_link = {}
+            for link_key, link_dict in link_info.items():
+                src_rank = link_key.split('-')[0]
+                dst_rank = link_key.split('-')[1]
+                src_rank = project_local_global_rank_map[src_rank] \
+                    if src_rank in project_local_global_rank_map else src_rank
+                dst_rank = project_local_global_rank_map[dst_rank] \
+                    if dst_rank in project_local_global_rank_map else dst_rank
+                link_dict[Constant.BANDWIDTH_GB_S] = \
+                    self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0),
+                                       link_dict.get(Constant.TRANSIT_TIME_MS, 0))
+                tmp_link[f"{src_rank}-{dst_rank}"] = link_dict
+            return tmp_link
+
+        project_local_global_rank_map = dict()
+        for op_name, op_dict in step_dict.items():
+            link_info = defaultdict(lambda: {
+                Constant.TRANSPORT_TYPE: '',
+                Constant.TRANSIT_TIME_MS: 0,
+                Constant.TRANSIT_SIZE_MB: 0,
+                Constant.OP_NAME: ''
+            })
+            for rank_id, rank_dict in op_dict.items():
+                process_link_key()
+            step_dict[op_name] = convert_local_to_global_rank()
+
+    def combine_link_info(self, step_dict: dict):
+        total_op_info = defaultdict(lambda: {
+            Constant.TRANSPORT_TYPE: '',
+            Constant.TRANSIT_TIME_MS: 0,
+            Constant.TRANSIT_SIZE_MB: 0,
+            Constant.OP_NAME: ''
+        })
+        for op_name, op_dict in step_dict.items():
+            if self.check_add_op(op_name):
+                for link_key, link_dict in op_dict.items():
+                    self.combine_link(total_op_info[link_key], link_dict)
+        for link_key, link_dict in total_op_info.items():
+            link_dict[Constant.BANDWIDTH_GB_S] = \
+                self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0),
+                                   link_dict.get(Constant.TRANSIT_TIME_MS, 0))
+        step_dict[Constant.TOTAL_OP_INFO] = total_op_info
diff --git a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_generator.py b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_generator.py
new file mode 100644
index 0000000000..03a1826955
--- /dev/null
+++ b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_generator.py
@@ -0,0 +1,17 @@
+from analysis.communication_matrix.comm_matrix_analysis_db import CommMatrixAnalysisDB
+from analysis.communication_matrix.comm_matrix_analysis_json import CommMatrixAnalysisJson
+from common_func.constant import Constant
+
+
+class CommMatrixAnalysisGenerator:
+
+    GROUP_MAP = {
+        Constant.DB: CommMatrixAnalysisDB,
+        Constant.TEXT: CommMatrixAnalysisJson
+    }
+
+    def __init__(self, params: dict):
+        self.generator = self.GROUP_MAP.get(params.get(Constant.DATA_TYPE))(params)
+
+    def run(self):
+        self.generator.run()
diff --git a/profiler/cluster_analyse/analysis/step_trace_time_analysis.py b/profiler/cluster_analyse/analysis/step_trace_time_analysis.py
index d24a7f1fe6..20a71df3c5 100644
--- a/profiler/cluster_analyse/analysis/step_trace_time_analysis.py
+++ b/profiler/cluster_analyse/analysis/step_trace_time_analysis.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 
 import os
-from collections import defaultdict
 
+from common_func.db_manager import DBManager
 from common_func.constant import Constant
 from common_func.file_manager import FileManager
 from prof_bean.step_trace_time_bean import StepTraceTimeBean
@@ -23,6 +23,7 @@ from prof_bean.step_trace_time_bean import StepTraceTimeBean
 
 class StepTraceTimeAnalysis:
     CLUSTER_TRACE_TIME_CSV = "cluster_step_trace_time.csv"
+    CLUSTER_TRACE_TIME_TABLE = "ClusterStepTraceTime"
 
     def __init__(self, param: dict):
         self.collection_path = param.get(Constant.COLLECTION_PATH)
@@ -30,6 +31,7 @@ class StepTraceTimeAnalysis:
         self.communication_group = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.COMMUNICATION_GROUP)
         self.step_time_dict = {}
         self.step_data_list = []
+        self.data_type = param.get(Constant.DATA_TYPE)
 
     @staticmethod
     def get_max_data_row(data_group_list: list):
@@ -51,21 +53,44 @@ class StepTraceTimeAnalysis:
     def dump_data(self):
         if not self.step_data_list:
             print("[WARNING] Can't get step time info!")
-        headers = self.get_headers()
-        FileManager.create_csv_file(self.collection_path, self.step_data_list, self.CLUSTER_TRACE_TIME_CSV, headers)
+        if self.data_type == Constant.TEXT:
+            headers = self.get_headers()
+            FileManager.create_csv_file(self.collection_path, self.step_data_list, self.CLUSTER_TRACE_TIME_CSV, headers)
+        else:
+            output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT)
+            result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER)
+            DBManager.create_tables(result_db, self.CLUSTER_TRACE_TIME_TABLE)
+            conn, cursor = DBManager.create_connect_db(result_db)
+            sql = "insert into {} values ({value})".format(self.CLUSTER_TRACE_TIME_TABLE,
+                                                           value="?," * (len(self.step_data_list[0]) - 1) + "?")
+            DBManager.executemany_sql(conn, sql, self.step_data_list)
+            DBManager.destroy_db_connect(conn, cursor)
 
     def load_step_trace_time_data(self):
         for rank_id, profiling_dir_path in self.data_map.items():
-            step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.STEP_TIME_CSV)
-            if step_time_file:
-                self.step_time_dict[rank_id] = FileManager.read_csv_file(step_time_file, StepTraceTimeBean)
+            if self.data_type == Constant.TEXT:
+                step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.STEP_TIME_CSV)
+                if step_time_file:
+                    self.step_time_dict[rank_id] = FileManager.read_csv_file(step_time_file, StepTraceTimeBean)
+            else:
+                step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT,
+                                              Constant.DB_COMMUNICATION_ANALYZER)
+                if step_time_file and DBManager.check_tables_in_db(step_time_file, Constant.TABLE_STEP_TRACE):
+                    conn, cursor = DBManager.create_connect_db(step_time_file)
+                    sql = "select * from {0}".format(Constant.TABLE_STEP_TRACE)
+                    data = DBManager.fetch_all_data(cursor, sql, is_dict=False)
+                    self.step_time_dict[rank_id] = data
+                    DBManager.destroy_db_connect(conn, cursor)
             if not self.step_time_dict.get(rank_id):
                 print(f"[WARNING] Rank {rank_id} does not have a valid step_trace_time.json.")
 
     def analyze_step_time(self):
         for rank_id, data_bean_list in self.step_time_dict.items():
             for data_bean in data_bean_list:
-                self.step_data_list.append([data_bean.step, Constant.RANK, rank_id] + data_bean.row)
+                if self.data_type == Constant.TEXT:
+                    self.step_data_list.append([data_bean.step, Constant.RANK, rank_id] + data_bean.row)
+                else:
+                    self.step_data_list.append([data_bean[0], Constant.RANK, rank_id] + list(data_bean[1:]))
         stage_list = self.communication_group.get(Constant.P2P)
         if not stage_list:
             return
@@ -80,7 +105,11 @@ class StepTraceTimeAnalysis:
             step_group_dict.setdefault(key, []).append(data_list[3:])
 
         for key, data_group_list in step_group_dict.items():
-            self.step_data_list.append([key[0], Constant.STAGE, key[1]] + self.get_max_data_row(data_group_list))
+            if self.data_type == Constant.TEXT:
+                self.step_data_list.append([key[0], Constant.STAGE, key[1]] + self.get_max_data_row(data_group_list))
+            else:
+                index = "(" + ",".join(str(i) for i in key[1]) + ")"
+                self.step_data_list.append([key[0], Constant.STAGE, index] + self.get_max_data_row(data_group_list))
 
     def get_headers(self):
         if self.step_time_dict:
diff --git a/profiler/cluster_analyse/cluster_analysis.py b/profiler/cluster_analyse/cluster_analysis.py
index e07cac1703..68eae526fb 100644
--- a/profiler/cluster_analyse/cluster_analysis.py
+++ b/profiler/cluster_analyse/cluster_analysis.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import argparse
+import glob
 import os
 
 from cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor
@@ -28,6 +29,8 @@ from analysis.analysis_facade import AnalysisFacade
 class Interface:
     ASCEND_PT = "ascend_pt"
     ASCEND_MS = "ascend_ms"
+    DB_RESULT_INFO = "*.db"
+    ALL_RESULT_INFO = "*.*"
 
     def __init__(self, params: dict):
         self.collection_path = PathManager.get_realpath(params.get(Constant.COLLECTION_PATH))
@@ -38,6 +41,25 @@ class Interface:
         self.communication_ops = []
         self.matrix_ops = []
 
+    def check_db_or_other_files(self, data_map: dict) -> tuple:
+        type_db_count = 0
+        type_text_count = 0
+        for _, folder_path in data_map.items():
+            folder_path = os.path.join(folder_path, Constant.SINGLE_OUTPUT)
+            db_files = glob.glob(os.path.join(folder_path, self.DB_RESULT_INFO))
+            all_files = glob.glob(os.path.join(folder_path, self.ALL_RESULT_INFO))
+            if all_files and db_files and len(all_files) != len(db_files):
+                return False, None
+            if db_files:
+                type_db_count += 1
+            else:
+                type_text_count += 1
+        if type_db_count == len(data_map):
+            return True, Constant.DB
+        if type_text_count == len(data_map):
+            return True, Constant.TEXT
+        return False, None
+
     def allocate_prof_data(self):
         ascend_pt_dirs = []
         ascend_ms_dirs = []
@@ -51,7 +73,7 @@ class Interface:
         ms_data_map = MindsporeDataPreprocessor(ascend_ms_dirs).get_data_map()
         if pt_data_map and ms_data_map:
             print("[ERROR] Can not analyze pytorch and mindspore meantime.")
-            return[]
+            return []
         return pt_data_map if pt_data_map else ms_data_map
 
     def run(self):
@@ -62,10 +84,15 @@ class Interface:
         if not data_map:
             print("[WARNING] Can not get rank info or profiling data.")
             return
+        is_valid, data_type = self.check_db_or_other_files(data_map)
+        if not is_valid:
+            print("[WARNING] The current folder contains both DB and other files. Please check.")
+            return
         params = {
             Constant.COLLECTION_PATH: self.collection_path,
             Constant.DATA_MAP: data_map,
-            Constant.ANALYSIS_MODE: self.analysis_mode
+            Constant.ANALYSIS_MODE: self.analysis_mode,
+            Constant.DATA_TYPE: data_type
         }
         comm_data_dict = CommunicationGroupGenerator(params).generate()
         params[Constant.COMM_DATA_DICT] = comm_data_dict
diff --git a/profiler/cluster_analyse/common_func/constant.py b/profiler/cluster_analyse/common_func/constant.py
index e426a9d225..71caee40db 100644
--- a/profiler/cluster_analyse/common_func/constant.py
+++ b/profiler/cluster_analyse/common_func/constant.py
@@ -30,6 +30,7 @@ class Constant(object):
     MAX_JSON_SIZE = 1024 * 1024 * 1024 * 10
     MAX_CSV_SIZE = 1024 * 1024 * 1024 * 5
     MAX_PATH_LENGTH = 4096
+    MAX_READ_DB_FILE_BYTES = 1024 * 1024 * 1024 * 8
 
     # communication
     P2P = "p2p"
@@ -66,11 +67,12 @@ class Constant(object):
     COMMUNICATION_GROUP = "communication_group"
     TRANSPORT_TYPE = "Transport Type"
     COMM_DATA_DICT = "comm_data_dict"
+    DATA_TYPE = "data_type"
     ANALYSIS_MODE = "analysis_mode"
 
     # step time
-    RANK = 'rank'
-    STAGE = 'stage'
+    RANK = "rank"
+    STAGE = "stage"
 
     # epsilon
     EPS = 1e-15
@@ -78,3 +80,17 @@ class Constant(object):
     # file suffix
     JSON_SUFFIX = ".json"
     CSV_SUFFIX = ".csv"
+
+    # result files type
+    TEXT = "text"
+    DB = "db"
+
+    # db name
+    DB_COMMUNICATION_ANALYZER = "analysis.db"
+    DB_CLUSTER_COMMUNICATION_ANALYZER = "cluster_analysis.db"
+
+    # db tables
+    TABLE_COMM_ANALYZER_BANDWIDTH = "CommAnalyzerBandwidth"
+    TABLE_COMM_ANALYZER_TIME = "CommAnalyzerTime"
+    TABLE_COMM_ANALYZER_MATRIX = "CommAnalyzerMatrix"
+    TABLE_STEP_TRACE = "StepTraceTime"
diff --git a/profiler/cluster_analyse/common_func/db_manager.py b/profiler/cluster_analyse/common_func/db_manager.py
new file mode 100644
index 0000000000..f19bc15dc8
--- /dev/null
+++ b/profiler/cluster_analyse/common_func/db_manager.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sqlite3
+
+from common_func.constant import Constant
+from common_func.empty_class import EmptyClass
+from common_func.file_manager import check_db_path_valid
+from common_func.tables_config import TablesConfig
+
+
+class DBManager:
+    """
+    class to manage DB operation
+    """
+    FETCH_SIZE = 10000
+    INSERT_SIZE = 10000
+    MAX_ROW_COUNT = 100000000
+
+    @staticmethod
+    def create_connect_db(db_path: str) -> tuple:
+        """
+        create and connect database
+        """
+        if check_db_path_valid(db_path):
+            try:
+                conn = sqlite3.connect(db_path)
+            except sqlite3.Error as err:
+                print(f"[ERROR] {err}")
+                return EmptyClass("empty conn"), EmptyClass("empty curs")
+            try:
+                if isinstance(conn, sqlite3.Connection):
+                    curs = conn.cursor()
+                    os.chmod(db_path, Constant.FILE_AUTHORITY)
+                    return conn, curs
+            except sqlite3.Error as err:
+                print(f"[ERROR] {err}")
+                return EmptyClass("empty conn"), EmptyClass("empty curs")
+        return EmptyClass("empty conn"), EmptyClass("empty curs")
+
+    @staticmethod
+    def destroy_db_connect(conn: any, curs: any) -> None:
+        """
+        destroy db connection
+        """
+        try:
+            if isinstance(curs, sqlite3.Cursor):
+                curs.close()
+        except sqlite3.Error as err:
+            print(f"[ERROR] {err}")
+        try:
+            if isinstance(conn, sqlite3.Connection):
+                conn.close()
+        except sqlite3.Error as err:
+            print(f"[ERROR] {err}")
+
+    @staticmethod
+    def judge_table_exists(curs: any, table_name: str) -> any:
+        """
+        judge table exists
+        """
+        if not isinstance(curs, sqlite3.Cursor):
+            return False
+        try:
+            curs.execute("select count(*) from sqlite_master where type='table' and name=?", (table_name,))
+            return curs.fetchone()[0]
+        except sqlite3.Error as err:
+            print("[ERROR] {}".format(err))
+            return False
+
+    @staticmethod
+    def sql_generate_table(table_map: str):
+        header_with_type_begin = "("
+        header_with_type_end = ")"
+        header_with_type_list = []
+        if table_map in TablesConfig.DATA:
+            items = TablesConfig.DATA[table_map]
+            for item in items:
+                if item[0] == "index":
+                    header_with_type_list.append('"' + item[0] + '" ' + item[1].split(",")[0])
+                else:
+                    header_with_type_list.append(item[0] + ' ' + item[1].split(",")[0])
+            header_with_type_begin += ",".join(header_with_type_list)
+            header_with_type_begin += header_with_type_end
+            return header_with_type_begin
+        return ""
+
+    @classmethod
+    def check_tables_in_db(cls, db_path: any, *tables: any) -> bool:
+        if check_db_path_valid(db_path, True):
+            conn, curs = cls.create_connect_db(db_path)
+            if not (conn and curs):
+                return False
+            res = True
+            for table in tables:
+                if not cls.judge_table_exists(curs, table):
+                    res = False
+                    break
+            cls.destroy_db_connect(conn, curs)
+            return res
+        return False
+
+    @classmethod
+    def create_tables(cls, db_path: any, *tables: any) -> bool:
+        conn, curs = cls.create_connect_db(db_path)
+        for table_name in tables:
+            if not cls.judge_table_exists(curs, table_name):
+                table_map = "{0}Map".format(table_name)
+                header_with_type = cls.sql_generate_table(table_map)
+                sql = "CREATE TABLE IF NOT EXISTS " + table_name + header_with_type
+                cls.execute_sql(conn, sql)
+
+    @staticmethod
+    def execute_sql(conn: any, sql: str, params: any = None) -> bool:
+        """
+        execute sql
+        """
+        try:
+            if isinstance(conn, sqlite3.Connection):
+                if params:
+                    conn.cursor().execute(sql, params)
+                else:
+                    conn.cursor().execute(sql)
+                conn.commit()
+                return True
+        except sqlite3.Error as err:
+            print(f"[ERROR] {err}")
+            return False
+        print("[ERROR] conn is invalid param")
+        return False
+
+    @staticmethod
+    def executemany_sql(conn: any, sql: str, params: any) -> bool:
+        """
+        execute many sql once
+        """
+        try:
+            if isinstance(conn, sqlite3.Connection):
+                conn.cursor().executemany(sql, params)
+                conn.commit()
+                return True
+        except sqlite3.Error as err:
+            print(f"[ERROR] {err}")
+            return False
+        print("[ERROR] conn is invalid param")
+        return False
+
+    @classmethod
+    def fetch_all_data(cls: any, curs: any, sql: str, param: tuple = None, is_dict: bool = True) -> list:
+        """
+        fetch 10000 num of data from db each time to get all data
+        """
+        if not isinstance(curs, sqlite3.Cursor):
+            return []
+        data = []
+        try:
+            if param:
+                res = curs.execute(sql, param)
+            else:
+                res = curs.execute(sql)
+        except sqlite3.Error as err:
+            print(f"[ERROR] {err}")
+            curs.row_factory = None
+            return []
+        try:
+            description = res.description
+            while True:
+                res = curs.fetchmany(cls.FETCH_SIZE)
+                if is_dict:
+                    data += CustomizedDictFactory.generate_dict_from_db(res, description)
+                else:
+                    data += res
+                if len(data) > cls.MAX_ROW_COUNT:
+                    print("[WARRING] The records count in the table exceeds the limit!")
+                if len(res) < cls.FETCH_SIZE:
+                    break
+            return data
+        except sqlite3.Error as err:
+            print(f"[ERROR] {err}")
+            return []
+        finally:
+            curs.row_factory = None
+
+
+class CustomizedDictFactory:
+    @staticmethod
+    def generate_dict_from_db(data_result: any, description: any) -> any:
+        description_set = [i[0] for i in description]
+        res = []
+        for data in data_result:
+            data_dict = dict(zip(description_set, data))
+            res.append(data_dict)
+        return res
diff --git a/profiler/cluster_analyse/common_func/empty_class.py b/profiler/cluster_analyse/common_func/empty_class.py
new file mode 100644
index 0000000000..df100d156f
--- /dev/null
+++ b/profiler/cluster_analyse/common_func/empty_class.py
@@ -0,0 +1,20 @@
+class EmptyClass:
+
+    def __init__(self: any, info: str = "") -> None:
+        self._info = info
+
+    @classmethod
+    def __bool__(cls: any) -> bool:
+        return False
+
+    @classmethod
+    def __str__(cls: any) -> str:
+        return ""
+
+    @property
+    def info(self: any) -> str:
+        return self._info
+
+    @staticmethod
+    def is_empty() -> bool:
+        return True
diff --git a/profiler/cluster_analyse/common_func/file_manager.py b/profiler/cluster_analyse/common_func/file_manager.py
index 3853c806f9..28ecbeaaf1 100644
--- a/profiler/cluster_analyse/common_func/file_manager.py
+++ b/profiler/cluster_analyse/common_func/file_manager.py
@@ -115,3 +115,13 @@ class FileManager:
         file_size = os.path.getsize(file_path)
         if file_size > limit_size:
             raise RuntimeError(f"The file({base_name}) size exceeds the preset max value.")
+
+
+def check_db_path_valid(path: str, is_create: bool = False, max_size: int = Constant.MAX_READ_DB_FILE_BYTES) -> bool:
+    if os.path.islink(path):
+        print(f'[ERROR] The db file path: {path} is link. Please check the path')
+        return False
+    if not is_create and os.path.exists(path) and os.path.getsize(path) > max_size:
+        print(f'[ERROR] The db file: {path} is too large to read. Please check the file')
+        return False
+    return True
diff --git a/profiler/cluster_analyse/common_func/table_constant.py b/profiler/cluster_analyse/common_func/table_constant.py
new file mode 100644
index 0000000000..de6d47e97e
--- /dev/null
+++ b/profiler/cluster_analyse/common_func/table_constant.py
@@ -0,0 +1,27 @@
+class TableConstant:
+
+    RANK_SET = "rank_set"
+    STEP = "step"
+    RANK_ID = "rank_id"
+    TYPE = "type"
+    HCCL_OP_NAME = "hccl_op_name"
+    GROUP_NAME = "group_name"
+    START_TIMESTAMP = "start_timestamp"
+    ELAPSED_TIME = "elapse_time"
+    TRANSIT_TIME = "transit_time"
+    WAIT_TIME = "wait_time"
+    SYNCHRONIZATION_TIME = "synchronization_time"
+    IDLE_TIME = "idle_time"
+    SYNCHRONIZATION_TIME_RATIO = "synchronization_time_ratio"
+    WAIT_TIME_RATIO = "wait_time_ratio"
+    BAND_TYPE = "band_type"
+    TRANSIT_SIZE = "transit_size"
+    BANDWIDTH = "bandwidth"
+    LARGE_PACKET_RATIO = "large_packet_ratio"
+    PACKAGE_SIZE = "package_size"
+    COUNT = "count"
+    TOTAL_DURATION = "total_duration"
+    SRC_RANK = "src_rank"
+    DST_RANK = "dst_rank"
+    TRANSPORT_TYPE = "transport_type"
+    OPNAME = "op_name"
diff --git a/profiler/cluster_analyse/common_func/tables_config.py b/profiler/cluster_analyse/common_func/tables_config.py
new file mode 100644
index 0000000000..75ee41c058
--- /dev/null
+++ b/profiler/cluster_analyse/common_func/tables_config.py
@@ -0,0 +1,63 @@
+class TablesConfig:
+    DATA = {
+        "ClusterCommAnalyzerTimeMap": [
+            ("rank_set", "TEXT, null"),
+            ("step", "TEXT, null"),
+            ("rank_id", "INTEGER, null"),
+            ("hccl_op_name", "TEXT, null"),
+            ("group_name", "TEXT, null"),
+            ("start_timestamp", "NUMERIC, null"),
+            ("elapsed_time", "NUMERIC, null"),
+            ("transit_time", "NUMERIC, null"),
+            ("wait_time", "NUMERIC, null"),
+            ("synchronization_time", "NUMERIC, null"),
+            ("idle_time", "NUMERIC, null"),
+            ("synchronization_time_ratio", "NUMERIC, null"),
+            ("wait_time_ratio", "NUMERIC, null")
+        ],
+        "CommunicationGroupMap": [
+            ("type", "TEXT, null"),
+            ("rank_set", "TEXT, null")
+        ],
+        "ClusterCommAnalyzerBandwidthMap": [
+            ("rank_set", "TEXT, null"),
+            ("step", "TEXT, null"),
+            ("rank_id", "INTEGER, null"),
+            ("hccl_op_name", "TEXT, null"),
+            ("group_name", "TEXT, null"),
+            ("band_type", "TEXT, null"),
+            ("transit_size", "NUMERIC, null"),
+            ("transit_time", "NUMERIC, null"),
+            ("bandwidth", "NUMERIC, null"),
+            ("large_packet_ratio", "NUMERIC, null"),
+            ("package_size", "NUMERIC, null"),
+            ("count", "NUMERIC, null"),
+            ("total_duration", "NUMERIC, null")
+        ],
+        "ClusterCommAnalyzerMatrixMap": [
+            ("rank_set", "TEXT, null"),
+            ("step", "TEXT, null"),
+            ("hccl_op_name", "TEXT, null"),
+            ("group_name", "TEXT, null"),
+            ("src_rank", "TEXT, null"),
+            ("dst_rank", "TEXT, null"),
+            ("transit_size", "NUMERIC, null"),
+            ("transit_time", "NUMERIC, null"),
+            ("bandwidth", "NUMERIC, null"),
+            ("transport_type", "TEXT, null"),
+            ("op_name", "TEXT, null")
+        ],
+        "ClusterStepTraceTimeMap": [
+            ("step", "TEXT, null"),
+            ("type", "TEXT, null"),
+            ("index", "TEXT, null"),
+            ("computing", "NUMERIC, null"),
+            ("communication_not_overlapped", "NUMERIC, null"),
+            ("overlapped", "NUMERIC, null"),
+            ("communication", "NUMERIC, null"),
+            ("free", "NUMERIC, null"),
+            ("stage", "NUMERIC, null"),
+            ("bubble", "NUMERIC, null"),
+            ("communication_not_overlapped_and_exclude_receive", "NUMERIC, null")
+        ]
+    }
diff --git a/profiler/cluster_analyse/communication_group/base_communication_group.py b/profiler/cluster_analyse/communication_group/base_communication_group.py
new file mode 100644
index 0000000000..515c77c93a
--- /dev/null
+++ b/profiler/cluster_analyse/communication_group/base_communication_group.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from abc import abstractmethod
+from collections import defaultdict
+from copy import deepcopy
+from multiprocessing import Pool
+
+from common_func.constant import Constant
+
+
+class BaseCommunicationGroup:
+    def __init__(self, params: dict):
+        self.collection_path = params.get(Constant.COLLECTION_PATH)
+        self.data_map = params.get(Constant.DATA_MAP)
+        self.data_type = params.get(Constant.DATA_TYPE)
+        self.analysis_mode = params.get(Constant.ANALYSIS_MODE)
+        self.rank_comm_dir_dict = {}
+        self.p2p_link = []
+        self.collective_group_dict = defaultdict(set)
+        self.p2p_comm_group = []
+        self.communication_group = {}
+
+    def load_communication_data(self):
+        comm_op_dirs = []
+        for rank_id, profiling_dir_path in self.data_map.items():
+            if self.data_type == Constant.TEXT:
+                comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_JSON)
+                matrix_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_MATRIX_JSON)
+            else:
+                comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.DB_COMMUNICATION_ANALYZER)
+                matrix_dir = comm_dir
+            if comm_dir and matrix_dir:
+                comm_op_dirs.append((rank_id, comm_dir, matrix_dir))
+            else:
+                print(
+                    f"[WARNING] Rank {rank_id} does not have a valid communication.json or communication_matrix.json.")
+        with Pool() as p:
+            self.rank_comm_dir_dict = p.map(self.read_communication_func, comm_op_dirs)
+
+    def set_p2p_groups(self):
+        self.p2p_link = sorted(self.p2p_link, key=lambda x: min(x))
+        while self.p2p_link:
+            union_set = deepcopy(self.p2p_link[0])
+            rm_list = [self.p2p_link[0]]
+            for idx, link_rank_set_x in enumerate(self.p2p_link[1:]):
+                if UnionFind.is_connected(link_rank_set_x, union_set):
+                    union_set = union_set.union(link_rank_set_x)
+                    rm_list.append(link_rank_set_x)
+            self.p2p_comm_group.append(union_set)
+            self.p2p_link = [element for element in self.p2p_link if element not in rm_list]
+
+    def generate_collective_communication_group(self):
+        self.communication_group[Constant.COLLECTIVE] = \
+            [list(group) for group_name, group in self.collective_group_dict.items()]
+
+    def generate_p2p_communication_group(self):
+        stage_group = {}
+        for group_name, rank_set in self.collective_group_dict.items():
+            if not self.whether_valid_comm_group(rank_set):
+                continue
+            unioned_set = set()
+            remove_key = []
+            for first_rank, stage in stage_group.items():
+                if UnionFind.is_connected(rank_set, stage):
+                    unioned_set = UnionFind.union(rank_set, stage, unioned_set)
+                    remove_key.append(first_rank)
+            if unioned_set:
+                for key in remove_key:
+                    del stage_group[key]
+                stage_group[min(unioned_set)] = unioned_set
+            else:
+                stage_group[min(rank_set)] = rank_set
+        first_rank_sort_list = sorted([first_rank for first_rank in stage_group])
+        self.communication_group[Constant.P2P] = \
+            [list(stage_group.get(first_rank, {})) for first_rank in first_rank_sort_list]
+
+    def whether_valid_comm_group(self, rank_set: set):
+        """
+        while distinguish which communication group should be used to infer stage info, these group should be ignored:
+            1. group can not include more than 1 rank in every single p2p group
+        """
+        for p2p_rank_set in self.p2p_comm_group:
+            if len(rank_set.intersection(p2p_rank_set)) > 1:
+                return False
+        return True
+
+    @abstractmethod
+    def read_communication_func(self, params: tuple):
+        pass
+
+    @abstractmethod
+    def analyze_communication_data(self):
+        pass
+
+    @abstractmethod
+    def dump_data(self):
+        pass
+
+    def generate(self):
+        self.load_communication_data()
+        self.analyze_communication_data()
+        self.set_p2p_groups()
+        self.generate_collective_communication_group()
+        self.generate_p2p_communication_group()
+        return self.dump_data()
+
+
+class UnionFind(object):
+    """Disjoint Set Union"""
+
+    @classmethod
+    def union(cls, first_set: set, second_set: set, third_set: set):
+        """make p and q the same set"""
+        return first_set | second_set | third_set
+
+    @classmethod
+    def is_connected(cls, first_set: set, second_set: set):
+        """
+        check whether set p and set q are connected
+        """
+        if first_set & second_set:
+            return True
+        else:
+            return False
diff --git a/profiler/cluster_analyse/communication_group/communication_db_group.py b/profiler/cluster_analyse/communication_group/communication_db_group.py
new file mode 100644
index 0000000000..0122a63f16
--- /dev/null
+++ b/profiler/cluster_analyse/communication_group/communication_db_group.py
@@ -0,0 +1,25 @@
+import os
+
+from common_func.db_manager import DBManager
+from common_func.constant import Constant
+from common_func.table_constant import TableConstant
+from communication_group.base_communication_group import BaseCommunicationGroup
+
+
+class CommunicationDBGroup(BaseCommunicationGroup):
+    COMMUNICATION_GROUP_TABLE = "CommunicationGroup"
+
+    def __init__(self, params: dict):
+        super().__init__(params)
+        self.communication_bandwidth_info = []
+        self.communication_time_info = []
+        self.matrix_info = []
+
+    def read_communication_func(self, params: tuple):
+        pass
+
+    def dump_data(self):
+        pass
+
+    def analyze_communication_data(self):
+        pass
diff --git a/profiler/cluster_analyse/communication_group/communication_group_generator.py b/profiler/cluster_analyse/communication_group/communication_group_generator.py
index 4963bf9539..3dca90454b 100644
--- a/profiler/cluster_analyse/communication_group/communication_group_generator.py
+++ b/profiler/cluster_analyse/communication_group/communication_group_generator.py
@@ -13,211 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from copy import deepcopy
-from multiprocessing import Pool
-from collections import defaultdict
 from common_func.constant import Constant
-from common_func.file_manager import FileManager
+from communication_group.communication_db_group import CommunicationDBGroup
+from communication_group.communication_json_group import CommunicationJsonGroup
 
 
 class CommunicationGroupGenerator:
-    COMMUNICATION_GROUP_JSON = "communication_group.json"
+
+    GROUP_MAP = {
+        Constant.DB: CommunicationDBGroup,
+        Constant.TEXT: CommunicationJsonGroup
+    }
 
     def __init__(self, params: dict):
-        self.collection_path = params.get(Constant.COLLECTION_PATH)
-        self.data_map = params.get(Constant.DATA_MAP)
-        self.analysis_mode = params.get(Constant.ANALYSIS_MODE)
-        self.communication_group = {}
-        self.collective_group_dict = defaultdict(set)
-        self.p2p_group_dict = defaultdict(list)
-        self.rank_comm_dir_dict = {}
-        self.communication_ops = []
-        self.p2p_comm_group = []
-        self.p2p_link = []
-        self.matrix_ops = []
+        self.processor = self.GROUP_MAP.get(params.get(Constant.DATA_TYPE))(params)
 
     def generate(self):
-        self.load_communication_json()
-        self.analyze_communication_ops()
-        self.set_p2p_groups()
-        self.generate_collective_communication_group()
-        self.generate_p2p_communication_group()
-        FileManager.create_json_file(self.collection_path, self.communication_group, self.COMMUNICATION_GROUP_JSON)
-        comm_data_dict = {
-           Constant.COLLECTIVE_GROUP: self.collective_group_dict,
-           Constant.COMMUNICATION_OPS: self.communication_ops,
-           Constant.MATRIX_OPS: self.matrix_ops,
-           Constant.COMMUNICATION_GROUP: self.communication_group
-        }
-        return comm_data_dict
-
-    def analyze_communication_ops(self):
-        for rank_id, rank_id_comm_dict, rank_id_matrix_dict in self.rank_comm_dir_dict:
-            for step_id, step_id_dict in rank_id_comm_dict.items():
-                if not isinstance(step_id_dict, dict):
-                    print(f"[WARNING] rank{rank_id}'s communication.json has a wrong data struct.")
-                    continue
-                self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE))
-                for comm_op_type, comm_op_dict in step_id_dict.items():
-                    self.add_communication_ops(rank_id, step_id, comm_op_type, comm_op_dict)
-
-            for step_id, step_id_dict in rank_id_matrix_dict.items():
-                if not isinstance(step_id_dict, dict):
-                    print(f"[WARNING] rank{rank_id}'s communication_matrix.json has a wrong data struct.")
-                    continue
-                self.set_p2p_link(rank_id, step_id, rank_id_matrix_dict)
-                self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE))
-
-    def read_comm_json_func(self: any, params: tuple):
-        if len(params) < 3:
-            return -1, {}, {}
-        rank_id = params[0]
-        comm_json_path = params[1]
-        matrix_json_path = params[2]
-        comm_data = {}
-        matrix_data = {}
-        if os.path.exists(comm_json_path) and self.analysis_mode in ['all', 'communication_time']:
-            comm_data = FileManager.read_json_file(comm_json_path)
-        if os.path.exists(matrix_json_path) and self.analysis_mode in ['all', 'communication_matrix']:
-            matrix_data = FileManager.read_json_file(matrix_json_path)
-        return rank_id, comm_data, matrix_data
-
-    def load_communication_json(self):
-        comm_op_dirs = []
-        for rank_id, profiling_dir_path in self.data_map.items():
-            comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_JSON)
-            matrix_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.COMM_MATRIX_JSON)
-            if comm_dir and matrix_dir:
-                comm_op_dirs.append((rank_id, comm_dir, matrix_dir))
-            else:
-                print(f"[WARNING] Rank {rank_id} does not have a valid communication.json or communication_matrix.json.")
-        with Pool() as p:
-            self.rank_comm_dir_dict = p.map(self.read_comm_json_func, comm_op_dirs)
-
-    def generate_collective_communication_group(self):
-        self.communication_group[Constant.COLLECTIVE] = \
-            [list(group) for group_name, group in self.collective_group_dict.items()]
-
-    def whether_valid_comm_group(self, rank_set: set):
-        """
-        while distinguish which communication group should be used to infer stage info, these group should be ignored:
-            1. group can not include more than 1 rank in every single p2p group
-        """
-        for p2p_rank_set in self.p2p_comm_group:
-            if len(rank_set.intersection(p2p_rank_set)) > 1:
-                return False
-        return True
-
-    def generate_p2p_communication_group(self):
-        stage_group = {}
-        for group_name, rank_set in self.collective_group_dict.items():
-            if not self.whether_valid_comm_group(rank_set):
-                continue
-            unioned_set = set()
-            remove_key = []
-            for first_rank, stage in stage_group.items():
-                if UnionFind.is_connected(rank_set, stage):
-                    unioned_set = UnionFind.union(rank_set, stage, unioned_set)
-                    remove_key.append(first_rank)
-            if unioned_set:
-                for key in remove_key:
-                    del stage_group[key]
-                stage_group[min(unioned_set)] = unioned_set
-            else:
-                stage_group[min(rank_set)] = rank_set
-        first_rank_sort_list = sorted([first_rank for first_rank in stage_group])
-        self.communication_group[Constant.P2P] = \
-            [list(stage_group.get(first_rank, {})) for first_rank in first_rank_sort_list]
-
-    def set_p2p_groups(self):
-        self.p2p_link = sorted(self.p2p_link, key=lambda x: min(x))
-        while self.p2p_link:
-            union_set = deepcopy(self.p2p_link[0])
-            rm_list = [self.p2p_link[0]]
-            for idx, link_rank_set_x in enumerate(self.p2p_link[1:]):
-                if UnionFind.is_connected(link_rank_set_x, union_set):
-                    union_set = union_set.union(link_rank_set_x)
-                    rm_list.append(link_rank_set_x)
-            self.p2p_comm_group.append(union_set)
-            self.p2p_link = [element for element in self.p2p_link if element not in rm_list]
-
-    def set_p2p_link(self, rank_id: int, step_id: str, rank_id_matrix_dict: dict):
-        ops = rank_id_matrix_dict.get(step_id, {})
-        self.add_matrix_ops(rank_id, step_id, ops)
-        if not ops:
-            print(f"[WARNING] rank{rank_id} {step_id} do not have communication matrix ops data.")
-            return
-        p2p_ops = ops.get(Constant.P2P, {})
-        for op_name, link_dict in p2p_ops.items():
-            self.append_p2p_link(op_name, link_dict)
-
-    def append_p2p_link(self, op_name, link_dict):
-        for link in link_dict:
-            if '-' not in link:
-                print(f"[WARNING] {op_name} has an invalid link key {link}!")
-                break
-            src_rank = int(link.split('-')[0])
-            dst_rank = int(link.split('-')[1])
-            if src_rank != dst_rank:
-                rank_set = set([src_rank, dst_rank])
-                if rank_set in self.p2p_link:
-                    continue
-                self.p2p_link.append(rank_set)
-
-    def get_collective_ops_name(self, rank_id: int, comm_op_dict: dict):
-        for comm_op in comm_op_dict:
-            if comm_op.startswith('Total'):
-                continue
-            group_name = comm_op.split('@')[-1]
-            self.collective_group_dict[group_name].add(rank_id)
-
-    def add_communication_ops(self, rank_id: str, step_id: str, comm_op_type: str, comm_op_dict: dict):
-        for comm_op in comm_op_dict:
-            if comm_op.startswith('Total'):
-                continue
-            group_name = comm_op.split('@')[-1]
-            self.communication_ops.append({
-                Constant.RANK_ID: rank_id,
-                Constant.STEP_ID: step_id,
-                Constant.COMM_OP_TYPE: comm_op_type,
-                Constant.COMM_OP_NAME: comm_op,
-                Constant.GROUP_NAME: group_name,
-                Constant.COMM_OP_INFO: comm_op_dict.get(comm_op)
-            })
-
-    def add_matrix_ops(self, rank_id: int, step_id: str, step_id_dict: dict):
-        for comm_op_type, comm_dict in step_id_dict.items():
-            if comm_op_type != Constant.COLLECTIVE and comm_op_type != Constant.P2P:
-                print(f"[WARNING] Unknown communication operators type!")
-                continue
-            for op_name, op_link_info in comm_dict.items():
-                if op_name.startswith('Total'):
-                    continue
-                group_name = op_name.split('@')[-1]
-                self.matrix_ops.append({
-                    Constant.RANK_ID: rank_id,
-                    Constant.STEP_ID: step_id,
-                    Constant.COMM_OP_TYPE: comm_op_type,
-                    Constant.COMM_OP_NAME: op_name,
-                    Constant.GROUP_NAME: group_name,
-                    Constant.COMM_OP_INFO: op_link_info
-                })
-
-
-class UnionFind(object):
-    """Disjoint Set Union"""
-    @classmethod
-    def union(cls, p: set, q: set, o: set):
-        """make p and q the same set"""
-        return p | q | o
-
-    @classmethod
-    def is_connected(cls, p: set, q: set):
-        """
-        check whether set p and set q are connected
-        """
-        if p & q:
-            return True
-        else:
-            return False
+        return self.processor.generate()
diff --git a/profiler/cluster_analyse/communication_group/communication_json_group.py b/profiler/cluster_analyse/communication_group/communication_json_group.py
new file mode 100644
index 0000000000..da6e6c1fe4
--- /dev/null
+++ b/profiler/cluster_analyse/communication_group/communication_json_group.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2023, Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from common_func.constant import Constant
+from common_func.file_manager import FileManager
+from communication_group.base_communication_group import BaseCommunicationGroup
+
+
+class CommunicationJsonGroup(BaseCommunicationGroup):
+    COMMUNICATION_GROUP_JSON = "communication_group.json"
+
+    def __init__(self, params: dict):
+        super().__init__(params)
+        self.communication_ops = []
+        self.matrix_ops = []
+
+    def dump_data(self):
+        FileManager.create_json_file(self.collection_path, self.communication_group, self.COMMUNICATION_GROUP_JSON)
+        comm_data_dict = {
+           Constant.COLLECTIVE_GROUP: self.collective_group_dict,
+           Constant.COMMUNICATION_OPS: self.communication_ops,
+           Constant.MATRIX_OPS: self.matrix_ops,
+           Constant.COMMUNICATION_GROUP: self.communication_group
+        }
+        return comm_data_dict
+
+    def analyze_communication_data(self):
+        for rank_id, rank_id_comm_dict, rank_id_matrix_dict in self.rank_comm_dir_dict:
+            for step_id, step_id_dict in rank_id_comm_dict.items():
+                if not isinstance(step_id_dict, dict):
+                    print(f"[WARNING] rank{rank_id}'s communication.json has a wrong data struct.")
+                    continue
+                self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE))
+                for comm_op_type, comm_op_dict in step_id_dict.items():
+                    self.add_communication_ops(rank_id, step_id, comm_op_type, comm_op_dict)
+
+            for step_id, step_id_dict in rank_id_matrix_dict.items():
+                if not isinstance(step_id_dict, dict):
+                    print(f"[WARNING] rank{rank_id}'s communication_matrix.json has a wrong data struct.")
+                    continue
+                self.set_p2p_link(rank_id, step_id, rank_id_matrix_dict)
+                self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE))
+
+    def read_communication_func(self: any, params: tuple):
+        if len(params) < 3:
+            return -1, {}, {}
+        rank_id = params[0]
+        comm_json_path = params[1]
+        matrix_json_path = params[2]
+        comm_data = {}
+        matrix_data = {}
+        if os.path.exists(comm_json_path) and self.analysis_mode in ["all", "communication_time"]:
+            comm_data = FileManager.read_json_file(comm_json_path)
+        if os.path.exists(matrix_json_path) and self.analysis_mode in ["all", "communication_matrix"]:
+            matrix_data = FileManager.read_json_file(matrix_json_path)
+        return rank_id, comm_data, matrix_data
+
+    def set_p2p_link(self, rank_id: int, step_id: str, rank_id_matrix_dict: dict):
+        ops = rank_id_matrix_dict.get(step_id, {})
+        self.add_matrix_ops(rank_id, step_id, ops)
+        if not ops:
+            print(f"[WARNING] rank{rank_id} {step_id} do not have communication matrix ops data.")
+            return
+        p2p_ops = ops.get(Constant.P2P, {})
+        for op_name, link_dict in p2p_ops.items():
+            self.append_p2p_link(op_name, link_dict)
+
+    def append_p2p_link(self, op_name, link_dict):
+        for link in link_dict:
+            if '-' not in link:
+                print(f"[WARNING] {op_name} has an invalid link key {link}!")
+                break
+            src_rank = int(link.split('-')[0])
+            dst_rank = int(link.split('-')[1])
+            if src_rank != dst_rank:
+                rank_set = set([src_rank, dst_rank])
+                if rank_set in self.p2p_link:
+                    continue
+                self.p2p_link.append(rank_set)
+
+    def get_collective_ops_name(self, rank_id: int, comm_op_dict: dict):
+        for comm_op in comm_op_dict:
+            if comm_op.startswith('Total'):
+                continue
+            group_name = comm_op.split('@')[-1]
+            self.collective_group_dict[group_name].add(rank_id)
+
+    def add_communication_ops(self, rank_id: str, step_id: str, comm_op_type: str, comm_op_dict: dict):
+        for comm_op in comm_op_dict:
+            if comm_op.startswith('Total'):
+                continue
+            group_name = comm_op.split('@')[-1]
+            self.communication_ops.append({
+                Constant.RANK_ID: rank_id,
+                Constant.STEP_ID: step_id,
+                Constant.COMM_OP_TYPE: comm_op_type,
+                Constant.COMM_OP_NAME: comm_op,
+                Constant.GROUP_NAME: group_name,
+                Constant.COMM_OP_INFO: comm_op_dict.get(comm_op)
+            })
+
+    def add_matrix_ops(self, rank_id: int, step_id: str, step_id_dict: dict):
+        for comm_op_type, comm_dict in step_id_dict.items():
+            if comm_op_type != Constant.COLLECTIVE and comm_op_type != Constant.P2P:
+                print(f"[WARNING] Unknown communication operators type!")
+                continue
+            for op_name, op_link_info in comm_dict.items():
+                if op_name.startswith('Total'):
+                    continue
+                group_name = op_name.split('@')[-1]
+                self.matrix_ops.append({
+                    Constant.RANK_ID: rank_id,
+                    Constant.STEP_ID: step_id,
+                    Constant.COMM_OP_TYPE: comm_op_type,
+                    Constant.COMM_OP_NAME: op_name,
+                    Constant.GROUP_NAME: group_name,
+                    Constant.COMM_OP_INFO: op_link_info
+                })
-- 
Gitee


From acfba6dffdf5c0d57f7ad1335dc921ca90fbc2b0 Mon Sep 17 00:00:00 2001
From: user_10012209 <734267852@qq.com>
Date: Thu, 7 Mar 2024 19:47:56 +0800
Subject: [PATCH 34/62] [att_profiler]

---
 profiler/compare_tools/README.md | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/profiler/compare_tools/README.md b/profiler/compare_tools/README.md
index b36ac58097..80b7930157 100644
--- a/profiler/compare_tools/README.md
+++ b/profiler/compare_tools/README.md
@@ -97,7 +97,7 @@ python performance_compare.py [基准性能数据文件] [比对性能数据文
 - 比对性能数据文件（必选）：可以指定以“ascend_pt”结尾的目录、ASCEND_PROFILER_OUTPUT目录或trace_view.json文件，指定trace_view.json无法显示算子的内存占用。
 - --output_path（可选）：性能比对结果存放的路径，默认保存在当前目录。
 
-工具将总体性能拆解为训练耗时和内存占用，其中训练耗时可拆分为算子、通信、调度三个维度，以打屏的形式输出总体指标，帮助用户定界劣化的方向。与此同时，工具还会生成performance_comparison_result_*.xlsl，展示每个算子在执行耗时、通信耗时、内存占用的优劣，可通过DIFF列大于0筛选出劣化算子。详细介绍请参见“**比对结果说明**”。
+工具将总体性能拆解为训练耗时和内存占用，其中训练耗时可拆分为算子（包括算子和nn.Module）、通信、调度三个维度，以打屏的形式输出总体指标，帮助用户定界劣化的方向。与此同时，工具还会生成performance_comparison_result_*.xlsl，展示每个算子在执行耗时、通信耗时、内存占用的优劣，可通过DIFF列大于0筛选出劣化算子。详细介绍请参见“**比对结果说明**”。
 
 #### 通用参数说明
 
@@ -120,10 +120,10 @@ python performance_compare.py [基准性能数据文件] [比对性能数据文
 
 | 参数名            | 说明                                                         | 是否必选 |
 | ----------------- | ------------------------------------------------------------ | -------- |
-| --gpu_flow_cat    | 配置GPU trace中cpu侧算子与device kernel的连线标识，当GPU的kernel均为空时设置。根据timeline的json文件在chrome://tracing上的Flow events的选项配置。使用示例：--gpu_flow_cat=async_gpu | 否       |
+| --gpu_flow_cat    | 配置GPU trace中CPU侧算子与device kernel的连线标识，当GPU的Device Duration(us)均为0时设置。使用chrome://tracing打开GPU的json，右上角Flow events找到连线标识，将标识配置进该参数。使用示例：--gpu_flow_cat=async_gpu | 否       |
 | --use_input_shape | 开启算子精准匹配，默认关闭。使用示例：--use_input_shape      | 否       |
-| --max_kernel_num  | 设置CPU侧算子下发的最大kernel数量，当超过设定值时工具会自动往下找子算子，直至满足条件，默认仅比对最上层算子。使用示例：--max_kernel_num=10 | 否       |
-| --op_name_map     | 设置GPU与NPU等价的算子名称的映射关系，以字典形式存入。使用示例：--op_name_map='{"Optimizer.step#SGD.step":"Optimizer.step#NpuFusedSGD.step"}' | 否       |
+| --max_kernel_num  | 设置CPU侧算子下发的最大kernel数量，当超过设定值时工具会自动往下找子算子，直至满足条件。默认仅比对最上层算子，粒度较粗；若想要更细粒度的算子比对，可设置该参数，参数值不得小于4，参数值设置越小，比对粒度越细。使用示例：--max_kernel_num=10 | 否       |
+| --op_name_map     | 设置GPU与NPU等价的算子名称的映射关系，以字典形式存入。使用示例：--op_name_map={'Optimizer.step#SGD.step':'Optimizer.step#NpuFusedSGD.step'} | 否       |
 
 ## 比对结果说明
 
@@ -139,11 +139,12 @@ python performance_compare.py [基准性能数据文件] [比对性能数据文
 | Flash Attention Time(Forward)           | Flash Attention算子前向耗时。                                |
 | Flash Attention Time(Backward)          | Flash Attention算子反向耗时。                                |
 | Computing Time                          | 计算流耗时，计算流所有event耗时总和。如果有多条并发计算，计算流耗时对重叠部分只会计算一次。 |
-| Mem Usage                               | 内存使用。gpu上的内存使用可以使用nvidia-smi查看，npu上的内存使用可以使用npu-smi查看，Profiling信息采集时打开profile_memory=True开关，mem usage显示的是memory_record里面的最大resevered值，一般来说是进程级内存。 |
+| Mem Usage                               | 内存使用。GPU上的内存使用可以使用nvidia-smi查看，NPU上的内存使用可以使用npu-smi查看，Profiling信息采集时打开profile_memory=True开关，mem usage显示的是memory_record里面的最大resevered值，一般来说是进程级内存。 |
 | Uncovered Communication Time(Wait Time) | 通信未掩盖耗时，包含Wait Time为同步时间。                    |
 | SDMA Time(Num)                          | 拷贝类任务耗时，Num表示计算的次数。                          |
 | Free Time                               | 调度耗时 = E2E耗时 - 算子耗时 - 通信不可掩盖耗时。Free的定义为Device侧既不在通信又不在计算的时间，因此包含拷贝时间（SDMA Time）。 |
 | E2E Time(Not minimal profiling)         | E2E总耗时，计算流端到端耗时。当存在Not minimal profiling时，表示该时间存在性能膨胀，会影响通信和调度耗时。 |
+| Other Time                              | AI CPU、DSA、TensorMove等其他算子耗时。                      |
 
 可以采取最简性能数据采集的方式来减少E2E耗时的性能膨胀，示例代码如下：
 
@@ -160,15 +161,22 @@ with torch_npu.profiler.profile(
 
 activities配置仅采集NPU数据，不配置experimental_config参数以及其他可选开关。
 
+- 当Computing Time耗时增大，分析**算子性能**。
+- 当Uncovered Communication Time耗时增大，分析**通信性能**，若通信性能分析没有劣化的通信算子，代表通信与计算的并行度较差，继续进行NPU的集群性能分析。
+- 当Mem Usage增大，分析**算子内存**，若没有明显占用较大的算子，则代表算子内存申请量没有差异，问题在于内存的释放（持有时间过久），可以使用tensorboard或ascend insight继续进行NPU内存的分析。
+
 ### 算子性能
 
 算子性能比对结果在performance_comparison_result_*.xlsl中OperatorCompare和OperatorCompareStatistic的sheet页呈现。
 
-- OperatorCompare(TOP)：算子为粒度的统计呈现，按照算子在device上的总耗时与基准算子的差距值（Diff Duration(ms)列）进行逆序。
+- OperatorCompareStatistic：算子为粒度的统计呈现，按照算子在device上的总耗时与基准算子的差距值（Diff Duration(ms)列）进行逆序。
 - OperatorCompare：算子比对的明细展示，可以查看每一个算子对应的kernel详情。
 - Diff Ratio：比较算子在device上执行总耗时 / 基准算子在device上执行总耗时，红色代表劣化。
 - Device Duration(us)：该算子下发到device上执行的所有kernel耗时的总和。
 
+步骤1：查看OperatorCompareStatistic页，找出耗时差距TOP的算子。
+步骤2：查看OperatorCompare页，搜索耗时差距TOP的算子，查看具体执行的kernel耗时，寻找可优化点。
+
 ### nn.Module性能
 
 nn.Module是所有神经网络模块的基类，使用PyTorch构建神经网络需要继承nn.Module类来实现，性能比对工具支持nn.Module模块级的比对，帮助优化模型结构。
@@ -187,7 +195,7 @@ nn.Module性能比对结果在performance_comparison_result_*.xlsl中ModuleCompa
 - Device Total Time(ms)：该模块调用的算子（包含子模块）在device侧执行的总耗时，单位ms。
 - Device Total Time Diff：GPU与NPU的Device Total Time(ms)差值。
 - Device Self Time Diff：GPU与NPU的Device Self Time(ms)差值。
-- Self Time Ratio：GPU与NPU的Device Self Time(ms)比值。
+- Total Time Ratio：GPU与NPU的Device Total Time(ms)比值。
 - Base Call Stack：基准文件模块的调用栈。
 - Comparison Call Stack：比较文件模块的调用栈。
 
@@ -195,7 +203,7 @@ nn.Module性能比对结果在performance_comparison_result_*.xlsl中ModuleCompa
 
 通信性能比对结果在performance_comparison_result_*.xlsl中CommunicationCompare的sheet页呈现。
 
-- 淡蓝色背景的记录行：通信算子的summary信息，包括通信算子名称、调用总次数、通信算子总耗时（单位：us）、通信算子平均耗时（单位：us）、通信算子最大耗时（单位：us）、通信算子最小耗时（单位：us）。
+- 第二行表头：通信算子的summary信息，包括通信算子名称、调用总次数、通信算子总耗时（单位：us）、通信算子平均耗时（单位：us）、通信算子最大耗时（单位：us）、通信算子最小耗时（单位：us）。
 - 无背景色的记录行：通信算子的detail信息，仅支持NPU，包含了该通信算子下的所有Task信息，包括Task名称、Task调用次数、Task总耗时（单位：us）、Task平均耗时（单位：us）、Task最大耗时（单位：us）、Task最小耗时（单位：us）。
 - Diff Ratio: 比较通信算子的总耗时 / 基准通信算子的总耗时，红色代表劣化。
 
@@ -203,10 +211,13 @@ nn.Module性能比对结果在performance_comparison_result_*.xlsl中ModuleCompa
 
 算子内存比对结果在performance_comparison_result_*.xlsl中MemoryCompare和MemoryCompareStatistic的sheet页呈现。
 
-- MemoryCompare(TOP)：算子为粒度的统计呈现，按照算子占用的总内存与基准算子的差距值(Diff Memory(MB))进行逆序。
+- MemoryCompareStatistic：算子为粒度的统计呈现，按照算子占用的总内存与基准算子的差距值(Diff Memory(MB))进行逆序。
 
 - MemoryCompare：算子内存比对的明细展示，可以查看每一个算子申请内存的详情。
 
 - Diff Ratio: 比较算子占用的总内存 / 基准算子占用的总内存，红色代表劣化。
 
 - Size(KB)：该算子占用的device内存大小，单位KB。
+
+步骤1：查看MemoryCompareStatistic页，找出内存占用差距TOP的算子。
+步骤2：查看MemoryCompare页，搜索内存占用差距TOP的算子，查看具体占用的子算子。
-- 
Gitee


From 28236864f7613ac803946c2c8ca894bc6a3eb0d6 Mon Sep 17 00:00:00 2001
From: w00800385 <wuzicheng2@huawei.com>
Date: Thu, 7 Mar 2024 20:59:34 +0800
Subject: [PATCH 35/62] =?UTF-8?q?att=E6=94=AF=E6=8C=81DB=E6=A0=BC=E5=BC=8F?=
 =?UTF-8?q?=E6=95=B0=E6=8D=AE=E8=BE=93=E5=85=A5=E4=B8=8E=E5=A4=84=E7=90=86?=
 =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E6=AD=A3codec=20heck=E5=91=8A=E8=AD=A6,?=
 =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../communication_analysis_db.py              | 140 +++++++++++++++++-
 .../comm_matrix_analysis_db.py                | 115 +++++++++++++-
 .../communication_db_group.py                 |  87 ++++++++++-
 3 files changed, 337 insertions(+), 5 deletions(-)

diff --git a/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py b/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py
index 0f7085ad9c..3559a9a28f 100644
--- a/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py
+++ b/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py
@@ -24,4 +24,142 @@ class CommunicationAnalysisDB:
         self.res_comm_bandwidth = []
 
     def run(self):
-        pass
\ No newline at end of file
+        if not self.communication_time_info and not self.communication_bandwidth_info:
+            return
+        self.split_and_add_rank_set(self.communication_time_info, self.comm_time_struct)
+        self.split_and_add_rank_set(self.communication_bandwidth_info, self.comm_bandwidth_struct)
+        self.compute_total_info()
+        self.dump_data()
+
+    def dump_data(self):
+        output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT)
+        result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER)
+        DBManager.create_tables(result_db, self.COMMUNICATION_TIME_TABLE, self.COMMUNICATION_BANDWIDTH_TABLE)
+        res_time, res_bandwidth = [], []
+        conn, cursor = DBManager.create_connect_db(result_db)
+        for data in self.res_comm_time:
+            res_time.append([data[TableConstant.RANK_SET], data[TableConstant.STEP], data[TableConstant.RANK_ID],
+                             data[TableConstant.HCCL_OP_NAME], data[TableConstant.GROUP_NAME],
+                             data[TableConstant.START_TIMESTAMP], data[TableConstant.ELAPSED_TIME],
+                             data[TableConstant.TRANSIT_TIME], data[TableConstant.WAIT_TIME],
+                             data[TableConstant.SYNCHRONIZATION_TIME], data[TableConstant.IDLE_TIME],
+                             data[TableConstant.SYNCHRONIZATION_TIME_RATIO], data[TableConstant.WAIT_TIME_RATIO]])
+        if res_time:
+            sql = "insert into {} values ({value})".format(self.COMMUNICATION_TIME_TABLE,
+                                                           value="?," * (len(res_time[0]) - 1) + "?")
+            DBManager.executemany_sql(conn, sql, res_time)
+        for data in self.res_comm_bandwidth:
+            res_bandwidth.append([data[TableConstant.RANK_SET], data[TableConstant.STEP], data[TableConstant.RANK_ID],
+                                  data[TableConstant.HCCL_OP_NAME], data[TableConstant.GROUP_NAME],
+                                  data[TableConstant.TRANSPORT_TYPE], data[TableConstant.TRANSIT_SIZE],
+                                  data[TableConstant.TRANSIT_TIME], data[TableConstant.BANDWIDTH],
+                                  data[TableConstant.LARGE_PACKET_RATIO], data[TableConstant.PACKAGE_SIZE],
+                                  data[TableConstant.COUNT], data[TableConstant.TOTAL_DURATION]])
+        if res_bandwidth:
+            sql = "insert into {} values ({value})".format(self.COMMUNICATION_BANDWIDTH_TABLE,
+                                                           value="?," * (len(res_bandwidth[0]) - 1) + "?")
+            DBManager.executemany_sql(conn, sql, res_bandwidth)
+        DBManager.destroy_db_connect(conn, cursor)
+
+    def split_and_add_rank_set(self, data_list, res_dict):
+        for data in data_list:
+            if data[TableConstant.TYPE] == Constant.P2P:
+                rank_tuple = Constant.P2P
+            else:
+                rank_tuple = tuple(self.collective_group_dict.get(data[TableConstant.GROUP_NAME]))
+            res_dict.setdefault(rank_tuple, {}).setdefault(data[TableConstant.STEP], []).append(data)
+
+    def compute_total_info(self):
+        for rank_tuple, op_dict in self.comm_time_struct.items():
+            if rank_tuple != Constant.P2P:
+                for step, data_list in op_dict.items():
+                    self.compute_rank_set_total_time_info(data_list, rank_tuple)
+            else:
+                rank_set = set()
+                for step, data_list in op_dict.items():
+                    rank_set.add(data[TableConstant.RANK_ID] for data in data_list)
+                for step, data_list in op_dict.items():
+                    self.compute_rank_set_total_time_info(data_list, rank_set, True)
+        for rank_tuple, op_dict in self.comm_bandwidth_struct.items():
+            for step, data_list in op_dict.items():
+                if rank_tuple != Constant.P2P:
+                    self.compute_rank_set_total_bandwidth_info(data_list, rank_tuple)
+                else:
+                    self.compute_rank_set_total_bandwidth_info(data_list, rank_tuple, True)
+
+    def compute_rank_set_total_bandwidth_info(self, data_list, rank_tuple, is_p2p=False):
+        if not data_list:
+            return
+        data_dict = {}
+        rank_tuple = "(" + ",".join(str(i) for i in rank_tuple) + ")" if not is_p2p else Constant.P2P
+        for data in data_list:
+            data[TableConstant.RANK_SET] = rank_tuple
+            rank_band_type = self.RANK_BAND_TYPE.format(data[TableConstant.RANK_ID],
+                                                        data[TableConstant.TRANSPORT_TYPE])
+            data_dict.setdefault(rank_band_type, []).append(data)
+            self.res_comm_bandwidth.append(data)
+        for rank_band_type, bandwidth_list in data_dict.items():
+            package_set = set()
+            for data in bandwidth_list:
+                package_set.add(data[TableConstant.PACKAGE_SIZE])
+            for package in package_set:
+                total_comm_bandwidth_info = dict()
+                for data in bandwidth_list:
+                    self.compute_bandwidth(total_comm_bandwidth_info, data, package)
+                bandwidth = BaseAnalysisJson.compute_ratio(total_comm_bandwidth_info.get(TableConstant.TRANSIT_SIZE),
+                                                           total_comm_bandwidth_info.get(TableConstant.TRANSIT_TIME))
+                total_comm_bandwidth_info[TableConstant.BANDWIDTH] = bandwidth
+                total_comm_bandwidth_info[TableConstant.PACKAGE_SIZE] = package
+                total_comm_bandwidth_info[TableConstant.HCCL_OP_NAME] = Constant.TOTAL_OP_INFO
+                total_comm_bandwidth_info[TableConstant.GROUP_NAME] = ""
+                total_comm_bandwidth_info[TableConstant.LARGE_PACKET_RATIO] = 0.0
+                self.res_comm_bandwidth.append(total_comm_bandwidth_info)
+
+    def compute_bandwidth(self, res_dict, data_dict, package):
+        for key in data_dict.keys():
+            if key in [TableConstant.TRANSIT_TIME, TableConstant.TRANSIT_SIZE]:
+                if key not in res_dict.keys():
+                    res_dict[key] = 0.0
+                res_dict[key] += data_dict[key]
+            elif key in [TableConstant.COUNT, TableConstant.TOTAL_DURATION]:
+                if data_dict[TableConstant.PACKAGE_SIZE] == package:
+                    if key not in res_dict.keys():
+                        res_dict[key] = 0.0
+                    res_dict[key] += data_dict[key]
+            else:
+                res_dict[key] = data_dict[key]
+
+    def compute_time(self, res_dict, data_dict, dict_key):
+        if dict_key.endswith(self.TIME_EXTENSION):
+            if dict_key not in res_dict.keys():
+                res_dict[dict_key] = 0.0
+            res_dict[dict_key] += data_dict[dict_key]
+        else:
+            res_dict[dict_key] = data_dict[dict_key]
+
+    def compute_rank_set_total_time_info(self, data_list: list, rank_tuple: any, is_p2p: bool = False):
+        if not data_list:
+            return
+        rank_set = "(" + ",".join(str(i) for i in rank_tuple) + ")" if not is_p2p else Constant.P2P
+        for rank_id in rank_tuple:
+            total_comm_time_info = dict()
+            for data in data_list:
+                if data[TableConstant.RANK_ID] == rank_id:
+                    data[TableConstant.RANK_SET] = rank_set
+                    data[TableConstant.SYNCHRONIZATION_TIME_RATIO] = 0.0
+                    data[TableConstant.WAIT_TIME_RATIO] = 0.0
+                    for key, value in data.items():
+                        self.compute_time(total_comm_time_info, data, key)
+            syn_ratio = BaseAnalysisJson.compute_ratio(total_comm_time_info.get(TableConstant.SYNCHRONIZATION_TIME),
+                                                       total_comm_time_info.get(TableConstant.SYNCHRONIZATION_TIME) +
+                                                       total_comm_time_info.get(TableConstant.TRANSIT_TIME))
+            wait_time_ratio = BaseAnalysisJson.compute_ratio(total_comm_time_info.get(TableConstant.WAIT_TIME),
+                                                             total_comm_time_info.get(TableConstant.WAIT_TIME) +
+                                                             total_comm_time_info.get(TableConstant.TRANSIT_TIME))
+            total_comm_time_info[TableConstant.HCCL_OP_NAME] = Constant.TOTAL_OP_INFO
+            total_comm_time_info[TableConstant.GROUP_NAME] = ""
+            total_comm_time_info[TableConstant.START_TIMESTAMP] = 0.0
+            total_comm_time_info[TableConstant.WAIT_TIME_RATIO] = wait_time_ratio
+            total_comm_time_info[TableConstant.SYNCHRONIZATION_TIME_RATIO] = syn_ratio
+            self.res_comm_time.append(total_comm_time_info)
+        self.res_comm_time.extend(data_list)
diff --git a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py
index ea02c990bf..df58fcecff 100644
--- a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py
+++ b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py
@@ -17,4 +17,117 @@ class CommMatrixAnalysisDB:
         self.res_comm_matrix = []
 
     def run(self):
-        pass
\ No newline at end of file
+        if not self.matrix_info:
+            return
+        self.set_rank_tuple()
+        self.combine_total_matrix_info()
+        self.dump_data()
+
+    def dump_data(self):
+        output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT)
+        result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER)
+        DBManager.create_tables(result_db, self.COMMUNICATION_MATRIX_TABLE)
+        conn, cursor = DBManager.create_connect_db(result_db)
+        res = []
+        for data in self.res_comm_matrix:
+            op_name = data.get(TableConstant.OPNAME) if data.get(TableConstant.OPNAME) is not None else ""
+            res.append([data[TableConstant.RANK_SET], data[TableConstant.STEP], data[TableConstant.HCCL_OP_NAME],
+                        data[TableConstant.GROUP_NAME], data[TableConstant.SRC_RANK], data[TableConstant.DST_RANK],
+                        data[TableConstant.TRANSIT_SIZE], data[TableConstant.TRANSIT_TIME],
+                        data[TableConstant.BANDWIDTH], data[TableConstant.TRANSPORT_TYPE], op_name])
+        if res:
+            sql = "insert into {} values ({value})".format(self.COMMUNICATION_MATRIX_TABLE,
+                                                           value="?," * (len(res[0]) - 1) + "?")
+            DBManager.executemany_sql(conn, sql, res)
+        DBManager.destroy_db_connect(conn, cursor)
+
+    def combine_total_matrix_info(self):
+        for rank_tuple, group_dict in self.comm_matrix_struct.items():
+            if rank_tuple != Constant.P2P:
+                rank_tuple = "(" + ",".join(str(i) for i in rank_tuple) + ")"
+            for step, step_dict in group_dict.items():
+                self.merge_same_info(step_dict, rank_tuple)
+                self.combine_total_info(step_dict)
+
+    def combine_total_info(self, step_dict: dict):
+        link_key_set = set()
+        for op_name, matrix_dict in step_dict.items():
+            self.res_comm_matrix.extend(matrix_dict.values())
+            if BaseAnalysisJson.check_add_op(op_name):
+                for key in matrix_dict.keys():
+                    link_key_set.add(key)
+        for link_key in link_key_set:
+            total_matrix_info = dict()
+            total_matrix_info[TableConstant.TRANSIT_SIZE] = 0.0
+            total_matrix_info[TableConstant.TRANSIT_TIME] = 0.0
+            for op_name, matrix_dict in step_dict.items():
+                if link_key in matrix_dict.keys() and BaseAnalysisJson.check_add_op(op_name):
+                    total_matrix_info[TableConstant.RANK_SET] = matrix_dict[link_key][TableConstant.RANK_SET]
+                    self.combine_link_info(total_matrix_info, matrix_dict[link_key])
+            bandwidth = BaseAnalysisJson.compute_ratio(total_matrix_info[TableConstant.TRANSIT_SIZE],
+                                                       total_matrix_info[TableConstant.TRANSIT_TIME])
+            total_matrix_info[TableConstant.HCCL_OP_NAME] = Constant.TOTAL_OP_INFO
+            total_matrix_info[TableConstant.GROUP_NAME] = ""
+            total_matrix_info[TableConstant.BANDWIDTH] = bandwidth
+            self.res_comm_matrix.append(total_matrix_info)
+
+    def combine_link_info(self, link_info, data: dict):
+        for col in data.keys():
+            if col in [TableConstant.TRANSIT_TIME, TableConstant.TRANSIT_SIZE]:
+                link_info[col] += data[col]
+            else:
+                link_info[col] = data[col]
+
+    def merge_same_info(self, step_dict: dict, rank_tuple):
+        def process_matrix():
+            for data in op_list:
+                if data[TableConstant.SRC_RANK] == data[TableConstant.DST_RANK]:
+                    if data[TableConstant.SRC_RANK] not in local_global_rank_map:
+                        local_global_rank_map[data[TableConstant.SRC_RANK]] = data[TableConstant.RANK_ID]
+                    elif local_global_rank_map[data[TableConstant.SRC_RANK]] != data[TableConstant.RANK_ID]:
+                        print(f"[WARNING] In the same communication group, local ranks projecting to global ranks "
+                              f"repeat!")
+                if (link_key.split('-')[0] == data[TableConstant.SRC_RANK] and
+                        link_key.split('-')[1] == data[TableConstant.DST_RANK]):
+                    self.combine_link_info(matrix_info, data)
+                    new_matrix_list[link_key] = matrix_info
+
+        def convert_local_to_global_rank():
+            res_dict = dict()
+            for key, new_matrix in new_matrix_list.items():
+                src_rank = new_matrix[TableConstant.SRC_RANK]
+                dst_rank = new_matrix[TableConstant.DST_RANK]
+                src_rank = local_global_rank_map[src_rank] if src_rank in local_global_rank_map else src_rank
+                dst_rank = local_global_rank_map[dst_rank] if dst_rank in local_global_rank_map else dst_rank
+                bandwidth = BaseAnalysisJson.compute_ratio(new_matrix[TableConstant.TRANSIT_SIZE],
+                                                           new_matrix[TableConstant.TRANSIT_TIME])
+                key = f"{src_rank}-{dst_rank}"
+                new_matrix[TableConstant.SRC_RANK] = src_rank
+                new_matrix[TableConstant.DST_RANK] = dst_rank
+                new_matrix[TableConstant.BANDWIDTH] = bandwidth
+                res_dict[key] = new_matrix
+            return res_dict
+
+        local_global_rank_map = dict()
+        for op_name, op_list in step_dict.items():
+            new_matrix_list = {}
+            link_key_set = set()
+            for op_data in op_list:
+                link_key_set.add(op_data[TableConstant.SRC_RANK] + "-" + op_data[TableConstant.DST_RANK])
+            for link_key in link_key_set:
+                matrix_info = dict()
+                matrix_info[TableConstant.RANK_SET] = rank_tuple
+                matrix_info[TableConstant.TRANSIT_SIZE] = 0.0
+                matrix_info[TableConstant.TRANSIT_TIME] = 0.0
+                process_matrix()
+            step_dict[op_name] = convert_local_to_global_rank()
+
+    def set_rank_tuple(self):
+        for data in self.matrix_info:
+            op_name = data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME]
+            if data[TableConstant.STEP] == Constant.P2P:
+                rank_tuple = Constant.P2P
+            else:
+                rank_tuple = tuple(self.collective_group_dict.get(data[TableConstant.GROUP_NAME]))
+            self.comm_matrix_struct.setdefault(rank_tuple, {}).setdefault(data[TableConstant.STEP], {}). \
+                setdefault(op_name, []).append(data)
diff --git a/profiler/cluster_analyse/communication_group/communication_db_group.py b/profiler/cluster_analyse/communication_group/communication_db_group.py
index 0122a63f16..e0cd7215e5 100644
--- a/profiler/cluster_analyse/communication_group/communication_db_group.py
+++ b/profiler/cluster_analyse/communication_group/communication_db_group.py
@@ -16,10 +16,91 @@ class CommunicationDBGroup(BaseCommunicationGroup):
         self.matrix_info = []
 
     def read_communication_func(self, params: tuple):
-        pass
+        if len(params) < 3:
+            return -1, ({}, {}, {})
+        rank_id = params[0]
+        db_path = params[1]
+        time_data = {}
+        bandwidth_data = {}
+        matrix_data = {}
+        if DBManager.check_tables_in_db(db_path, Constant.TABLE_COMM_ANALYZER_TIME,
+                                        Constant.TABLE_COMM_ANALYZER_BANDWIDTH,
+                                        Constant.TABLE_COMM_ANALYZER_MATRIX):
+            conn, cursor = DBManager.create_connect_db(db_path)
+            time_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_TIME)
+            bandwidth_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_BANDWIDTH)
+            matrix_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_MATRIX)
+            if self.analysis_mode in ["all", "communication_time"]:
+                time_data = DBManager.fetch_all_data(cursor, time_info_sql)
+                bandwidth_data = DBManager.fetch_all_data(cursor, bandwidth_info_sql)
+            if self.analysis_mode in ["all", "communication_matrix"]:
+                matrix_data = DBManager.fetch_all_data(cursor, matrix_info_sql)
+            DBManager.destroy_db_connect(conn, cursor)
+        return rank_id, (self.data_group_by_step(time_data), self.data_group_by_step(bandwidth_data),
+                         self.data_group_by_step(matrix_data))
+
+    @staticmethod
+    def data_group_by_step(data: any) -> any:
+        res = {}
+        for item in data:
+            res.setdefault(item[TableConstant.STEP], []).append(item)
+        return res
 
     def dump_data(self):
-        pass
+        output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT)
+        result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER)
+        DBManager.create_tables(result_db, self.COMMUNICATION_GROUP_TABLE)
+        res = []
+        conn, cursor = DBManager.create_connect_db(result_db)
+        for data_type, data_list in self.communication_group.items():
+            for data in data_list:
+                rank_set = "(" + ",".join(str(i) for i in data) + ")"
+                data = [data_type, rank_set]
+                res.append(data)
+        if res:
+            sql = "insert into {} values ({value})".format(self.COMMUNICATION_GROUP_TABLE,
+                                                           value="?," * (len(res[0]) - 1) + "?")
+            DBManager.executemany_sql(conn, sql, res)
+        DBManager.destroy_db_connect(conn, cursor)
+        comm_data_dict = {
+            Constant.COLLECTIVE_GROUP: self.collective_group_dict,
+            Constant.COMMUNICATION_TIME_INFO: self.communication_time_info,
+            Constant.COMMUNICATION_BANDWIDTH_INFO: self.communication_bandwidth_info,
+            Constant.MATRIX_OPS: self.matrix_info,
+            Constant.COMMUNICATION_GROUP: self.communication_group
+        }
+        return comm_data_dict
 
     def analyze_communication_data(self):
-        pass
+        for rank_id, data_tuple in self.rank_comm_dir_dict:
+            time_data, bandwidth_data, matrix_data = data_tuple[0], data_tuple[1], data_tuple[2]
+            for step, data_list in time_data.items():
+                for data in data_list:
+                    self.compute_collective_group(data, rank_id, self.communication_time_info)
+                for data in bandwidth_data[step]:
+                    self.compute_collective_group(data, rank_id, self.communication_bandwidth_info)
+            for step, data_list in matrix_data.items():
+                self.add_p2p_and_rank(rank_id, step, matrix_data)
+                for data in data_list:
+                    self.compute_collective_group(data, rank_id, self.matrix_info)
+
+    def compute_collective_group(self, data, rank_id, res_list):
+        if data[TableConstant.TYPE] == Constant.COLLECTIVE:
+            self.collective_group_dict[data[TableConstant.GROUP_NAME]].add(rank_id)
+            data[TableConstant.RANK_ID] = rank_id
+            res_list.append(data)
+
+    def add_p2p_and_rank(self, rank_id: int, step: str, data_dict: dict):
+        data_list = data_dict[step]
+        if not data_list:
+            print(f"[WARNING] rank {rank_id} {step} don't have communication matrix ops data")
+            return
+        for data in data_list:
+            if data[TableConstant.TYPE] != Constant.COLLECTIVE and data[TableConstant.TYPE] != Constant.P2P:
+                print(f"[WARNING] Unknown communication operators type!")
+                continue
+            if data[TableConstant.TYPE] == Constant.P2P:
+                if data[TableConstant.SRC_RANK] != data[TableConstant.DST_RANK]:
+                    rank_set = {data[TableConstant.SRC_RANK], data[TableConstant.DST_RANK]}
+                    if rank_set not in self.p2p_link:
+                        self.p2p_link.append(rank_set)
-- 
Gitee


From 62f81ff291f9066cd3f1104dd776cb0e1ade0bad Mon Sep 17 00:00:00 2001
From: c30056266 <chengfangzhou1@huawei.com>
Date: Fri, 8 Mar 2024 10:52:55 +0800
Subject: [PATCH 36/62] spelling_fix

---
 .../compare_backend/profiling_parser/npu_profiling_parser.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
index 5759b0719a..1de55d693f 100644
--- a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
+++ b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
@@ -111,7 +111,7 @@ class NPUProfilingParser(BaseProfilingParser):
     def _picking_notify_wait_event_and_not_overlap_event(self):
         self.notify_event_cache = []
         self._not_overlaped_commu_event = []
-        for event in self._commu_task_list:
+        for event in self._comm_task_list:
             if event.name == 'Notify_Wait' and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_CHECK' \
                     and event.args.get('rdma_type', 0) != 'RDMA_PAYLOAD_ACK':
                 self.notify_event_cache.append(event)
-- 
Gitee


From f685b1c14c223b57e213675a6020c5929aef82a4 Mon Sep 17 00:00:00 2001
From: hid86196561 <chengfangzhou1@huawei.com>
Date: Mon, 11 Mar 2024 01:43:39 +0000
Subject: [PATCH 37/62] update
 profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py.
 max_emptylist_bugfix

Signed-off-by: hid86196561 <chengfangzhou1@huawei.com>
---
 .../compare_backend/profiling_parser/npu_profiling_parser.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
index 1de55d693f..a767d402c0 100644
--- a/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
+++ b/profiler/compare_tools/compare_backend/profiling_parser/npu_profiling_parser.py
@@ -129,7 +129,7 @@ class NPUProfilingParser(BaseProfilingParser):
                 notify_wait_event_dict[notify_event.tid] = [notify_event]
         total_time = 0
         for commu_event in self._not_overlaped_commu_event:
-            wait_time_list = []
+            wait_time_list = [0]
             commu_event_start_time = float(commu_event.start_time)
             commu_event_end_time = float(commu_event.start_time) + commu_event.dur
 
-- 
Gitee


From a8d4a8852fb2c9a872c9ac576d9a4c8ea0fa343e Mon Sep 17 00:00:00 2001
From: wangqihui01 <wangqihui2@huawei.com>
Date: Mon, 11 Mar 2024 09:59:15 +0800
Subject: [PATCH 38/62] =?UTF-8?q?=E5=BC=82=E5=B8=B8=E6=80=A7=E8=83=BD?=
 =?UTF-8?q?=E7=AE=97=E5=AD=90=E6=A3=80=E6=B5=8B=EF=BC=8C=E7=A7=BB=E9=99=A4?=
 =?UTF-8?q?MIX=5FAIC=E5=92=8CMIX=5FAIV=E7=9A=84=E6=A3=80=E6=B5=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../compute_advice/npu_fused/op_perf.py       |  4 +--
 profiler/advisor/compute_perf_analysis.ipynb  | 20 +++++------
 .../compute_advice/test_npu_slow_advice.py    | 35 +++++++++++++------
 3 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py b/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py
index a194ce5e40..7bcbed5a75 100644
--- a/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py
+++ b/profiler/advisor/advisor_backend/compute_advice/npu_fused/op_perf.py
@@ -23,9 +23,9 @@ from common_func_advisor.constant import PerfColor
 class OpPerfFactory:
     @classmethod
     def build(cls, op_row: Dict):
-        if op_row.get(Constant.TITLE.TASK_TYPE) in (CoreType.AIV, CoreType.MIX_AIV):
+        if op_row.get(Constant.TITLE.TASK_TYPE) == CoreType.AIV:
             return VecOpPerf(op_row)
-        elif op_row.get(Constant.TITLE.TASK_TYPE) in (CoreType.AIC, CoreType.MIX_AIC):
+        elif op_row.get(Constant.TITLE.TASK_TYPE) == CoreType.AIC:
             return CubeOpPerf(op_row)
         else:
             return OpPerf(op_row)
diff --git a/profiler/advisor/compute_perf_analysis.ipynb b/profiler/advisor/compute_perf_analysis.ipynb
index 86a1fb5b7e..e7a663130c 100644
--- a/profiler/advisor/compute_perf_analysis.ipynb
+++ b/profiler/advisor/compute_perf_analysis.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 3,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-02-21T09:19:13.937531900Z",
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 4,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-02-22T08:41:17.455567500Z",
@@ -136,7 +136,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 5,
    "metadata": {
     "collapsed": false
    },
@@ -274,14 +274,14 @@
        "      <td>0.1</td>\n",
        "      <td>0.1</td>\n",
        "      <td>0.7</td>\n",
-       "      <td>1.77</td>\n",
-       "      <td>29508.0</td>\n",
+       "      <td>0.00</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.0062</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>5856.0</td>\n",
        "      <td>0.046921</td>\n",
        "      <td>14.592698</td>\n",
        "      <td>RED</td>\n",
@@ -293,7 +293,7 @@
       "text/plain": [
        "   Step Id    Model ID  Task ID  Stream ID    Name   Type Accelerator Core    Start Time(us)  Duration(us)  Wait Time(us)  Block Dim  Mix Block Dim Input Shapes Input Data Types Input Formats Output Shapes Output Data Types Output Formats  Context ID  aicore_time(us)  aic_total_cycles  aic_mac_ratio  aic_mac_int8_ratio  aic_cube_fops  aic_vector_fops  aiv_time(us)  aiv_total_cycles  aiv_vec_fp32_ratio  aiv_vec_fp16_ratio  aiv_vec_int32_ratio  aiv_vec_misc_ratio  aiv_cube_fops  aiv_vector_fops  size(MB)  throughput(GB/s) color\n",
        "0        1  4294967295     1265         16  Slice1  Slice   AI_VECTOR_CORE  1699529623106750         21.20         261.56          9              0       4,1025            INT64     FORMAT_ND        4,1025             INT32      FORMAT_ND         NaN              0.0               0.0            0.0                 0.0            0.0              0.0          1.77           29508.0                 0.0                 0.0               0.0062                 0.0            0.0           5856.0  0.046921          2.161371   RED\n",
-       "4        1  4294967295     1265         16    Add1    Add          AI_CORE  1699529623106754          3.14         261.56          9              0       4,1025            INT64     FORMAT_ND        4,1025             INT32      FORMAT_ND         NaN              2.3           28888.0            0.2                 0.1            0.1              0.7          1.77           29508.0                 0.0                 0.0               0.0062                 0.0            0.0           5856.0  0.046921         14.592698   RED"
+       "4        1  4294967295     1265         16    Add1    Add          AI_CORE  1699529623106754          3.14         261.56          9              0       4,1025            INT64     FORMAT_ND        4,1025             INT32      FORMAT_ND         NaN              2.3           28888.0            0.2                 0.1            0.1              0.7          0.00               0.0                 0.0                 0.0               0.0000                 0.0            0.0              0.0  0.046921         14.592698   RED"
       ]
      },
      "metadata": {},
@@ -312,7 +312,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -321,7 +321,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
diff --git a/profiler/test/ut/advisor/advisor_backend/compute_advice/test_npu_slow_advice.py b/profiler/test/ut/advisor/advisor_backend/compute_advice/test_npu_slow_advice.py
index 41da4b3c27..8830d49599 100644
--- a/profiler/test/ut/advisor/advisor_backend/compute_advice/test_npu_slow_advice.py
+++ b/profiler/test/ut/advisor/advisor_backend/compute_advice/test_npu_slow_advice.py
@@ -7,7 +7,6 @@ import unittest
 
 from advisor_backend.interface import Interface
 from advisor_backend.compute_advice.npu_slow_advice import NpuSlowAdvice
-from advisor_backend.common_func_advisor.trace_view_json import TraceViewJson
 
 
 class TestNpuSlowAdvice(unittest.TestCase):
@@ -109,17 +108,17 @@ class TestNpuSlowAdvice(unittest.TestCase):
                       'aiv_time(us)', 'aiv_total_cycles', 'aiv_vec_fp32_ratio', 'aiv_vec_fp16_ratio',
                       'aiv_vec_int32_ratio',
                       'aiv_vec_misc_ratio', 'aiv_cube_fops', 'aiv_vector_fops']
-        # red: size=0.0492 MB, throughput=2.32 GB/s, task_duration=21.2us
+        # RED: size=0.0492 MB, throughput=2.32 GB/s, task_duration=21.2us
         csv_row1 = [1, 4294967295, 1265, 16, 'Slice1', 'Slice', 'AI_VECTOR_CORE', "1699529623106750\t", 21.2, 261.56, 9,
                     0,
                     '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A',
                     0, 0, 0, 0, 0, 0,
                     1.77, 29508, 0, 0, 0.0062, 0, 0, 5856]
         # YELLOW: size=0.0492 MB, throughput=984 GB/s, task_duration=0.05us
-        csv_row2 = [1, 4294967295, 1265, 16, 'Slice2', 'Slice', 'MIX_AIV', "1699529623106751\t", 0.05, 261.56, 9,
+        csv_row2 = [1, 4294967295, 1265, 16, 'Slice2', 'Slice', 'AI_VECTOR_CORE', "1699529623106751\t", 0.05, 261.56, 9,
                     0,
                     '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A',
-                    2.3, 28888, 0.4, 0.1, 0.1, 0.7,
+                    0, 0, 0, 0, 0, 0,
                     1.77, 29508, 0, 0, 0.0062, 0, 0, 5856]
         # WHITE: AI_CPU
         csv_row3 = [1, 4294967295, 1265, 16, 'Swish1', 'Swish', 'AI_CPU', "1699529623106752\t", 3.14, 261.56, 9,
@@ -136,16 +135,28 @@ class TestNpuSlowAdvice(unittest.TestCase):
         csv_row5 = [1, 4294967295, 1265, 16, 'Add1', 'Add', 'AI_CORE', "1699529623106754\t", 3.14, 261.56, 9, 0,
                     '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A',
                     2.3, 28888, 0.2, 0.1, 0.1, 0.7,
-                    1.77, 29508, 0, 0, 0.0062, 0, 0, 5856]
+                    0, 0, 0, 0, 0, 0, 0, 0]
         # GREEN: aic_mac_ratio=0.85
-        csv_row6 = [1, 4294967295, 1265, 16, 'Add1', 'Add', 'MIX_AIC', "1699529623106754\t", 3.14, 261.56, 9, 0,
+        csv_row6 = [1, 4294967295, 1265, 16, 'Add1', 'Add', 'AI_CORE', "1699529623106754\t", 3.14, 261.56, 9, 0,
                     '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A',
                     2.3, 38888, 0.85, 0.1, 0.1, 0.7,
-                    1.77, 29508, 0, 0, 0.0062, 0, 0, 5856]
+                    0, 0, 0, 0, 0, 0, 0, 0]
         # YELLOW: aic_mac_ratio=0.64
-        csv_row7 = [1, 4294967295, 1265, 16, 'Add1', 'Add', 'MIX_AIC', "1699529623106754\t", 3.14, 261.56, 9, 0,
+        csv_row7 = [1, 4294967295, 1265, 16, 'Add1', 'Add', 'AI_CORE', "1699529623106754\t", 3.14, 261.56, 9, 0,
                     '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A',
                     2.3, 48888, 0.64, 0.1, 0.1, 0.7,
+                    0, 0, 0, 0, 0, 0, 0, 0]
+        # WHITE: MIX_AIC
+        csv_row8 = [1, 4294967295, 1265, 16, 'Slice2', 'Slice', 'MIX_AIC', "1699529623106751\t", 0.05, 261.56, 9,
+                    0,
+                    '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A',
+                    2.3, 28888, 0.4, 0.1, 0.1, 0.7,
+                    1.77, 29508, 0, 0, 0.0062, 0, 0, 5856]
+        # WHITE: MIX_AIV
+        csv_row9 = [1, 4294967295, 1265, 16, 'Slice2', 'Slice', 'MIX_AIV', "1699529623106751\t", 0.05, 261.56, 9,
+                    0,
+                    '4,1025', 'INT64', 'FORMAT_ND', '4,1025', 'INT32', 'FORMAT_ND', 'N/A',
+                    2.3, 28888, 0.4, 0.1, 0.1, 0.7,
                     1.77, 29508, 0, 0, 0.0062, 0, 0, 5856]
         with os.fdopen(os.open(f"{TestNpuSlowAdvice.OUTPUT_DIR}/kernel_details.csv",
                                os.O_WRONLY | os.O_CREAT, stat.S_IWUSR | stat.S_IRUSR), 'w') as fp:
@@ -158,6 +169,8 @@ class TestNpuSlowAdvice(unittest.TestCase):
             csv_writer.writerow(csv_row5)
             csv_writer.writerow(csv_row6)
             csv_writer.writerow(csv_row7)
+            csv_writer.writerow(csv_row8)
+            csv_writer.writerow(csv_row9)
 
     def test_run_should_return_empty_when_ascend_pt_path_not_exist(self):
         interface = Interface("")
@@ -174,7 +187,7 @@ class TestNpuSlowAdvice(unittest.TestCase):
         interface = Interface(self.ASCEND_PT_DIR)
         data = interface.get_data('compute', 'npu_slow')
         call_stack = NpuSlowAdvice(self.ASCEND_PT_DIR).get_call_stack(data, index_id=0, ts_col="Start Time(us)")
-        self.assertEqual(7, len(data))
+        self.assertEqual(9, len(data))
         self.assertEqual("", call_stack)
 
     def test_run_should_return_7_data_with_call_stack_when_new_trace_view_exists(self):
@@ -186,7 +199,7 @@ class TestNpuSlowAdvice(unittest.TestCase):
         slow_op_data = data[data["color"] == "RED"]
         NpuSlowAdvice.save_to_excel(data, file_path=os.path.join(self.ASCEND_PT_DIR, "slow_op.xlsx"))
         call_stack = NpuSlowAdvice(self.ASCEND_PT_DIR).get_call_stack(data, index_id=0, ts_col="Start Time(us)")
-        self.assertEqual(7, len(data))
+        self.assertEqual(9, len(data))
         self.assertEqual(2, len(slow_op_data))
         print(call_stack)
         call_stack_res = "/root/torch/module.py\n" \
@@ -202,7 +215,7 @@ class TestNpuSlowAdvice(unittest.TestCase):
         slow_op_data = data[data["color"] == "RED"]
         NpuSlowAdvice.save_to_excel(data, file_path=os.path.join(self.ASCEND_PT_DIR, "slow_op.xlsx"))
         call_stack = NpuSlowAdvice(self.ASCEND_PT_DIR).get_call_stack(data, index_id=0, ts_col="Start Time(us)")
-        self.assertEqual(7, len(data))
+        self.assertEqual(9, len(data))
         self.assertEqual(2, len(slow_op_data))
         print(call_stack)
         call_stack_res = "/root/test/slice.py(116)\n\r\n" \
-- 
Gitee


From 218be9ef7753c776b87f87222803d2a10e38f120 Mon Sep 17 00:00:00 2001
From: user_10012209 <734267852@qq.com>
Date: Mon, 11 Mar 2024 11:45:55 +0800
Subject: [PATCH 39/62] =?UTF-8?q?[att\profiler\compare=5Ftools]=E6=A3=80?=
 =?UTF-8?q?=E8=A7=86=E6=84=8F=E8=A7=81=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 profiler/compare_tools/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/profiler/compare_tools/README.md b/profiler/compare_tools/README.md
index 80b7930157..2807e21767 100644
--- a/profiler/compare_tools/README.md
+++ b/profiler/compare_tools/README.md
@@ -140,7 +140,7 @@ python performance_compare.py [基准性能数据文件] [比对性能数据文
 | Flash Attention Time(Backward)          | Flash Attention算子反向耗时。                                |
 | Computing Time                          | 计算流耗时，计算流所有event耗时总和。如果有多条并发计算，计算流耗时对重叠部分只会计算一次。 |
 | Mem Usage                               | 内存使用。GPU上的内存使用可以使用nvidia-smi查看，NPU上的内存使用可以使用npu-smi查看，Profiling信息采集时打开profile_memory=True开关，mem usage显示的是memory_record里面的最大resevered值，一般来说是进程级内存。 |
-| Uncovered Communication Time(Wait Time) | 通信未掩盖耗时，包含Wait Time为同步时间。                    |
+| Uncovered Communication Time(Wait Time) | 通信未掩盖耗时，包含Wait Time（只有采集性能数据的Level等级为L1以上并且采集NPU数据时才会存在）为同步时间。 |
 | SDMA Time(Num)                          | 拷贝类任务耗时，Num表示计算的次数。                          |
 | Free Time                               | 调度耗时 = E2E耗时 - 算子耗时 - 通信不可掩盖耗时。Free的定义为Device侧既不在通信又不在计算的时间，因此包含拷贝时间（SDMA Time）。 |
 | E2E Time(Not minimal profiling)         | E2E总耗时，计算流端到端耗时。当存在Not minimal profiling时，表示该时间存在性能膨胀，会影响通信和调度耗时。 |
-- 
Gitee


From db1b9bc707b93b8196212dbfb6620fd56363324a Mon Sep 17 00:00:00 2001
From: w00800385 <wuzicheng2@huawei.com>
Date: Tue, 12 Mar 2024 17:05:22 +0800
Subject: [PATCH 40/62] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dcommunication=E4=B8=AD?=
 =?UTF-8?q?=E5=8F=AA=E6=9C=89time=5Finfo=E6=B2=A1=E6=9C=89bandwidth?=
 =?UTF-8?q?=E4=BF=A1=E6=81=AF=E6=97=B6=E5=AD=97=E5=85=B8=E8=AE=BF=E9=97=AE?=
 =?UTF-8?q?=E8=B6=8A=E7=95=8C=E7=9A=84bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../communication_group/communication_db_group.py              | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/profiler/cluster_analyse/communication_group/communication_db_group.py b/profiler/cluster_analyse/communication_group/communication_db_group.py
index e0cd7215e5..7dcc8f9c23 100644
--- a/profiler/cluster_analyse/communication_group/communication_db_group.py
+++ b/profiler/cluster_analyse/communication_group/communication_db_group.py
@@ -77,7 +77,8 @@ class CommunicationDBGroup(BaseCommunicationGroup):
             for step, data_list in time_data.items():
                 for data in data_list:
                     self.compute_collective_group(data, rank_id, self.communication_time_info)
-                for data in bandwidth_data[step]:
+            for step, data_list in bandwidth_data.items():
+                for data in data_list:
                     self.compute_collective_group(data, rank_id, self.communication_bandwidth_info)
             for step, data_list in matrix_data.items():
                 self.add_p2p_and_rank(rank_id, step, matrix_data)
-- 
Gitee


From 57721fff6189a0221e46ee33b9ee9e41a1a62900 Mon Sep 17 00:00:00 2001
From: zhouxianqi <13165993773@163.com>
Date: Tue, 12 Mar 2024 14:24:00 +0800
Subject: [PATCH 41/62] place_module_in_front_of_operator_compare

---
 .../compare_tools/compare_backend/generator/base_generator.py | 3 ++-
 .../compare_backend/generator/detail_performance_generator.py | 4 ++--
 profiler/compare_tools/compare_backend/utils/excel_config.py  | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/profiler/compare_tools/compare_backend/generator/base_generator.py b/profiler/compare_tools/compare_backend/generator/base_generator.py
index c472bc9922..e77071b599 100644
--- a/profiler/compare_tools/compare_backend/generator/base_generator.py
+++ b/profiler/compare_tools/compare_backend/generator/base_generator.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from collections import OrderedDict
 from multiprocessing import Process
 
 
@@ -7,7 +8,7 @@ class BaseGenerator(Process, ABC):
         super(BaseGenerator, self).__init__()
         self._profiling_data_dict = profiling_data_dict
         self._args = args
-        self._result_data = {}
+        self._result_data = OrderedDict()
 
     def run(self):
         self.compare()
diff --git a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py
index 677b0d18c2..c7d04d6085 100644
--- a/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py
+++ b/profiler/compare_tools/compare_backend/generator/detail_performance_generator.py
@@ -59,10 +59,10 @@ class DetailPerformanceGenerator(BaseGenerator):
             comparator_list.append(CommunicationComparator(communication_data, CommunicationBean))
 
         if self._args.enable_operator_compare:
-            comparator_list.append(OperatorComparator(op_compare_result, OperatorCompareBean))
-            comparator_list.append(OperatorStatisticComparator(op_compare_result, OperatorStatisticBean))
             if module_compare_result:
                 comparator_list.append(ModuleStatisticComparator(module_compare_result, ModuleStatisticBean))
+            comparator_list.append(OperatorComparator(op_compare_result, OperatorCompareBean))
+            comparator_list.append(OperatorStatisticComparator(op_compare_result, OperatorStatisticBean))
         if self._args.enable_memory_compare:
             comparator_list.append(OperatorComparator(op_compare_result, MemoryCompareBean))
             comparator_list.append(OperatorStatisticComparator(op_compare_result, MemoryStatisticBean))
diff --git a/profiler/compare_tools/compare_backend/utils/excel_config.py b/profiler/compare_tools/compare_backend/utils/excel_config.py
index 965c229ede..f58c414bde 100644
--- a/profiler/compare_tools/compare_backend/utils/excel_config.py
+++ b/profiler/compare_tools/compare_backend/utils/excel_config.py
@@ -56,9 +56,9 @@ class ExcelConfig(object):
     HOST_TOTAL_TIME = "Host Total Time(ms)"
     DEVICE_SELF_TIME = "Device Self Time(ms)"
     DEVICE_TOTAL_TIME = "Device Total Time(ms)"
-    DIFF_SELF_TIME = "Device Self Time Diff"
+    DIFF_SELF_TIME = "Device Self Time Diff(ms)"
     DIFF_TOTAL_RATIO = "Total Diff Ratio"
-    DIFF_TOTAL_TIME = "Device Total Time Diff"
+    DIFF_TOTAL_TIME = "Device Total Time Diff(ms)"
     NUMBER = "Number"
     MODULE_LEVEL = "Module Level"
     BASE_CALL_STACK = "Base Call Stack"
-- 
Gitee


From 4ff0f290dccd660175a17a4540ded9aebdaffc80 Mon Sep 17 00:00:00 2001
From: w00800385 <wuzicheng2@huawei.com>
Date: Wed, 13 Mar 2024 11:19:11 +0800
Subject: [PATCH 42/62] =?UTF-8?q?=E5=BD=93=E8=BE=93=E5=85=A5DB=E6=95=B0?=
 =?UTF-8?q?=E6=8D=AE=E4=B8=BA=E7=A9=BA=E6=88=96=E8=80=85=E8=AE=A1=E7=AE=97?=
 =?UTF-8?q?=E7=BB=93=E6=9E=9C=E4=B8=BA=E7=A9=BA=E6=97=B6=EF=BC=8C=E5=A2=9E?=
 =?UTF-8?q?=E5=8A=A0=E6=8F=90=E7=A4=BA=E6=97=A5=E5=BF=97=E6=89=93=E5=8D=B0?=
 =?UTF-8?q?=EF=BC=8C=E4=B8=8D=E7=94=9F=E6=88=90DB=E6=88=96=E8=80=85?=
 =?UTF-8?q?=E5=AF=B9=E5=BA=94=E8=A1=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../communication_analysis_db.py              |  5 ++++-
 .../comm_matrix_analysis_db.py                |  5 ++++-
 .../analysis/step_trace_time_analysis.py      |  8 +++++---
 .../cluster_analyse/common_func/db_manager.py |  6 +++---
 .../base_communication_group.py               |  4 ++--
 .../communication_db_group.py                 | 19 +++++++++++--------
 6 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py b/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py
index 3559a9a28f..ff371cf7a8 100644
--- a/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py
+++ b/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py
@@ -32,6 +32,9 @@ class CommunicationAnalysisDB:
         self.dump_data()
 
     def dump_data(self):
+        if not self.res_comm_time and not self.res_comm_bandwidth:
+            print("[WARNING] There is no final communication data generated")
+            return
         output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT)
         result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER)
         DBManager.create_tables(result_db, self.COMMUNICATION_TIME_TABLE, self.COMMUNICATION_BANDWIDTH_TABLE)
@@ -66,7 +69,7 @@ class CommunicationAnalysisDB:
             if data[TableConstant.TYPE] == Constant.P2P:
                 rank_tuple = Constant.P2P
             else:
-                rank_tuple = tuple(self.collective_group_dict.get(data[TableConstant.GROUP_NAME]))
+                rank_tuple = tuple(self.collective_group_dict.get(data[TableConstant.GROUP_NAME], []))
             res_dict.setdefault(rank_tuple, {}).setdefault(data[TableConstant.STEP], []).append(data)
 
     def compute_total_info(self):
diff --git a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py
index df58fcecff..dbee80debd 100644
--- a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py
+++ b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py
@@ -24,6 +24,9 @@ class CommMatrixAnalysisDB:
         self.dump_data()
 
     def dump_data(self):
+        if not self.res_comm_matrix:
+            print("[WARNING] There is no final communication_matrix data generated")
+            return
         output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT)
         result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER)
         DBManager.create_tables(result_db, self.COMMUNICATION_MATRIX_TABLE)
@@ -128,6 +131,6 @@ class CommMatrixAnalysisDB:
             if data[TableConstant.STEP] == Constant.P2P:
                 rank_tuple = Constant.P2P
             else:
-                rank_tuple = tuple(self.collective_group_dict.get(data[TableConstant.GROUP_NAME]))
+                rank_tuple = tuple(self.collective_group_dict.get(data[TableConstant.GROUP_NAME], []))
             self.comm_matrix_struct.setdefault(rank_tuple, {}).setdefault(data[TableConstant.STEP], {}). \
                 setdefault(op_name, []).append(data)
diff --git a/profiler/cluster_analyse/analysis/step_trace_time_analysis.py b/profiler/cluster_analyse/analysis/step_trace_time_analysis.py
index 20a71df3c5..f570deee1c 100644
--- a/profiler/cluster_analyse/analysis/step_trace_time_analysis.py
+++ b/profiler/cluster_analyse/analysis/step_trace_time_analysis.py
@@ -53,6 +53,7 @@ class StepTraceTimeAnalysis:
     def dump_data(self):
         if not self.step_data_list:
             print("[WARNING] Can't get step time info!")
+            return
         if self.data_type == Constant.TEXT:
             headers = self.get_headers()
             FileManager.create_csv_file(self.collection_path, self.step_data_list, self.CLUSTER_TRACE_TIME_CSV, headers)
@@ -70,19 +71,20 @@ class StepTraceTimeAnalysis:
         for rank_id, profiling_dir_path in self.data_map.items():
             if self.data_type == Constant.TEXT:
                 step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.STEP_TIME_CSV)
-                if step_time_file:
+                if os.path.exists(step_time_file):
                     self.step_time_dict[rank_id] = FileManager.read_csv_file(step_time_file, StepTraceTimeBean)
             else:
                 step_time_file = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT,
                                               Constant.DB_COMMUNICATION_ANALYZER)
-                if step_time_file and DBManager.check_tables_in_db(step_time_file, Constant.TABLE_STEP_TRACE):
+                if (os.path.exists(step_time_file) and
+                        DBManager.check_tables_in_db(step_time_file, Constant.TABLE_STEP_TRACE)):
                     conn, cursor = DBManager.create_connect_db(step_time_file)
                     sql = "select * from {0}".format(Constant.TABLE_STEP_TRACE)
                     data = DBManager.fetch_all_data(cursor, sql, is_dict=False)
                     self.step_time_dict[rank_id] = data
                     DBManager.destroy_db_connect(conn, cursor)
             if not self.step_time_dict.get(rank_id):
-                print(f"[WARNING] Rank {rank_id} does not have a valid step_trace_time.json.")
+                print(f"[WARNING] Rank {rank_id} does not have a valid step_trace_time data in {self.data_type} file.")
 
     def analyze_step_time(self):
         for rank_id, data_bean_list in self.step_time_dict.items():
diff --git a/profiler/cluster_analyse/common_func/db_manager.py b/profiler/cluster_analyse/common_func/db_manager.py
index f19bc15dc8..bdee49be60 100644
--- a/profiler/cluster_analyse/common_func/db_manager.py
+++ b/profiler/cluster_analyse/common_func/db_manager.py
@@ -35,7 +35,7 @@ class DBManager:
         """
         create and connect database
         """
-        if check_db_path_valid(db_path):
+        if check_db_path_valid(db_path, is_create=True):
             try:
                 conn = sqlite3.connect(db_path)
             except sqlite3.Error as err:
@@ -100,7 +100,7 @@ class DBManager:
 
     @classmethod
     def check_tables_in_db(cls, db_path: any, *tables: any) -> bool:
-        if check_db_path_valid(db_path, True):
+        if check_db_path_valid(db_path):
             conn, curs = cls.create_connect_db(db_path)
             if not (conn and curs):
                 return False
@@ -114,7 +114,7 @@ class DBManager:
         return False
 
     @classmethod
-    def create_tables(cls, db_path: any, *tables: any) -> bool:
+    def create_tables(cls, db_path: any, *tables: any):
         conn, curs = cls.create_connect_db(db_path)
         for table_name in tables:
             if not cls.judge_table_exists(curs, table_name):
diff --git a/profiler/cluster_analyse/communication_group/base_communication_group.py b/profiler/cluster_analyse/communication_group/base_communication_group.py
index 515c77c93a..a275fefe75 100644
--- a/profiler/cluster_analyse/communication_group/base_communication_group.py
+++ b/profiler/cluster_analyse/communication_group/base_communication_group.py
@@ -43,11 +43,11 @@ class BaseCommunicationGroup:
             else:
                 comm_dir = os.path.join(profiling_dir_path, Constant.SINGLE_OUTPUT, Constant.DB_COMMUNICATION_ANALYZER)
                 matrix_dir = comm_dir
-            if comm_dir and matrix_dir:
+            if os.path.exists(comm_dir) or os.path.exists(matrix_dir):
                 comm_op_dirs.append((rank_id, comm_dir, matrix_dir))
             else:
                 print(
-                    f"[WARNING] Rank {rank_id} does not have a valid communication.json or communication_matrix.json.")
+                    f"[WARNING] Rank {rank_id} does not have valid communication data and communication_matrix data.")
         with Pool() as p:
             self.rank_comm_dir_dict = p.map(self.read_communication_func, comm_op_dirs)
 
diff --git a/profiler/cluster_analyse/communication_group/communication_db_group.py b/profiler/cluster_analyse/communication_group/communication_db_group.py
index 7dcc8f9c23..c61411edab 100644
--- a/profiler/cluster_analyse/communication_group/communication_db_group.py
+++ b/profiler/cluster_analyse/communication_group/communication_db_group.py
@@ -23,17 +23,18 @@ class CommunicationDBGroup(BaseCommunicationGroup):
         time_data = {}
         bandwidth_data = {}
         matrix_data = {}
-        if DBManager.check_tables_in_db(db_path, Constant.TABLE_COMM_ANALYZER_TIME,
-                                        Constant.TABLE_COMM_ANALYZER_BANDWIDTH,
-                                        Constant.TABLE_COMM_ANALYZER_MATRIX):
+        if os.path.exists(db_path):
             conn, cursor = DBManager.create_connect_db(db_path)
             time_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_TIME)
             bandwidth_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_BANDWIDTH)
             matrix_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_MATRIX)
-            if self.analysis_mode in ["all", "communication_time"]:
+            if (DBManager.check_tables_in_db(db_path, Constant.TABLE_COMM_ANALYZER_TIME,
+                                             Constant.TABLE_COMM_ANALYZER_BANDWIDTH)
+                    and self.analysis_mode in ["all", "communication_time"]):
                 time_data = DBManager.fetch_all_data(cursor, time_info_sql)
                 bandwidth_data = DBManager.fetch_all_data(cursor, bandwidth_info_sql)
-            if self.analysis_mode in ["all", "communication_matrix"]:
+            if (DBManager.check_tables_in_db(db_path, Constant.TABLE_COMM_ANALYZER_MATRIX)
+                    and self.analysis_mode in ["all", "communication_matrix"]):
                 matrix_data = DBManager.fetch_all_data(cursor, matrix_info_sql)
             DBManager.destroy_db_connect(conn, cursor)
         return rank_id, (self.data_group_by_step(time_data), self.data_group_by_step(bandwidth_data),
@@ -49,19 +50,21 @@ class CommunicationDBGroup(BaseCommunicationGroup):
     def dump_data(self):
         output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT)
         result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER)
-        DBManager.create_tables(result_db, self.COMMUNICATION_GROUP_TABLE)
         res = []
-        conn, cursor = DBManager.create_connect_db(result_db)
         for data_type, data_list in self.communication_group.items():
             for data in data_list:
                 rank_set = "(" + ",".join(str(i) for i in data) + ")"
                 data = [data_type, rank_set]
                 res.append(data)
         if res:
+            DBManager.create_tables(result_db, self.COMMUNICATION_GROUP_TABLE)
+            conn, cursor = DBManager.create_connect_db(result_db)
             sql = "insert into {} values ({value})".format(self.COMMUNICATION_GROUP_TABLE,
                                                            value="?," * (len(res[0]) - 1) + "?")
             DBManager.executemany_sql(conn, sql, res)
-        DBManager.destroy_db_connect(conn, cursor)
+            DBManager.destroy_db_connect(conn, cursor)
+        else:
+            print("[WARNING] The CommunicationGroup table won't be created because no data has been calculated.")
         comm_data_dict = {
             Constant.COLLECTIVE_GROUP: self.collective_group_dict,
             Constant.COMMUNICATION_TIME_INFO: self.communication_time_info,
-- 
Gitee


From bbb5346c58f1b41877502f23168ca4c7120c3efd Mon Sep 17 00:00:00 2001
From: user_10012209 <734267852@qq.com>
Date: Thu, 14 Mar 2024 16:05:03 +0800
Subject: [PATCH 43/62] =?UTF-8?q?[att\profiler\compare=5Ftools]=E6=8F=8F?=
 =?UTF-8?q?=E8=BF=B0=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 profiler/compare_tools/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/profiler/compare_tools/README.md b/profiler/compare_tools/README.md
index 2807e21767..106f53e11a 100644
--- a/profiler/compare_tools/README.md
+++ b/profiler/compare_tools/README.md
@@ -179,7 +179,7 @@ activities配置仅采集NPU数据，不配置experimental_config参数以及其
 
 ### nn.Module性能
 
-nn.Module是所有神经网络模块的基类，使用PyTorch构建神经网络需要继承nn.Module类来实现，性能比对工具支持nn.Module模块级的比对，帮助优化模型结构。
+nn.Module是所有神经网络模块的基类，使用PyTorch构建神经网络需要继承nn.Module类来实现，性能比对工具支持模块级的比对（包含优化器和nn.Module），帮助优化模型结构。
 
 当用户采集时开启with_stack开关，会上报python function事件，当比对的双方数据都存在python function的事件时，可进行模块级别的比对。
 
@@ -193,8 +193,8 @@ nn.Module性能比对结果在performance_comparison_result_*.xlsl中ModuleCompa
 - Device Self Time(ms)：该模块调用的算子（排除子模块）在device侧执行的总耗时，单位ms。
 - Number：该Module或算子被调用的次数。
 - Device Total Time(ms)：该模块调用的算子（包含子模块）在device侧执行的总耗时，单位ms。
-- Device Total Time Diff：GPU与NPU的Device Total Time(ms)差值。
-- Device Self Time Diff：GPU与NPU的Device Self Time(ms)差值。
+- Device Total Time Diff(ms)：GPU与NPU的Device Total Time(ms)差值。
+- Device Self Time Diff(ms)：GPU与NPU的Device Self Time(ms)差值。
 - Total Time Ratio：GPU与NPU的Device Total Time(ms)比值。
 - Base Call Stack：基准文件模块的调用栈。
 - Comparison Call Stack：比较文件模块的调用栈。
-- 
Gitee


From eafceb1ff93e3d12c475027c4af308b864a296ef Mon Sep 17 00:00:00 2001
From: w00800385 <wuzicheng2@huawei.com>
Date: Fri, 15 Mar 2024 13:23:39 +0800
Subject: [PATCH 44/62] =?UTF-8?q?=E8=B0=83=E6=95=B4=E8=BE=93=E5=85=A5?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6=E6=A0=A1=E9=AA=8C=EF=BC=8C=E9=98=B2=E6=AD=A2?=
 =?UTF-8?q?ascend=20insight=E6=B7=BB=E5=8A=A0DB=E5=AF=B9=E6=A0=A1=E9=AA=8C?=
 =?UTF-8?q?=E9=80=A0=E6=88=90=E5=B9=B2=E6=89=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 profiler/cluster_analyse/cluster_analysis.py  | 35 ++++---------------
 .../pytorch_data_preprocessor.py              | 35 ++++++++++++++++++-
 2 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/profiler/cluster_analyse/cluster_analysis.py b/profiler/cluster_analyse/cluster_analysis.py
index 68eae526fb..90b82acd48 100644
--- a/profiler/cluster_analyse/cluster_analysis.py
+++ b/profiler/cluster_analyse/cluster_analysis.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import argparse
-import glob
 import os
 
 from cluster_data_preprocess.pytorch_data_preprocessor import PytorchDataPreprocessor
@@ -29,8 +28,6 @@ from analysis.analysis_facade import AnalysisFacade
 class Interface:
     ASCEND_PT = "ascend_pt"
     ASCEND_MS = "ascend_ms"
-    DB_RESULT_INFO = "*.db"
-    ALL_RESULT_INFO = "*.*"
 
     def __init__(self, params: dict):
         self.collection_path = PathManager.get_realpath(params.get(Constant.COLLECTION_PATH))
@@ -41,25 +38,6 @@ class Interface:
         self.communication_ops = []
         self.matrix_ops = []
 
-    def check_db_or_other_files(self, data_map: dict) -> tuple:
-        type_db_count = 0
-        type_text_count = 0
-        for _, folder_path in data_map.items():
-            folder_path = os.path.join(folder_path, Constant.SINGLE_OUTPUT)
-            db_files = glob.glob(os.path.join(folder_path, self.DB_RESULT_INFO))
-            all_files = glob.glob(os.path.join(folder_path, self.ALL_RESULT_INFO))
-            if all_files and db_files and len(all_files) != len(db_files):
-                return False, None
-            if db_files:
-                type_db_count += 1
-            else:
-                type_text_count += 1
-        if type_db_count == len(data_map):
-            return True, Constant.DB
-        if type_text_count == len(data_map):
-            return True, Constant.TEXT
-        return False, None
-
     def allocate_prof_data(self):
         ascend_pt_dirs = []
         ascend_ms_dirs = []
@@ -69,24 +47,25 @@ class Interface:
                     ascend_pt_dirs.append(os.path.join(root, dir_name))
                 if dir_name.endswith(self.ASCEND_MS):
                     ascend_ms_dirs.append(os.path.join(root, dir_name))
-        pt_data_map = PytorchDataPreprocessor(ascend_pt_dirs).get_data_map()
+        pytorch_processor = PytorchDataPreprocessor(ascend_pt_dirs)
+        pt_data_map = pytorch_processor.get_data_map()
+        data_type = pytorch_processor.get_data_type()
         ms_data_map = MindsporeDataPreprocessor(ascend_ms_dirs).get_data_map()
         if pt_data_map and ms_data_map:
             print("[ERROR] Can not analyze pytorch and mindspore meantime.")
             return []
-        return pt_data_map if pt_data_map else ms_data_map
+        return (pt_data_map, data_type) if pt_data_map else (ms_data_map, Constant.TEXT)
 
     def run(self):
         PathManager.check_input_directory_path(self.collection_path)
         PathManager.check_path_owner_consistent(self.collection_path)
         FileManager.create_output_dir(self.collection_path)
-        data_map = self.allocate_prof_data()
+        data_map, data_type = self.allocate_prof_data()
         if not data_map:
             print("[WARNING] Can not get rank info or profiling data.")
             return
-        is_valid, data_type = self.check_db_or_other_files(data_map)
-        if not is_valid:
-            print("[WARNING] The current folder contains both DB and other files. Please check.")
+        if not data_type:
+            print("[ERROR] The current folder contains both DB and other files. Please check.")
             return
         params = {
             Constant.COLLECTION_PATH: self.collection_path,
diff --git a/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py b/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py
index f1e4c062a7..c5e0861965 100644
--- a/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py
+++ b/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py
@@ -12,9 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import glob
 from collections import defaultdict
 import os
+
+from common_func.constant import Constant
 from common_func.file_manager import FileManager
 from common_func.path_manager import PathManager
 
@@ -22,9 +24,14 @@ from common_func.path_manager import PathManager
 class PytorchDataPreprocessor:
     PROFILER_INFO_HEAD = 'profiler_info_'
     PROFILER_INFO_EXTENSION = '.json'
+    JSON_RESULT_INFO = "*.json"
+    CSV_RESULT_INFO = "*.csv"
 
     def __init__(self, path_list: str):
         self.path_list = path_list
+        self.db_count = 0
+        self.text_count = 0
+        self.valid_data_flag = True
 
     def get_data_map(self) -> dict:
         rank_id_map = defaultdict(list)
@@ -33,6 +40,21 @@ class PytorchDataPreprocessor:
             if rank_id < 0:
                 print('[Error]fail to get rankid or rankid invalid.')
                 continue
+            folder_path = os.path.join(dir_name, Constant.SINGLE_OUTPUT)
+            db_files = glob.glob(os.path.join(folder_path, Constant.DB_COMMUNICATION_ANALYZER))
+            text_files = (glob.glob(os.path.join(folder_path, self.JSON_RESULT_INFO)) +
+                          glob.glob(os.path.join(folder_path, self.CSV_RESULT_INFO)))
+            if text_files and db_files:
+                self.valid_data_flag = False
+                print(f"[ERROR] Rank {rank_id} has both db and text files")
+                continue
+            if db_files:
+                self.db_count += 1
+            elif text_files:
+                self.text_count += 1
+            else:
+                print(f"[WARNING] Rank {rank_id} has no valid files")
+                continue
             rank_id_map[rank_id].append(dir_name)
 
         ret_dict = dict()
@@ -55,3 +77,14 @@ class PytorchDataPreprocessor:
                     rank_id = -1
                 return rank_id
         return -1
+
+    def get_data_type(self):
+        if self.valid_data_flag:
+            if self.db_count != 0 and self.text_count != 0:
+                return None
+            if self.db_count != 0:
+                return Constant.DB
+            if self.text_count != 0:
+                return True, Constant.TEXT
+        else:
+            return None
-- 
Gitee


From 0335314abc48353b68d12936bc150d7c8446a573 Mon Sep 17 00:00:00 2001
From: w00800385 <wuzicheng2@huawei.com>
Date: Fri, 15 Mar 2024 16:25:32 +0800
Subject: [PATCH 45/62] =?UTF-8?q?=E8=B0=83=E6=95=B4=E8=BE=93=E5=85=A5?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6=E6=A0=A1=E9=AA=8C=EF=BC=8C=E9=98=B2=E6=AD=A2?=
 =?UTF-8?q?ascend=20insight=E6=B7=BB=E5=8A=A0DB=E5=AF=B9=E6=A0=A1=E9=AA=8C?=
 =?UTF-8?q?=E9=80=A0=E6=88=90=E5=B9=B2=E6=89=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 profiler/cluster_analyse/cluster_analysis.py  |  2 +-
 .../pytorch_data_preprocessor.py              | 21 ++++++++-----------
 .../cluster_analyse/common_func/constant.py   |  1 +
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/profiler/cluster_analyse/cluster_analysis.py b/profiler/cluster_analyse/cluster_analysis.py
index 90b82acd48..2445462211 100644
--- a/profiler/cluster_analyse/cluster_analysis.py
+++ b/profiler/cluster_analyse/cluster_analysis.py
@@ -64,7 +64,7 @@ class Interface:
         if not data_map:
             print("[WARNING] Can not get rank info or profiling data.")
             return
-        if not data_type:
+        if data_type == Constant.INVALID:
             print("[ERROR] The current folder contains both DB and other files. Please check.")
             return
         params = {
diff --git a/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py b/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py
index c5e0861965..7b55612845 100644
--- a/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py
+++ b/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py
@@ -31,7 +31,6 @@ class PytorchDataPreprocessor:
         self.path_list = path_list
         self.db_count = 0
         self.text_count = 0
-        self.valid_data_flag = True
 
     def get_data_map(self) -> dict:
         rank_id_map = defaultdict(list)
@@ -45,9 +44,9 @@ class PytorchDataPreprocessor:
             text_files = (glob.glob(os.path.join(folder_path, self.JSON_RESULT_INFO)) +
                           glob.glob(os.path.join(folder_path, self.CSV_RESULT_INFO)))
             if text_files and db_files:
-                self.valid_data_flag = False
                 print(f"[ERROR] Rank {rank_id} has both db and text files")
-                continue
+                self.db_count, self.text_count = 1, 1
+                break
             if db_files:
                 self.db_count += 1
             elif text_files:
@@ -79,12 +78,10 @@ class PytorchDataPreprocessor:
         return -1
 
     def get_data_type(self):
-        if self.valid_data_flag:
-            if self.db_count != 0 and self.text_count != 0:
-                return None
-            if self.db_count != 0:
-                return Constant.DB
-            if self.text_count != 0:
-                return True, Constant.TEXT
-        else:
-            return None
+        if self.db_count != 0 and self.text_count != 0:
+            return Constant.INVALID
+        if self.db_count != 0:
+            return Constant.DB
+        if self.text_count != 0:
+            return Constant.TEXT
+        return Constant.INVALID
diff --git a/profiler/cluster_analyse/common_func/constant.py b/profiler/cluster_analyse/common_func/constant.py
index 71caee40db..200244aff4 100644
--- a/profiler/cluster_analyse/common_func/constant.py
+++ b/profiler/cluster_analyse/common_func/constant.py
@@ -84,6 +84,7 @@ class Constant(object):
     # result files type
     TEXT = "text"
     DB = "db"
+    INVALID = "invalid"
 
     # db name
     DB_COMMUNICATION_ANALYZER = "analysis.db"
-- 
Gitee


From c02329f9f6039af26e2f112b2d9e4c6e5955741f Mon Sep 17 00:00:00 2001
From: w00800385 <wuzicheng2@huawei.com>
Date: Sat, 16 Mar 2024 17:44:10 +0800
Subject: [PATCH 46/62] =?UTF-8?q?=E5=90=88=E5=B9=B6DB=E4=B8=8ETEXT?=
 =?UTF-8?q?=E8=A7=A3=E6=9E=90=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../data_preprocessor.py                      | 23 +++++-
 .../mindspore_data_preprocessor.py            | 30 ++-----
 .../pytorch_data_preprocessor.py              | 28 ++-----
 .../cluster_analyse/common_func/constant.py   |  3 +
 .../common_func/data_transfer_adapter.py      | 76 +++++++++++++++++
 .../base_communication_group.py               | 81 ++++++++++++++++++-
 .../communication_db_group.py                 | 62 +++-----------
 .../communication_json_group.py               | 81 -------------------
 8 files changed, 199 insertions(+), 185 deletions(-)
 create mode 100644 profiler/cluster_analyse/common_func/data_transfer_adapter.py

diff --git a/profiler/cluster_analyse/cluster_data_preprocess/data_preprocessor.py b/profiler/cluster_analyse/cluster_data_preprocess/data_preprocessor.py
index ebc9647c20..72d65ae657 100644
--- a/profiler/cluster_analyse/cluster_data_preprocess/data_preprocessor.py
+++ b/profiler/cluster_analyse/cluster_data_preprocess/data_preprocessor.py
@@ -12,15 +12,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
 from abc import abstractmethod
 
 
 class DataPreprocessor:
-    def __init__(self, collection_path: str):
-        self.collection_path = collection_path
+    PROFILER_INFO_HEAD = 'profiler_info_'
+    PROFILER_INFO_EXTENSION = '.json'
+
+    def __init__(self, path_list: list):
+        self.path_list = path_list
         self.data_map = {}
 
     @abstractmethod
-    def input_data(self):
+    def get_data_map(self):
         pass
+
+    def get_rank_id(self, dir_name: str) -> int:
+        files = os.listdir(dir_name)
+        for file_name in files:
+            if file_name.startswith(self.PROFILER_INFO_HEAD) and file_name.endswith(self.PROFILER_INFO_EXTENSION):
+                rank_id_str = file_name[len(self.PROFILER_INFO_HEAD): -1 * len(self.PROFILER_INFO_EXTENSION)]
+                try:
+                    rank_id = int(rank_id_str)
+                except ValueError:
+                    rank_id = -1
+                return rank_id
+        return -1
diff --git a/profiler/cluster_analyse/cluster_data_preprocess/mindspore_data_preprocessor.py b/profiler/cluster_analyse/cluster_data_preprocess/mindspore_data_preprocessor.py
index 85debdd31b..a3e09983dd 100644
--- a/profiler/cluster_analyse/cluster_data_preprocess/mindspore_data_preprocessor.py
+++ b/profiler/cluster_analyse/cluster_data_preprocess/mindspore_data_preprocessor.py
@@ -14,17 +14,14 @@
 # limitations under the License.
 
 from collections import defaultdict
-import os
-from common_func.file_manager import FileManager
-from common_func.path_manager import PathManager
 
+from cluster_data_preprocess.data_preprocessor import DataPreprocessor
 
-class MindsporeDataPreprocessor:
-    PROFILER_INFO_HEAD = 'profiler_info_'
-    PROFILER_INFO_EXTENSION = '.json'
 
-    def __init__(self, path_list: str):
-        self.path_list = path_list
+class MindsporeDataPreprocessor(DataPreprocessor):
+
+    def __init__(self, path_list: list):
+        super().__init__(path_list)
 
     def get_data_map(self) -> dict:
         rank_id_map = defaultdict(list)
@@ -35,23 +32,10 @@ class MindsporeDataPreprocessor:
                 continue
             rank_id_map[rank_id].append(dir_name)
 
-        ret_dict = dict()
         try:
             for (rank_id, dir_list) in rank_id_map.items():
                 dir_list.sort(key=lambda x: x.split('_')[-3])
-                ret_dict[rank_id] = dir_list[0]
+                self.data_map[rank_id] = dir_list[0]
         except Exception as e:
             raise RuntimeError("Found invalid directory name!") from e
-        return ret_dict
-
-    def get_rank_id(self, dir_name: str) -> int:
-        files = os.listdir(dir_name)
-        for file_name in files:
-            if file_name.startswith(self.PROFILER_INFO_HEAD) and file_name.endswith(self.PROFILER_INFO_EXTENSION):
-                rank_id_str = file_name[len(self.PROFILER_INFO_HEAD): -1 * len(self.PROFILER_INFO_EXTENSION)]
-                try:
-                    rank_id = int(rank_id_str)
-                except ValueError:
-                    rank_id = -1
-                return rank_id
-        return -1
+        return self.data_map
diff --git a/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py b/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py
index 7b55612845..943ed60f56 100644
--- a/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py
+++ b/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py
@@ -16,19 +16,16 @@ import glob
 from collections import defaultdict
 import os
 
+from cluster_data_preprocess.data_preprocessor import DataPreprocessor
 from common_func.constant import Constant
-from common_func.file_manager import FileManager
-from common_func.path_manager import PathManager
 
 
-class PytorchDataPreprocessor:
-    PROFILER_INFO_HEAD = 'profiler_info_'
-    PROFILER_INFO_EXTENSION = '.json'
+class PytorchDataPreprocessor(DataPreprocessor):
     JSON_RESULT_INFO = "*.json"
     CSV_RESULT_INFO = "*.csv"
 
-    def __init__(self, path_list: str):
-        self.path_list = path_list
+    def __init__(self, path_list: list):
+        super().__init__(path_list)
         self.db_count = 0
         self.text_count = 0
 
@@ -56,26 +53,13 @@ class PytorchDataPreprocessor:
                 continue
             rank_id_map[rank_id].append(dir_name)
 
-        ret_dict = dict()
         try:
             for (rank_id, dir_list) in rank_id_map.items():
                 dir_list.sort(key=lambda x: x.split('_')[-3])
-                ret_dict[rank_id] = dir_list[0]
+                self.data_map[rank_id] = dir_list[0]
         except Exception as e:
             raise RuntimeError("Found invalid directory name!") from e
-        return ret_dict
-
-    def get_rank_id(self, dir_name: str) -> int:
-        files = os.listdir(dir_name)
-        for file_name in files:
-            if file_name.startswith(self.PROFILER_INFO_HEAD) and file_name.endswith(self.PROFILER_INFO_EXTENSION):
-                rank_id_str = file_name[len(self.PROFILER_INFO_HEAD): -1 * len(self.PROFILER_INFO_EXTENSION)]
-                try:
-                    rank_id = int(rank_id_str)
-                except ValueError:
-                    rank_id = -1
-                return rank_id
-        return -1
+        return self.data_map
 
     def get_data_type(self):
         if self.db_count != 0 and self.text_count != 0:
diff --git a/profiler/cluster_analyse/common_func/constant.py b/profiler/cluster_analyse/common_func/constant.py
index 200244aff4..e75bbebe75 100644
--- a/profiler/cluster_analyse/common_func/constant.py
+++ b/profiler/cluster_analyse/common_func/constant.py
@@ -57,6 +57,9 @@ class Constant(object):
     OP_NAME = "Op Name"
     BANDWIDTH_GB_S = "Bandwidth(GB/s)"
     COMMUNICATION = "communication.json"
+    ELAPSE_TIME_MS = "Elapse Time(ms)"
+    IDLE_TIME_MS = "Idle Time(ms)"
+    LARGE_PACKET_RATIO = "Large Packet Ratio"
 
     # params
     DATA_MAP = "data_map"
diff --git a/profiler/cluster_analyse/common_func/data_transfer_adapter.py b/profiler/cluster_analyse/common_func/data_transfer_adapter.py
new file mode 100644
index 0000000000..fb0663e2f5
--- /dev/null
+++ b/profiler/cluster_analyse/common_func/data_transfer_adapter.py
@@ -0,0 +1,76 @@
+from common_func.constant import Constant
+from common_func.table_constant import TableConstant
+
+
+class DataTransferAdapter:
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def transfer_comm_from_db_to_json(time_info: list, bandwidth_info: list):
+        result = {}
+        if not time_info and not bandwidth_info:
+            return result
+        for time_data in time_info:
+            comm_time = dict()
+            hccl_name = time_data[TableConstant.HCCL_OP_NAME] + "@" + time_data[TableConstant.GROUP_NAME]
+            comm_time[Constant.ELAPSE_TIME_MS] = time_data[TableConstant.ELAPSED_TIME]
+            comm_time[Constant.IDLE_TIME_MS] = time_data[TableConstant.IDLE_TIME]
+            comm_time[Constant.START_TIMESTAMP] = time_data[TableConstant.START_TIMESTAMP]
+            comm_time[Constant.SYNCHRONIZATION_TIME_MS] = time_data[TableConstant.SYNCHRONIZATION_TIME]
+            comm_time[Constant.TRANSIT_TIME_MS] = time_data[TableConstant.TRANSIT_TIME]
+            comm_time[Constant.WAIT_TIME_MS] = time_data[TableConstant.WAIT_TIME]
+            result.setdefault(time_data[TableConstant.STEP], {}).setdefault(time_data[TableConstant.TYPE], {}). \
+                setdefault(hccl_name, {})[Constant.COMMUNICATION_TIME_INFO] = comm_time
+        hccl_set = set()
+        for bd_data in bandwidth_info:
+            hccl_name = bd_data[TableConstant.HCCL_OP_NAME] + "@" + bd_data[TableConstant.GROUP_NAME]
+            hccl_set.add(hccl_name)
+        for hccl in hccl_set:
+            comm_bd = dict()
+            for bd_data in bandwidth_info:
+                if hccl == (bd_data[TableConstant.HCCL_OP_NAME] + "@" + bd_data[TableConstant.GROUP_NAME]):
+                    comm_bd.setdefault(bd_data[TableConstant.TRANSPORT_TYPE], {})[Constant.BANDWIDTH_GB_S] = \
+                        bd_data[TableConstant.BANDWIDTH]
+                    comm_bd.setdefault(bd_data[TableConstant.TRANSPORT_TYPE], {})[Constant.TRANSIT_TIME_MS] = \
+                        bd_data[TableConstant.TRANSIT_TIME]
+                    comm_bd.setdefault(bd_data[TableConstant.TRANSPORT_TYPE], {})[Constant.TRANSIT_SIZE_MB] = \
+                        bd_data[TableConstant.TRANSIT_SIZE]
+                    comm_bd.setdefault(bd_data[TableConstant.TRANSPORT_TYPE], {})[Constant.LARGE_PACKET_RATIO] = \
+                        bd_data[TableConstant.LARGE_PACKET_RATIO]
+                    comm_bd.setdefault(bd_data[TableConstant.TRANSPORT_TYPE], {}).setdefault(
+                        Constant.SIZE_DISTRIBUTION, {})[bd_data[TableConstant.PACKAGE_SIZE]] = \
+                        [bd_data[TableConstant.COUNT], bd_data[TableConstant.TOTAL_DURATION]]
+                    result.setdefault(bd_data[TableConstant.STEP], {}).setdefault(bd_data[TableConstant.TYPE], {}). \
+                        setdefault(hccl, {})[Constant.COMMUNICATION_BANDWIDTH_INFO] = comm_bd
+        return result
+
+    def transfer_comm_from_json_to_db(self):
+        pass
+
+    @staticmethod
+    def transfer_matrix_from_db_to_json(matrix_data: list):
+        result = {}
+        if not matrix_data:
+            return result
+        hccl_set = set()
+        for data in matrix_data:
+            hccl = data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME]
+            hccl_set.add(hccl)
+        for hccl in hccl_set:
+            matrix_dict = dict()
+            for data in matrix_data:
+                if hccl == (data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME]):
+                    key = data[TableConstant.SRC_RANK] + '-' + data[TableConstant.DST_RANK]
+                    matrix_dict.setdefault(key, {})[Constant.BANDWIDTH_GB_S] = data[TableConstant.BANDWIDTH]
+                    matrix_dict.setdefault(key, {})[Constant.TRANSIT_TIME_MS] = data[TableConstant.TRANSIT_TIME]
+                    matrix_dict.setdefault(key, {})[Constant.TRANSIT_SIZE_MB] = data[TableConstant.TRANSIT_SIZE]
+                    matrix_dict.setdefault(key, {})[Constant.TRANSPORT_TYPE] = data[TableConstant.TRANSPORT_TYPE]
+                    matrix_dict.setdefault(key, {})[Constant.OP_NAME] = data[TableConstant.OPNAME]
+                    result.setdefault(data[TableConstant.STEP], {}).setdefault(data[TableConstant.TYPE], {})[hccl] =\
+                        matrix_dict
+        return result
+
+    def transfer_matrix_from_json_to_db(self):
+        pass
diff --git a/profiler/cluster_analyse/communication_group/base_communication_group.py b/profiler/cluster_analyse/communication_group/base_communication_group.py
index a275fefe75..bc34d3439f 100644
--- a/profiler/cluster_analyse/communication_group/base_communication_group.py
+++ b/profiler/cluster_analyse/communication_group/base_communication_group.py
@@ -33,6 +33,8 @@ class BaseCommunicationGroup:
         self.collective_group_dict = defaultdict(set)
         self.p2p_comm_group = []
         self.communication_group = {}
+        self.communication_ops = []
+        self.matrix_ops = []
 
     def load_communication_data(self):
         comm_op_dirs = []
@@ -102,9 +104,22 @@ class BaseCommunicationGroup:
     def read_communication_func(self, params: tuple):
         pass
 
-    @abstractmethod
     def analyze_communication_data(self):
-        pass
+        for rank_id, rank_id_comm_dict, rank_id_matrix_dict in self.rank_comm_dir_dict:
+            for step_id, step_id_dict in rank_id_comm_dict.items():
+                if not isinstance(step_id_dict, dict):
+                    print(f"[WARNING] rank{rank_id}'s communication.json has a wrong data struct.")
+                    continue
+                self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE))
+                for comm_op_type, comm_op_dict in step_id_dict.items():
+                    self.add_communication_ops(rank_id, step_id, comm_op_type, comm_op_dict)
+
+            for step_id, step_id_dict in rank_id_matrix_dict.items():
+                if not isinstance(step_id_dict, dict):
+                    print(f"[WARNING] rank{rank_id}'s communication_matrix.json has a wrong data struct.")
+                    continue
+                self.set_p2p_link(rank_id, step_id, rank_id_matrix_dict)
+                self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE))
 
     @abstractmethod
     def dump_data(self):
@@ -118,6 +133,68 @@ class BaseCommunicationGroup:
         self.generate_p2p_communication_group()
         return self.dump_data()
 
+    def set_p2p_link(self, rank_id: int, step_id: str, rank_id_matrix_dict: dict):
+        ops = rank_id_matrix_dict.get(step_id, {})
+        self.add_matrix_ops(rank_id, step_id, ops)
+        if not ops:
+            print(f"[WARNING] rank{rank_id} {step_id} do not have communication matrix ops data.")
+            return
+        p2p_ops = ops.get(Constant.P2P, {})
+        for op_name, link_dict in p2p_ops.items():
+            self.append_p2p_link(op_name, link_dict)
+
+    def append_p2p_link(self, op_name, link_dict):
+        for link in link_dict:
+            if '-' not in link:
+                print(f"[WARNING] {op_name} has an invalid link key {link}!")
+                break
+            src_rank = int(link.split('-')[0])
+            dst_rank = int(link.split('-')[1])
+            if src_rank != dst_rank:
+                rank_set = {src_rank, dst_rank}
+                if rank_set in self.p2p_link:
+                    continue
+                self.p2p_link.append(rank_set)
+
+    def get_collective_ops_name(self, rank_id: int, comm_op_dict: dict):
+        for comm_op in comm_op_dict:
+            if comm_op.startswith('Total'):
+                continue
+            group_name = comm_op.split('@')[-1]
+            self.collective_group_dict[group_name].add(rank_id)
+
+    def add_communication_ops(self, rank_id: str, step_id: str, comm_op_type: str, comm_op_dict: dict):
+        for comm_op in comm_op_dict:
+            if comm_op.startswith('Total'):
+                continue
+            group_name = comm_op.split('@')[-1]
+            self.communication_ops.append({
+                Constant.RANK_ID: rank_id,
+                Constant.STEP_ID: step_id,
+                Constant.COMM_OP_TYPE: comm_op_type,
+                Constant.COMM_OP_NAME: comm_op,
+                Constant.GROUP_NAME: group_name,
+                Constant.COMM_OP_INFO: comm_op_dict.get(comm_op)
+            })
+
+    def add_matrix_ops(self, rank_id: int, step_id: str, step_id_dict: dict):
+        for comm_op_type, comm_dict in step_id_dict.items():
+            if comm_op_type != Constant.COLLECTIVE and comm_op_type != Constant.P2P:
+                print(f"[WARNING] Unknown communication operators type!")
+                continue
+            for op_name, op_link_info in comm_dict.items():
+                if op_name.startswith('Total'):
+                    continue
+                group_name = op_name.split('@')[-1]
+                self.matrix_ops.append({
+                    Constant.RANK_ID: rank_id,
+                    Constant.STEP_ID: step_id,
+                    Constant.COMM_OP_TYPE: comm_op_type,
+                    Constant.COMM_OP_NAME: op_name,
+                    Constant.GROUP_NAME: group_name,
+                    Constant.COMM_OP_INFO: op_link_info
+                })
+
 
 class UnionFind(object):
     """Disjoint Set Union"""
diff --git a/profiler/cluster_analyse/communication_group/communication_db_group.py b/profiler/cluster_analyse/communication_group/communication_db_group.py
index c61411edab..7e498fcde1 100644
--- a/profiler/cluster_analyse/communication_group/communication_db_group.py
+++ b/profiler/cluster_analyse/communication_group/communication_db_group.py
@@ -1,8 +1,8 @@
 import os
 
+from common_func.data_transfer_adapter import DataTransferAdapter
 from common_func.db_manager import DBManager
 from common_func.constant import Constant
-from common_func.table_constant import TableConstant
 from communication_group.base_communication_group import BaseCommunicationGroup
 
 
@@ -11,18 +11,15 @@ class CommunicationDBGroup(BaseCommunicationGroup):
 
     def __init__(self, params: dict):
         super().__init__(params)
-        self.communication_bandwidth_info = []
-        self.communication_time_info = []
-        self.matrix_info = []
 
     def read_communication_func(self, params: tuple):
         if len(params) < 3:
             return -1, ({}, {}, {})
         rank_id = params[0]
         db_path = params[1]
-        time_data = {}
-        bandwidth_data = {}
-        matrix_data = {}
+        time_data = []
+        bandwidth_data = []
+        matrix_data = []
         if os.path.exists(db_path):
             conn, cursor = DBManager.create_connect_db(db_path)
             time_info_sql = "select * from {0}".format(Constant.TABLE_COMM_ANALYZER_TIME)
@@ -37,15 +34,9 @@ class CommunicationDBGroup(BaseCommunicationGroup):
                     and self.analysis_mode in ["all", "communication_matrix"]):
                 matrix_data = DBManager.fetch_all_data(cursor, matrix_info_sql)
             DBManager.destroy_db_connect(conn, cursor)
-        return rank_id, (self.data_group_by_step(time_data), self.data_group_by_step(bandwidth_data),
-                         self.data_group_by_step(matrix_data))
-
-    @staticmethod
-    def data_group_by_step(data: any) -> any:
-        res = {}
-        for item in data:
-            res.setdefault(item[TableConstant.STEP], []).append(item)
-        return res
+        comm_data = DataTransferAdapter.transfer_comm_from_db_to_json(time_data, bandwidth_data)
+        comm_matrix_data = DataTransferAdapter.transfer_matrix_from_db_to_json(matrix_data)
+        return rank_id, comm_data, comm_matrix_data
 
     def dump_data(self):
         output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT)
@@ -67,44 +58,9 @@ class CommunicationDBGroup(BaseCommunicationGroup):
             print("[WARNING] The CommunicationGroup table won't be created because no data has been calculated.")
         comm_data_dict = {
             Constant.COLLECTIVE_GROUP: self.collective_group_dict,
-            Constant.COMMUNICATION_TIME_INFO: self.communication_time_info,
-            Constant.COMMUNICATION_BANDWIDTH_INFO: self.communication_bandwidth_info,
-            Constant.MATRIX_OPS: self.matrix_info,
+            Constant.COMMUNICATION_OPS: self.communication_ops,
+            Constant.MATRIX_OPS: self.matrix_ops,
             Constant.COMMUNICATION_GROUP: self.communication_group
         }
         return comm_data_dict
 
-    def analyze_communication_data(self):
-        for rank_id, data_tuple in self.rank_comm_dir_dict:
-            time_data, bandwidth_data, matrix_data = data_tuple[0], data_tuple[1], data_tuple[2]
-            for step, data_list in time_data.items():
-                for data in data_list:
-                    self.compute_collective_group(data, rank_id, self.communication_time_info)
-            for step, data_list in bandwidth_data.items():
-                for data in data_list:
-                    self.compute_collective_group(data, rank_id, self.communication_bandwidth_info)
-            for step, data_list in matrix_data.items():
-                self.add_p2p_and_rank(rank_id, step, matrix_data)
-                for data in data_list:
-                    self.compute_collective_group(data, rank_id, self.matrix_info)
-
-    def compute_collective_group(self, data, rank_id, res_list):
-        if data[TableConstant.TYPE] == Constant.COLLECTIVE:
-            self.collective_group_dict[data[TableConstant.GROUP_NAME]].add(rank_id)
-            data[TableConstant.RANK_ID] = rank_id
-            res_list.append(data)
-
-    def add_p2p_and_rank(self, rank_id: int, step: str, data_dict: dict):
-        data_list = data_dict[step]
-        if not data_list:
-            print(f"[WARNING] rank {rank_id} {step} don't have communication matrix ops data")
-            return
-        for data in data_list:
-            if data[TableConstant.TYPE] != Constant.COLLECTIVE and data[TableConstant.TYPE] != Constant.P2P:
-                print(f"[WARNING] Unknown communication operators type!")
-                continue
-            if data[TableConstant.TYPE] == Constant.P2P:
-                if data[TableConstant.SRC_RANK] != data[TableConstant.DST_RANK]:
-                    rank_set = {data[TableConstant.SRC_RANK], data[TableConstant.DST_RANK]}
-                    if rank_set not in self.p2p_link:
-                        self.p2p_link.append(rank_set)
diff --git a/profiler/cluster_analyse/communication_group/communication_json_group.py b/profiler/cluster_analyse/communication_group/communication_json_group.py
index da6e6c1fe4..5d6d51e92e 100644
--- a/profiler/cluster_analyse/communication_group/communication_json_group.py
+++ b/profiler/cluster_analyse/communication_group/communication_json_group.py
@@ -25,8 +25,6 @@ class CommunicationJsonGroup(BaseCommunicationGroup):
 
     def __init__(self, params: dict):
         super().__init__(params)
-        self.communication_ops = []
-        self.matrix_ops = []
 
     def dump_data(self):
         FileManager.create_json_file(self.collection_path, self.communication_group, self.COMMUNICATION_GROUP_JSON)
@@ -38,23 +36,6 @@ class CommunicationJsonGroup(BaseCommunicationGroup):
         }
         return comm_data_dict
 
-    def analyze_communication_data(self):
-        for rank_id, rank_id_comm_dict, rank_id_matrix_dict in self.rank_comm_dir_dict:
-            for step_id, step_id_dict in rank_id_comm_dict.items():
-                if not isinstance(step_id_dict, dict):
-                    print(f"[WARNING] rank{rank_id}'s communication.json has a wrong data struct.")
-                    continue
-                self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE))
-                for comm_op_type, comm_op_dict in step_id_dict.items():
-                    self.add_communication_ops(rank_id, step_id, comm_op_type, comm_op_dict)
-
-            for step_id, step_id_dict in rank_id_matrix_dict.items():
-                if not isinstance(step_id_dict, dict):
-                    print(f"[WARNING] rank{rank_id}'s communication_matrix.json has a wrong data struct.")
-                    continue
-                self.set_p2p_link(rank_id, step_id, rank_id_matrix_dict)
-                self.get_collective_ops_name(rank_id, step_id_dict.get(Constant.COLLECTIVE))
-
     def read_communication_func(self: any, params: tuple):
         if len(params) < 3:
             return -1, {}, {}
@@ -68,65 +49,3 @@ class CommunicationJsonGroup(BaseCommunicationGroup):
         if os.path.exists(matrix_json_path) and self.analysis_mode in ["all", "communication_matrix"]:
             matrix_data = FileManager.read_json_file(matrix_json_path)
         return rank_id, comm_data, matrix_data
-
-    def set_p2p_link(self, rank_id: int, step_id: str, rank_id_matrix_dict: dict):
-        ops = rank_id_matrix_dict.get(step_id, {})
-        self.add_matrix_ops(rank_id, step_id, ops)
-        if not ops:
-            print(f"[WARNING] rank{rank_id} {step_id} do not have communication matrix ops data.")
-            return
-        p2p_ops = ops.get(Constant.P2P, {})
-        for op_name, link_dict in p2p_ops.items():
-            self.append_p2p_link(op_name, link_dict)
-
-    def append_p2p_link(self, op_name, link_dict):
-        for link in link_dict:
-            if '-' not in link:
-                print(f"[WARNING] {op_name} has an invalid link key {link}!")
-                break
-            src_rank = int(link.split('-')[0])
-            dst_rank = int(link.split('-')[1])
-            if src_rank != dst_rank:
-                rank_set = set([src_rank, dst_rank])
-                if rank_set in self.p2p_link:
-                    continue
-                self.p2p_link.append(rank_set)
-
-    def get_collective_ops_name(self, rank_id: int, comm_op_dict: dict):
-        for comm_op in comm_op_dict:
-            if comm_op.startswith('Total'):
-                continue
-            group_name = comm_op.split('@')[-1]
-            self.collective_group_dict[group_name].add(rank_id)
-
-    def add_communication_ops(self, rank_id: str, step_id: str, comm_op_type: str, comm_op_dict: dict):
-        for comm_op in comm_op_dict:
-            if comm_op.startswith('Total'):
-                continue
-            group_name = comm_op.split('@')[-1]
-            self.communication_ops.append({
-                Constant.RANK_ID: rank_id,
-                Constant.STEP_ID: step_id,
-                Constant.COMM_OP_TYPE: comm_op_type,
-                Constant.COMM_OP_NAME: comm_op,
-                Constant.GROUP_NAME: group_name,
-                Constant.COMM_OP_INFO: comm_op_dict.get(comm_op)
-            })
-
-    def add_matrix_ops(self, rank_id: int, step_id: str, step_id_dict: dict):
-        for comm_op_type, comm_dict in step_id_dict.items():
-            if comm_op_type != Constant.COLLECTIVE and comm_op_type != Constant.P2P:
-                print(f"[WARNING] Unknown communication operators type!")
-                continue
-            for op_name, op_link_info in comm_dict.items():
-                if op_name.startswith('Total'):
-                    continue
-                group_name = op_name.split('@')[-1]
-                self.matrix_ops.append({
-                    Constant.RANK_ID: rank_id,
-                    Constant.STEP_ID: step_id,
-                    Constant.COMM_OP_TYPE: comm_op_type,
-                    Constant.COMM_OP_NAME: op_name,
-                    Constant.GROUP_NAME: group_name,
-                    Constant.COMM_OP_INFO: op_link_info
-                })
-- 
Gitee


From 258d7f06ec6b4a6aa9b19ebd61ed535f63a23980 Mon Sep 17 00:00:00 2001
From: cai-weiwei1989 <734267852@qq.com>
Date: Mon, 18 Mar 2024 10:22:28 +0800
Subject: [PATCH 47/62] =?UTF-8?q?[att]=E9=A6=96=E9=A1=B5readme=E8=A1=A5?=
 =?UTF-8?q?=E5=85=85=E3=80=90Powered=20by=20MindStudio=E3=80=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cb203544c7..63accdcb60 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 # ATT
 
-Ascend Training Tools，昇腾训练工具链。针对训练&大模型场景，提供端到端命令行&可视化调试调优工具，帮助用户快速提高模型开发效率。
+Ascend Training Tools，昇腾训练工具链。【Powered by MindStudio】
+
+针对训练&大模型场景，提供端到端命令行&可视化调试调优工具，帮助用户快速提高模型开发效率。
 
 ## 模型训练迁移全流程
 ![输入图片说明](debug/resources/model_training_migration_process.png)
-- 
Gitee


From a0422098655f184f1e7f71f8ac71903939b74b88 Mon Sep 17 00:00:00 2001
From: w00800385 <wuzicheng2@huawei.com>
Date: Sun, 17 Mar 2024 18:54:55 +0800
Subject: [PATCH 48/62] =?UTF-8?q?=E5=90=88=E5=B9=B6DB=E4=B8=8ETEXT?=
 =?UTF-8?q?=E7=9A=84=E8=A7=A3=E6=9E=90=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../analysis/analysis_facade.py               |   7 +-
 ...base_analysis_json.py => base_analysis.py} |  15 +-
 ...alysis_json.py => comm_matrix_analysis.py} |  20 ++-
 .../communication/comm_analysis_generator.py  |  17 --
 .../communication_analysis_db.py              | 168 ------------------
 ...ysis_json.py => communication_analysis.py} |  25 ++-
 .../analysis/communication_matrix/__init__.py |   0
 .../comm_matrix_analysis_db.py                | 136 --------------
 .../comm_matrix_generator.py                  |  17 --
 .../pytorch_data_preprocessor.py              |  35 ++--
 .../cluster_analyse/common_func/constant.py   |   5 +
 .../common_func/data_transfer_adapter.py      |  76 --------
 .../cluster_analyse/common_func/db_manager.py |  12 +-
 .../base_communication_group.py               |  14 +-
 .../communication_db_group.py                 |  13 +-
 .../communication_json_group.py               |   7 -
 .../communication => utils}/__init__.py       |   0
 .../utils/data_transfer_adapter.py            | 142 +++++++++++++++
 18 files changed, 238 insertions(+), 471 deletions(-)
 rename profiler/cluster_analyse/analysis/{base_analysis_json.py => base_analysis.py} (86%)
 rename profiler/cluster_analyse/analysis/{communication_matrix/comm_matrix_analysis_json.py => comm_matrix_analysis.py} (79%)
 delete mode 100644 profiler/cluster_analyse/analysis/communication/comm_analysis_generator.py
 delete mode 100644 profiler/cluster_analyse/analysis/communication/communication_analysis_db.py
 rename profiler/cluster_analyse/analysis/{communication/communication_analysis_json.py => communication_analysis.py} (76%)
 delete mode 100644 profiler/cluster_analyse/analysis/communication_matrix/__init__.py
 delete mode 100644 profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py
 delete mode 100644 profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_generator.py
 delete mode 100644 profiler/cluster_analyse/common_func/data_transfer_adapter.py
 rename profiler/cluster_analyse/{analysis/communication => utils}/__init__.py (100%)
 create mode 100644 profiler/cluster_analyse/utils/data_transfer_adapter.py

diff --git a/profiler/cluster_analyse/analysis/analysis_facade.py b/profiler/cluster_analyse/analysis/analysis_facade.py
index 0b870bbaaf..06be6002e1 100644
--- a/profiler/cluster_analyse/analysis/analysis_facade.py
+++ b/profiler/cluster_analyse/analysis/analysis_facade.py
@@ -14,13 +14,14 @@
 # limitations under the License.
 
 from multiprocessing import Process
-from analysis.communication.comm_analysis_generator import CommunicationAnalysisGenerator
-from analysis.communication_matrix.comm_matrix_generator import CommMatrixAnalysisGenerator
+
+from analysis.communication_analysis import CommunicationAnalysis
+from analysis.comm_matrix_analysis import CommMatrixAnalysis
 from analysis.step_trace_time_analysis import StepTraceTimeAnalysis
 
 
 class AnalysisFacade:
-    analysis_module = {CommunicationAnalysisGenerator, StepTraceTimeAnalysis, CommMatrixAnalysisGenerator}
+    analysis_module = {CommunicationAnalysis, StepTraceTimeAnalysis, CommMatrixAnalysis}
 
     def __init__(self, params: dict):
         self.params = params
diff --git a/profiler/cluster_analyse/analysis/base_analysis_json.py b/profiler/cluster_analyse/analysis/base_analysis.py
similarity index 86%
rename from profiler/cluster_analyse/analysis/base_analysis_json.py
rename to profiler/cluster_analyse/analysis/base_analysis.py
index 3df54b0ae2..cc803813dd 100644
--- a/profiler/cluster_analyse/analysis/base_analysis_json.py
+++ b/profiler/cluster_analyse/analysis/base_analysis.py
@@ -1,16 +1,19 @@
 from abc import abstractmethod
 from common_func.constant import Constant
+from utils.data_transfer_adapter import DataTransferAdapter
 from common_func.file_manager import FileManager
 
 
-class BaseAnalysisJson:
+class BaseAnalysis:
 
     def __init__(self, param: dict):
         self.collection_path = param.get(Constant.COLLECTION_PATH)
         self.data_map = param.get(Constant.DATA_MAP)
+        self.data_type = param.get(Constant.DATA_TYPE)
         self.communication_ops = []
         self.collective_group_dict = param.get(Constant.COMM_DATA_DICT, {}).get(Constant.COLLECTIVE_GROUP)
         self.comm_ops_struct = {}
+        self.adapter = DataTransferAdapter()
 
     @staticmethod
     def compute_ratio(dividend: float, divisor: float):
@@ -40,6 +43,16 @@ class BaseAnalysisJson:
         if not self.comm_ops_struct:
             print("[WARNING] There is no final comm ops data generated")
             return
+        if self.data_type == Constant.TEXT:
+            self.dump_json()
+        else:
+            self.dump_db()
+
+    @abstractmethod
+    def dump_db(self):
+        pass
+
+    def dump_json(self):
         output_comm_data = {}
         for key in self.comm_ops_struct:
             output_comm_data[str(key)] = self.comm_ops_struct.get(key)
diff --git a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_json.py b/profiler/cluster_analyse/analysis/comm_matrix_analysis.py
similarity index 79%
rename from profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_json.py
rename to profiler/cluster_analyse/analysis/comm_matrix_analysis.py
index 7baca7e928..8dc04471fe 100644
--- a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_json.py
+++ b/profiler/cluster_analyse/analysis/comm_matrix_analysis.py
@@ -1,11 +1,14 @@
+import os
 from collections import defaultdict
 
-from analysis.base_analysis_json import BaseAnalysisJson
+from analysis.base_analysis import BaseAnalysis
 from common_func.constant import Constant
+from common_func.db_manager import DBManager
 
 
-class CommMatrixAnalysisJson(BaseAnalysisJson):
+class CommMatrixAnalysis(BaseAnalysis):
     SAVED_JSON = "cluster_communication_matrix.json"
+    COMMUNICATION_MATRIX_TABLE = "ClusterCommAnalyzerMatrix"
 
     def __init__(self, param: dict):
         super().__init__(param)
@@ -25,6 +28,19 @@ class CommMatrixAnalysisJson(BaseAnalysisJson):
         self.combine_ops_total_info()
         self.dump_data()
 
+    def dump_db(self):
+        res_comm_matrix = self.adapter.transfer_matrix_from_json_to_db(self.comm_ops_struct)
+        output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT)
+        result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER)
+        DBManager.create_tables(result_db, self.COMMUNICATION_MATRIX_TABLE)
+        conn, cursor = DBManager.create_connect_db(result_db)
+        if res_comm_matrix:
+            res_matrix_value = [list(data.values()) for data in res_comm_matrix]
+            sql = "insert into {} values ({value})".format(self.COMMUNICATION_MATRIX_TABLE,
+                                                           value="?," * (len(res_matrix_value[0]) - 1) + "?")
+            DBManager.executemany_sql(conn, sql, res_matrix_value)
+        DBManager.destroy_db_connect(conn, cursor)
+
     def compute_total_info(self, step_dict: dict):
         self.merge_same_links(step_dict)
         self.combine_link_info(step_dict)
diff --git a/profiler/cluster_analyse/analysis/communication/comm_analysis_generator.py b/profiler/cluster_analyse/analysis/communication/comm_analysis_generator.py
deleted file mode 100644
index 4b737b5da4..0000000000
--- a/profiler/cluster_analyse/analysis/communication/comm_analysis_generator.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from analysis.communication.communication_analysis_db import CommunicationAnalysisDB
-from analysis.communication.communication_analysis_json import CommunicationAnalysisJson
-from common_func.constant import Constant
-
-
-class CommunicationAnalysisGenerator:
-
-    GROUP_MAP = {
-        Constant.DB: CommunicationAnalysisDB,
-        Constant.TEXT: CommunicationAnalysisJson
-    }
-
-    def __init__(self, params: dict):
-        self.generator = self.GROUP_MAP.get(params.get(Constant.DATA_TYPE))(params)
-
-    def run(self):
-        self.generator.run()
diff --git a/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py b/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py
deleted file mode 100644
index ff371cf7a8..0000000000
--- a/profiler/cluster_analyse/analysis/communication/communication_analysis_db.py
+++ /dev/null
@@ -1,168 +0,0 @@
-import os
-
-from analysis.base_analysis_json import BaseAnalysisJson
-from common_func.db_manager import DBManager
-from common_func.constant import Constant
-from common_func.table_constant import TableConstant
-
-
-class CommunicationAnalysisDB:
-    COMMUNICATION_BANDWIDTH_TABLE = "ClusterCommAnalyzerBandwidth"
-    COMMUNICATION_TIME_TABLE = "ClusterCommAnalyzerTime"
-    TIME_EXTENSION = "time"
-    RANK_BAND_TYPE = "{}-{}"
-
-    def __init__(self, params: any):
-        self.collection_path = params.get(Constant.COLLECTION_PATH)
-        self.communication_time_info = params.get(Constant.COMM_DATA_DICT, {}).get(Constant.COMMUNICATION_TIME_INFO)
-        self.communication_bandwidth_info = params.get(Constant.COMM_DATA_DICT, {}).get(
-            Constant.COMMUNICATION_BANDWIDTH_INFO)
-        self.collective_group_dict = params.get(Constant.COMM_DATA_DICT, {}).get(Constant.COLLECTIVE_GROUP)
-        self.comm_time_struct = {}
-        self.comm_bandwidth_struct = {}
-        self.res_comm_time = []
-        self.res_comm_bandwidth = []
-
-    def run(self):
-        if not self.communication_time_info and not self.communication_bandwidth_info:
-            return
-        self.split_and_add_rank_set(self.communication_time_info, self.comm_time_struct)
-        self.split_and_add_rank_set(self.communication_bandwidth_info, self.comm_bandwidth_struct)
-        self.compute_total_info()
-        self.dump_data()
-
-    def dump_data(self):
-        if not self.res_comm_time and not self.res_comm_bandwidth:
-            print("[WARNING] There is no final communication data generated")
-            return
-        output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT)
-        result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER)
-        DBManager.create_tables(result_db, self.COMMUNICATION_TIME_TABLE, self.COMMUNICATION_BANDWIDTH_TABLE)
-        res_time, res_bandwidth = [], []
-        conn, cursor = DBManager.create_connect_db(result_db)
-        for data in self.res_comm_time:
-            res_time.append([data[TableConstant.RANK_SET], data[TableConstant.STEP], data[TableConstant.RANK_ID],
-                             data[TableConstant.HCCL_OP_NAME], data[TableConstant.GROUP_NAME],
-                             data[TableConstant.START_TIMESTAMP], data[TableConstant.ELAPSED_TIME],
-                             data[TableConstant.TRANSIT_TIME], data[TableConstant.WAIT_TIME],
-                             data[TableConstant.SYNCHRONIZATION_TIME], data[TableConstant.IDLE_TIME],
-                             data[TableConstant.SYNCHRONIZATION_TIME_RATIO], data[TableConstant.WAIT_TIME_RATIO]])
-        if res_time:
-            sql = "insert into {} values ({value})".format(self.COMMUNICATION_TIME_TABLE,
-                                                           value="?," * (len(res_time[0]) - 1) + "?")
-            DBManager.executemany_sql(conn, sql, res_time)
-        for data in self.res_comm_bandwidth:
-            res_bandwidth.append([data[TableConstant.RANK_SET], data[TableConstant.STEP], data[TableConstant.RANK_ID],
-                                  data[TableConstant.HCCL_OP_NAME], data[TableConstant.GROUP_NAME],
-                                  data[TableConstant.TRANSPORT_TYPE], data[TableConstant.TRANSIT_SIZE],
-                                  data[TableConstant.TRANSIT_TIME], data[TableConstant.BANDWIDTH],
-                                  data[TableConstant.LARGE_PACKET_RATIO], data[TableConstant.PACKAGE_SIZE],
-                                  data[TableConstant.COUNT], data[TableConstant.TOTAL_DURATION]])
-        if res_bandwidth:
-            sql = "insert into {} values ({value})".format(self.COMMUNICATION_BANDWIDTH_TABLE,
-                                                           value="?," * (len(res_bandwidth[0]) - 1) + "?")
-            DBManager.executemany_sql(conn, sql, res_bandwidth)
-        DBManager.destroy_db_connect(conn, cursor)
-
-    def split_and_add_rank_set(self, data_list, res_dict):
-        for data in data_list:
-            if data[TableConstant.TYPE] == Constant.P2P:
-                rank_tuple = Constant.P2P
-            else:
-                rank_tuple = tuple(self.collective_group_dict.get(data[TableConstant.GROUP_NAME], []))
-            res_dict.setdefault(rank_tuple, {}).setdefault(data[TableConstant.STEP], []).append(data)
-
-    def compute_total_info(self):
-        for rank_tuple, op_dict in self.comm_time_struct.items():
-            if rank_tuple != Constant.P2P:
-                for step, data_list in op_dict.items():
-                    self.compute_rank_set_total_time_info(data_list, rank_tuple)
-            else:
-                rank_set = set()
-                for step, data_list in op_dict.items():
-                    rank_set.add(data[TableConstant.RANK_ID] for data in data_list)
-                for step, data_list in op_dict.items():
-                    self.compute_rank_set_total_time_info(data_list, rank_set, True)
-        for rank_tuple, op_dict in self.comm_bandwidth_struct.items():
-            for step, data_list in op_dict.items():
-                if rank_tuple != Constant.P2P:
-                    self.compute_rank_set_total_bandwidth_info(data_list, rank_tuple)
-                else:
-                    self.compute_rank_set_total_bandwidth_info(data_list, rank_tuple, True)
-
-    def compute_rank_set_total_bandwidth_info(self, data_list, rank_tuple, is_p2p=False):
-        if not data_list:
-            return
-        data_dict = {}
-        rank_tuple = "(" + ",".join(str(i) for i in rank_tuple) + ")" if not is_p2p else Constant.P2P
-        for data in data_list:
-            data[TableConstant.RANK_SET] = rank_tuple
-            rank_band_type = self.RANK_BAND_TYPE.format(data[TableConstant.RANK_ID],
-                                                        data[TableConstant.TRANSPORT_TYPE])
-            data_dict.setdefault(rank_band_type, []).append(data)
-            self.res_comm_bandwidth.append(data)
-        for rank_band_type, bandwidth_list in data_dict.items():
-            package_set = set()
-            for data in bandwidth_list:
-                package_set.add(data[TableConstant.PACKAGE_SIZE])
-            for package in package_set:
-                total_comm_bandwidth_info = dict()
-                for data in bandwidth_list:
-                    self.compute_bandwidth(total_comm_bandwidth_info, data, package)
-                bandwidth = BaseAnalysisJson.compute_ratio(total_comm_bandwidth_info.get(TableConstant.TRANSIT_SIZE),
-                                                           total_comm_bandwidth_info.get(TableConstant.TRANSIT_TIME))
-                total_comm_bandwidth_info[TableConstant.BANDWIDTH] = bandwidth
-                total_comm_bandwidth_info[TableConstant.PACKAGE_SIZE] = package
-                total_comm_bandwidth_info[TableConstant.HCCL_OP_NAME] = Constant.TOTAL_OP_INFO
-                total_comm_bandwidth_info[TableConstant.GROUP_NAME] = ""
-                total_comm_bandwidth_info[TableConstant.LARGE_PACKET_RATIO] = 0.0
-                self.res_comm_bandwidth.append(total_comm_bandwidth_info)
-
-    def compute_bandwidth(self, res_dict, data_dict, package):
-        for key in data_dict.keys():
-            if key in [TableConstant.TRANSIT_TIME, TableConstant.TRANSIT_SIZE]:
-                if key not in res_dict.keys():
-                    res_dict[key] = 0.0
-                res_dict[key] += data_dict[key]
-            elif key in [TableConstant.COUNT, TableConstant.TOTAL_DURATION]:
-                if data_dict[TableConstant.PACKAGE_SIZE] == package:
-                    if key not in res_dict.keys():
-                        res_dict[key] = 0.0
-                    res_dict[key] += data_dict[key]
-            else:
-                res_dict[key] = data_dict[key]
-
-    def compute_time(self, res_dict, data_dict, dict_key):
-        if dict_key.endswith(self.TIME_EXTENSION):
-            if dict_key not in res_dict.keys():
-                res_dict[dict_key] = 0.0
-            res_dict[dict_key] += data_dict[dict_key]
-        else:
-            res_dict[dict_key] = data_dict[dict_key]
-
-    def compute_rank_set_total_time_info(self, data_list: list, rank_tuple: any, is_p2p: bool = False):
-        if not data_list:
-            return
-        rank_set = "(" + ",".join(str(i) for i in rank_tuple) + ")" if not is_p2p else Constant.P2P
-        for rank_id in rank_tuple:
-            total_comm_time_info = dict()
-            for data in data_list:
-                if data[TableConstant.RANK_ID] == rank_id:
-                    data[TableConstant.RANK_SET] = rank_set
-                    data[TableConstant.SYNCHRONIZATION_TIME_RATIO] = 0.0
-                    data[TableConstant.WAIT_TIME_RATIO] = 0.0
-                    for key, value in data.items():
-                        self.compute_time(total_comm_time_info, data, key)
-            syn_ratio = BaseAnalysisJson.compute_ratio(total_comm_time_info.get(TableConstant.SYNCHRONIZATION_TIME),
-                                                       total_comm_time_info.get(TableConstant.SYNCHRONIZATION_TIME) +
-                                                       total_comm_time_info.get(TableConstant.TRANSIT_TIME))
-            wait_time_ratio = BaseAnalysisJson.compute_ratio(total_comm_time_info.get(TableConstant.WAIT_TIME),
-                                                             total_comm_time_info.get(TableConstant.WAIT_TIME) +
-                                                             total_comm_time_info.get(TableConstant.TRANSIT_TIME))
-            total_comm_time_info[TableConstant.HCCL_OP_NAME] = Constant.TOTAL_OP_INFO
-            total_comm_time_info[TableConstant.GROUP_NAME] = ""
-            total_comm_time_info[TableConstant.START_TIMESTAMP] = 0.0
-            total_comm_time_info[TableConstant.WAIT_TIME_RATIO] = wait_time_ratio
-            total_comm_time_info[TableConstant.SYNCHRONIZATION_TIME_RATIO] = syn_ratio
-            self.res_comm_time.append(total_comm_time_info)
-        self.res_comm_time.extend(data_list)
diff --git a/profiler/cluster_analyse/analysis/communication/communication_analysis_json.py b/profiler/cluster_analyse/analysis/communication_analysis.py
similarity index 76%
rename from profiler/cluster_analyse/analysis/communication/communication_analysis_json.py
rename to profiler/cluster_analyse/analysis/communication_analysis.py
index 7fa680fe56..3f0a9b417e 100644
--- a/profiler/cluster_analyse/analysis/communication/communication_analysis_json.py
+++ b/profiler/cluster_analyse/analysis/communication_analysis.py
@@ -1,11 +1,15 @@
+import os
 from collections import defaultdict
 
-from analysis.base_analysis_json import BaseAnalysisJson
+from analysis.base_analysis import BaseAnalysis
 from common_func.constant import Constant
+from common_func.db_manager import DBManager
 
 
-class CommunicationAnalysisJson(BaseAnalysisJson):
+class CommunicationAnalysis(BaseAnalysis):
     SAVED_JSON = "cluster_communication.json"
+    COMMUNICATION_BANDWIDTH_TABLE = "ClusterCommAnalyzerBandwidth"
+    COMMUNICATION_TIME_TABLE = "ClusterCommAnalyzerTime"
 
     def __init__(self, param: dict):
         super().__init__(param)
@@ -24,6 +28,23 @@ class CommunicationAnalysisJson(BaseAnalysisJson):
         self.combine_ops_total_info()
         self.dump_data()
 
+    def dump_db(self):
+        res_comm_time, res_comm_bandwidth = self.adapter.transfer_comm_from_json_to_db(self.comm_ops_struct)
+        output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT)
+        result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER)
+        DBManager.create_tables(result_db, self.COMMUNICATION_TIME_TABLE, self.COMMUNICATION_BANDWIDTH_TABLE)
+        conn, cursor = DBManager.create_connect_db(result_db)
+        self.execute(conn, res_comm_time, self.COMMUNICATION_TIME_TABLE)
+        self.execute(conn, res_comm_bandwidth, self.COMMUNICATION_BANDWIDTH_TABLE)
+        DBManager.destroy_db_connect(conn, cursor)
+
+    @staticmethod
+    def execute(conn, res_data, table_name):
+        if res_data:
+            res_value = [list(data.values()) for data in res_data]
+            sql = "insert into {} values ({value})".format(table_name, value="?," * (len(res_value[0]) - 1) + "?")
+            DBManager.executemany_sql(conn, sql, res_value)
+
     def compute_total_info(self, comm_ops: dict):
         if not comm_ops:
             return
diff --git a/profiler/cluster_analyse/analysis/communication_matrix/__init__.py b/profiler/cluster_analyse/analysis/communication_matrix/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py
deleted file mode 100644
index dbee80debd..0000000000
--- a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_analysis_db.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import os
-
-from analysis.base_analysis_json import BaseAnalysisJson
-from common_func.db_manager import DBManager
-from common_func.constant import Constant
-from common_func.table_constant import TableConstant
-
-
-class CommMatrixAnalysisDB:
-    COMMUNICATION_MATRIX_TABLE = "ClusterCommAnalyzerMatrix"
-
-    def __init__(self, params: any):
-        self.collection_path = params.get(Constant.COLLECTION_PATH)
-        self.matrix_info = params.get(Constant.COMM_DATA_DICT, {}).get(Constant.MATRIX_OPS)
-        self.collective_group_dict = params.get(Constant.COMM_DATA_DICT, {}).get(Constant.COLLECTIVE_GROUP)
-        self.comm_matrix_struct = {}
-        self.res_comm_matrix = []
-
-    def run(self):
-        if not self.matrix_info:
-            return
-        self.set_rank_tuple()
-        self.combine_total_matrix_info()
-        self.dump_data()
-
-    def dump_data(self):
-        if not self.res_comm_matrix:
-            print("[WARNING] There is no final communication_matrix data generated")
-            return
-        output_path = os.path.join(self.collection_path, Constant.CLUSTER_ANALYSIS_OUTPUT)
-        result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER)
-        DBManager.create_tables(result_db, self.COMMUNICATION_MATRIX_TABLE)
-        conn, cursor = DBManager.create_connect_db(result_db)
-        res = []
-        for data in self.res_comm_matrix:
-            op_name = data.get(TableConstant.OPNAME) if data.get(TableConstant.OPNAME) is not None else ""
-            res.append([data[TableConstant.RANK_SET], data[TableConstant.STEP], data[TableConstant.HCCL_OP_NAME],
-                        data[TableConstant.GROUP_NAME], data[TableConstant.SRC_RANK], data[TableConstant.DST_RANK],
-                        data[TableConstant.TRANSIT_SIZE], data[TableConstant.TRANSIT_TIME],
-                        data[TableConstant.BANDWIDTH], data[TableConstant.TRANSPORT_TYPE], op_name])
-        if res:
-            sql = "insert into {} values ({value})".format(self.COMMUNICATION_MATRIX_TABLE,
-                                                           value="?," * (len(res[0]) - 1) + "?")
-            DBManager.executemany_sql(conn, sql, res)
-        DBManager.destroy_db_connect(conn, cursor)
-
-    def combine_total_matrix_info(self):
-        for rank_tuple, group_dict in self.comm_matrix_struct.items():
-            if rank_tuple != Constant.P2P:
-                rank_tuple = "(" + ",".join(str(i) for i in rank_tuple) + ")"
-            for step, step_dict in group_dict.items():
-                self.merge_same_info(step_dict, rank_tuple)
-                self.combine_total_info(step_dict)
-
-    def combine_total_info(self, step_dict: dict):
-        link_key_set = set()
-        for op_name, matrix_dict in step_dict.items():
-            self.res_comm_matrix.extend(matrix_dict.values())
-            if BaseAnalysisJson.check_add_op(op_name):
-                for key in matrix_dict.keys():
-                    link_key_set.add(key)
-        for link_key in link_key_set:
-            total_matrix_info = dict()
-            total_matrix_info[TableConstant.TRANSIT_SIZE] = 0.0
-            total_matrix_info[TableConstant.TRANSIT_TIME] = 0.0
-            for op_name, matrix_dict in step_dict.items():
-                if link_key in matrix_dict.keys() and BaseAnalysisJson.check_add_op(op_name):
-                    total_matrix_info[TableConstant.RANK_SET] = matrix_dict[link_key][TableConstant.RANK_SET]
-                    self.combine_link_info(total_matrix_info, matrix_dict[link_key])
-            bandwidth = BaseAnalysisJson.compute_ratio(total_matrix_info[TableConstant.TRANSIT_SIZE],
-                                                       total_matrix_info[TableConstant.TRANSIT_TIME])
-            total_matrix_info[TableConstant.HCCL_OP_NAME] = Constant.TOTAL_OP_INFO
-            total_matrix_info[TableConstant.GROUP_NAME] = ""
-            total_matrix_info[TableConstant.BANDWIDTH] = bandwidth
-            self.res_comm_matrix.append(total_matrix_info)
-
-    def combine_link_info(self, link_info, data: dict):
-        for col in data.keys():
-            if col in [TableConstant.TRANSIT_TIME, TableConstant.TRANSIT_SIZE]:
-                link_info[col] += data[col]
-            else:
-                link_info[col] = data[col]
-
-    def merge_same_info(self, step_dict: dict, rank_tuple):
-        def process_matrix():
-            for data in op_list:
-                if data[TableConstant.SRC_RANK] == data[TableConstant.DST_RANK]:
-                    if data[TableConstant.SRC_RANK] not in local_global_rank_map:
-                        local_global_rank_map[data[TableConstant.SRC_RANK]] = data[TableConstant.RANK_ID]
-                    elif local_global_rank_map[data[TableConstant.SRC_RANK]] != data[TableConstant.RANK_ID]:
-                        print(f"[WARNING] In the same communication group, local ranks projecting to global ranks "
-                              f"repeat!")
-                if (link_key.split('-')[0] == data[TableConstant.SRC_RANK] and
-                        link_key.split('-')[1] == data[TableConstant.DST_RANK]):
-                    self.combine_link_info(matrix_info, data)
-                    new_matrix_list[link_key] = matrix_info
-
-        def convert_local_to_global_rank():
-            res_dict = dict()
-            for key, new_matrix in new_matrix_list.items():
-                src_rank = new_matrix[TableConstant.SRC_RANK]
-                dst_rank = new_matrix[TableConstant.DST_RANK]
-                src_rank = local_global_rank_map[src_rank] if src_rank in local_global_rank_map else src_rank
-                dst_rank = local_global_rank_map[dst_rank] if dst_rank in local_global_rank_map else dst_rank
-                bandwidth = BaseAnalysisJson.compute_ratio(new_matrix[TableConstant.TRANSIT_SIZE],
-                                                           new_matrix[TableConstant.TRANSIT_TIME])
-                key = f"{src_rank}-{dst_rank}"
-                new_matrix[TableConstant.SRC_RANK] = src_rank
-                new_matrix[TableConstant.DST_RANK] = dst_rank
-                new_matrix[TableConstant.BANDWIDTH] = bandwidth
-                res_dict[key] = new_matrix
-            return res_dict
-
-        local_global_rank_map = dict()
-        for op_name, op_list in step_dict.items():
-            new_matrix_list = {}
-            link_key_set = set()
-            for op_data in op_list:
-                link_key_set.add(op_data[TableConstant.SRC_RANK] + "-" + op_data[TableConstant.DST_RANK])
-            for link_key in link_key_set:
-                matrix_info = dict()
-                matrix_info[TableConstant.RANK_SET] = rank_tuple
-                matrix_info[TableConstant.TRANSIT_SIZE] = 0.0
-                matrix_info[TableConstant.TRANSIT_TIME] = 0.0
-                process_matrix()
-            step_dict[op_name] = convert_local_to_global_rank()
-
-    def set_rank_tuple(self):
-        for data in self.matrix_info:
-            op_name = data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME]
-            if data[TableConstant.STEP] == Constant.P2P:
-                rank_tuple = Constant.P2P
-            else:
-                rank_tuple = tuple(self.collective_group_dict.get(data[TableConstant.GROUP_NAME], []))
-            self.comm_matrix_struct.setdefault(rank_tuple, {}).setdefault(data[TableConstant.STEP], {}). \
-                setdefault(op_name, []).append(data)
diff --git a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_generator.py b/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_generator.py
deleted file mode 100644
index 03a1826955..0000000000
--- a/profiler/cluster_analyse/analysis/communication_matrix/comm_matrix_generator.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from analysis.communication_matrix.comm_matrix_analysis_db import CommMatrixAnalysisDB
-from analysis.communication_matrix.comm_matrix_analysis_json import CommMatrixAnalysisJson
-from common_func.constant import Constant
-
-
-class CommMatrixAnalysisGenerator:
-
-    GROUP_MAP = {
-        Constant.DB: CommMatrixAnalysisDB,
-        Constant.TEXT: CommMatrixAnalysisJson
-    }
-
-    def __init__(self, params: dict):
-        self.generator = self.GROUP_MAP.get(params.get(Constant.DATA_TYPE))(params)
-
-    def run(self):
-        self.generator.run()
diff --git a/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py b/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py
index 943ed60f56..55c3d03958 100644
--- a/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py
+++ b/profiler/cluster_analyse/cluster_data_preprocess/pytorch_data_preprocessor.py
@@ -18,16 +18,14 @@ import os
 
 from cluster_data_preprocess.data_preprocessor import DataPreprocessor
 from common_func.constant import Constant
+from common_func.file_manager import FileManager
 
 
 class PytorchDataPreprocessor(DataPreprocessor):
-    JSON_RESULT_INFO = "*.json"
-    CSV_RESULT_INFO = "*.csv"
 
     def __init__(self, path_list: list):
         super().__init__(path_list)
-        self.db_count = 0
-        self.text_count = 0
+        self.data_type = set()
 
     def get_data_map(self) -> dict:
         rank_id_map = defaultdict(list)
@@ -36,21 +34,12 @@ class PytorchDataPreprocessor(DataPreprocessor):
             if rank_id < 0:
                 print('[Error]fail to get rankid or rankid invalid.')
                 continue
-            folder_path = os.path.join(dir_name, Constant.SINGLE_OUTPUT)
-            db_files = glob.glob(os.path.join(folder_path, Constant.DB_COMMUNICATION_ANALYZER))
-            text_files = (glob.glob(os.path.join(folder_path, self.JSON_RESULT_INFO)) +
-                          glob.glob(os.path.join(folder_path, self.CSV_RESULT_INFO)))
-            if text_files and db_files:
-                print(f"[ERROR] Rank {rank_id} has both db and text files")
-                self.db_count, self.text_count = 1, 1
-                break
-            if db_files:
-                self.db_count += 1
-            elif text_files:
-                self.text_count += 1
-            else:
-                print(f"[WARNING] Rank {rank_id} has no valid files")
-                continue
+            for file_name in os.listdir(dir_name):
+                if file_name.startswith(self.PROFILER_INFO_HEAD) and file_name.endswith(self.PROFILER_INFO_EXTENSION):
+                    file_path = os.path.join(dir_name, file_name)
+                    config = FileManager.read_json_file(file_path)
+                    self.data_type.add(config.get(Constant.CONFIG, {}).get(Constant.EXPER_CONFIG, {}).
+                                       get(Constant.EXPORT_TYPE, Constant.TEXT))
             rank_id_map[rank_id].append(dir_name)
 
         try:
@@ -62,10 +51,6 @@ class PytorchDataPreprocessor(DataPreprocessor):
         return self.data_map
 
     def get_data_type(self):
-        if self.db_count != 0 and self.text_count != 0:
-            return Constant.INVALID
-        if self.db_count != 0:
-            return Constant.DB
-        if self.text_count != 0:
-            return Constant.TEXT
+        if len(self.data_type) == 1:
+            return self.data_type.pop()
         return Constant.INVALID
diff --git a/profiler/cluster_analyse/common_func/constant.py b/profiler/cluster_analyse/common_func/constant.py
index e75bbebe75..3b4126de79 100644
--- a/profiler/cluster_analyse/common_func/constant.py
+++ b/profiler/cluster_analyse/common_func/constant.py
@@ -98,3 +98,8 @@ class Constant(object):
     TABLE_COMM_ANALYZER_TIME = "CommAnalyzerTime"
     TABLE_COMM_ANALYZER_MATRIX = "CommAnalyzerMatrix"
     TABLE_STEP_TRACE = "StepTraceTime"
+
+    # data config key
+    CONFIG = "config"
+    EXPER_CONFIG = "experimental_config"
+    EXPORT_TYPE = "_export_type"
diff --git a/profiler/cluster_analyse/common_func/data_transfer_adapter.py b/profiler/cluster_analyse/common_func/data_transfer_adapter.py
deleted file mode 100644
index fb0663e2f5..0000000000
--- a/profiler/cluster_analyse/common_func/data_transfer_adapter.py
+++ /dev/null
@@ -1,76 +0,0 @@
-from common_func.constant import Constant
-from common_func.table_constant import TableConstant
-
-
-class DataTransferAdapter:
-
-    def __init__(self):
-        pass
-
-    @staticmethod
-    def transfer_comm_from_db_to_json(time_info: list, bandwidth_info: list):
-        result = {}
-        if not time_info and not bandwidth_info:
-            return result
-        for time_data in time_info:
-            comm_time = dict()
-            hccl_name = time_data[TableConstant.HCCL_OP_NAME] + "@" + time_data[TableConstant.GROUP_NAME]
-            comm_time[Constant.ELAPSE_TIME_MS] = time_data[TableConstant.ELAPSED_TIME]
-            comm_time[Constant.IDLE_TIME_MS] = time_data[TableConstant.IDLE_TIME]
-            comm_time[Constant.START_TIMESTAMP] = time_data[TableConstant.START_TIMESTAMP]
-            comm_time[Constant.SYNCHRONIZATION_TIME_MS] = time_data[TableConstant.SYNCHRONIZATION_TIME]
-            comm_time[Constant.TRANSIT_TIME_MS] = time_data[TableConstant.TRANSIT_TIME]
-            comm_time[Constant.WAIT_TIME_MS] = time_data[TableConstant.WAIT_TIME]
-            result.setdefault(time_data[TableConstant.STEP], {}).setdefault(time_data[TableConstant.TYPE], {}). \
-                setdefault(hccl_name, {})[Constant.COMMUNICATION_TIME_INFO] = comm_time
-        hccl_set = set()
-        for bd_data in bandwidth_info:
-            hccl_name = bd_data[TableConstant.HCCL_OP_NAME] + "@" + bd_data[TableConstant.GROUP_NAME]
-            hccl_set.add(hccl_name)
-        for hccl in hccl_set:
-            comm_bd = dict()
-            for bd_data in bandwidth_info:
-                if hccl == (bd_data[TableConstant.HCCL_OP_NAME] + "@" + bd_data[TableConstant.GROUP_NAME]):
-                    comm_bd.setdefault(bd_data[TableConstant.TRANSPORT_TYPE], {})[Constant.BANDWIDTH_GB_S] = \
-                        bd_data[TableConstant.BANDWIDTH]
-                    comm_bd.setdefault(bd_data[TableConstant.TRANSPORT_TYPE], {})[Constant.TRANSIT_TIME_MS] = \
-                        bd_data[TableConstant.TRANSIT_TIME]
-                    comm_bd.setdefault(bd_data[TableConstant.TRANSPORT_TYPE], {})[Constant.TRANSIT_SIZE_MB] = \
-                        bd_data[TableConstant.TRANSIT_SIZE]
-                    comm_bd.setdefault(bd_data[TableConstant.TRANSPORT_TYPE], {})[Constant.LARGE_PACKET_RATIO] = \
-                        bd_data[TableConstant.LARGE_PACKET_RATIO]
-                    comm_bd.setdefault(bd_data[TableConstant.TRANSPORT_TYPE], {}).setdefault(
-                        Constant.SIZE_DISTRIBUTION, {})[bd_data[TableConstant.PACKAGE_SIZE]] = \
-                        [bd_data[TableConstant.COUNT], bd_data[TableConstant.TOTAL_DURATION]]
-                    result.setdefault(bd_data[TableConstant.STEP], {}).setdefault(bd_data[TableConstant.TYPE], {}). \
-                        setdefault(hccl, {})[Constant.COMMUNICATION_BANDWIDTH_INFO] = comm_bd
-        return result
-
-    def transfer_comm_from_json_to_db(self):
-        pass
-
-    @staticmethod
-    def transfer_matrix_from_db_to_json(matrix_data: list):
-        result = {}
-        if not matrix_data:
-            return result
-        hccl_set = set()
-        for data in matrix_data:
-            hccl = data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME]
-            hccl_set.add(hccl)
-        for hccl in hccl_set:
-            matrix_dict = dict()
-            for data in matrix_data:
-                if hccl == (data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME]):
-                    key = data[TableConstant.SRC_RANK] + '-' + data[TableConstant.DST_RANK]
-                    matrix_dict.setdefault(key, {})[Constant.BANDWIDTH_GB_S] = data[TableConstant.BANDWIDTH]
-                    matrix_dict.setdefault(key, {})[Constant.TRANSIT_TIME_MS] = data[TableConstant.TRANSIT_TIME]
-                    matrix_dict.setdefault(key, {})[Constant.TRANSIT_SIZE_MB] = data[TableConstant.TRANSIT_SIZE]
-                    matrix_dict.setdefault(key, {})[Constant.TRANSPORT_TYPE] = data[TableConstant.TRANSPORT_TYPE]
-                    matrix_dict.setdefault(key, {})[Constant.OP_NAME] = data[TableConstant.OPNAME]
-                    result.setdefault(data[TableConstant.STEP], {}).setdefault(data[TableConstant.TYPE], {})[hccl] =\
-                        matrix_dict
-        return result
-
-    def transfer_matrix_from_json_to_db(self):
-        pass
diff --git a/profiler/cluster_analyse/common_func/db_manager.py b/profiler/cluster_analyse/common_func/db_manager.py
index bdee49be60..039473d707 100644
--- a/profiler/cluster_analyse/common_func/db_manager.py
+++ b/profiler/cluster_analyse/common_func/db_manager.py
@@ -117,11 +117,13 @@ class DBManager:
     def create_tables(cls, db_path: any, *tables: any):
         conn, curs = cls.create_connect_db(db_path)
         for table_name in tables:
-            if not cls.judge_table_exists(curs, table_name):
-                table_map = "{0}Map".format(table_name)
-                header_with_type = cls.sql_generate_table(table_map)
-                sql = "CREATE TABLE IF NOT EXISTS " + table_name + header_with_type
-                cls.execute_sql(conn, sql)
+            if cls.judge_table_exists(curs, table_name):
+                drop_sql = "drop table {0}".format(table_name)
+                cls.execute_sql(conn, drop_sql)
+            table_map = "{0}Map".format(table_name)
+            header_with_type = cls.sql_generate_table(table_map)
+            sql = "CREATE TABLE IF NOT EXISTS " + table_name + header_with_type
+            cls.execute_sql(conn, sql)
 
     @staticmethod
     def execute_sql(conn: any, sql: str, params: any = None) -> bool:
diff --git a/profiler/cluster_analyse/communication_group/base_communication_group.py b/profiler/cluster_analyse/communication_group/base_communication_group.py
index bc34d3439f..923d479ee7 100644
--- a/profiler/cluster_analyse/communication_group/base_communication_group.py
+++ b/profiler/cluster_analyse/communication_group/base_communication_group.py
@@ -20,6 +20,7 @@ from copy import deepcopy
 from multiprocessing import Pool
 
 from common_func.constant import Constant
+from utils.data_transfer_adapter import DataTransferAdapter
 
 
 class BaseCommunicationGroup:
@@ -35,6 +36,7 @@ class BaseCommunicationGroup:
         self.communication_group = {}
         self.communication_ops = []
         self.matrix_ops = []
+        self.adapter = DataTransferAdapter()
 
     def load_communication_data(self):
         comm_op_dirs = []
@@ -125,13 +127,23 @@ class BaseCommunicationGroup:
     def dump_data(self):
         pass
 
+    def collect_comm_data(self):
+        comm_data_dict = {
+            Constant.COLLECTIVE_GROUP: self.collective_group_dict,
+            Constant.COMMUNICATION_OPS: self.communication_ops,
+            Constant.MATRIX_OPS: self.matrix_ops,
+            Constant.COMMUNICATION_GROUP: self.communication_group
+        }
+        return comm_data_dict
+
     def generate(self):
         self.load_communication_data()
         self.analyze_communication_data()
         self.set_p2p_groups()
         self.generate_collective_communication_group()
         self.generate_p2p_communication_group()
-        return self.dump_data()
+        self.dump_data()
+        return self.collect_comm_data()
 
     def set_p2p_link(self, rank_id: int, step_id: str, rank_id_matrix_dict: dict):
         ops = rank_id_matrix_dict.get(step_id, {})
diff --git a/profiler/cluster_analyse/communication_group/communication_db_group.py b/profiler/cluster_analyse/communication_group/communication_db_group.py
index 7e498fcde1..510dcd9713 100644
--- a/profiler/cluster_analyse/communication_group/communication_db_group.py
+++ b/profiler/cluster_analyse/communication_group/communication_db_group.py
@@ -1,6 +1,5 @@
 import os
 
-from common_func.data_transfer_adapter import DataTransferAdapter
 from common_func.db_manager import DBManager
 from common_func.constant import Constant
 from communication_group.base_communication_group import BaseCommunicationGroup
@@ -34,8 +33,8 @@ class CommunicationDBGroup(BaseCommunicationGroup):
                     and self.analysis_mode in ["all", "communication_matrix"]):
                 matrix_data = DBManager.fetch_all_data(cursor, matrix_info_sql)
             DBManager.destroy_db_connect(conn, cursor)
-        comm_data = DataTransferAdapter.transfer_comm_from_db_to_json(time_data, bandwidth_data)
-        comm_matrix_data = DataTransferAdapter.transfer_matrix_from_db_to_json(matrix_data)
+        comm_data = self.adapter.transfer_comm_from_db_to_json(time_data, bandwidth_data)
+        comm_matrix_data = self.adapter.transfer_matrix_from_db_to_json(matrix_data)
         return rank_id, comm_data, comm_matrix_data
 
     def dump_data(self):
@@ -56,11 +55,3 @@ class CommunicationDBGroup(BaseCommunicationGroup):
             DBManager.destroy_db_connect(conn, cursor)
         else:
             print("[WARNING] The CommunicationGroup table won't be created because no data has been calculated.")
-        comm_data_dict = {
-            Constant.COLLECTIVE_GROUP: self.collective_group_dict,
-            Constant.COMMUNICATION_OPS: self.communication_ops,
-            Constant.MATRIX_OPS: self.matrix_ops,
-            Constant.COMMUNICATION_GROUP: self.communication_group
-        }
-        return comm_data_dict
-
diff --git a/profiler/cluster_analyse/communication_group/communication_json_group.py b/profiler/cluster_analyse/communication_group/communication_json_group.py
index 5d6d51e92e..f6e01e3abf 100644
--- a/profiler/cluster_analyse/communication_group/communication_json_group.py
+++ b/profiler/cluster_analyse/communication_group/communication_json_group.py
@@ -28,13 +28,6 @@ class CommunicationJsonGroup(BaseCommunicationGroup):
 
     def dump_data(self):
         FileManager.create_json_file(self.collection_path, self.communication_group, self.COMMUNICATION_GROUP_JSON)
-        comm_data_dict = {
-           Constant.COLLECTIVE_GROUP: self.collective_group_dict,
-           Constant.COMMUNICATION_OPS: self.communication_ops,
-           Constant.MATRIX_OPS: self.matrix_ops,
-           Constant.COMMUNICATION_GROUP: self.communication_group
-        }
-        return comm_data_dict
 
     def read_communication_func(self: any, params: tuple):
         if len(params) < 3:
diff --git a/profiler/cluster_analyse/analysis/communication/__init__.py b/profiler/cluster_analyse/utils/__init__.py
similarity index 100%
rename from profiler/cluster_analyse/analysis/communication/__init__.py
rename to profiler/cluster_analyse/utils/__init__.py
diff --git a/profiler/cluster_analyse/utils/data_transfer_adapter.py b/profiler/cluster_analyse/utils/data_transfer_adapter.py
new file mode 100644
index 0000000000..1f306415fa
--- /dev/null
+++ b/profiler/cluster_analyse/utils/data_transfer_adapter.py
@@ -0,0 +1,142 @@
+import copy
+
+from common_func.constant import Constant
+from common_func.table_constant import TableConstant
+
+
+class DataTransferAdapter(object):
+    COMM_TIME_TABLE_COLUMN = [TableConstant.START_TIMESTAMP, TableConstant.ELAPSED_TIME, TableConstant.TRANSIT_TIME,
+                              TableConstant.WAIT_TIME, TableConstant.SYNCHRONIZATION_TIME, TableConstant.IDLE_TIME,
+                              TableConstant.SYNCHRONIZATION_TIME_RATIO, TableConstant.WAIT_TIME_RATIO]
+    COMM_TIME_JSON_COLUMN = [Constant.START_TIMESTAMP, Constant.ELAPSE_TIME_MS, Constant.TRANSIT_TIME_MS,
+                             Constant.WAIT_TIME_MS, Constant.SYNCHRONIZATION_TIME_MS, Constant.IDLE_TIME_MS,
+                             Constant.SYNCHRONIZATION_TIME_RATIO, Constant.WAIT_TIME_RATIO]
+    MATRIX_TABLE_COLUMN = [TableConstant.TRANSIT_SIZE, TableConstant.TRANSIT_TIME, TableConstant.BANDWIDTH,
+                           TableConstant.TRANSPORT_TYPE, TableConstant.OPNAME]
+    MATRIX_JSON_COLUMN = [Constant.TRANSIT_SIZE_MB, Constant.TRANSIT_TIME_MS, Constant.BANDWIDTH_GB_S,
+                          Constant.TRANSPORT_TYPE, Constant.OP_NAME]
+    COMM_BD_TABLE_COLUMN = [TableConstant.TRANSIT_SIZE, TableConstant.TRANSIT_TIME, TableConstant.BANDWIDTH,
+                            TableConstant.LARGE_PACKET_RATIO]
+    COMM_BD_JSON_COLUMN = [Constant.TRANSIT_SIZE_MB, Constant.TRANSIT_TIME_MS, Constant.BANDWIDTH_GB_S,
+                           Constant.LARGE_PACKET_RATIO]
+
+    def __init__(self):
+        super().__init__()
+
+    def transfer_comm_from_db_to_json(self, time_info: list, bandwidth_info: list):
+        result = {}
+        if not time_info and not bandwidth_info:
+            return result
+        for time_data in time_info:
+            comm_time = dict()
+            hccl_name = time_data[TableConstant.HCCL_OP_NAME] + "@" + time_data[TableConstant.GROUP_NAME]
+            for key, value in dict(zip(self.COMM_TIME_JSON_COLUMN, self.COMM_TIME_TABLE_COLUMN)).items():
+                if not key.endswith("ratio"):
+                    comm_time[key] = time_data.get(value, 0)
+            result.setdefault(time_data[TableConstant.STEP], {}).setdefault(time_data[TableConstant.TYPE], {}). \
+                setdefault(hccl_name, {})[Constant.COMMUNICATION_TIME_INFO] = comm_time
+        hccl_set = set()
+        for bd_data in bandwidth_info:
+            hccl_name = bd_data[TableConstant.HCCL_OP_NAME] + "@" + bd_data[TableConstant.GROUP_NAME]
+            hccl_set.add(hccl_name)
+        for hccl in hccl_set:
+            comm_bd = dict()
+            for bd_data in bandwidth_info:
+                if hccl == (bd_data[TableConstant.HCCL_OP_NAME] + "@" + bd_data[TableConstant.GROUP_NAME]):
+                    temp_dict = dict()
+                    key_dict = dict(zip(self.COMM_BD_JSON_COLUMN, self.COMM_BD_TABLE_COLUMN))
+                    self.set_value_by_key(temp_dict, bd_data, key_dict)
+                    comm_bd.setdefault(bd_data[TableConstant.TRANSPORT_TYPE], temp_dict).setdefault(
+                        Constant.SIZE_DISTRIBUTION, {})[bd_data[TableConstant.PACKAGE_SIZE]] = \
+                        [bd_data[TableConstant.COUNT], bd_data[TableConstant.TOTAL_DURATION]]
+                    result.setdefault(bd_data[TableConstant.STEP], {}).setdefault(bd_data[TableConstant.TYPE], {}). \
+                        setdefault(hccl, {})[Constant.COMMUNICATION_BANDWIDTH_INFO] = comm_bd
+        return result
+
+    def transfer_comm_from_json_to_db(self, res_data: dict):
+        res_comm_data, res_bd_data = list(), list()
+
+        def split_comm_time():
+            for rank_id, comm_data in op_data.items():
+                time_data = comm_data.get(Constant.COMMUNICATION_TIME_INFO)
+                res_time = set_only_value(rank_id)
+                for key, value in dict(zip(self.COMM_TIME_TABLE_COLUMN, self.COMM_TIME_JSON_COLUMN)).items():
+                    res_time[key] = time_data.get(value, 0)
+                res_comm_data.append(res_time)
+                bd_data = comm_data.get(Constant.COMMUNICATION_BANDWIDTH_INFO, {})
+                for transport_type, data in bd_data.items():
+                    res_bandwidth = set_only_value(rank_id)
+                    key_dict = dict(zip(self.COMM_BD_TABLE_COLUMN, self.COMM_BD_JSON_COLUMN))
+                    res_bandwidth[TableConstant.TRANSPORT_TYPE] = transport_type
+                    self.set_value_by_key(res_bandwidth, data, key_dict)
+                    for key, value in data.get(Constant.SIZE_DISTRIBUTION, {}).items():
+                        res_bandwidth[TableConstant.PACKAGE_SIZE] = key
+                        res_bandwidth[TableConstant.COUNT] = value[0]
+                        res_bandwidth[TableConstant.TOTAL_DURATION] = value[1]
+                        temp_dict = copy.deepcopy(res_bandwidth)
+                        res_bd_data.append(temp_dict)
+
+        def set_only_value(rank_id):
+            res_dict = dict()
+            res_dict[TableConstant.RANK_SET] = str(rank_set)
+            res_dict[TableConstant.STEP] = step
+            res_dict[TableConstant.RANK_ID] = rank_id
+            res_dict[TableConstant.HCCL_OP_NAME] = op_name.split("@")[0] if "@" in op_name else op_name
+            res_dict[TableConstant.GROUP_NAME] = op_name.split("@")[1] if "@" in op_name else ""
+            return res_dict
+
+        for rank_set, step_dict in res_data.items():
+            for step, op_dict in step_dict.items():
+                for op_name, op_data in op_dict.items():
+                    split_comm_time()
+        return res_comm_data, res_bd_data
+
+    def set_value_by_key(self, src_dict, dst_dict, key_dict):
+        for key, value in key_dict.items():
+            src_dict[key] = dst_dict.get(value, 0)
+
+    def transfer_matrix_from_db_to_json(self, matrix_data: list):
+        result = {}
+        if not matrix_data:
+            return result
+        hccl_set = set()
+        for data in matrix_data:
+            hccl = data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME]
+            hccl_set.add(hccl)
+        for hccl in hccl_set:
+            for data in matrix_data:
+                if hccl == (data[TableConstant.HCCL_OP_NAME] + "@" + data[TableConstant.GROUP_NAME]):
+                    key = data[TableConstant.SRC_RANK] + '-' + data[TableConstant.DST_RANK]
+                    temp_dict = dict()
+                    key_dict = dict(zip(self.MATRIX_JSON_COLUMN, self.MATRIX_TABLE_COLUMN))
+                    self.set_value_by_key(temp_dict, data, key_dict)
+                    result.setdefault(data[TableConstant.STEP], {}).setdefault(data[TableConstant.TYPE], {}). \
+                        setdefault(hccl, {}).setdefault(key, temp_dict)
+        return result
+
+    def transfer_matrix_from_json_to_db(self, res_data: dict):
+        result = list()
+
+        def split_matrix_data():
+            for op_name, op_data in op_dict.items():
+                for link_key, link_data in op_data.items():
+                    if "@" in op_name:
+                        hccl_op_name, group_name = op_name.split("@")[0], op_name.split("@")[1]
+                    else:
+                        hccl_op_name, group_name = op_name, ""
+                    matrix_data = {
+                        TableConstant.RANK_SET: str(rank_set),
+                        TableConstant.STEP: step,
+                        TableConstant.HCCL_OP_NAME: hccl_op_name,
+                        TableConstant.GROUP_NAME: group_name,
+                        TableConstant.SRC_RANK: link_key.split("-")[0],
+                        TableConstant.DST_RANK: link_key.split("-")[1]
+                    }
+                    key_dict = dict(zip(self.MATRIX_TABLE_COLUMN, self.MATRIX_JSON_COLUMN))
+                    self.set_value_by_key(matrix_data, link_data, key_dict)
+                    result.append(matrix_data)
+
+        for rank_set, step_dict in res_data.items():
+            for step, op_dict in step_dict.items():
+                split_matrix_data()
+        return result
-- 
Gitee


From c8c45ecc35b0a30dcb6aa0c71242cecd4555da5b Mon Sep 17 00:00:00 2001
From: mei-feiyao <meifeiyao@huawei.com>
Date: Tue, 19 Mar 2024 14:23:48 +0800
Subject: [PATCH 49/62] update bind core script

---
 README.md                               |   4 +
 profiler/affinity_cpu_bind/README.md    |  14 +-
 profiler/affinity_cpu_bind/__init__.py  |   0
 profiler/affinity_cpu_bind/bind_core.py | 419 ++++++++++++------------
 4 files changed, 215 insertions(+), 222 deletions(-)
 create mode 100644 profiler/affinity_cpu_bind/__init__.py

diff --git a/README.md b/README.md
index cb203544c7..e82feb0906 100644
--- a/README.md
+++ b/README.md
@@ -42,6 +42,10 @@ Ascend Training Tools，昇腾训练工具链。针对训练&大模型场景，
 
    提供多机多卡的集群分析能力（基于通信域的通信分析和迭代耗时分析）, 当前需要配合Ascend Insight的集群分析功能使用。
 
+3. [affinity_cpu_bind (亲和cpu绑核工具) ](https://gitee.com/ascend/att/tree/master/profiler/affinity_cpu_bind)
+
+   提供亲和性CPU绑核能力，改善host_bound调度问题。
+
 ### [Tensorboard](https://gitee.com/ascend/att/tree/master/plugins/tensorboard-plugins/tb_plugin)
 
 Tensorboard支持NPU性能数据可视化插件PyTorch Profiler TensorBoard NPU Plugin。
diff --git a/profiler/affinity_cpu_bind/README.md b/profiler/affinity_cpu_bind/README.md
index 832bd555e4..ab33065749 100644
--- a/profiler/affinity_cpu_bind/README.md
+++ b/profiler/affinity_cpu_bind/README.md
@@ -4,17 +4,20 @@
 昇腾亲和性CPU绑核工具支持用户无需侵入式修改工程，直接运行工具即可实现按亲和性策略绑核，提升推理或训练性能。
 
 ###  **使用方式** 
-1.命令行输入python3 bind_core.py -app/--application="inference/train cmd"（如果命令含多个参数，放在双引号中）
+1.命令行输入python3 bind_core.py -app/--application="inference/train cmd"（如果命令含多个参数，放在双引号中）。
 该方式会在拉起任务后，监测任务进程，并实施绑核，直至任务进程结束。
 
-2.推理或训练任务已经拉起，命令行输入python3 bind_core.py。该方式会循环查找使用到NPU卡的任务进程，并实施绑核。
+2.也可先拉起训练或推理进程，命令行再输入python3 bind_core.py。该方式会循环查找使用到NPU卡的任务进程，并实施绑核。
 
-3.绑核运行过程的日志默认不存盘；想保存运行日志的话，执行绑核命令时设置-l/--log参数，例如 : python3 bind_core.py -l/--log，这样就会将运行日志保存到当前路径的bind_core_xxx.txt
+3.绑核运行过程的日志会保存到当前路径的bind_core_时间戳.txt。
+
+4.如果希望绑核脚本在拉起后等待一定时间再执行绑核动作(比如训练进程拉起后需要一定时间预处理数据，未真正下发任务)，可在执行绑核命令时设置-t/--time参数。
+例如 ：python3 bind_core.py -t=10，这样就会在脚本会在等待10秒后执行绑核操作。
 
 ### **使用须知**
-1.该脚本会在拉起后查找使用到NPU卡的进程，每次查找10s，循环5次。如果找不到进程，会超时退出。
+1.该脚本用于arm服务器环境，训练或推理任务因为CPU资源分配不均等出现host_bound问题时使用，可改善问题，对于非host_bound的场景无明显改善效果。
 
-2.使用工具前应提前安装pstree工具，参考命令yum install -y psmisc或apt -y install psmisc。
+2.该脚本会在拉起后查找使用到NPU卡的进程，每次查找10s，循环5次。如果找不到进程，会超时退出。
 
 3.使用前手动执行npu-smi info -t topo，出现如下类似信息，说明环境支持绑核，否则请将环境驱动包升级到Ascend HDK 23.0.RC2以上版本。
 
@@ -27,6 +30,7 @@
     NPU5    HCCS   HCCS    HCCS    HCCS    HCCS    X       HCCS    HCCS    xx-xx
     NPU6    HCCS   HCCS    HCCS    HCCS    HCCS    HCCS    X       HCCS    xx-xx
     NPU7    HCCS   HCCS    HCCS    HCCS    HCCS    HCCS    HCCS    X       xx-xx
+    ...     ...    ...     ...     ...     ...     ...     ...     ...     ...
 
 
diff --git a/profiler/affinity_cpu_bind/__init__.py b/profiler/affinity_cpu_bind/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/profiler/affinity_cpu_bind/bind_core.py b/profiler/affinity_cpu_bind/bind_core.py
index a3f6d67a8e..d53693815a 100644
--- a/profiler/affinity_cpu_bind/bind_core.py
+++ b/profiler/affinity_cpu_bind/bind_core.py
@@ -1,227 +1,212 @@
-#! /usr/bin/python3
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import subprocess
-import re
 import argparse
+import os
+import time
+import logging
 from datetime import datetime
 from datetime import timezone
-import time
 
-NPU_IDS = []
-RUNNING_PIDS = {}
-NPU_CPU_AFFINITY_DICT = {}
-SAVE_LOG_TO_FILE = False
-
-# binding core log file
-nowtime = datetime.now(tz=timezone.utc)
-BIND_CORE_RESULT_FILE = 'bind_core_' + \
-                        str(nowtime.year) + '_' + \
-                        str(nowtime.month) + '_' + \
-                        str(nowtime.day) + '_' + \
-                        str(nowtime.hour) + '_' + \
-                        str(nowtime.minute) + '_' + \
-                        str(nowtime.second) + '.txt'
-
-
-# print log to logfile
-def print_log_to_file(msg):
-    global SAVE_LOG_TO_FILE
-    if not SAVE_LOG_TO_FILE:
-        return
-    with open(file=BIND_CORE_RESULT_FILE, mode="a", encoding="utf-8") as f:
-        f.write(msg + '\n')
-
-
-# launch training or inference process
-def launch_process(cmd):
-    global RUNNING_CMD_PID
-    print_log_to_file('[INFO] Start to execute cmd: {}'.format(cmd))
-    subprocess.Popen(cmd.split(), shell=False)
-
-
-# parse input cmd
-def args_parse():
-    global SAVE_LOG_TO_FILE
-    bind_wait_core_time = 0
-    parser = argparse.ArgumentParser(description='This is a sample program.')
-    parser.add_argument('-t', '--time', type=int, metavar='', nargs='+', help='Wait time before bind cores that you want to set. The unit is \'s\'')
-    parser.add_argument('-app', '--application', metavar='', nargs='+', help='Training or inference command that you want to run.')
-    parser.add_argument('-l', '--log', default=False, action='store_true', help='Switch to save running log to local file.')
-    args = parser.parse_args()
-    if args.application:
-        application_cmd = ' '.join(args.application)
-        launch_process(application_cmd)
-        time.sleep(10)
-    if args.time:
-        bind_wait_core_time = int(args.time[0])
-    if args.log:
-        SAVE_LOG_TO_FILE = True
-
-    # if time is set, wait for setting time before bind cores
-    if bind_wait_core_time != 0:
-        time.sleep(bind_wait_core_time)
-
-
-# get npu affinity
-def get_npu_affinity() -> bool:
-    global NPU_CPU_AFFINITY_DICT
-    global NPU_IDS
-
-    get_npu_topo_cmd = 'npu-smi info -t topo'
-    p = subprocess.run(get_npu_topo_cmd.split(), shell=False, capture_output=True)
-    res = p.stdout.decode('utf-8').strip().split()
-    if not res:
-        print('[ERROR] Failed to run get npu affinity info, please check if driver version support cmd npu-smi info -t topo')
-        return False
-
-    i = 0
-    for v in res:
-        if '-' in v:
-            NPU_CPU_AFFINITY_DICT[NPU_IDS[i]] = v
-            i += 1
-    for k in NPU_CPU_AFFINITY_DICT.keys():
-        print_log_to_file('[INFO] Affinity CPU list {} for NPU {}'.format(NPU_CPU_AFFINITY_DICT[k], k))
-    return True
-
-
-# get total npu id
-def get_total_npu_id() -> bool:
-    global NPU_IDS
-    get_npu_info_cmd = 'npu-smi info -l'
-    get_npu_info_process = subprocess.run(get_npu_info_cmd.split(), shell=False, capture_output=True)
-    get_npu_ids_cmd = 'grep ID'
-    get_npu_ids_process = subprocess.run(get_npu_ids_cmd.split(), shell=False, input=get_npu_info_process.stdout, capture_output=True)
-    res = get_npu_ids_process.stdout.decode('utf-8').strip().split()
-    for i in res:
-        if i.isdigit():
-            NPU_IDS.append(int(i))
-    if not NPU_IDS:
-        print('[ERROR] Failed to get total NPU id list, please make sure there is NPU on this device')
-        return False
-    print_log_to_file('[INFO] NPU total id list: {}'.format(NPU_IDS))
-    return True
-
-
-# get app pid on npu
-def get_pid_on_npu() -> bool:
-    global RUNNING_PIDS
-    global NPU_IDS
-    print_log_to_file('[INFO] Begin to find running process on all NPUs')
-    RUNNING_PIDS.clear()
-    # get process pid on NPUs, retry times : 5
-    for times in range(5):
-        for i in NPU_IDS:
-            get_npu_pids_cmd = 'npu-smi info -t proc-mem -i {} -c 0'.format(str(i))
-            p = subprocess.run(get_npu_pids_cmd.split(), shell=False, capture_output=True)
-            res = p.stdout.decode('utf-8').strip().split()
-
-            if 'Process' in res:
-                for v in res:
-                    if v.startswith('id:'):
-                        pid_on_npu = v.split(':')[1]
-                        if i not in RUNNING_PIDS:
-                            RUNNING_PIDS[i] = [int(pid_on_npu)]
-                        else:
-                            RUNNING_PIDS[i].append(int(pid_on_npu))
-
-        if RUNNING_PIDS:
-            break
-        print_log_to_file('[WARNING] Found no running process on all NPUs, retry times: {}, wait for 5 s'.format(times + 1))
-        # wait 5 s for each time
-        time.sleep(5)
-
-    # no running process on NPUs, stop
-    if not RUNNING_PIDS:
-        print_log_to_file('[INFO] Found no running process on all NPUs, stop bind cores')
-        print('[INFO] Now there is no running process on all NPUs, stop bind cores')
-        return False
-
-    # delete repeat pid
-    for i in NPU_IDS:
-        if i not in RUNNING_PIDS:
-            continue
-        pids_npu = RUNNING_PIDS[i]
-        for n, pid in RUNNING_PIDS.items():
-            if n != i and pid in pids_npu:
-                RUNNING_PIDS[n].remove(pid)
-
-    for k in RUNNING_PIDS.keys():
-        print_log_to_file('[INFO] Succeed to find running process {} on NPU {}'.format(RUNNING_PIDS[k], k))
-    return True
-
-
-# get device info
-def get_dev_info() -> bool:
-    if not get_total_npu_id():
-        return False
-    if not get_npu_affinity():
-        return False
-    return True
-
-
-# get process affinity
-def get_process_affinity(pid):
-    get_affinity_cpu_cmd = 'taskset -pc {} '.format(pid)
-    p = subprocess.run(get_affinity_cpu_cmd.split(), shell=False, capture_output=True)
-    res = p.stdout.decode('utf-8').strip().split()
-    return res[len(res) - 1]
-
-
-# run bind core
-def run_bind_core():
-    global NPU_IDS
-    global NPU_CPU_AFFINITY_DICT
-    for k, pid_list in RUNNING_PIDS.items():
-        cpu_list = NPU_CPU_AFFINITY_DICT[k].split('-')
-        start_cpu_id = cpu_list[0]
-        end_cpu_id = cpu_list[1]
-
-        for pid in pid_list:
-            get_child_pids_cmd = 'pstree {} -p -T'.format(pid)
-            p = subprocess.run(get_child_pids_cmd.split(), shell=False, capture_output=True)
-            res = p.stdout.decode('utf-8').strip().split()
-            for ele in res:
-                ele = re.sub(u"\\(|\\)", ",", ele)
-                ele_list = ele.split(',')
-                for sub_p in ele_list:
-                    if sub_p.isdigit():
-                        sub_p = int(sub_p)
-
-                        # if process has set to right affinity, continue
-                        current_affinity_cpu_list = get_process_affinity(sub_p)
-                        if not current_affinity_cpu_list:
-                            continue
-                        current_cpu_list = current_affinity_cpu_list.split('-')
-                        if current_cpu_list and current_cpu_list[0] == start_cpu_id and current_cpu_list[1] == end_cpu_id:
-                            continue
-                        print_log_to_file('[INFO] Begin to bind cores for process {} on NPU {}'.format(str(sub_p), k))
-                        set_affinity_cpu_cmd = 'taskset -pc {}-{} {}'.format(int(start_cpu_id), int(end_cpu_id), sub_p)
-                        p = subprocess.run(set_affinity_cpu_cmd.split(), shell=False, capture_output=True)
-                        print_log_to_file(p.stdout.decode('utf-8'))
-
-                        print_log_to_file('[INFO] Succeed to bind process {} on NPU {} with cpu cores list {}'.format(str(sub_p), k, NPU_CPU_AFFINITY_DICT[k]))
+
+class PathManager:
+    @classmethod
+    def create_file_safety(cls, path: str):
+        base_name = os.path.basename(path)
+        msg = f"Failed to create file: {base_name}"
+        if os.path.islink(path):
+            raise RuntimeError(msg)
+        if os.path.exists(path):
+            return
+        try:
+            os.close(os.open(path, os.O_WRONLY | os.O_CREAT, cls.DATA_FILE_AUTHORITY))
+        except Exception as err:
+            raise RuntimeError(msg) from err
+
+
+class BindCoreManager():
+    DEFAULT_FIND_RUNNING_PID_TIMES = 5
+
+    def __init__(self):
+        self.npu_id_list = []
+        self.running_pid_on_npu = {}
+        self.find_running_pid_times = self.DEFAULT_FIND_RUNNING_PID_TIMES
+        self.npu_affinity_cpu_dict = {}
+        self.log_file = ''
+        self._init_log_file()
+
+
+    def _init_log_file(self):
+        now_time = datetime.now(tz=timezone.utc)
+        time_stamp = str(now_time.year) + '_' + \
+                     str(now_time.month) + '_' + \
+                     str(now_time.day) + '_' + \
+                     str(now_time.hour) + '_' + \
+                     str(now_time.minute) + '_' + \
+                     str(now_time.second)
+        log_file_name = 'bind_core_' + time_stamp + '.log'
+        msg = f"Failed to create file: {log_file_name}"
+        try:
+            PathManager.create_file_safety(os.path.join(os.getcwd(), log_file_name))
+        except RuntimeError as err:
+            raise RuntimeError(msg) from err
+        self.log_file = log_file_name
+        logging.basicConfig(filename=self.log_file,
+                            level=logging.INFO,
+                            filemode='w',
+                            format='%(asctime)s-%(name)s-%(levelname)s-%(message)s')
+
+    def _get_all_npu_id(self) -> None:
+        get_npu_info_cmd = 'npu-smi info -l'
+        get_npu_info_process = subprocess.run(get_npu_info_cmd.split(), shell=False, capture_output=True)
+        get_npu_id_cmd = 'grep ID'
+        get_npu_id_process = subprocess.run(get_npu_id_cmd.split(), shell=False, input=get_npu_info_process.stdout, capture_output=True)
+        res = get_npu_id_process.stdout.decode('utf-8').split()
+        for i in res:
+            if i.isdigit():
+                self.npu_id_list.append(int(i))
+        logging.info(f'NPU total id list: {self.npu_id_list}')
+
+    def _get_npu_affinity(self) -> bool:
+        cpu_num = os.cpu_count()
+        cpu_num_for_each_npu = cpu_num // len(self.npu_id_list)
+        get_npu_topo_cmd = 'npu-smi info -t topo'
+        p = subprocess.run(get_npu_topo_cmd.split(), shell=False, capture_output=True)
+        res = p.stdout.decode('utf-8').split()
+        if not res:
+            print('[ERROR] Failed to run get npu affinity info, please check if driver version support cmd npu-smi info -t topo')
+            return False
+
+        index = 0
+        for v in res:
+            if '-' in v:
+                affinity_cpus = []
+                cpu_lists = v.split(',')
+                for cpu_list in cpu_lists:
+                    cpus = cpu_list.split('-')
+                    if len(cpus) != 2:
+                        continue
+                    if int(cpus[1]) - int(cpus[0]) == cpu_num_for_each_npu - 1:
+                        cpus[1] = str(int(cpus[1]) + cpu_num_for_each_npu)
+                    affinity_cpus.append(cpus[0] + '-' + cpus[1])
+                if index < len(self.npu_id_list):
+                    self.npu_affinity_cpu_dict[self.npu_id_list[index]] = ','.join(affinity_cpu for affinity_cpu in affinity_cpus)
+                    index += 1
+                else:
+                    print('[ERROR] Get affinity_cpu_list for {} npus, more than real npu num: {}'.format(index + 1, len(self.npu_id_list)))
+                    return False
+
+        for k in self.npu_affinity_cpu_dict.keys():
+            logging.info(f'Affinity CPU list {self.npu_affinity_cpu_dict[k]} for NPU {k}')
+        return True
+
+    def get_running_pid_on_npu(self) -> bool:
+        no_running_pids_on_npu_msg = '[INFO] Now there is no running process on all NPUs, stop bind cores'
+        logging.info('Begin to find running process on all NPUs')
+        # get running process on NPUs
+        for times in range(self.find_running_pid_times):
+            running_pid_on_npu = {}
+            for npu_id in self.npu_id_list:
+                get_npu_pids_cmd = 'npu-smi info -t proc-mem -i {} -c 0'.format(npu_id)
+                get_npu_pids_process = subprocess.run(get_npu_pids_cmd.split(), shell=False, capture_output=True)
+                res = get_npu_pids_process.stdout.decode('utf-8').split()
+                pid_list = []
+                for value in res:
+                    if value.startswith('id:'):
+                        pid = value.split(':')[1]
+                        pid_list.append(pid)
+                if pid_list:
+                    running_pid_on_npu[npu_id] = list(set(pid_list))
+            
+            if len(self.running_pid_on_npu.keys()) == len(running_pid_on_npu.keys()) and running_pid_on_npu:
+                self.running_pid_on_npu = running_pid_on_npu
+                break
+
+            self.running_pid_on_npu = running_pid_on_npu
+            time.sleep(5)
+
+        # delete repeat pid
+        for npu_id in self.npu_id_list:
+            if npu_id not in self.running_pid_on_npu:
+                continue
+            pids_on_npu = self.running_pid_on_npu[npu_id]
+            for pid in pids_on_npu:
+                for npu_id_with_pids, pids in self.running_pid_on_npu.items():
+                    if npu_id == npu_id_with_pids:
+                        continue
+                    if pid in pids:
+                        pids_on_npu.remove(pid)
+
+        if_running_process = False
+        for npu_id, pids in self.running_pid_on_npu.items():
+            if not pids:
+                logging.info(f'There is no running process on NPU {npu_id}')
+            else:
+                logging.info(f'Succeed to find running process {pids} on NPU {npu_id}')
+                if_running_process = True
+        if not if_running_process:
+            print(no_running_pids_on_npu_msg)
+        return if_running_process
+
+    def get_npu_info(self) -> bool:
+        try:
+            self._get_all_npu_id()
+            if not self._get_npu_affinity():
+                return False
+        except subprocess.CalledProcessError:
+            return False
+        return True
+
+    def run_bind_core(self):
+        if not self.running_pid_on_npu:
+            return
+        for npu, pid_list in self.running_pid_on_npu.items():
+            if npu not in self.npu_affinity_cpu_dict.keys():
+                logging.warning(f'Cannot find affinity cpu for npu: {npu}')
+                continue
+            affinity_cpu = self.npu_affinity_cpu_dict.get(npu)
+            for pid in pid_list:
+                try:
+                    logging.info(f'Begin to bind cores for process {pid} on NPU {npu}')
+                    set_affinity_cpu_cmd = 'taskset -pc {} {}'.format(affinity_cpu, pid)
+                    p = subprocess.run(set_affinity_cpu_cmd.split(), shell=False, capture_output=True)
+                    logging.info(p.stdout.decode('utf-8'))
+                except subprocess.CalledProcessError:
+                    print('[ERROR] Failed to bind process {} on NPU {} with cpu cores list {}'.format(pid, npu, affinity_cpu))
+            
+                logging.info(f'Succeed to bind process {pid} on NPU {npu} with cpu cores list {affinity_cpu}')
+
+    def args_parse(self):
+        parser = argparse.ArgumentParser(description='This is a affinity cpu core bind script.')
+        parser.add_argument('-t', '--time', type=int, metavar='', help='Wait time before bind cores that you want to set. The unit is \'s\'.')
+        parser.add_argument('-app', '--application', metavar='', nargs='+', help='Training or inference command that you want to run.')
+        args = parser.parse_args()
+        if args.application:
+            application_cmd = ' '.join(args.application)
+            self.launch_process(application_cmd)
+            time.sleep(2)
+        # if time is set, wait for setting time before bind cores
+        if args.time:
+            time.sleep(args.time)
+
+    def launch_process(self, cmd: list):
+        logging.info(f'Start to execute cmd: {cmd}')
+        try:
+            subprocess.Popen(cmd.split(), shell=False)
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f'Failed to run cmd: {cmd}') from e
 
 
 if __name__ == '__main__':
-    print("[INFO] Begin to run bind-cores script...")
-    args_parse()
-    if not get_dev_info():
+    print('[INFO] Begin to run bind-cores script...')
+    bind_core_manager = BindCoreManager()
+    bind_core_manager.args_parse()
+
+    if not bind_core_manager.get_npu_info():
+        print('[ERROR] Failed to get current npus info')
+        exit()
+
+    if not bind_core_manager.get_running_pid_on_npu():
         exit()
+    bind_core_manager.run_bind_core()
+    print('[INFO] End to run bind-cores script, the log is saved in {}'.format(bind_core_manager.log_file))
+
 
-    while True:
-        if not get_pid_on_npu():
-            exit()
-        run_bind_core()
-- 
Gitee


From c04b554f5df5e9f2b1249a124111a099b19dcdf5 Mon Sep 17 00:00:00 2001
From: mei-feiyao <meifeiyao@huawei.com>
Date: Thu, 21 Mar 2024 15:03:36 +0800
Subject: [PATCH 50/62] update bind core script readme

---
 README.md                            |  2 +-
 profiler/affinity_cpu_bind/README.md | 56 ++++++++++++++--------------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index 92a4783443..a9fe8a8fdf 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ Ascend Training Tools，昇腾训练工具链。【Powered by MindStudio】
 
    提供多机多卡的集群分析能力（基于通信域的通信分析和迭代耗时分析）, 当前需要配合Ascend Insight的集群分析功能使用。
 
-3. [affinity_cpu_bind (亲和cpu绑核工具) ](https://gitee.com/ascend/att/tree/master/profiler/affinity_cpu_bind)
+3. [affinity_cpu_bind (亲和性cpu绑核工具) ](https://gitee.com/ascend/att/tree/master/profiler/affinity_cpu_bind)
 
    提供亲和性CPU绑核能力，改善host_bound调度问题。
 
diff --git a/profiler/affinity_cpu_bind/README.md b/profiler/affinity_cpu_bind/README.md
index ab33065749..8c3b47ed51 100644
--- a/profiler/affinity_cpu_bind/README.md
+++ b/profiler/affinity_cpu_bind/README.md
@@ -1,38 +1,40 @@
-### **昇腾亲和性CPU绑核工具** 
+# 昇腾亲和性CPU绑核工具
 
-###  **介绍** 
-昇腾亲和性CPU绑核工具支持用户无需侵入式修改工程，直接运行工具即可实现按亲和性策略绑核，提升推理或训练性能。
+昇腾亲和性CPU绑核工具支持用户无需修改代码，直接运行工具即可实现按CPU亲和性策略绑核，提升推理或训练性能。
 
-###  **使用方式** 
-1.命令行输入python3 bind_core.py -app/--application="inference/train cmd"（如果命令含多个参数，放在双引号中）。
-该方式会在拉起任务后，监测任务进程，并实施绑核，直至任务进程结束。
+绑核工具用户arm服务器环境，对于训练或推理任务因为CPU资源调度等出现host_bound问题时使用，可改善该问题；对于非host_bound的场景无明显改善效果。
 
-2.也可先拉起训练或推理进程，命令行再输入python3 bind_core.py。该方式会循环查找使用到NPU卡的任务进程，并实施绑核。
+## 使用须知
 
-3.绑核运行过程的日志会保存到当前路径的bind_core_时间戳.txt。
+使用绑核工具前手动执行npu-smi info -t topo，出现以下类似信息，说明环境支持绑核，否则请将环境HDK包升级到Ascend HDK 23.0.RC2及以上版本。
 
-4.如果希望绑核脚本在拉起后等待一定时间再执行绑核动作(比如训练进程拉起后需要一定时间预处理数据，未真正下发任务)，可在执行绑核命令时设置-t/--time参数。
-例如 ：python3 bind_core.py -t=10，这样就会在脚本会在等待10秒后执行绑核操作。
+            NPU0   NPU1    NPU2    NPU3    NPU4    NPU5    NPU6    NPU7    NPUx   CPU Affinity 
+    NPU0    X      HCCS    HCCS    HCCS    HCCS    HCCS    HCCS    HCCS    ...    xx-xx
+    NPU1    HCCS   X       HCCS    HCCS    HCCS    HCCS    HCCS    HCCS    ...    xx-xx
+    NPU2    HCCS   HCCS    X       HCCS    HCCS    HCCS    HCCS    HCCS    ...    xx-xx
+    NPU3    HCCS   HCCS    HCCS    X       HCCS    HCCS    HCCS    HCCS    ...    xx-xx
+    NPU4    HCCS   HCCS    HCCS    HCCS    X       HCCS    HCCS    HCCS    ...    xx-xx
+    NPU5    HCCS   HCCS    HCCS    HCCS    HCCS    X       HCCS    HCCS    ...    xx-xx
+    NPU6    HCCS   HCCS    HCCS    HCCS    HCCS    HCCS    X       HCCS    ...    xx-xx
+    NPU7    HCCS   HCCS    HCCS    HCCS    HCCS    HCCS    HCCS    X       ...    xx-xx
+    NPUx    ...    ...     ...     ...     ...     ...     ...     ...     ...    ...
 
-### **使用须知**
-1.该脚本用于arm服务器环境，训练或推理任务因为CPU资源分配不均等出现host_bound问题时使用，可改善问题，对于非host_bound的场景无明显改善效果。
-
-2.该脚本会在拉起后查找使用到NPU卡的进程，每次查找10s，循环5次。如果找不到进程，会超时退出。
-
-3.使用前手动执行npu-smi info -t topo，出现如下类似信息，说明环境支持绑核，否则请将环境驱动包升级到Ascend HDK 23.0.RC2以上版本。
-
-            NPU0   NPU1    NPU2    NPU3    NPU4    NPU5    NPU6    NPU7    CPU Affinity 
-    NPU0    X      HCCS    HCCS    HCCS    HCCS    HCCS    HCCS    HCCS    xx-xx
-    NPU1    HCCS   X       HCCS    HCCS    HCCS    HCCS    HCCS    HCCS    xx-xx
-    NPU2    HCCS   HCCS    X       HCCS    HCCS    HCCS    HCCS    HCCS    xx-xx
-    NPU3    HCCS   HCCS    HCCS    X       HCCS    HCCS    HCCS    HCCS    xx-xx
-    NPU4    HCCS   HCCS    HCCS    HCCS    X       HCCS    HCCS    HCCS    xx-xx
-    NPU5    HCCS   HCCS    HCCS    HCCS    HCCS    X       HCCS    HCCS    xx-xx
-    NPU6    HCCS   HCCS    HCCS    HCCS    HCCS    HCCS    X       HCCS    xx-xx
-    NPU7    HCCS   HCCS    HCCS    HCCS    HCCS    HCCS    HCCS    X       xx-xx
-    ...     ...    ...     ...     ...     ...     ...     ...     ...     ...
+##  使用方式
 
+1.执行以下命令实施绑核：
 
+ - 直接执行绑核命令
+```bash
+python3 bind_core.py -app/--application="inferenec/train cmd"
+```
+该方式会自动拉起训练或推理任务，检测任务进程，并实施绑核。
 
+ - 手动拉起训练或推理任务后再执行绑核
+```bash
+python3 bind_core.py
+```
+该方式会循环查找(循环5次，每次10s，若找不到进程，则直接退出)使用到NPU的任务进程，并实施绑核。
 
+2.绑核运行过程的日志会保存到当前路径的bind_core_时间戳.log。
 
+3.如果推理或训练进程拉起后需要一定时间预处理，才会真正执行任务，可在执行绑核命令时设置-t/--time参数(单位秒)，绑核工具会在延迟配置的时间后，再实施绑核动作。例如：python3 bind_core.py -app="cmd" -t=10，配置后工具会在10秒后执行绑核操作。
\ No newline at end of file
-- 
Gitee


From d9ef16bd75b6d4fe913a0de4d1d53ff85ea6d4bf Mon Sep 17 00:00:00 2001
From: Mrtutu <zhangwei983@huawei.com>
Date: Fri, 22 Mar 2024 15:23:34 +0800
Subject: [PATCH 51/62] fix cluster

---
 .../advisor_backend/cluster_advice/cluster_advice_base.py      | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/profiler/advisor/advisor_backend/cluster_advice/cluster_advice_base.py b/profiler/advisor/advisor_backend/cluster_advice/cluster_advice_base.py
index 8cd9acab4c..e9be467596 100644
--- a/profiler/advisor/advisor_backend/cluster_advice/cluster_advice_base.py
+++ b/profiler/advisor/advisor_backend/cluster_advice/cluster_advice_base.py
@@ -46,7 +46,8 @@ class ClusterAdviceBase(AdviceBase):
 
     def cluster_analyze(self):
         parameter = {
-            Constant.COLLECTION_PATH: self.collection_path
+            Constant.COLLECTION_PATH: self.collection_path,
+            Constant.ANALYSIS_MODE: "all"
         }
         try:
             Interface(parameter).run()
-- 
Gitee


From da15d38c0b1b7c0d99111e5930acc1c71424208b Mon Sep 17 00:00:00 2001
From: cai-weiwei1989 <734267852@qq.com>
Date: Mon, 25 Mar 2024 11:39:17 +0800
Subject: [PATCH 52/62] =?UTF-8?q?[compare=5Ftools]=E4=BD=8E=E9=94=99?=
 =?UTF-8?q?=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 profiler/compare_tools/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/profiler/compare_tools/README.md b/profiler/compare_tools/README.md
index 106f53e11a..014fa36e4e 100644
--- a/profiler/compare_tools/README.md
+++ b/profiler/compare_tools/README.md
@@ -32,11 +32,11 @@ pip3 install numpy
 采集样例代码参考一：
 
 ```Python
-with torch.profiler.profile(
+with torch_npu.profiler.profile(
         profile_memory=True,  # 内存数据采集的开关
         record_shapes=True,  # 算子input shape信息采集的开关
-        schedule=torch.profiler.schedule(wait=10, warmup=0, active=1, repeat=1),
-        on_trace_ready=torch.profiler.tensorboard_trace_handler("./result_dir")
+        schedule=torch_npu.profiler.schedule(wait=10, warmup=0, active=1, repeat=1),
+        on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./result_dir")
 ) as prof:
     for step in ranges(step_number):
         train_one_step()
@@ -46,10 +46,10 @@ with torch.profiler.profile(
 采集样例代码参考二：
 
 ```Python
-prof = torch.profiler.profile(
+prof = torch_npu.profiler.profile(
     profile_memory=True,  # 内存数据采集的开关
     record_shapes=True,  # 算子input shape信息采集的开关
-    on_trace_ready=torch.profiler.tensorboard_trace_handler("./result_dir"))
+    on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./result_dir"))
 for step in range(step_number):
     if step == 11:
         prof.start()
-- 
Gitee


From 4a37a40d15340d53c3b7ee2a85e832f3ee329047 Mon Sep 17 00:00:00 2001
From: zhouxianqi <13165993773@163.com>
Date: Mon, 25 Mar 2024 20:53:41 +0800
Subject: [PATCH 53/62] adpt_uncovered_communication_time_field

---
 .../advisor_backend/overall_advice/overall_summary_advice.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py b/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py
index 7cbf7d807e..9fb347d823 100644
--- a/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py
+++ b/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py
@@ -27,7 +27,7 @@ class OverallSummaryAdvice(AdviceBase):
     }
     time_name_map = {
         "Computing Time": "computing",
-        "Uncovered Communication Time": "communication",
+        "Uncovered Communication Time(Wait Time)": "communication",
         "Free Time": "free",
         'Cube Time(Num)': 'Cube Time',
         'Vector Time(Num)': 'Vector Time',
@@ -39,7 +39,7 @@ class OverallSummaryAdvice(AdviceBase):
     performance_time_dict = {
         "Computing Time": ['Cube Time(Num)', 'Vector Time(Num)', 'Flash Attention Time(Forward)(Num)',
                            'Flash Attention Time(Backward)(Num)', 'Other Time'],
-        "Uncovered Communication Time": [],
+        "Uncovered Communication Time(Wait Time)": [],
         "Free Time": ['SDMA Time(Num)']
     }
 
@@ -112,6 +112,7 @@ class OverallSummaryAdvice(AdviceBase):
             if time_value == Constant.INVALID_VALUE:
                 continue
             duration, _ = self.split_duration_and_num(time_value)
+            time_category = time_category.split("(")[0]
             time_category_dict[time_category] = duration
             self.get_sub_category_time(time_category, time_list, duration)
         self.cur_data["overall_data"] = time_category_dict
-- 
Gitee


From 495ad74db46309eb1e805b8daa89aafa6f3b6dcd Mon Sep 17 00:00:00 2001
From: zhouxianqi <13165993773@163.com>
Date: Tue, 26 Mar 2024 14:53:01 +0800
Subject: [PATCH 54/62] bug_fix_for_overall_advice

---
 .../overall_advice/overall_summary_advice.py                | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py b/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py
index 9fb347d823..bdee8029b8 100644
--- a/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py
+++ b/profiler/advisor/advisor_backend/overall_advice/overall_summary_advice.py
@@ -146,7 +146,7 @@ class OverallSummaryAdvice(AdviceBase):
         overall_data = self.cur_data.get("overall_data")
         if not overall_data:
             return
-        e2e_time = sum([data for data in overall_data.values()])
+        e2e_time = '%.3f' % sum([data for data in overall_data.values()])
         overall_bottleneck = f"The Model E2E Time is {e2e_time}s.\n"
         comparison_bottleneck = ""
         for time_type, time_value in overall_data.items():
@@ -161,7 +161,9 @@ class OverallSummaryAdvice(AdviceBase):
             if not self._has_base_collection:
                 continue
             # add comparison bottleneck
-            base_duration, _ = self.split_duration_and_num(self.get_time_value(time_type, self._base_data))
+            time_type_origin = "Uncovered Communication Time(Wait Time)" \
+                if time_type == "Uncovered Communication Time" else time_type
+            base_duration, _ = self.split_duration_and_num(self.get_time_value(time_type_origin, self._base_data))
             if time_value > base_duration:
                 ratio = "{:.2%}".format(self.calculate_ratio(time_value - base_duration, base_duration))
                 comparison_bottleneck += f"{time_type} exceeds the benchmark by {ratio}\n"
-- 
Gitee


From 4bf7661ca36628a596034c247bdb4a1148b48338 Mon Sep 17 00:00:00 2001
From: wangqihui01 <wangqihui2@huawei.com>
Date: Fri, 29 Mar 2024 09:52:34 +0800
Subject: [PATCH 55/62] destroy conn and curs in create table

---
 profiler/cluster_analyse/common_func/db_manager.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/profiler/cluster_analyse/common_func/db_manager.py b/profiler/cluster_analyse/common_func/db_manager.py
index 039473d707..7b1d641d74 100644
--- a/profiler/cluster_analyse/common_func/db_manager.py
+++ b/profiler/cluster_analyse/common_func/db_manager.py
@@ -116,6 +116,8 @@ class DBManager:
     @classmethod
     def create_tables(cls, db_path: any, *tables: any):
         conn, curs = cls.create_connect_db(db_path)
+        if not (conn and curs):
+            return
         for table_name in tables:
             if cls.judge_table_exists(curs, table_name):
                 drop_sql = "drop table {0}".format(table_name)
@@ -124,6 +126,7 @@ class DBManager:
             header_with_type = cls.sql_generate_table(table_map)
             sql = "CREATE TABLE IF NOT EXISTS " + table_name + header_with_type
             cls.execute_sql(conn, sql)
+        cls.destroy_db_connect(conn, curs)
 
     @staticmethod
     def execute_sql(conn: any, sql: str, params: any = None) -> bool:
-- 
Gitee


From b2b989450726d24fb26b58eb3038e8d30788757c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=A9=AC=E5=AE=81=E6=82=A6?= <maningyue@huawei.com>
Date: Tue, 2 Apr 2024 08:49:39 +0000
Subject: [PATCH 56/62] update OWNERS.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 马宁悦 <maningyue@huawei.com>
---
 OWNERS | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/OWNERS b/OWNERS
index d609a18f96..6ad393ee68 100644
--- a/OWNERS
+++ b/OWNERS
@@ -11,6 +11,7 @@ approvers:
 - ly-qianxiao
 - blian
 - kun_8
+- binghamhuang
 reviewers:
 - leo920320
 - wo-wenjie
@@ -39,4 +40,5 @@ reviewers:
 - machj
 - zhengweifeng6
 - gong-siwei
-- uniteone
\ No newline at end of file
+- uniteone
+- binghamhuang
\ No newline at end of file
-- 
Gitee


From 58831e87754583057d19d4ba13c56c1f8efeea00 Mon Sep 17 00:00:00 2001
From: gitee <jiangchangting@huawei.com>
Date: Thu, 11 Apr 2024 10:52:53 +0800
Subject: [PATCH 57/62] fix

---
 debug/accuracy_tools/ptdbg_ascend/CMakeLists.txt      | 2 +-
 debug/accuracy_tools/ptdbg_ascend/RELEASE.md          | 2 +-
 debug/accuracy_tools/ptdbg_ascend/src/python/setup.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/debug/accuracy_tools/ptdbg_ascend/CMakeLists.txt b/debug/accuracy_tools/ptdbg_ascend/CMakeLists.txt
index 582dd0c8e5..a709b26836 100644
--- a/debug/accuracy_tools/ptdbg_ascend/CMakeLists.txt
+++ b/debug/accuracy_tools/ptdbg_ascend/CMakeLists.txt
@@ -16,4 +16,4 @@ add_custom_target(ptdbg_ascend ALL
         VERBATIM
 	)
 
-install(CODE "execute_process(COMMAND ${PYTHON_BIN_PATH} -m pip install ${CMAKE_BINARY_DIR}/ptdbg_ascend/dist/ptdbg_ascend-5.0.T4-py3-none-any.whl --upgrade)")
+install(CODE "execute_process(COMMAND ${PYTHON_BIN_PATH} -m pip install ${CMAKE_BINARY_DIR}/ptdbg_ascend/dist/ptdbg_ascend-5.0-py3-none-any.whl --upgrade)")
diff --git a/debug/accuracy_tools/ptdbg_ascend/RELEASE.md b/debug/accuracy_tools/ptdbg_ascend/RELEASE.md
index f4725220f1..fd7d9e93c1 100644
--- a/debug/accuracy_tools/ptdbg_ascend/RELEASE.md
+++ b/debug/accuracy_tools/ptdbg_ascend/RELEASE.md
@@ -1,4 +1,4 @@
-# Release 5.0.T4
+# Release 5.0
 
 This is the initial release of Pytorch precision compare tools which was designed by the researchers
  and engineers in Huawei Technologies Co.,Ltd.
\ No newline at end of file
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py b/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py
index fb5b8ff000..36082bfdef 100644
--- a/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/setup.py
@@ -20,7 +20,7 @@ import stat
 from pathlib import Path
 import setuptools
 
-VERSION = '5.0.T4'
+VERSION = '5.0'
 
 
 def generate_ptdbg_ascend_version():
-- 
Gitee


From 2dcb91f3b853c607bb0a62baafc145cf4622b6b9 Mon Sep 17 00:00:00 2001
From: zhouxianqi <13165993773@163.com>
Date: Thu, 11 Apr 2024 16:24:28 +0800
Subject: [PATCH 58/62] optimize_subsequence_match

---
 .../compare_backend/utils/common_func.py      | 74 +++++++++----------
 1 file changed, 36 insertions(+), 38 deletions(-)

diff --git a/profiler/compare_tools/compare_backend/utils/common_func.py b/profiler/compare_tools/compare_backend/utils/common_func.py
index f8f0af97b1..573b53b75e 100644
--- a/profiler/compare_tools/compare_backend/utils/common_func.py
+++ b/profiler/compare_tools/compare_backend/utils/common_func.py
@@ -1,7 +1,5 @@
 from decimal import Decimal
 
-import numpy as np
-
 
 def calculate_diff_ratio(base_value: float, comparison_value: float):
     if not base_value and not comparison_value:
@@ -42,47 +40,47 @@ def longest_common_subsequence_matching(base_ops: list, comparison_ops: list, na
             result_data[index] = [value, None]
         return result_data
 
-    result_data = []
+    base_ops.reverse()
+    comparison_ops.reverse()
     comparison_len, base_len = len(comparison_ops), len(base_ops)
-    dp = [[0] * (base_len + 1) for _ in range(comparison_len + 1)]
-    for comparison_index in range(1, comparison_len + 1):
-        for base_index in range(1, base_len + 1):
-            if name_func(base_ops[base_index - 1]) == name_func(
-                    comparison_ops[comparison_index - 1]):
-                dp[comparison_index][base_index] = dp[comparison_index - 1][base_index - 1] + 1
+    dp = [[0] * (base_len + 1)] * (comparison_len + 1)
+    dp_path = [[0] * (base_len + 1)] * (comparison_len + 1)
+
+    comparison_index, base_index = 0, 0
+    iter_comparison_data = iter(comparison_ops)
+    iter_base_data = iter(base_ops)
+    for comparison_data in iter_comparison_data:
+        for base_data in iter_base_data:
+            if name_func(comparison_data) == name_func(base_data):
+                dp[comparison_index + 1][base_index + 1] = dp[comparison_index][base_index] + 1
+                dp_path[comparison_index + 1][base_index + 1] = "D"  # D for base op and comparison op matched
+            elif dp[comparison_index][base_index + 1] > dp[comparison_index + 1][base_index]:
+                dp[comparison_index + 1][base_index + 1] = dp[comparison_index][base_index + 1]
+                dp_path[comparison_index + 1][base_index + 1] = "U"  # U for only comparison op
             else:
-                dp[comparison_index][base_index] = max(dp[comparison_index][base_index - 1],
-                                                       dp[comparison_index - 1][base_index])
+                dp[comparison_index + 1][base_index + 1] = dp[comparison_index + 1][base_index]
+                dp_path[comparison_index + 1][base_index + 1] = "L"  # L for only base op
+            base_index += 1
+        comparison_index += 1
+
     matched_op = []
     comparison_index, base_index = comparison_len, base_len
     while comparison_index > 0 and base_index > 0:
-        if name_func(base_ops[base_index - 1]) == name_func(
-                comparison_ops[comparison_index - 1]):
-            matched_op.append([comparison_index - 1, base_index - 1])
+        path_value = dp_path[comparison_index][base_index]
+        if path_value == "D":
+            matched_op.append([base_ops[base_index - 1], comparison_ops[comparison_index - 1]])
             comparison_index -= 1
             base_index -= 1
-            continue
-        if dp[comparison_index][base_index - 1] > dp[comparison_index - 1][base_index]:
-            base_index -= 1
-        else:
+        elif path_value == "U":
+            matched_op.append([None, comparison_ops[comparison_index - 1]])
             comparison_index -= 1
-    if not matched_op:
-        matched_base_index_list = []
-    else:
-        matched_op.reverse()
-        matched_op = np.array(matched_op)
-        matched_base_index_list = list(matched_op[:, 1])
-    curr_comparison_index = 0
-    for base_index, base_api_node in enumerate(base_ops):
-        if base_index not in matched_base_index_list:
-            result_data.append([base_api_node, None])
-            continue
-        matched_comparison_index = matched_op[matched_base_index_list.index(base_index), 0]
-        for comparison_index in range(curr_comparison_index, matched_comparison_index):
-            result_data.append([None, comparison_ops[comparison_index]])
-        result_data.append([base_api_node, comparison_ops[matched_comparison_index]])
-        curr_comparison_index = matched_comparison_index + 1
-    if curr_comparison_index < len(comparison_ops):
-        for comparison_index in range(curr_comparison_index, len(comparison_ops)):
-            result_data.append([None, comparison_ops[comparison_index]])
-    return result_data
+        else:
+            matched_op.append([base_ops[base_index - 1], None])
+            base_index -= 1
+    while comparison_index > 0:
+        matched_op.append([None, comparison_ops[comparison_index - 1]])
+        comparison_index -= 1
+    while base_index > 0:
+        matched_op.append([base_ops[base_index - 1], None])
+        base_index -= 1
+    return matched_op
-- 
Gitee


From 2954c0264b687fd75ce75f30181ac0e349a3ff9a Mon Sep 17 00:00:00 2001
From: zhouxianqi <13165993773@163.com>
Date: Tue, 16 Apr 2024 15:58:13 +0800
Subject: [PATCH 59/62] bug_fix_for_sequence_match

---
 .../profiling_parser/gpu_profiling_parser.py  |  2 +-
 .../compare_backend/utils/common_func.py      | 44 +++++++++++--------
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py
index 923854bdf7..0a32c4c922 100644
--- a/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py
+++ b/profiler/compare_tools/compare_backend/profiling_parser/gpu_profiling_parser.py
@@ -12,7 +12,7 @@ class GPUProfilingParser(BaseProfilingParser):
     FA_MARK_LIST = [['fmha', 'kernel'], ['flash', 'kernel'], ['attention', 'kernel']]
     SDMA_MARK_LIST = ['htod', 'dtod', 'dtoh', 'memset (device)']
     FLOW_CAT = ("async_gpu", "async_cpu_to_gpu", "ac2g", "async")
-    TORCH_OP_CAT = ("cpu_op", "user_annotation", "cuda_runtime", "operator")
+    TORCH_OP_CAT = ("cpu_op", "user_annotation", "cuda_runtime", "operator", "runtime")
 
     def __init__(self, args: any, path_dict: dict):
         super().__init__(args, path_dict)
diff --git a/profiler/compare_tools/compare_backend/utils/common_func.py b/profiler/compare_tools/compare_backend/utils/common_func.py
index 573b53b75e..a3cab286e3 100644
--- a/profiler/compare_tools/compare_backend/utils/common_func.py
+++ b/profiler/compare_tools/compare_backend/utils/common_func.py
@@ -1,5 +1,7 @@
 from decimal import Decimal
 
+import numpy
+
 
 def calculate_diff_ratio(base_value: float, comparison_value: float):
     if not base_value and not comparison_value:
@@ -40,42 +42,45 @@ def longest_common_subsequence_matching(base_ops: list, comparison_ops: list, na
             result_data[index] = [value, None]
         return result_data
 
-    base_ops.reverse()
-    comparison_ops.reverse()
     comparison_len, base_len = len(comparison_ops), len(base_ops)
-    dp = [[0] * (base_len + 1)] * (comparison_len + 1)
-    dp_path = [[0] * (base_len + 1)] * (comparison_len + 1)
+    dp_flag = numpy.zeros(shape=(comparison_len + 1, base_len + 1), dtype=int)
+    pre_list = [0] * (base_len + 1)
+    cur_list = [0] * (base_len + 1)
 
-    comparison_index, base_index = 0, 0
+    comparison_index = 1
     iter_comparison_data = iter(comparison_ops)
-    iter_base_data = iter(base_ops)
     for comparison_data in iter_comparison_data:
+        base_index = 1
+        iter_base_data = iter(base_ops)
         for base_data in iter_base_data:
             if name_func(comparison_data) == name_func(base_data):
-                dp[comparison_index + 1][base_index + 1] = dp[comparison_index][base_index] + 1
-                dp_path[comparison_index + 1][base_index + 1] = "D"  # D for base op and comparison op matched
-            elif dp[comparison_index][base_index + 1] > dp[comparison_index + 1][base_index]:
-                dp[comparison_index + 1][base_index + 1] = dp[comparison_index][base_index + 1]
-                dp_path[comparison_index + 1][base_index + 1] = "U"  # U for only comparison op
+                cur_list[base_index] = pre_list[base_index - 1] + 1
             else:
-                dp[comparison_index + 1][base_index + 1] = dp[comparison_index + 1][base_index]
-                dp_path[comparison_index + 1][base_index + 1] = "L"  # L for only base op
+                only_base = cur_list[base_index - 1]
+                only_comparison = pre_list[base_index]
+                if only_base < only_comparison:
+                    dp_flag[comparison_index][base_index] = 1  # 1 for only comparison op
+                    cur_list[base_index] = only_comparison
+                else:
+                    cur_list[base_index] = only_base
             base_index += 1
+        pre_list = cur_list
         comparison_index += 1
 
     matched_op = []
     comparison_index, base_index = comparison_len, base_len
     while comparison_index > 0 and base_index > 0:
-        path_value = dp_path[comparison_index][base_index]
-        if path_value == "D":
-            matched_op.append([base_ops[base_index - 1], comparison_ops[comparison_index - 1]])
+        base_data = base_ops[base_index - 1]
+        comparison_data = comparison_ops[comparison_index - 1]
+        if name_func(base_data) == name_func(comparison_data):
+            matched_op.append([base_data, comparison_data])
             comparison_index -= 1
             base_index -= 1
-        elif path_value == "U":
-            matched_op.append([None, comparison_ops[comparison_index - 1]])
+        elif dp_flag[comparison_index][base_index] == 1:  # 1 for only comparison op
+            matched_op.append([None, comparison_data])
             comparison_index -= 1
         else:
-            matched_op.append([base_ops[base_index - 1], None])
+            matched_op.append([base_data, None])
             base_index -= 1
     while comparison_index > 0:
         matched_op.append([None, comparison_ops[comparison_index - 1]])
@@ -83,4 +88,5 @@ def longest_common_subsequence_matching(base_ops: list, comparison_ops: list, na
     while base_index > 0:
         matched_op.append([base_ops[base_index - 1], None])
         base_index -= 1
+    matched_op.reverse()
     return matched_op
-- 
Gitee


From 331e057ecd1f686bafafe330d6b97d47f7d77a81 Mon Sep 17 00:00:00 2001
From: keith <wangjianxin11@huawei.com>
Date: Mon, 29 Apr 2024 16:58:57 +0800
Subject: [PATCH 60/62] =?UTF-8?q?=E6=9B=BF=E6=8D=A2ptdbg=20=E7=AE=97?=
 =?UTF-8?q?=E5=AD=90=E6=AD=A5=E9=AA=A4prefix=E4=B8=AD=E7=9A=84'=5F'?=
 =?UTF-8?q?=E4=B8=BA'.'=EF=BC=8C=E6=9B=BF=E6=8D=A2=E9=A2=84=E6=A3=80?=
 =?UTF-8?q?=E5=B7=A5=E5=85=B7=E7=AE=97=E5=AD=90=E6=AD=A5=E9=AA=A4name?=
 =?UTF-8?q?=E4=B8=AD'*'=E4=B8=BA'.'=EF=BC=8C=E7=BB=9F=E4=B8=80=E4=BA=8C?=
 =?UTF-8?q?=E8=80=85=E6=A0=BC=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../api_accuracy_checker/dump/dump.py         |  8 ++--
 .../src/python/ptdbg_ascend/dump/dump.py      | 37 +++++++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/debug/accuracy_tools/api_accuracy_checker/dump/dump.py b/debug/accuracy_tools/api_accuracy_checker/dump/dump.py
index d8b317aa28..122c4f9478 100644
--- a/debug/accuracy_tools/api_accuracy_checker/dump/dump.py
+++ b/debug/accuracy_tools/api_accuracy_checker/dump/dump.py
@@ -87,15 +87,17 @@ class DumpConst:
 def pretest_info_dump(name, out_feat, module, phase):
     if not DumpUtil.get_dump_switch():
         return
+    replaced_name = name.replace('*', '.')
+    # Replacing symbols to unify the format of ptdbg and pretest
     if phase == DumpConst.forward:
-        api_info = ForwardAPIInfo(name, module.input_args, module.input_kwargs)
+        api_info = ForwardAPIInfo(replaced_name, module.input_args, module.input_kwargs)
     elif phase == DumpConst.backward:
-        api_info = BackwardAPIInfo(name, out_feat)
+        api_info = BackwardAPIInfo(replaced_name, out_feat)
     else:
         msg = "Unexpected training phase {}.".format(phase)
         print_error_log(msg)
         raise NotImplementedError(msg)
-    print_info_log(f"tools is dumping api: {name}" + " " * 10, end='\r')
+    print_info_log(f"tools is dumping api: {replaced_name}" + " " * 10, end='\r')
     write_api_info_json(api_info)
 
 
diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
index 2e49a9743b..5cde702e9c 100644
--- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
@@ -188,8 +188,45 @@ def dump_data(prefix, data_info):
 def thread_dump_data(prefix, data_info):
     DumpUtil.dump_thread_pool.submit(dump_data, prefix, data_info)
 
+def underscore_replacement(prefix) {
+    """
+    Replacing symbols to unify the format of ptdbg and pretest
+    """
+    replaced_prefix = []
+    consecutive_underscore_count = 0
+    three_underscore_time = 0
+
+    for char in input_string:
+        if char == '_':
+            consecutive_underscore_count += 1
+            if consecutive_underscore_count == 2:
+                # Two consecutive underscores, leave them unchanged
+                replaced_prefix.pop('.')
+                replaced_prefix.append('__')
+            elif consecutive_underscore_count == 3:
+                # Three consecutive underscores
+                three_underscore_time += 1
+                replaced_prefix.pop('__')
+                if three_underscore_time % 2 == 1:
+                    # Even index, replace the first underscore
+                    replaced_prefix.append('.__')
+                else:
+                    # Odd index, replace the third underscore
+                    replaced_prefix.append('__.')
+            else:
+                # Single underscore, replace with a period
+                replaced_prefix.append('.')
+        else:
+            # Not an underscore, reset the count
+            consecutive_underscore_count = 0
+            three_underscore_time = 0
+            replaced_prefix.append(char)
+    replaced_prefix = ''.join(replaced_prefix).replace("stack.info", "stack_info")
+    return replaced_prefix
+}
 
 def dump_data_by_rank_count(dump_step, prefix, data_info):
+    prefix = underscore_replacement(prefix)
     print_info_log(f"ptdbg is analyzing rank{rank} api: {prefix}" + " " * 10, end='\r')
     if DumpUtil.is_single_rank and DumpUtil.dump_thread_pool:
         thread_dump_data(prefix, data_info)
-- 
Gitee


From 7845cf457cdc26a0be1d05371aa3616c3fe7498e Mon Sep 17 00:00:00 2001
From: keith <wangjianxin11@huawei.com>
Date: Tue, 30 Apr 2024 17:03:16 +0800
Subject: [PATCH 61/62] Unit Test

---
 debug/accuracy_tools/ptdbg_ascend/test/ut/test_dump.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_dump.py b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_dump.py
index 9673c292ba..f420c141c0 100644
--- a/debug/accuracy_tools/ptdbg_ascend/test/ut/test_dump.py
+++ b/debug/accuracy_tools/ptdbg_ascend/test/ut/test_dump.py
@@ -47,3 +47,11 @@ class TestDump(unittest.TestCase):
         result = get_pkl_file_path()
         self.assertEqual(result, "")
 
+    def test_underscore_replacement(self):
+        prefix = "Torch_matmul_605_forward_input.0"
+        replaced_prefix = underscore_replacement(prefix)
+        self.assertEqual(replaced_prefix, "Torch.matmul.605.forward.input.0")
+
+        prefix = "Tensor___getitem___488_forward_stack_info"
+        replaced_prefix = underscore_replacement(prefix)
+        self.assertEqual(replaced_prefix, "Tensor.__getitem__.488.forward.stack_info")
-- 
Gitee


From a6b46ffe7c4351fbec5a776e340adf6466170885 Mon Sep 17 00:00:00 2001
From: keith <wangjianxin11@huawei.com>
Date: Wed, 8 May 2024 11:33:57 +0800
Subject: [PATCH 62/62] =?UTF-8?q?=E7=8E=AF=E5=A2=83=E4=B8=8A=E9=AA=8C?=
 =?UTF-8?q?=E8=AF=81+bugfix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py  | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
index 5cde702e9c..8d64a5ddfd 100644
--- a/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
+++ b/debug/accuracy_tools/ptdbg_ascend/src/python/ptdbg_ascend/dump/dump.py
@@ -188,7 +188,7 @@ def dump_data(prefix, data_info):
 def thread_dump_data(prefix, data_info):
     DumpUtil.dump_thread_pool.submit(dump_data, prefix, data_info)
 
-def underscore_replacement(prefix) {
+def underscore_replacement(prefix):
     """
     Replacing symbols to unify the format of ptdbg and pretest
     """
@@ -196,17 +196,17 @@ def underscore_replacement(prefix) {
     consecutive_underscore_count = 0
     three_underscore_time = 0
 
-    for char in input_string:
+    for char in prefix:
         if char == '_':
             consecutive_underscore_count += 1
             if consecutive_underscore_count == 2:
                 # Two consecutive underscores, leave them unchanged
-                replaced_prefix.pop('.')
+                replaced_prefix.pop()
                 replaced_prefix.append('__')
             elif consecutive_underscore_count == 3:
                 # Three consecutive underscores
                 three_underscore_time += 1
-                replaced_prefix.pop('__')
+                replaced_prefix.pop()
                 if three_underscore_time % 2 == 1:
                     # Even index, replace the first underscore
                     replaced_prefix.append('.__')
@@ -219,11 +219,9 @@ def underscore_replacement(prefix) {
         else:
             # Not an underscore, reset the count
             consecutive_underscore_count = 0
-            three_underscore_time = 0
             replaced_prefix.append(char)
     replaced_prefix = ''.join(replaced_prefix).replace("stack.info", "stack_info")
     return replaced_prefix
-}
 
 def dump_data_by_rank_count(dump_step, prefix, data_info):
     prefix = underscore_replacement(prefix)
-- 
Gitee