From b6b865d45fbc56faeee5fc53914651b5e6f5849d Mon Sep 17 00:00:00 2001
From: sunboquan <sbq1998@163.com>
Date: Thu, 6 Jul 2023 19:05:44 +0800
Subject: [PATCH 1/2] modify debug tools directory

---
 debug/tools/__init__.py                     |   0
 debug/tools/cluster_profiling_data_copyl.sh |  76 ----
 debug/tools/config.json                     |  14 -
 debug/tools/distribute_modify_hostname.bash |  49 ---
 debug/tools/torch_op_compare.py             | 447 --------------------
 5 files changed, 586 deletions(-)
 delete mode 100644 debug/tools/__init__.py
 delete mode 100644 debug/tools/cluster_profiling_data_copyl.sh
 delete mode 100644 debug/tools/config.json
 delete mode 100644 debug/tools/distribute_modify_hostname.bash
 delete mode 100644 debug/tools/torch_op_compare.py

diff --git a/debug/tools/__init__.py b/debug/tools/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/debug/tools/cluster_profiling_data_copyl.sh b/debug/tools/cluster_profiling_data_copyl.sh
deleted file mode 100644
index d3d301aea..000000000
--- a/debug/tools/cluster_profiling_data_copyl.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-SSH="ssh -o StrictHostKeyChecking=no"
-SCP="scp -o StrictHostKeyChecking=no"
-
-# Get the node list in the cluster.
-get_cluster_list()
-{
-    local cluster_config=$1
-    cat ${cluster_config} | python3 -c 'import sys,json;[print(node) for node in json.load(sys.stdin)["cluster"].keys()]'
-}
-
-# Get the account number of node.
-get_node_user()
-{
-    local cluster_config=$1
-    local node=$2
-    cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["user"])'
-}
-
-# Get the password of node.
-get_node_passwd()
-{
-    local cluster_config=$1
-    local node=$2
-    cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["passwd"])'
-}
-
-# Get the dir of node.
-get_node_dir()
-{
-    local cluster_config=$1
-    local node=$2
-    cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["dir"])'
-}
-
-# Copy data from remote node to local node.
-rscp_pass()
-{
-    local node="$1"
-    local user="$2"
-    local passwd="$3"
-    local src="$4"
-    local target="$5"
-    sshpass -p "${passwd}" ${SCP} -r "${user}"@"${node}":"${src}" "${target}"
-}
-
-# 指定拷贝使用的json文件
-cluster_account_config_path=$1
-# 指定拷贝的路径
-target_dir=$2
-
-node_list=$(get_cluster_list ${cluster_account_config_path})
-echo "-----begin----"
-
-for node in ${node_list}
-do
-    user=$(get_node_user ${cluster_account_config_path} ${node})
-    passwd=$(get_node_passwd ${cluster_account_config_path} ${node})
-    src_dir=$(get_node_dir ${cluster_account_config_path} ${node})
-    echo "------------------${user}@${node}---------------------"
-    $(rscp_pass ${node} ${user} ${passwd} "${src_dir}" ${target_dir})
-done
diff --git a/debug/tools/config.json b/debug/tools/config.json
deleted file mode 100644
index 04f49e728..000000000
--- a/debug/tools/config.json
+++ /dev/null
@@ -1,14 +0,0 @@
-{
-  "cluster": {
-                "10.xxx.xxx.1": {
-                "user": "root",
-                "passwd": "xxx",
-                "dir": "/home/data/test"
-                },
-                "10.xxx.xxx.2": {
-                "user": "root",
-                "passwd": "xxx",
-                "dir": "/home/data/test"
-                }
-              }
-}
\ No newline at end of file
diff --git a/debug/tools/distribute_modify_hostname.bash b/debug/tools/distribute_modify_hostname.bash
deleted file mode 100644
index 3ec6dca3c..000000000
--- a/debug/tools/distribute_modify_hostname.bash
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-
-SSH="ssh -o StrictHostKeyChecking=no"
-
-# Modify hostname
-change_hostname()
-{
-    local node="$1"
-    local user="$2"
-    local passwd="$3"
-    sshpass -p "${passwd}" ${SSH} ${user}@${node} hostname "${user}-${node}"
-}
-
-# Get node list in the cluster.
-get_cluster_list()
-{
-    local cluster_config=$1
-    cat ${cluster_config} | python3 -c 'import sys,json;[print(node) for node in json.load(sys.stdin)["cluster"].keys()]'
-}
-
-# Get the user from node
-get_node_user()
-{
-    local cluster_config=$1
-    local node=$2
-    cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["user"])'
-}
-
-# Get the password from node
-get_node_passwd()
-{
-    local cluster_config=$1
-    local node=$2
-    cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["passwd"])'
-}
-
-cluster_account_config_path=$1
-
-node_list=$(get_cluster_list ${cluster_account_config_path})
-echo"-------begin--------"
-
-for node in ${node_list}
-do
-    user=$(get_node_user ${cluster_account_config_path} ${node})
-    passwd=$(get_node_passwd ${cluster_account_config_path} ${node})
-    echo "--------------${user}@${node}----------------"
-    $(change_hostname ${node} ${user} ${passwd})
-    echo "---"
-done
\ No newline at end of file
diff --git a/debug/tools/torch_op_compare.py b/debug/tools/torch_op_compare.py
deleted file mode 100644
index 7d32a03e3..000000000
--- a/debug/tools/torch_op_compare.py
+++ /dev/null
@@ -1,447 +0,0 @@
-import argparse
-import ast
-import copy
-import json
-import os.path
-import time
-from queue import Queue
-import numpy as np
-
-from openpyxl.styles import PatternFill, Font, Border, Side
-from openpyxl.workbook import Workbook
-
-GPU = 0
-NPU = 1
-NA = 'N/A'
-LIMIT_KERNEL = 3
-OP_NAME = 'Operator Name'
-INPUT_SHAPE = 'Input Shape'
-INPUT_TYPE = 'Input Type'
-KERNEL_NAME = 'Kernel Name'
-DEVICE_DUR = 'Device Duration(us)'
-TASK_ID = 'Task Id'
-KERNEL_TYPE = 'Kernel Type'
-DIFF = 'DIFF: (sum(Trace2 Duration)-sum(Trace1 Duration))/sum(Trace1 Duration)'
-OP_NAME_FILTER = 'Operator Name Filter'
-DIFF_FILTER = 'DIFF Filter'
-BASE_TRACE = 'Base Trace'
-COMPARISON_TRACE = 'Comparison Trace'
-BASE_TRACE_TYPE = None
-COMPARISON_TRACE_TYPE = None
-BASE_TYPE = 1
-COMPARISON_TYPE = 2
-GPU_HEADER = [OP_NAME, INPUT_SHAPE, INPUT_TYPE, KERNEL_NAME, DEVICE_DUR]
-NPU_HEADER = [OP_NAME, INPUT_SHAPE, INPUT_TYPE, KERNEL_NAME, TASK_ID, KERNEL_TYPE, DEVICE_DUR]
-FILL_DICT = {
-    BASE_TYPE: PatternFill("solid", fgColor='003366FF'), COMPARISON_TYPE: PatternFill("solid", fgColor='0033CCCC'),
-    DIFF: PatternFill("solid", fgColor='00FF0000'), OP_NAME_FILTER: PatternFill("solid", fgColor='00FFFF00'),
-    DIFF_FILTER: PatternFill("solid", fgColor='00FFFF00')
-}
-COLUMN_WIDTH = {OP_NAME: 50, INPUT_SHAPE: 25, INPUT_TYPE: 25, KERNEL_NAME: 25, DEVICE_DUR: 25,
-                TASK_ID: 20, KERNEL_TYPE: 25, DIFF: 25, OP_NAME_FILTER: 25, DIFF_FILTER: 25}
-BORDER = Border(top=Side(border_style="thin", color='00000000'),
-                left=Side(border_style="thin", color='00000000'),
-                right=Side(border_style="thin", color='00000000'),
-                bottom=Side(border_style="thin", color='00000000'))
-
-
-class TorchOpNode:
-    def __init__(self, event=None, parent_node=None):
-        self._event = event
-        self._parent_node = parent_node
-        self._child_nodes = []
-        self._kernel_list = []
-        self._kernel_num = 0
-
-    @property
-    def start_time(self):
-        return self._event.get("ts", 0)
-
-    @property
-    def end_time(self):
-        return self._event.get("ts", 0) + self._event.get("dur", 0)
-
-    @property
-    def name(self):
-        return str(self._event.get("name", NA))
-
-    @property
-    def input_shape(self):
-        return str(self._event.get("args", {}).get("Input Dims", NA))
-
-    @property
-    def input_type(self):
-        return str(self._event.get("args", {}).get("Input type", NA))
-
-    @property
-    def parent(self):
-        return self._parent_node
-
-    @property
-    def child_nodes(self):
-        return self._child_nodes
-
-    @property
-    def kernel_list(self):
-        return self._kernel_list
-
-    @property
-    def kernel_num(self):
-        return self._kernel_num
-
-    def add_child_node(self, child_node):
-        self._child_nodes.append(child_node)
-
-    def set_kernel_list(self, kernel_list: list):
-        self._kernel_list = kernel_list
-
-    def add_kernel_num(self, kernel_num: int):
-        self._kernel_num += kernel_num
-
-    def is_step_profiler(self) -> bool:
-        return self.name.find("ProfilerStep#") != -1
-
-
-class TreeBuilder:
-    @classmethod
-    def build_tree(cls, event_list: list, flow_kernel_dict: dict) -> TorchOpNode:
-        root_node = TorchOpNode()
-        event_list.sort(key=lambda x: x.get("ts", 0))
-        last_node = root_node
-        for event in event_list:
-            kernel_list = flow_kernel_dict.get(event.get("ts", 0), [])
-            while last_node:
-                if last_node == root_node or event.get("ts", 0) < last_node.end_time:
-                    tree_node = TorchOpNode(event, last_node)
-                    last_node.add_child_node(tree_node)
-                    if kernel_list:
-                        tree_node.set_kernel_list(kernel_list)
-                    last_node = tree_node
-                    break
-                last_node = last_node.parent
-        return root_node
-
-    @classmethod
-    def mark_kernel_num(cls, root_node: TorchOpNode, flow_kernel_dict: dict):
-        for ts, kernel_list in flow_kernel_dict.items():
-            curr_node = root_node
-            while curr_node.child_nodes:
-                for node in curr_node.child_nodes:
-                    if node.start_time <= ts <= node.end_time:
-                        node.add_kernel_num(len(kernel_list))
-                        curr_node = node
-                        break
-
-    @classmethod
-    def get_total_kernels(cls, root_node: TorchOpNode) -> list:
-        result_list = []
-        node_queue = Queue()
-        for child_node in root_node.child_nodes:
-            node_queue.put(child_node)
-        while not node_queue.empty():
-            tree_node = node_queue.get()
-            result_list.extend(tree_node.kernel_list)
-            for child_node in tree_node.child_nodes:
-                node_queue.put(child_node)
-        return result_list
-
-
-def read_json_file(file_path: str, trace_type: int) -> any:
-    event_list = []
-    flow_kernel_dict = {}
-    if not os.path.isfile(file_path):
-        raise RuntimeError(f"File not exists: {file_path}")
-    try:
-        with open(file_path, "rt") as file:
-            json_data = json.loads(file.read())
-    except Exception:
-        raise RuntimeError(f"Can't read file: {file_path}")
-    flow_start_dict, flow_end_dict, event_dict = {}, {}, {}
-    flow_cat = ("async_gpu", "ac2g", "async_npu")
-    if trace_type == BASE_TYPE:
-        global BASE_TRACE_TYPE
-        BASE_TRACE_TYPE = GPU if isinstance(json_data, dict) else NPU
-        _type = BASE_TRACE_TYPE
-    else:
-        global COMPARISON_TRACE_TYPE
-        COMPARISON_TRACE_TYPE = GPU if isinstance(json_data, dict) else NPU
-        _type = COMPARISON_TRACE_TYPE
-    total_events = json_data.get("traceEvents", []) if _type == GPU else json_data
-    for event in total_events:
-        if event.get("cat") == "cpu_op" or event.get("cat") in ("Runtime", "cuda_runtime"):
-            event_list.append(event)
-        elif event.get("cat") in flow_cat and event.get("ph") == "s":
-            flow_start_dict[event.get("id")] = event
-        elif event.get("cat") in flow_cat and event.get("ph") == "f":
-            flow_end_dict[event.get("id")] = event
-        elif _type == GPU and event.get("cat", "").capitalize() == "Kernel".capitalize():
-            event_dict["{}-{}-{}".format(event.get("pid"), event.get("tid"), event.get("ts"))] = event
-        elif _type == NPU and event.get("ph") != "f":
-            event_dict["{}-{}-{}".format(event.get("pid"), event.get("tid"), event.get("ts"))] = event
-
-    for flow_id, start_flow in flow_start_dict.items():
-        end_flow = flow_end_dict.get(flow_id)
-        if end_flow is None:
-            continue
-        kernel_event = event_dict.get("{}-{}-{}".format(end_flow.get("pid"), end_flow.get("tid"), end_flow.get("ts")))
-        if kernel_event is None:
-            continue
-        flow_kernel_dict.setdefault(start_flow.get("ts"), []).append(kernel_event)
-    return event_list, flow_kernel_dict
-
-
-def get_top_layer_apis(file_path: str, trace_type: int, max_kernel_num: int) -> any:
-    event_list, flow_kernel_dict = read_json_file(file_path, trace_type)
-    root_node = TreeBuilder.build_tree(event_list, flow_kernel_dict)
-    if max_kernel_num is not None:
-        TreeBuilder.mark_kernel_num(root_node, flow_kernel_dict)
-    level1_child_nodes = root_node.child_nodes
-    if not level1_child_nodes:
-        raise RuntimeError(f"Can't find any torch op in the file: {file_path}")
-    result_data = []
-    for level1_node in level1_child_nodes:
-        if level1_node.is_step_profiler():
-            result_data.extend(level1_node.child_nodes)
-        else:
-            result_data.append(level1_node)
-    return result_data
-
-
-def compare(base_top_layer_apis: list, comparison_top_layer_apis: list, op_name_map: dict) -> list:
-    result_data = []
-    comparison_len, base_len = len(comparison_top_layer_apis), len(base_top_layer_apis)
-    dp = [[0] * (base_len + 1) for _ in range(comparison_len + 1)]
-    for comparison_index in range(1, comparison_len + 1):
-        for base_index in range(1, base_len + 1):
-            base_name = base_top_layer_apis[base_index - 1].name
-            comparison_name = comparison_top_layer_apis[comparison_index - 1].name
-            if op_name_map.get(comparison_name, comparison_name) == op_name_map.get(base_name, base_name):
-                dp[comparison_index][base_index] = dp[comparison_index - 1][base_index - 1] + 1
-            else:
-                dp[comparison_index][base_index] = max(dp[comparison_index][base_index - 1],
-                                                       dp[comparison_index - 1][base_index])
-    matched_op = []
-    comparison_index, base_index = comparison_len, base_len
-    while comparison_index > 0 and base_index > 0:
-        base_name = base_top_layer_apis[base_index - 1].name
-        comparison_name = comparison_top_layer_apis[comparison_index - 1].name
-        if op_name_map.get(comparison_name, comparison_name) == op_name_map.get(base_name, base_name):
-            matched_op.append([comparison_index - 1, base_index - 1])
-            comparison_index -= 1
-            base_index -= 1
-            continue
-        if dp[comparison_index][base_index - 1] > dp[comparison_index - 1][base_index]:
-            base_index -= 1
-        else:
-            comparison_index -= 1
-    if not matched_op:
-        matched_base_index_list = []
-    else:
-        matched_op.reverse()
-        matched_op = np.array(matched_op)
-        matched_base_index_list = list(matched_op[:, 1])
-    curr_comparison_index = 0
-    for base_index, base_api_node in enumerate(base_top_layer_apis):
-        if base_index not in matched_base_index_list:
-            result_data.append([base_api_node, None])
-            continue
-        matched_comparison_index = matched_op[matched_base_index_list.index(base_index), 0]
-        for comparison_index in range(curr_comparison_index, matched_comparison_index):
-            result_data.append([None, comparison_top_layer_apis[comparison_index]])
-        result_data.append([base_api_node, comparison_top_layer_apis[matched_comparison_index]])
-        curr_comparison_index = matched_comparison_index + 1
-    if curr_comparison_index < len(comparison_top_layer_apis):
-        for comparison_index in range(curr_comparison_index, len(comparison_top_layer_apis)):
-            result_data.append([None, comparison_top_layer_apis[comparison_index]])
-    return result_data
-
-
-def create_data(base_api_node: TorchOpNode, comparison_api_node: TorchOpNode) -> list:
-    result_data = []
-    base_kernel_list = TreeBuilder.get_total_kernels(base_api_node) if base_api_node else []
-    comparison_kernel_list = TreeBuilder.get_total_kernels(comparison_api_node) if comparison_api_node else []
-    if not base_kernel_list or not comparison_kernel_list:
-        diff = NA
-    else:
-        base_total_dur = sum([kernel.get("dur", 0) for kernel in base_kernel_list])
-        comparison_total_dur = sum([kernel.get("dur", 0) for kernel in comparison_kernel_list])
-        diff = (comparison_total_dur - base_total_dur) / base_total_dur
-    op_name = base_api_node.name if base_api_node else comparison_api_node.name
-    base_kernel_num, comparison_kernel_num = len(base_kernel_list), len(comparison_kernel_list)
-    base_data = [NA] * len(GPU_HEADER) if BASE_TRACE_TYPE == GPU else [NA] * len(NPU_HEADER)
-    if base_api_node:
-        base_data[0] = base_api_node.name
-        base_data[1] = base_api_node.input_shape
-        base_data[2] = base_api_node.input_type
-    comparison_data = [NA] * len(GPU_HEADER) if COMPARISON_TRACE_TYPE == GPU else [NA] * len(NPU_HEADER)
-    if comparison_api_node:
-        comparison_data[0] = comparison_api_node.name
-        comparison_data[1] = comparison_api_node.input_shape
-        comparison_data[2] = comparison_api_node.input_type
-    if base_kernel_num == 0 and comparison_kernel_num == 0:
-        data = base_data + comparison_data + [diff, op_name]
-        result_data.append(data)
-        return result_data
-    for index in range(max(base_kernel_num, comparison_kernel_num)):
-        base_row_data, comparison_row_data = copy.deepcopy(base_data), copy.deepcopy(comparison_data)
-        if index < base_kernel_num:
-            base_kernel = base_kernel_list[index]
-            if BASE_TRACE_TYPE == GPU:
-                base_row_data[3] = base_kernel.get("name")
-                base_row_data[4] = base_kernel.get("dur")
-            else:
-                base_row_data[3] = base_kernel.get("name")
-                base_row_data[4] = base_kernel.get("args", {}).get("Task Id")
-                base_row_data[5] = base_kernel.get("args", {}).get("Task Type")
-                base_row_data[6] = base_kernel.get("dur")
-        if index < comparison_kernel_num:
-            comparison_kernel = comparison_kernel_list[index]
-            if COMPARISON_TRACE_TYPE == GPU:
-                comparison_row_data[3] = comparison_kernel.get("name")
-                comparison_row_data[4] = comparison_kernel.get("dur")
-            else:
-                comparison_row_data[3] = comparison_kernel.get("name")
-                comparison_row_data[4] = comparison_kernel.get("args", {}).get("Task Id")
-                comparison_row_data[5] = comparison_kernel.get("args", {}).get("Task Type")
-                comparison_row_data[6] = comparison_kernel.get("dur")
-        data = base_row_data + comparison_row_data + [diff, op_name]
-        result_data.append(data)
-    return result_data
-
-
-def drill_down(compare_result_data: list, max_kernel_num: int, op_name_map: dict) -> list:
-    result_data = []
-    for data in compare_result_data:
-        base_api = data[0] if data[0] else TorchOpNode()
-        comparison_api = data[1] if data[1] else TorchOpNode()
-        if max(base_api.kernel_num, comparison_api.kernel_num) <= max_kernel_num:
-            result_data.append(data)
-            continue
-        result_data.extend(compare(base_api.child_nodes, comparison_api.child_nodes, op_name_map))
-    return result_data
-
-
-def have_to_drill_down(compare_result_data: list, max_kernel_num: int) -> bool:
-    for data in compare_result_data:
-        base_api = data[0] if data[0] else TorchOpNode()
-        comparison_api = data[1] if data[1] else TorchOpNode()
-        if max(base_api.kernel_num, comparison_api.kernel_num) > max_kernel_num:
-            return True
-    return False
-
-
-def main():
-    global BASE_TRACE, COMPARISON_TRACE
-    parser = argparse.ArgumentParser(description="Compare trace of GPU and NPU")
-    parser.add_argument("base_trace_path", help="base trace file path")
-    parser.add_argument("comparison_trace_path", help="comparison trace file path")
-    parser.add_argument("--output_path", help="性能数据比对结果的存放路径")
-    parser.add_argument("--max_kernel_num", type=int, help="每个torch op的kernel数量限制")
-    parser.add_argument("--op_name_map", type=ast.literal_eval, default={},
-                        help="配置GPU OP与NPU OP等价的名称映射关系，以字典的形式传入")
-    args = parser.parse_args()
-    if args.max_kernel_num is not None and args.max_kernel_num <= LIMIT_KERNEL:
-        raise RuntimeError(f"Invalid param, --max_kernel_num has to be greater than {LIMIT_KERNEL}")
-    if not isinstance(args.op_name_map, dict):
-        raise RuntimeError("Invalid param, --op_name_map must be dict, for example: --op_name_map={'name1':'name2'}")
-    base_top_layer_apis = get_top_layer_apis(args.base_trace_path, BASE_TYPE, args.max_kernel_num)
-    if BASE_TRACE_TYPE == GPU:
-        BASE_TRACE += ' [GPU] : ' + os.path.basename(args.base_trace_path)
-    else:
-        BASE_TRACE += ' [NPU] : ' + os.path.basename(args.base_trace_path)
-    comparison_top_layer_apis = get_top_layer_apis(args.comparison_trace_path, COMPARISON_TYPE, args.max_kernel_num)
-    if COMPARISON_TRACE_TYPE == GPU:
-        COMPARISON_TRACE += ' [GPU] : ' + os.path.basename(args.comparison_trace_path)
-    else:
-        COMPARISON_TRACE += ' [NPU] : ' + os.path.basename(args.comparison_trace_path)
-    compare_result_data = compare(base_top_layer_apis, comparison_top_layer_apis, args.op_name_map)
-
-    if args.max_kernel_num is not None:
-        while have_to_drill_down(compare_result_data, args.max_kernel_num):
-            compare_result_data = drill_down(compare_result_data, args.max_kernel_num, args.op_name_map)
-
-    dir_path = args.output_path if args.output_path else "./"
-    file_name = "torch_op_compare_{}.xlsx".format(time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())))
-    result_file_path = os.path.join(dir_path, file_name)
-
-    wb = Workbook()
-    ws = wb.create_sheet("CompareResult", 0)
-    ws.sheet_properties.tabColor = '00CED1'
-    # write headers
-    base_trace_headers = GPU_HEADER if BASE_TRACE_TYPE == GPU else NPU_HEADER
-    comparison_trace_headers = GPU_HEADER if COMPARISON_TRACE_TYPE == GPU else NPU_HEADER
-    headers = base_trace_headers + comparison_trace_headers + [DIFF, OP_NAME_FILTER, DIFF_FILTER]
-    base_trace_start_column = 0
-    comparison_trace_start_column = len(base_trace_headers)
-    diff_start_column = len(base_trace_headers) + len(comparison_trace_headers)
-
-    for col_index in range(len(headers)):
-        ws.cell(row=1, column=col_index + 1).border = BORDER
-        ws.cell(row=1, column=col_index + 1).font = Font(name='Arial', bold=True)
-        ws.cell(row=2, column=col_index + 1).border = BORDER
-        ws.cell(row=2, column=col_index + 1).font = Font(name='Arial', bold=True)
-        header_name = headers[col_index]
-        if col_index < comparison_trace_start_column:
-            ws.cell(row=1, column=col_index + 1).value = BASE_TRACE
-            ws.cell(row=1, column=col_index + 1).fill = FILL_DICT.get(BASE_TYPE)
-            ws.cell(row=2, column=col_index + 1).fill = FILL_DICT.get(BASE_TYPE)
-        elif col_index < diff_start_column:
-            ws.cell(row=1, column=col_index + 1).value = COMPARISON_TRACE
-            ws.cell(row=1, column=col_index + 1).fill = FILL_DICT.get(COMPARISON_TYPE)
-            ws.cell(row=2, column=col_index + 1).fill = FILL_DICT.get(COMPARISON_TYPE)
-        else:
-            ws.cell(row=1, column=col_index + 1).value = header_name
-            ws.cell(row=1, column=col_index + 1).fill = FILL_DICT.get(header_name)
-        ws.cell(row=2, column=col_index + 1).value = header_name
-        dim = ws.cell(row=2, column=col_index + 1).coordinate
-        ws.column_dimensions[dim[0]].width = COLUMN_WIDTH.get(header_name)
-    ws.merge_cells(start_row=1, start_column=base_trace_start_column + 1,
-                   end_row=1, end_column=comparison_trace_start_column)
-    ws.merge_cells(start_row=1, start_column=comparison_trace_start_column + 1,
-                   end_row=1, end_column=diff_start_column)
-    ws.merge_cells(start_row=1, start_column=headers.index(DIFF) + 1,
-                   end_row=2, end_column=headers.index(DIFF) + 1)
-    ws.merge_cells(start_row=1, start_column=headers.index(OP_NAME_FILTER) + 1,
-                   end_row=2, end_column=headers.index(OP_NAME_FILTER) + 1)
-    ws.merge_cells(start_row=1, start_column=headers.index(DIFF_FILTER) + 1,
-                   end_row=2, end_column=headers.index(DIFF_FILTER) + 1)
-
-    # write lines
-    start_row_index = 3
-    for data in compare_result_data:
-        rows = create_data(data[0], data[1])
-        row_number = 0
-        for row in rows:
-            row_index = start_row_index + row_number
-            ws.cell(row=row_index, column=len(row) + 1).border = BORDER
-            for index, value in enumerate(row):
-                if index == headers.index(DIFF):
-                    ws.cell(row=row_index, column=index + 1).number_format = '0.00%'
-                    if value != NA and value < 0:
-                        ws.cell(row=row_index, column=index + 1).fill = PatternFill("solid", fgColor='0000FF00')
-                        ws.cell(row=row_index, column=index + 3).fill = PatternFill("solid", fgColor='0000FF00')
-                    if value != NA and value >= 0:
-                        ws.cell(row=row_index, column=index + 1).fill = PatternFill("solid", fgColor='00FF0000')
-                        ws.cell(row=row_index, column=index + 3).fill = PatternFill("solid", fgColor='00FF0000')
-                if index in [key for key, value in enumerate(headers) if value == OP_NAME]:
-                    ws.cell(row=row_index, column=index + 1).font = Font(name='Arial', bold=True)
-                else:
-                    ws.cell(row=row_index, column=index + 1).font = Font(name='Arial')
-                ws.cell(row=row_index, column=index + 1).value = value
-                ws.cell(row=row_index, column=index + 1).border = BORDER
-            row_number += 1
-        if row_number > 1:
-            # 合并单元格
-            merged_index = set(
-                [key for key, value in enumerate(headers) if value in (OP_NAME, INPUT_SHAPE, INPUT_TYPE, DIFF)])
-            for col_index in merged_index:
-                ws.merge_cells(start_row=start_row_index, start_column=col_index + 1,
-                               end_row=start_row_index + row_number - 1, end_column=col_index + 1)
-        start_row_index = start_row_index + row_number
-
-    wb.save(result_file_path)
-    wb.close()
-
-
-if __name__ == "__main__":
-    main()
-- 
Gitee


From 1b345e13d399bcd87bba995ccc50247f9ae8b3b8 Mon Sep 17 00:00:00 2001
From: sunboquan <sbq1998@163.com>
Date: Thu, 6 Jul 2023 19:09:03 +0800
Subject: [PATCH 2/2] modify debug tools directory

---
 debug/tools/compare_tools/__init__.py         |   0
 debug/tools/compare_tools/torch_op_compare.py | 447 ++++++++++++++++++
 .../cluster_profiling_data_copyl.sh           |  76 +++
 debug/tools/distribute_tools/config.json      |  14 +
 .../distribute_modify_hostname.bash           |  49 ++
 5 files changed, 586 insertions(+)
 create mode 100644 debug/tools/compare_tools/__init__.py
 create mode 100644 debug/tools/compare_tools/torch_op_compare.py
 create mode 100644 debug/tools/distribute_tools/cluster_profiling_data_copyl.sh
 create mode 100644 debug/tools/distribute_tools/config.json
 create mode 100644 debug/tools/distribute_tools/distribute_modify_hostname.bash

diff --git a/debug/tools/compare_tools/__init__.py b/debug/tools/compare_tools/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/debug/tools/compare_tools/torch_op_compare.py b/debug/tools/compare_tools/torch_op_compare.py
new file mode 100644
index 000000000..7d32a03e3
--- /dev/null
+++ b/debug/tools/compare_tools/torch_op_compare.py
@@ -0,0 +1,447 @@
+import argparse
+import ast
+import copy
+import json
+import os.path
+import time
+from queue import Queue
+import numpy as np
+
+from openpyxl.styles import PatternFill, Font, Border, Side
+from openpyxl.workbook import Workbook
+
+GPU = 0
+NPU = 1
+NA = 'N/A'
+LIMIT_KERNEL = 3
+OP_NAME = 'Operator Name'
+INPUT_SHAPE = 'Input Shape'
+INPUT_TYPE = 'Input Type'
+KERNEL_NAME = 'Kernel Name'
+DEVICE_DUR = 'Device Duration(us)'
+TASK_ID = 'Task Id'
+KERNEL_TYPE = 'Kernel Type'
+DIFF = 'DIFF: (sum(Trace2 Duration)-sum(Trace1 Duration))/sum(Trace1 Duration)'
+OP_NAME_FILTER = 'Operator Name Filter'
+DIFF_FILTER = 'DIFF Filter'
+BASE_TRACE = 'Base Trace'
+COMPARISON_TRACE = 'Comparison Trace'
+BASE_TRACE_TYPE = None
+COMPARISON_TRACE_TYPE = None
+BASE_TYPE = 1
+COMPARISON_TYPE = 2
+GPU_HEADER = [OP_NAME, INPUT_SHAPE, INPUT_TYPE, KERNEL_NAME, DEVICE_DUR]
+NPU_HEADER = [OP_NAME, INPUT_SHAPE, INPUT_TYPE, KERNEL_NAME, TASK_ID, KERNEL_TYPE, DEVICE_DUR]
+FILL_DICT = {
+    BASE_TYPE: PatternFill("solid", fgColor='003366FF'), COMPARISON_TYPE: PatternFill("solid", fgColor='0033CCCC'),
+    DIFF: PatternFill("solid", fgColor='00FF0000'), OP_NAME_FILTER: PatternFill("solid", fgColor='00FFFF00'),
+    DIFF_FILTER: PatternFill("solid", fgColor='00FFFF00')
+}
+COLUMN_WIDTH = {OP_NAME: 50, INPUT_SHAPE: 25, INPUT_TYPE: 25, KERNEL_NAME: 25, DEVICE_DUR: 25,
+                TASK_ID: 20, KERNEL_TYPE: 25, DIFF: 25, OP_NAME_FILTER: 25, DIFF_FILTER: 25}
+BORDER = Border(top=Side(border_style="thin", color='00000000'),
+                left=Side(border_style="thin", color='00000000'),
+                right=Side(border_style="thin", color='00000000'),
+                bottom=Side(border_style="thin", color='00000000'))
+
+
+class TorchOpNode:
+    def __init__(self, event=None, parent_node=None):
+        self._event = event
+        self._parent_node = parent_node
+        self._child_nodes = []
+        self._kernel_list = []
+        self._kernel_num = 0
+
+    @property
+    def start_time(self):
+        return self._event.get("ts", 0)
+
+    @property
+    def end_time(self):
+        return self._event.get("ts", 0) + self._event.get("dur", 0)
+
+    @property
+    def name(self):
+        return str(self._event.get("name", NA))
+
+    @property
+    def input_shape(self):
+        return str(self._event.get("args", {}).get("Input Dims", NA))
+
+    @property
+    def input_type(self):
+        return str(self._event.get("args", {}).get("Input type", NA))
+
+    @property
+    def parent(self):
+        return self._parent_node
+
+    @property
+    def child_nodes(self):
+        return self._child_nodes
+
+    @property
+    def kernel_list(self):
+        return self._kernel_list
+
+    @property
+    def kernel_num(self):
+        return self._kernel_num
+
+    def add_child_node(self, child_node):
+        self._child_nodes.append(child_node)
+
+    def set_kernel_list(self, kernel_list: list):
+        self._kernel_list = kernel_list
+
+    def add_kernel_num(self, kernel_num: int):
+        self._kernel_num += kernel_num
+
+    def is_step_profiler(self) -> bool:
+        return self.name.find("ProfilerStep#") != -1
+
+
+class TreeBuilder:
+    @classmethod
+    def build_tree(cls, event_list: list, flow_kernel_dict: dict) -> TorchOpNode:
+        root_node = TorchOpNode()
+        event_list.sort(key=lambda x: x.get("ts", 0))
+        last_node = root_node
+        for event in event_list:
+            kernel_list = flow_kernel_dict.get(event.get("ts", 0), [])
+            while last_node:
+                if last_node == root_node or event.get("ts", 0) < last_node.end_time:
+                    tree_node = TorchOpNode(event, last_node)
+                    last_node.add_child_node(tree_node)
+                    if kernel_list:
+                        tree_node.set_kernel_list(kernel_list)
+                    last_node = tree_node
+                    break
+                last_node = last_node.parent
+        return root_node
+
+    @classmethod
+    def mark_kernel_num(cls, root_node: TorchOpNode, flow_kernel_dict: dict):
+        for ts, kernel_list in flow_kernel_dict.items():
+            curr_node = root_node
+            while curr_node.child_nodes:
+                for node in curr_node.child_nodes:
+                    if node.start_time <= ts <= node.end_time:
+                        node.add_kernel_num(len(kernel_list))
+                        curr_node = node
+                        break
+
+    @classmethod
+    def get_total_kernels(cls, root_node: TorchOpNode) -> list:
+        result_list = []
+        node_queue = Queue()
+        for child_node in root_node.child_nodes:
+            node_queue.put(child_node)
+        while not node_queue.empty():
+            tree_node = node_queue.get()
+            result_list.extend(tree_node.kernel_list)
+            for child_node in tree_node.child_nodes:
+                node_queue.put(child_node)
+        return result_list
+
+
+def read_json_file(file_path: str, trace_type: int) -> any:
+    event_list = []
+    flow_kernel_dict = {}
+    if not os.path.isfile(file_path):
+        raise RuntimeError(f"File not exists: {file_path}")
+    try:
+        with open(file_path, "rt") as file:
+            json_data = json.loads(file.read())
+    except Exception:
+        raise RuntimeError(f"Can't read file: {file_path}")
+    flow_start_dict, flow_end_dict, event_dict = {}, {}, {}
+    flow_cat = ("async_gpu", "ac2g", "async_npu")
+    if trace_type == BASE_TYPE:
+        global BASE_TRACE_TYPE
+        BASE_TRACE_TYPE = GPU if isinstance(json_data, dict) else NPU
+        _type = BASE_TRACE_TYPE
+    else:
+        global COMPARISON_TRACE_TYPE
+        COMPARISON_TRACE_TYPE = GPU if isinstance(json_data, dict) else NPU
+        _type = COMPARISON_TRACE_TYPE
+    total_events = json_data.get("traceEvents", []) if _type == GPU else json_data
+    for event in total_events:
+        if event.get("cat") == "cpu_op" or event.get("cat") in ("Runtime", "cuda_runtime"):
+            event_list.append(event)
+        elif event.get("cat") in flow_cat and event.get("ph") == "s":
+            flow_start_dict[event.get("id")] = event
+        elif event.get("cat") in flow_cat and event.get("ph") == "f":
+            flow_end_dict[event.get("id")] = event
+        elif _type == GPU and event.get("cat", "").capitalize() == "Kernel".capitalize():
+            event_dict["{}-{}-{}".format(event.get("pid"), event.get("tid"), event.get("ts"))] = event
+        elif _type == NPU and event.get("ph") != "f":
+            event_dict["{}-{}-{}".format(event.get("pid"), event.get("tid"), event.get("ts"))] = event
+
+    for flow_id, start_flow in flow_start_dict.items():
+        end_flow = flow_end_dict.get(flow_id)
+        if end_flow is None:
+            continue
+        kernel_event = event_dict.get("{}-{}-{}".format(end_flow.get("pid"), end_flow.get("tid"), end_flow.get("ts")))
+        if kernel_event is None:
+            continue
+        flow_kernel_dict.setdefault(start_flow.get("ts"), []).append(kernel_event)
+    return event_list, flow_kernel_dict
+
+
+def get_top_layer_apis(file_path: str, trace_type: int, max_kernel_num: int) -> any:
+    event_list, flow_kernel_dict = read_json_file(file_path, trace_type)
+    root_node = TreeBuilder.build_tree(event_list, flow_kernel_dict)
+    if max_kernel_num is not None:
+        TreeBuilder.mark_kernel_num(root_node, flow_kernel_dict)
+    level1_child_nodes = root_node.child_nodes
+    if not level1_child_nodes:
+        raise RuntimeError(f"Can't find any torch op in the file: {file_path}")
+    result_data = []
+    for level1_node in level1_child_nodes:
+        if level1_node.is_step_profiler():
+            result_data.extend(level1_node.child_nodes)
+        else:
+            result_data.append(level1_node)
+    return result_data
+
+
+def compare(base_top_layer_apis: list, comparison_top_layer_apis: list, op_name_map: dict) -> list:
+    result_data = []
+    comparison_len, base_len = len(comparison_top_layer_apis), len(base_top_layer_apis)
+    dp = [[0] * (base_len + 1) for _ in range(comparison_len + 1)]
+    for comparison_index in range(1, comparison_len + 1):
+        for base_index in range(1, base_len + 1):
+            base_name = base_top_layer_apis[base_index - 1].name
+            comparison_name = comparison_top_layer_apis[comparison_index - 1].name
+            if op_name_map.get(comparison_name, comparison_name) == op_name_map.get(base_name, base_name):
+                dp[comparison_index][base_index] = dp[comparison_index - 1][base_index - 1] + 1
+            else:
+                dp[comparison_index][base_index] = max(dp[comparison_index][base_index - 1],
+                                                       dp[comparison_index - 1][base_index])
+    matched_op = []
+    comparison_index, base_index = comparison_len, base_len
+    while comparison_index > 0 and base_index > 0:
+        base_name = base_top_layer_apis[base_index - 1].name
+        comparison_name = comparison_top_layer_apis[comparison_index - 1].name
+        if op_name_map.get(comparison_name, comparison_name) == op_name_map.get(base_name, base_name):
+            matched_op.append([comparison_index - 1, base_index - 1])
+            comparison_index -= 1
+            base_index -= 1
+            continue
+        if dp[comparison_index][base_index - 1] > dp[comparison_index - 1][base_index]:
+            base_index -= 1
+        else:
+            comparison_index -= 1
+    if not matched_op:
+        matched_base_index_list = []
+    else:
+        matched_op.reverse()
+        matched_op = np.array(matched_op)
+        matched_base_index_list = list(matched_op[:, 1])
+    curr_comparison_index = 0
+    for base_index, base_api_node in enumerate(base_top_layer_apis):
+        if base_index not in matched_base_index_list:
+            result_data.append([base_api_node, None])
+            continue
+        matched_comparison_index = matched_op[matched_base_index_list.index(base_index), 0]
+        for comparison_index in range(curr_comparison_index, matched_comparison_index):
+            result_data.append([None, comparison_top_layer_apis[comparison_index]])
+        result_data.append([base_api_node, comparison_top_layer_apis[matched_comparison_index]])
+        curr_comparison_index = matched_comparison_index + 1
+    if curr_comparison_index < len(comparison_top_layer_apis):
+        for comparison_index in range(curr_comparison_index, len(comparison_top_layer_apis)):
+            result_data.append([None, comparison_top_layer_apis[comparison_index]])
+    return result_data
+
+
+def create_data(base_api_node: TorchOpNode, comparison_api_node: TorchOpNode) -> list:
+    result_data = []
+    base_kernel_list = TreeBuilder.get_total_kernels(base_api_node) if base_api_node else []
+    comparison_kernel_list = TreeBuilder.get_total_kernels(comparison_api_node) if comparison_api_node else []
+    if not base_kernel_list or not comparison_kernel_list:
+        diff = NA
+    else:
+        base_total_dur = sum([kernel.get("dur", 0) for kernel in base_kernel_list])
+        comparison_total_dur = sum([kernel.get("dur", 0) for kernel in comparison_kernel_list])
+        diff = (comparison_total_dur - base_total_dur) / base_total_dur
+    op_name = base_api_node.name if base_api_node else comparison_api_node.name
+    base_kernel_num, comparison_kernel_num = len(base_kernel_list), len(comparison_kernel_list)
+    base_data = [NA] * len(GPU_HEADER) if BASE_TRACE_TYPE == GPU else [NA] * len(NPU_HEADER)
+    if base_api_node:
+        base_data[0] = base_api_node.name
+        base_data[1] = base_api_node.input_shape
+        base_data[2] = base_api_node.input_type
+    comparison_data = [NA] * len(GPU_HEADER) if COMPARISON_TRACE_TYPE == GPU else [NA] * len(NPU_HEADER)
+    if comparison_api_node:
+        comparison_data[0] = comparison_api_node.name
+        comparison_data[1] = comparison_api_node.input_shape
+        comparison_data[2] = comparison_api_node.input_type
+    if base_kernel_num == 0 and comparison_kernel_num == 0:
+        data = base_data + comparison_data + [diff, op_name]
+        result_data.append(data)
+        return result_data
+    for index in range(max(base_kernel_num, comparison_kernel_num)):
+        base_row_data, comparison_row_data = copy.deepcopy(base_data), copy.deepcopy(comparison_data)
+        if index < base_kernel_num:
+            base_kernel = base_kernel_list[index]
+            if BASE_TRACE_TYPE == GPU:
+                base_row_data[3] = base_kernel.get("name")
+                base_row_data[4] = base_kernel.get("dur")
+            else:
+                base_row_data[3] = base_kernel.get("name")
+                base_row_data[4] = base_kernel.get("args", {}).get("Task Id")
+                base_row_data[5] = base_kernel.get("args", {}).get("Task Type")
+                base_row_data[6] = base_kernel.get("dur")
+        if index < comparison_kernel_num:
+            comparison_kernel = comparison_kernel_list[index]
+            if COMPARISON_TRACE_TYPE == GPU:
+                comparison_row_data[3] = comparison_kernel.get("name")
+                comparison_row_data[4] = comparison_kernel.get("dur")
+            else:
+                comparison_row_data[3] = comparison_kernel.get("name")
+                comparison_row_data[4] = comparison_kernel.get("args", {}).get("Task Id")
+                comparison_row_data[5] = comparison_kernel.get("args", {}).get("Task Type")
+                comparison_row_data[6] = comparison_kernel.get("dur")
+        data = base_row_data + comparison_row_data + [diff, op_name]
+        result_data.append(data)
+    return result_data
+
+
+def drill_down(compare_result_data: list, max_kernel_num: int, op_name_map: dict) -> list:
+    result_data = []
+    for data in compare_result_data:
+        base_api = data[0] if data[0] else TorchOpNode()
+        comparison_api = data[1] if data[1] else TorchOpNode()
+        if max(base_api.kernel_num, comparison_api.kernel_num) <= max_kernel_num:
+            result_data.append(data)
+            continue
+        result_data.extend(compare(base_api.child_nodes, comparison_api.child_nodes, op_name_map))
+    return result_data
+
+
+def have_to_drill_down(compare_result_data: list, max_kernel_num: int) -> bool:
+    for data in compare_result_data:
+        base_api = data[0] if data[0] else TorchOpNode()
+        comparison_api = data[1] if data[1] else TorchOpNode()
+        if max(base_api.kernel_num, comparison_api.kernel_num) > max_kernel_num:
+            return True
+    return False
+
+
+def main():
+    global BASE_TRACE, COMPARISON_TRACE
+    parser = argparse.ArgumentParser(description="Compare trace of GPU and NPU")
+    parser.add_argument("base_trace_path", help="base trace file path")
+    parser.add_argument("comparison_trace_path", help="comparison trace file path")
+    parser.add_argument("--output_path", help="性能数据比对结果的存放路径")
+    parser.add_argument("--max_kernel_num", type=int, help="每个torch op的kernel数量限制")
+    parser.add_argument("--op_name_map", type=ast.literal_eval, default={},
+                        help="配置GPU OP与NPU OP等价的名称映射关系，以字典的形式传入")
+    args = parser.parse_args()
+    if args.max_kernel_num is not None and args.max_kernel_num <= LIMIT_KERNEL:
+        raise RuntimeError(f"Invalid param, --max_kernel_num has to be greater than {LIMIT_KERNEL}")
+    if not isinstance(args.op_name_map, dict):
+        raise RuntimeError("Invalid param, --op_name_map must be dict, for example: --op_name_map={'name1':'name2'}")
+    base_top_layer_apis = get_top_layer_apis(args.base_trace_path, BASE_TYPE, args.max_kernel_num)
+    if BASE_TRACE_TYPE == GPU:
+        BASE_TRACE += ' [GPU] : ' + os.path.basename(args.base_trace_path)
+    else:
+        BASE_TRACE += ' [NPU] : ' + os.path.basename(args.base_trace_path)
+    comparison_top_layer_apis = get_top_layer_apis(args.comparison_trace_path, COMPARISON_TYPE, args.max_kernel_num)
+    if COMPARISON_TRACE_TYPE == GPU:
+        COMPARISON_TRACE += ' [GPU] : ' + os.path.basename(args.comparison_trace_path)
+    else:
+        COMPARISON_TRACE += ' [NPU] : ' + os.path.basename(args.comparison_trace_path)
+    compare_result_data = compare(base_top_layer_apis, comparison_top_layer_apis, args.op_name_map)
+
+    if args.max_kernel_num is not None:
+        while have_to_drill_down(compare_result_data, args.max_kernel_num):
+            compare_result_data = drill_down(compare_result_data, args.max_kernel_num, args.op_name_map)
+
+    dir_path = args.output_path if args.output_path else "./"
+    file_name = "torch_op_compare_{}.xlsx".format(time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())))
+    result_file_path = os.path.join(dir_path, file_name)
+
+    wb = Workbook()
+    ws = wb.create_sheet("CompareResult", 0)
+    ws.sheet_properties.tabColor = '00CED1'
+    # write headers
+    base_trace_headers = GPU_HEADER if BASE_TRACE_TYPE == GPU else NPU_HEADER
+    comparison_trace_headers = GPU_HEADER if COMPARISON_TRACE_TYPE == GPU else NPU_HEADER
+    headers = base_trace_headers + comparison_trace_headers + [DIFF, OP_NAME_FILTER, DIFF_FILTER]
+    base_trace_start_column = 0
+    comparison_trace_start_column = len(base_trace_headers)
+    diff_start_column = len(base_trace_headers) + len(comparison_trace_headers)
+
+    for col_index in range(len(headers)):
+        ws.cell(row=1, column=col_index + 1).border = BORDER
+        ws.cell(row=1, column=col_index + 1).font = Font(name='Arial', bold=True)
+        ws.cell(row=2, column=col_index + 1).border = BORDER
+        ws.cell(row=2, column=col_index + 1).font = Font(name='Arial', bold=True)
+        header_name = headers[col_index]
+        if col_index < comparison_trace_start_column:
+            ws.cell(row=1, column=col_index + 1).value = BASE_TRACE
+            ws.cell(row=1, column=col_index + 1).fill = FILL_DICT.get(BASE_TYPE)
+            ws.cell(row=2, column=col_index + 1).fill = FILL_DICT.get(BASE_TYPE)
+        elif col_index < diff_start_column:
+            ws.cell(row=1, column=col_index + 1).value = COMPARISON_TRACE
+            ws.cell(row=1, column=col_index + 1).fill = FILL_DICT.get(COMPARISON_TYPE)
+            ws.cell(row=2, column=col_index + 1).fill = FILL_DICT.get(COMPARISON_TYPE)
+        else:
+            ws.cell(row=1, column=col_index + 1).value = header_name
+            ws.cell(row=1, column=col_index + 1).fill = FILL_DICT.get(header_name)
+        ws.cell(row=2, column=col_index + 1).value = header_name
+        dim = ws.cell(row=2, column=col_index + 1).coordinate
+        ws.column_dimensions[dim[0]].width = COLUMN_WIDTH.get(header_name)
+    ws.merge_cells(start_row=1, start_column=base_trace_start_column + 1,
+                   end_row=1, end_column=comparison_trace_start_column)
+    ws.merge_cells(start_row=1, start_column=comparison_trace_start_column + 1,
+                   end_row=1, end_column=diff_start_column)
+    ws.merge_cells(start_row=1, start_column=headers.index(DIFF) + 1,
+                   end_row=2, end_column=headers.index(DIFF) + 1)
+    ws.merge_cells(start_row=1, start_column=headers.index(OP_NAME_FILTER) + 1,
+                   end_row=2, end_column=headers.index(OP_NAME_FILTER) + 1)
+    ws.merge_cells(start_row=1, start_column=headers.index(DIFF_FILTER) + 1,
+                   end_row=2, end_column=headers.index(DIFF_FILTER) + 1)
+
+    # write lines
+    start_row_index = 3
+    for data in compare_result_data:
+        rows = create_data(data[0], data[1])
+        row_number = 0
+        for row in rows:
+            row_index = start_row_index + row_number
+            ws.cell(row=row_index, column=len(row) + 1).border = BORDER
+            for index, value in enumerate(row):
+                if index == headers.index(DIFF):
+                    ws.cell(row=row_index, column=index + 1).number_format = '0.00%'
+                    if value != NA and value < 0:
+                        ws.cell(row=row_index, column=index + 1).fill = PatternFill("solid", fgColor='0000FF00')
+                        ws.cell(row=row_index, column=index + 3).fill = PatternFill("solid", fgColor='0000FF00')
+                    if value != NA and value >= 0:
+                        ws.cell(row=row_index, column=index + 1).fill = PatternFill("solid", fgColor='00FF0000')
+                        ws.cell(row=row_index, column=index + 3).fill = PatternFill("solid", fgColor='00FF0000')
+                if index in [key for key, value in enumerate(headers) if value == OP_NAME]:
+                    ws.cell(row=row_index, column=index + 1).font = Font(name='Arial', bold=True)
+                else:
+                    ws.cell(row=row_index, column=index + 1).font = Font(name='Arial')
+                ws.cell(row=row_index, column=index + 1).value = value
+                ws.cell(row=row_index, column=index + 1).border = BORDER
+            row_number += 1
+        if row_number > 1:
+            # 合并单元格
+            merged_index = set(
+                [key for key, value in enumerate(headers) if value in (OP_NAME, INPUT_SHAPE, INPUT_TYPE, DIFF)])
+            for col_index in merged_index:
+                ws.merge_cells(start_row=start_row_index, start_column=col_index + 1,
+                               end_row=start_row_index + row_number - 1, end_column=col_index + 1)
+        start_row_index = start_row_index + row_number
+
+    wb.save(result_file_path)
+    wb.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/debug/tools/distribute_tools/cluster_profiling_data_copyl.sh b/debug/tools/distribute_tools/cluster_profiling_data_copyl.sh
new file mode 100644
index 000000000..d3d301aea
--- /dev/null
+++ b/debug/tools/distribute_tools/cluster_profiling_data_copyl.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SSH="ssh -o StrictHostKeyChecking=no"
+SCP="scp -o StrictHostKeyChecking=no"
+
+# Get the node list in the cluster.
+get_cluster_list()
+{
+    local cluster_config=$1
+    cat ${cluster_config} | python3 -c 'import sys,json;[print(node) for node in json.load(sys.stdin)["cluster"].keys()]'
+}
+
+# Get the account number of node.
+get_node_user()
+{
+    local cluster_config=$1
+    local node=$2
+    cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["user"])'
+}
+
+# Get the password of node.
+get_node_passwd()
+{
+    local cluster_config=$1
+    local node=$2
+    cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["passwd"])'
+}
+
+# Get the dir of node.
+get_node_dir()
+{
+    local cluster_config=$1
+    local node=$2
+    cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["dir"])'
+}
+
+# Copy data from remote node to local node.
+rscp_pass()
+{
+    local node="$1"
+    local user="$2"
+    local passwd="$3"
+    local src="$4"
+    local target="$5"
+    sshpass -p "${passwd}" ${SCP} -r "${user}"@"${node}":"${src}" "${target}"
+}
+
+# 指定拷贝使用的json文件
+cluster_account_config_path=$1
+# 指定拷贝的路径
+target_dir=$2
+
+node_list=$(get_cluster_list ${cluster_account_config_path})
+echo "-----begin----"
+
+for node in ${node_list}
+do
+    user=$(get_node_user ${cluster_account_config_path} ${node})
+    passwd=$(get_node_passwd ${cluster_account_config_path} ${node})
+    src_dir=$(get_node_dir ${cluster_account_config_path} ${node})
+    echo "------------------${user}@${node}---------------------"
+    $(rscp_pass ${node} ${user} ${passwd} "${src_dir}" ${target_dir})
+done
diff --git a/debug/tools/distribute_tools/config.json b/debug/tools/distribute_tools/config.json
new file mode 100644
index 000000000..04f49e728
--- /dev/null
+++ b/debug/tools/distribute_tools/config.json
@@ -0,0 +1,14 @@
+{
+  "cluster": {
+                "10.xxx.xxx.1": {
+                "user": "root",
+                "passwd": "xxx",
+                "dir": "/home/data/test"
+                },
+                "10.xxx.xxx.2": {
+                "user": "root",
+                "passwd": "xxx",
+                "dir": "/home/data/test"
+                }
+              }
+}
\ No newline at end of file
diff --git a/debug/tools/distribute_tools/distribute_modify_hostname.bash b/debug/tools/distribute_tools/distribute_modify_hostname.bash
new file mode 100644
index 000000000..c835ea51d
--- /dev/null
+++ b/debug/tools/distribute_tools/distribute_modify_hostname.bash
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+SSH="ssh -o StrictHostKeyChecking=no"
+
+# Modify hostname
+change_hostname()
+{
+    local node="$1"
+    local user="$2"
+    local passwd="$3"
+    sshpass -p "${passwd}" ${SSH} ${user}@${node} hostname "${user}-${node}"
+}
+
+# Get node list in the cluster.
+get_cluster_list()
+{
+    local cluster_config=$1
+    cat ${cluster_config} | python3 -c 'import sys,json;[print(node) for node in json.load(sys.stdin)["cluster"].keys()]'
+}
+
+# Get the user from node
+get_node_user()
+{
+    local cluster_config=$1
+    local node=$2
+    cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["user"])'
+}
+
+# Get the password from node
+get_node_passwd()
+{
+    local cluster_config=$1
+    local node=$2
+    cat ${cluster_config} | python3 -c 'import sys,json;print(json.load(sys.stdin)["cluster"]['\"${node}\"']["passwd"])'
+}
+
+cluster_account_config_path=$1
+
+node_list=$(get_cluster_list ${cluster_account_config_path})
+echo "-------begin--------"
+
+for node in ${node_list}
+do
+    user=$(get_node_user ${cluster_account_config_path} ${node})
+    passwd=$(get_node_passwd ${cluster_account_config_path} ${node})
+    echo "--------------${user}@${node}----------------"
+    $(change_hostname ${node} ${user} ${passwd})
+    echo "---"
+done
\ No newline at end of file
-- 
Gitee