From 1926b134830d14ad38e1aee7fc63cb880cb34cfc Mon Sep 17 00:00:00 2001
From: yuye <15675110562@163.com>
Date: Tue, 20 Jun 2023 17:03:51 +0800
Subject: [PATCH] code of merge_profiling_timeline tool

---
 .../tools/merge_profiling_timeline/README.md  | 157 ++++++++++++++
 debug/tools/merge_profiling_timeline/main.py  | 191 ++++++++++++++++++
 2 files changed, 348 insertions(+)
 create mode 100644 debug/tools/merge_profiling_timeline/README.md
 create mode 100644 debug/tools/merge_profiling_timeline/main.py

diff --git a/debug/tools/merge_profiling_timeline/README.md b/debug/tools/merge_profiling_timeline/README.md
new file mode 100644
index 000000000..b9df3f24e
--- /dev/null
+++ b/debug/tools/merge_profiling_timeline/README.md
@@ -0,0 +1,157 @@
+# Profiling merge tool
+
+## 介绍
+本工具支持合并profiling的timeline数据，支持合并指定rank的timline、合并指定timeline中的item
+
+## 软件架构
+软件架构说明
+
+## 使用说明
+> 合并单机多卡timeline时，不涉及多机间时间校准，直接忽略第1节内容
+### 1 获取服务器与客户端的启动时间差值文件time_difference.json
+
+   以下代码中，服务器为当前代码所在机器，所有步骤都在服务器上操作，可以选择集群中任意节点作为服务器
+
+#### 1.1 拉取代码到集群任意节点任意目录，进入get_nodes_timediff目录
+
+   ```shell
+   git clone https://gitee.com/aerfaliang/merge_profiling_timeline.git
+   cd merge_profiling_timeline/get_nodes_timediff
+   ```
+
+#### 1.2 安装依赖sshpass(1.4中远程执行客户端命令时要用到)
+
+##### Ubuntu
+
+   ```shell
+   apt-get install sshpass
+   ```
+
+##### CentOS
+
+   ```shell
+    # 源码包安装
+    wget http://sourceforge.net/projects/sshpass/files/sshpass/1.05/sshpass-1.05.tar.gz 
+    tar -xvzf sshpass-1.05.tar.gz 
+    cd sshpass-1.05
+    ./configure 
+    make 
+    make install 
+
+    # yum安装
+    yum -y install sshpass
+   ```
+
+#### 1.3 按照集群节点顺序编辑nodeinfo.json文件  
+
+   文件中的内容为全部节点的ip，用户名，密码以及端口
+
+   注意：必须按照集群节点顺序填写，以下为样例文件中内容，请根据实际节点信息修改，将集群中所有节点信息填入
+
+   ```shell
+   root@root:~/test/merge_profiling_timeline/get_nodes_timediff$ cat nodeinfo.json
+   {
+       "cluster": {
+           "90.90.66.62": {
+               "user": "dcs-62",
+               "pd": "password",
+               "port": 22
+           },
+           "90.90.66.64": {
+               "user": "dcs-64",
+               "pd": "password",
+               "port": 22  
+               
+           }
+       }
+   }
+   
+   ```
+
+#### 1.4 执行get_nodes_timediff.sh脚本，获取服务器与客户端的启动时间差值文件time_difference.json
+
+   ```shell
+   # bash get_nodes_timediff.sh {当前机器ip}
+   bash get_nodes_timediff.sh 90.90.66.62
+   ```
+
+#### 1.5 检查在脚本同目录有time_difference.json文件生成
+
+   文件中记录的是节点ip以及客户端相对服务器启动时间差(客户端当前启动时间-服务器启动时间)，按照集群节点顺序排列
+
+   ```shell
+   root@root:~/test/merge_profiling_timeline/get_nodes_timediff$ cat time_difference.json
+   {
+       "90.90.66.62": -3.8049183785915375e-06,
+       "90.90.66.64": -1.551163767464459
+   }
+   ```
+
+### 2 多timeline数据合并
+
+#### 2.1 使用msporf采集数据，将采集到的所有节点的profiling数据拷贝到当前机器同一目录下，以下假设数据在/home/test/all_cann_profiling下
+
+#### 2.2 使用merge_profiling_timeline下的main.py合并timeline
+
+可选参数：
+- -d: **必选参数**，profiling数据文件或文件夹路径
+- -t: **当需要融合节点间timeline时为必选参数**，传入的为前面步骤1.3生成的time_difference.json文件路径
+- -o: 可选参数，指定合并后的timeline文件输出的路径，默认为'-d'输入的路径
+- --rank：可选参数，指定需要合并timeline的卡号，默认全部合并
+- --items：可选参数，指定需要合并的profiling数据项，默认全部合并
+
+profiling数据目录结构示意, 合并timeline必需数据：`msprof.json`和`info.json.*`：
+```
+|- cann_profiling
+    |- PROF_***
+        |- timeline
+            |- msprof.json
+        |- device_*
+            |- info.json.*
+        ...
+    |- PROF_***
+    ...
+```
+
+
+**使用示例**：
+
+1、合并单机多卡timeline，默认合并所有卡、所有数据项：
+```
+python3 main.py -d path/to/cann_profiling/
+```
+
+2、合并单机多卡timeline，只合并0卡和1卡：
+
+```
+python3 main.py -d path/to/cann_profiling/ --rank 0,1
+```
+
+3、合并单机多卡timeline，合并所有卡的CANN层和Ascend_Hardware层数据
+```
+python3 main.py -d path/to/cann_profiling/ --items CANN,Ascend_Hardware
+```
+
+4、合并多机多卡的timeline时, 需要-t指定节点间的时间误差文件路径, 用以校准节点间的时间：
+
+
+```
+python3 main.py -d path/to/cann_profiling/ -t path/to/time_difference.json --rank 0,8,
+```
+
+合并timeline查看：
+> 查看在 -o 指定的目录（默认在-d指定的目录下）生辰过的msprof_merged_*p.json，为合并后的文件
+
+
+### 3、超大timeline文件查看
+
+直接使用以下命令
+```
+cd merge_profiling_timeline
+python ./trace_processor --httpd path/to/msprof_merged_*p.json 
+```
+等待加载完毕，刷新[perfetto](https://ui.perfetto.dev/)界面，点击`YES, use loaded trace`即可展示timeline
+
+
+
+
diff --git a/debug/tools/merge_profiling_timeline/main.py b/debug/tools/merge_profiling_timeline/main.py
new file mode 100644
index 000000000..7cd9bd45a
--- /dev/null
+++ b/debug/tools/merge_profiling_timeline/main.py
@@ -0,0 +1,191 @@
+#! /usr/bin/python3
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+from functools import partial
+from argparse import ArgumentParser
+
+
+FILTER_DIRS = [".profiler", "HCCL_PROF", "timeline", "query", 'sqlite', 'log']
+MAX_INDEX_COUNT = 1000
+
+
+# 获取时间差异文件中的node和时间差的对应关系，保存到字典中
+def get_node_time_diff(time_diff_file_path):
+    node_diff = {}
+    if not time_diff_file_path:
+        return None
+    with open(time_diff_file_path, 'r+', encoding='utf-8') as f:
+        all_time_diff = json.load(f)
+        node_idx = 0
+        for ip, timediff in all_time_diff.items():
+            node_diff[node_idx] = timediff
+            node_idx += 1
+        return node_diff
+
+
+def get_path_dir(path: str) -> list:
+    """
+    check result path exist JOB dir
+    path : result path
+    """
+    path_dir_filter = filter(partial(_path_dir_filter_func, root_dir=path), os.listdir(path))
+    sub_dirs = list(path_dir_filter)
+    if not sub_dirs:
+        message = f"The path \"{path}\" does not have PROF dir. Please check the path."
+        print(message)
+    return sub_dirs
+
+
+def _path_dir_filter_func(sub_path, root_dir):
+    return sub_path not in FILTER_DIRS and os.path.isdir(os.path.realpath(os.path.join(root_dir, sub_path)))
+
+
+def get_timeline_info(input_path, prof_dirs):
+    timeline_info = {}
+
+    for prof in prof_dirs:
+        pro_path = os.path.join(input_path, prof)
+
+        # 从info.json读取rank_id
+        rank_id = get_rank_id_from_info_json(pro_path)
+        if rank_id is None:
+            print(f"WARN, There is not rank id info in {pro_path}")
+            continue
+
+        tmp_path = os.path.realpath(os.path.join(pro_path, "timeline", "msprof.json"))
+        if os.path.exists(tmp_path):
+            timeline_info[rank_id] = tmp_path
+        else:
+            print(f"WARN, The file \"{tmp_path}\" does not exist.")
+    return timeline_info
+
+
+def get_rank_id_from_info_json(pro_path):
+    info_json = ""
+    rank_id = None
+    for root, dirs, files in os.walk(pro_path):
+        for file in files:
+            if "info.json." in file and ".done" not in file:
+                info_json = os.path.join(root, file)
+                break
+
+    if info_json:
+        with open(info_json, "r+") as f:
+            info = json.load(f)
+        rank_id = info.get("rank_id")
+    return rank_id
+
+
+def merge_timeline(timeline_info, args):
+    """merge timeline of PROF_"""
+    new_events = []
+
+    node_time_diff = get_node_time_diff(args.timediff) if args.timediff else None
+
+    # 合并部分profiling items
+    merge_items = args.items.split(",") if args.items else None
+
+    # 合并部分rank
+    if args.rank:
+        rank_ids = [int(rank_id) for rank_id in args.rank.split(",")]
+    else:
+        rank_ids = list(timeline_info.keys())
+
+    for rank_id in rank_ids:
+        timeline_file = timeline_info.get(rank_id)
+        node = rank_id // 8
+        print("rank id: ", rank_id, "timeline file: ", timeline_file)
+
+        # 获取相应的时间差异
+        node_time = node_time_diff[node] if node_time_diff else None
+
+        with open(timeline_file, 'r+') as f:
+            cur_events = json.load(f)
+            proc_pid_dict = {}
+            for event in cur_events:
+                if event.get("name") == "process_name" and event.get("ph") == "M":
+                    if event.get("args"):
+                        proc_pid_dict[event["args"].get("name")] = event.get("pid")
+            process_list = merge_items if merge_items else list(proc_pid_dict.keys())
+            # 提取待合并的items的pid
+            merged_pids = set()
+            for pro in process_list:
+                pro = " ".join(pro.split("_")) if "_" in pro else pro
+
+                if pro not in proc_pid_dict.keys():
+                    print(f"{pro} is invalid item, valid items: {list(proc_pid_dict.keys())}")
+                    continue
+                merged_pids.add(proc_pid_dict.get(pro))
+
+            for event in cur_events:
+
+                # 只合并特定数据项
+                if event.get('pid') not in merged_pids:
+                    continue
+
+                # 当前节点间时间误差可用时，进行时间校准
+                if event.get("ts") and node_time:
+                    event["ts"] = event["ts"] - node_time * 1000000
+
+                # 区分不同rank的同一进程的pid
+                if isinstance(event.get("pid"), (str, int)):
+                    event["pid"] = int(''.join(x for x in str(event.get("pid")) if x.isdigit()) +
+                                       str(rank_id * MAX_INDEX_COUNT))
+
+                # convert tid to int
+                if isinstance(event.get("tid"), str):
+                    event["tid"] = int(''.join(x for x in event["tid"] if x.isdigit()))
+
+                # 进程名加上rank_id区分不同rank
+                if event.get("name") == "process_name" and event.get("ph") == "M":
+                    if event.get("args") is not None and event["args"].get("name") is not None:
+                        event["args"]["name"] = event["args"]["name"] + f"_{rank_id}"
+
+                new_events.append(event)
+
+    output_path = os.path.join(args.output, f"msprof_merged_{len(rank_ids)}p.json")
+    with open(output_path, 'w') as f:
+        json.dump(new_events, f)
+
+
+def parse_args():
+    parser = ArgumentParser(description="Merge timeline for multi card")
+    parser.add_argument("--data", "-d", default=None, help="root dir of PROF_* data")
+    parser.add_argument("--timediff", "-t", default=None, help="JSON file for saving startup time differences")
+    parser.add_argument("--output", "-o", default=None, help="save path of msprof_merged.json ")
+    parser.add_argument("--rank", default=None, help="List of ranks to be merged. By default, all ranks are merged")
+    parser.add_argument("--items", default=None, help="Items to be combined in the timeline."
+                                                      " The options are ['MsprofTx', 'AscendCL', 'Runtime', 'AI CPU',"
+                                                      "'Task Scheduler', and 'HCCL'].")
+    arg = parser.parse_args()
+    return arg
+
+
+if __name__ == "__main__":
+    import time
+    start = time.time()
+    args = parse_args()
+    prof_dir = get_path_dir(args.data)
+
+    timeline_info = get_timeline_info(args.data, prof_dir)
+    if not args.output:
+        args.output = args.data
+    print("========================== start merge timeline ====================")
+    merge_timeline(timeline_info, args)
+    end = time.time()
+    print(f"msprof.json merged finished cost time: {end - start}s")
-- 
Gitee