From 1926b134830d14ad38e1aee7fc63cb880cb34cfc Mon Sep 17 00:00:00 2001 From: yuye <15675110562@163.com> Date: Tue, 20 Jun 2023 17:03:51 +0800 Subject: [PATCH] code of merge_profiling_timeline tool --- .../tools/merge_profiling_timeline/README.md | 157 ++++++++++++++ debug/tools/merge_profiling_timeline/main.py | 191 ++++++++++++++++++ 2 files changed, 348 insertions(+) create mode 100644 debug/tools/merge_profiling_timeline/README.md create mode 100644 debug/tools/merge_profiling_timeline/main.py diff --git a/debug/tools/merge_profiling_timeline/README.md b/debug/tools/merge_profiling_timeline/README.md new file mode 100644 index 000000000..b9df3f24e --- /dev/null +++ b/debug/tools/merge_profiling_timeline/README.md @@ -0,0 +1,157 @@ +# Profiling merge tool + +## 介绍 +本工具支持合并profiling的timeline数据,支持合并指定rank的timline、合并指定timeline中的item + +## 软件架构 +软件架构说明 + +## 使用说明 +> 合并单机多卡timeline时,不涉及多机间时间校准,直接忽略第1节内容 +### 1 获取服务器与客户端的启动时间差值文件time_difference.json + + 以下代码中,服务器为当前代码所在机器,所有步骤都在服务器上操作,可以选择集群中任意节点作为服务器 + +#### 1.1 拉取代码到集群任意节点任意目录,进入get_nodes_timediff目录 + + ```shell + git clone https://gitee.com/aerfaliang/merge_profiling_timeline.git + cd merge_profiling_timeline/get_nodes_timediff + ``` + +#### 1.2 安装依赖sshpass(1.4中远程执行客户端命令时要用到) + +##### Ubuntu + + ```shell + apt-get install sshpass + ``` + +##### CentOS + + ```shell + # 源码包安装 + wget http://sourceforge.net/projects/sshpass/files/sshpass/1.05/sshpass-1.05.tar.gz + tar -xvzf sshpass-1.05.tar.gz + cd sshpass-1.05 + ./configure + make + make install + + # yum安装 + yum -y install sshpass + ``` + +#### 1.3 按照集群节点顺序编辑nodeinfo.json文件 + + 文件中的内容为全部节点的ip,用户名,密码以及端口 + + 注意:必须按照集群节点顺序填写,以下为样例文件中内容,请根据实际节点信息修改,将集群中所有节点信息填入 + + ```shell + root@root:~/test/merge_profiling_timeline/get_nodes_timediff$ cat nodeinfo.json + { + "cluster": { + "90.90.66.62": { + "user": "dcs-62", + "pd": "password", + "port": 22 + }, + "90.90.66.64": { + "user": "dcs-64", + "pd": "password", + "port": 22 + + } + } + } + + ``` + +#### 1.4 执行get_nodes_timediff.sh脚本,获取服务器与客户端的启动时间差值文件time_difference.json + + ```shell + # bash get_nodes_timediff.sh {当前机器ip} + bash get_nodes_timediff.sh 90.90.66.62 + ``` + +#### 1.5 检查在脚本同目录有time_difference.json文件生成 + + 文件中记录的是节点ip以及客户端相对服务器启动时间差(客户端当前启动时间-服务器启动时间),按照集群节点顺序排列 + + ```shell + root@root:~/test/merge_profiling_timeline/get_nodes_timediff$ cat time_difference.json + { + "90.90.66.62": -3.8049183785915375e-06, + "90.90.66.64": -1.551163767464459 + } + ``` + +### 2 多timeline数据合并 + +#### 2.1 使用msporf采集数据,将采集到的所有节点的profiling数据拷贝到当前机器同一目录下,以下假设数据在/home/test/all_cann_profiling下 + +#### 2.2 使用merge_profiling_timeline下的main.py合并timeline + +可选参数: +- -d: **必选参数**,profiling数据文件或文件夹路径 +- -t: **当需要融合节点间timeline时为必选参数**,传入的为前面步骤1.3生成的time_difference.json文件路径 +- -o: 可选参数,指定合并后的timeline文件输出的路径,默认为'-d'输入的路径 +- --rank:可选参数,指定需要合并timeline的卡号,默认全部合并 +- --items:可选参数,指定需要合并的profiling数据项,默认全部合并 + +profiling数据目录结构示意, 合并timeline必需数据:`msprof.json`和`info.json.*`: +``` +|- cann_profiling + |- PROF_*** + |- timeline + |- msprof.json + |- device_* + |- info.json.* + ... + |- PROF_*** + ... +``` + + +**使用示例**: + +1、合并单机多卡timeline,默认合并所有卡、所有数据项: +``` +python3 main.py -d path/to/cann_profiling/ +``` + +2、合并单机多卡timeline,只合并0卡和1卡: + +``` +python3 main.py -d path/to/cann_profiling/ --rank 0,1 +``` + +3、合并单机多卡timeline,合并所有卡的CANN层和Ascend_Hardware层数据 +``` +python3 main.py -d path/to/cann_profiling/ --items CANN,Ascend_Hardware +``` + +4、合并多机多卡的timeline时, 需要-t指定节点间的时间误差文件路径, 用以校准节点间的时间: + + +``` +python3 main.py -d path/to/cann_profiling/ -t path/to/time_difference.json --rank 0,8, +``` + +合并timeline查看: +> 查看在 -o 指定的目录(默认在-d指定的目录下)生辰过的msprof_merged_*p.json,为合并后的文件 + + +### 3、超大timeline文件查看 + +直接使用以下命令 +``` +cd merge_profiling_timeline +python ./trace_processor --httpd path/to/msprof_merged_*p.json +``` +等待加载完毕,刷新[perfetto](https://ui.perfetto.dev/)界面,点击`YES, use loaded trace`即可展示timeline + + + + diff --git a/debug/tools/merge_profiling_timeline/main.py b/debug/tools/merge_profiling_timeline/main.py new file mode 100644 index 000000000..7cd9bd45a --- /dev/null +++ b/debug/tools/merge_profiling_timeline/main.py @@ -0,0 +1,191 @@ +#! /usr/bin/python3 +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os + +from functools import partial +from argparse import ArgumentParser + + +FILTER_DIRS = [".profiler", "HCCL_PROF", "timeline", "query", 'sqlite', 'log'] +MAX_INDEX_COUNT = 1000 + + +# 获取时间差异文件中的node和时间差的对应关系,保存到字典中 +def get_node_time_diff(time_diff_file_path): + node_diff = {} + if not time_diff_file_path: + return None + with open(time_diff_file_path, 'r+', encoding='utf-8') as f: + all_time_diff = json.load(f) + node_idx = 0 + for ip, timediff in all_time_diff.items(): + node_diff[node_idx] = timediff + node_idx += 1 + return node_diff + + +def get_path_dir(path: str) -> list: + """ + check result path exist JOB dir + path : result path + """ + path_dir_filter = filter(partial(_path_dir_filter_func, root_dir=path), os.listdir(path)) + sub_dirs = list(path_dir_filter) + if not sub_dirs: + message = f"The path \"{path}\" does not have PROF dir. Please check the path." + print(message) + return sub_dirs + + +def _path_dir_filter_func(sub_path, root_dir): + return sub_path not in FILTER_DIRS and os.path.isdir(os.path.realpath(os.path.join(root_dir, sub_path))) + + +def get_timeline_info(input_path, prof_dirs): + timeline_info = {} + + for prof in prof_dirs: + pro_path = os.path.join(input_path, prof) + + # 从info.json读取rank_id + rank_id = get_rank_id_from_info_json(pro_path) + if rank_id is None: + print(f"WARN, There is not rank id info in {pro_path}") + continue + + tmp_path = os.path.realpath(os.path.join(pro_path, "timeline", "msprof.json")) + if os.path.exists(tmp_path): + timeline_info[rank_id] = tmp_path + else: + print(f"WARN, The file \"{tmp_path}\" does not exist.") + return timeline_info + + +def get_rank_id_from_info_json(pro_path): + info_json = "" + rank_id = None + for root, dirs, files in os.walk(pro_path): + for file in files: + if "info.json." in file and ".done" not in file: + info_json = os.path.join(root, file) + break + + if info_json: + with open(info_json, "r+") as f: + info = json.load(f) + rank_id = info.get("rank_id") + return rank_id + + +def merge_timeline(timeline_info, args): + """merge timeline of PROF_""" + new_events = [] + + node_time_diff = get_node_time_diff(args.timediff) if args.timediff else None + + # 合并部分profiling items + merge_items = args.items.split(",") if args.items else None + + # 合并部分rank + if args.rank: + rank_ids = [int(rank_id) for rank_id in args.rank.split(",")] + else: + rank_ids = list(timeline_info.keys()) + + for rank_id in rank_ids: + timeline_file = timeline_info.get(rank_id) + node = rank_id // 8 + print("rank id: ", rank_id, "timeline file: ", timeline_file) + + # 获取相应的时间差异 + node_time = node_time_diff[node] if node_time_diff else None + + with open(timeline_file, 'r+') as f: + cur_events = json.load(f) + proc_pid_dict = {} + for event in cur_events: + if event.get("name") == "process_name" and event.get("ph") == "M": + if event.get("args"): + proc_pid_dict[event["args"].get("name")] = event.get("pid") + process_list = merge_items if merge_items else list(proc_pid_dict.keys()) + # 提取待合并的items的pid + merged_pids = set() + for pro in process_list: + pro = " ".join(pro.split("_")) if "_" in pro else pro + + if pro not in proc_pid_dict.keys(): + print(f"{pro} is invalid item, valid items: {list(proc_pid_dict.keys())}") + continue + merged_pids.add(proc_pid_dict.get(pro)) + + for event in cur_events: + + # 只合并特定数据项 + if event.get('pid') not in merged_pids: + continue + + # 当前节点间时间误差可用时,进行时间校准 + if event.get("ts") and node_time: + event["ts"] = event["ts"] - node_time * 1000000 + + # 区分不同rank的同一进程的pid + if isinstance(event.get("pid"), (str, int)): + event["pid"] = int(''.join(x for x in str(event.get("pid")) if x.isdigit()) + + str(rank_id * MAX_INDEX_COUNT)) + + # convert tid to int + if isinstance(event.get("tid"), str): + event["tid"] = int(''.join(x for x in event["tid"] if x.isdigit())) + + # 进程名加上rank_id区分不同rank + if event.get("name") == "process_name" and event.get("ph") == "M": + if event.get("args") is not None and event["args"].get("name") is not None: + event["args"]["name"] = event["args"]["name"] + f"_{rank_id}" + + new_events.append(event) + + output_path = os.path.join(args.output, f"msprof_merged_{len(rank_ids)}p.json") + with open(output_path, 'w') as f: + json.dump(new_events, f) + + +def parse_args(): + parser = ArgumentParser(description="Merge timeline for multi card") + parser.add_argument("--data", "-d", default=None, help="root dir of PROF_* data") + parser.add_argument("--timediff", "-t", default=None, help="JSON file for saving startup time differences") + parser.add_argument("--output", "-o", default=None, help="save path of msprof_merged.json ") + parser.add_argument("--rank", default=None, help="List of ranks to be merged. By default, all ranks are merged") + parser.add_argument("--items", default=None, help="Items to be combined in the timeline." + " The options are ['MsprofTx', 'AscendCL', 'Runtime', 'AI CPU'," + "'Task Scheduler', and 'HCCL'].") + arg = parser.parse_args() + return arg + + +if __name__ == "__main__": + import time + start = time.time() + args = parse_args() + prof_dir = get_path_dir(args.data) + + timeline_info = get_timeline_info(args.data, prof_dir) + if not args.output: + args.output = args.data + print("========================== start merge timeline ====================") + merge_timeline(timeline_info, args) + end = time.time() + print(f"msprof.json merged finished cost time: {end - start}s") -- Gitee